From 369054b02d2ec33daddf0a667fddaa1d83e0031a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Fri, 24 Jan 2025 14:27:17 -0800 Subject: [PATCH] feat(processor): fetch YouTube watch time in bulk using the API --- internal/reader/processor/bilibili.go | 2 +- internal/reader/processor/filters.go | 200 +++++++++++++++ internal/reader/processor/nebula.go | 2 +- internal/reader/processor/odysee.go | 2 +- internal/reader/processor/processor.go | 283 +--------------------- internal/reader/processor/reading_time.go | 63 +++++ internal/reader/processor/youtube.go | 140 +++++++---- internal/reader/processor/youtube_test.go | 34 +++ 8 files changed, 396 insertions(+), 330 deletions(-) create mode 100644 internal/reader/processor/filters.go create mode 100644 internal/reader/processor/reading_time.go diff --git a/internal/reader/processor/bilibili.go b/internal/reader/processor/bilibili.go index b207ff03..d145d1b6 100644 --- a/internal/reader/processor/bilibili.go +++ b/internal/reader/processor/bilibili.go @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -package processor +package processor // import "miniflux.app/v2/internal/reader/processor import ( "encoding/json" diff --git a/internal/reader/processor/filters.go b/internal/reader/processor/filters.go new file mode 100644 index 00000000..ae314f96 --- /dev/null +++ b/internal/reader/processor/filters.go @@ -0,0 +1,200 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package processor // import "miniflux.app/v2/internal/reader/processor + +import ( + "log/slog" + "regexp" + "slices" + "strings" + "time" + + "miniflux.app/v2/internal/model" +) + +func isBlockedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool { + if user.BlockFilterEntryRules != "" { + rules := strings.Split(user.BlockFilterEntryRules, "\n") + for _, rule := range rules { + parts := strings.SplitN(rule, "=", 2) + + var match bool + switch parts[0] { + case "EntryDate": + datePattern := parts[1] + match = isDateMatchingPattern(entry.Date, datePattern) + case "EntryTitle": + match, _ = regexp.MatchString(parts[1], entry.Title) + case "EntryURL": + match, _ = regexp.MatchString(parts[1], entry.URL) + case "EntryCommentsURL": + match, _ = regexp.MatchString(parts[1], entry.CommentsURL) + case "EntryContent": + match, _ = regexp.MatchString(parts[1], entry.Content) + case "EntryAuthor": + match, _ = regexp.MatchString(parts[1], entry.Author) + case "EntryTag": + containsTag := slices.ContainsFunc(entry.Tags, func(tag string) bool { + match, _ = regexp.MatchString(parts[1], tag) + return match + }) + if containsTag { + match = true + } + } + + if match { + slog.Debug("Blocking entry based on rule", + slog.String("entry_url", entry.URL), + slog.Int64("feed_id", feed.ID), + slog.String("feed_url", feed.FeedURL), + slog.String("rule", rule), + ) + return true + } + } + } + + if feed.BlocklistRules == "" { + return false + } + + compiledBlocklist, err := regexp.Compile(feed.BlocklistRules) + if err != nil { + slog.Debug("Failed on regexp compilation", + slog.String("pattern", feed.BlocklistRules), + slog.Any("error", err), + ) + return false + } + + containsBlockedTag := slices.ContainsFunc(entry.Tags, func(tag string) bool { + return compiledBlocklist.MatchString(tag) + }) + + if compiledBlocklist.MatchString(entry.URL) || compiledBlocklist.MatchString(entry.Title) || compiledBlocklist.MatchString(entry.Author) || containsBlockedTag { + slog.Debug("Blocking entry based on rule", + slog.String("entry_url", entry.URL), + slog.Int64("feed_id", feed.ID), + slog.String("feed_url", feed.FeedURL), + slog.String("rule", feed.BlocklistRules), + ) + return true + } + + return false +} + +func isAllowedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool { + if user.KeepFilterEntryRules != "" { + rules := strings.Split(user.KeepFilterEntryRules, "\n") + for _, rule := range rules { + parts := strings.SplitN(rule, "=", 2) + + var match bool + switch parts[0] { + case "EntryDate": + datePattern := parts[1] + match = isDateMatchingPattern(entry.Date, datePattern) + case "EntryTitle": + match, _ = regexp.MatchString(parts[1], entry.Title) + case "EntryURL": + match, _ = regexp.MatchString(parts[1], entry.URL) + case "EntryCommentsURL": + match, _ = regexp.MatchString(parts[1], entry.CommentsURL) + case "EntryContent": + match, _ = regexp.MatchString(parts[1], entry.Content) + case "EntryAuthor": + match, _ = regexp.MatchString(parts[1], entry.Author) + case "EntryTag": + containsTag := slices.ContainsFunc(entry.Tags, func(tag string) bool { + match, _ = regexp.MatchString(parts[1], tag) + return match + }) + if containsTag { + match = true + } + } + + if match { + slog.Debug("Allowing entry based on rule", + slog.String("entry_url", entry.URL), + slog.Int64("feed_id", feed.ID), + slog.String("feed_url", feed.FeedURL), + slog.String("rule", rule), + ) + return true + } + } + return false + } + + if feed.KeeplistRules == "" { + return true + } + + compiledKeeplist, err := regexp.Compile(feed.KeeplistRules) + if err != nil { + slog.Debug("Failed on regexp compilation", + slog.String("pattern", feed.KeeplistRules), + slog.Any("error", err), + ) + return false + } + containsAllowedTag := slices.ContainsFunc(entry.Tags, func(tag string) bool { + return compiledKeeplist.MatchString(tag) + }) + + if compiledKeeplist.MatchString(entry.URL) || compiledKeeplist.MatchString(entry.Title) || compiledKeeplist.MatchString(entry.Author) || containsAllowedTag { + slog.Debug("Allow entry based on rule", + slog.String("entry_url", entry.URL), + slog.Int64("feed_id", feed.ID), + slog.String("feed_url", feed.FeedURL), + slog.String("rule", feed.KeeplistRules), + ) + return true + } + return false +} + +func isDateMatchingPattern(entryDate time.Time, pattern string) bool { + if pattern == "future" { + return entryDate.After(time.Now()) + } + + parts := strings.SplitN(pattern, ":", 2) + if len(parts) != 2 { + return false + } + + operator := parts[0] + dateStr := parts[1] + + switch operator { + case "before": + targetDate, err := time.Parse("2006-01-02", dateStr) + if err != nil { + return false + } + return entryDate.Before(targetDate) + case "after": + targetDate, err := time.Parse("2006-01-02", dateStr) + if err != nil { + return false + } + return entryDate.After(targetDate) + case "between": + dates := strings.Split(dateStr, ",") + if len(dates) != 2 { + return false + } + startDate, err1 := time.Parse("2006-01-02", dates[0]) + endDate, err2 := time.Parse("2006-01-02", dates[1]) + if err1 != nil || err2 != nil { + return false + } + return entryDate.After(startDate) && entryDate.Before(endDate) + } + return false +} diff --git a/internal/reader/processor/nebula.go b/internal/reader/processor/nebula.go index 216e9b34..fa3d75da 100644 --- a/internal/reader/processor/nebula.go +++ b/internal/reader/processor/nebula.go @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -package processor +package processor // import "miniflux.app/v2/internal/reader/processor import ( "errors" diff --git a/internal/reader/processor/odysee.go b/internal/reader/processor/odysee.go index 873ae60c..bdbcc32f 100644 --- a/internal/reader/processor/odysee.go +++ b/internal/reader/processor/odysee.go @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -package processor +package processor // import "miniflux.app/v2/internal/reader/processor import ( "errors" diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index 3c824b66..f7e9d101 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -1,13 +1,11 @@ // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -package processor +package processor // import "miniflux.app/v2/internal/reader/processor import ( "log/slog" "regexp" - "slices" - "strings" "time" "github.com/tdewolff/minify/v2" @@ -127,157 +125,17 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64, entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content) updateEntryReadingTime(store, feed, entry, entryIsNew, user) + filteredEntries = append(filteredEntries, entry) } + if user.ShowReadingTime && shouldFetchYouTubeWatchTimeInBulk() { + fetchYouTubeWatchTimeInBulk(filteredEntries) + } + feed.Entries = filteredEntries } -func isBlockedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool { - if user.BlockFilterEntryRules != "" { - rules := strings.Split(user.BlockFilterEntryRules, "\n") - for _, rule := range rules { - parts := strings.SplitN(rule, "=", 2) - - var match bool - switch parts[0] { - case "EntryDate": - datePattern := parts[1] - match = isDateMatchingPattern(entry.Date, datePattern) - case "EntryTitle": - match, _ = regexp.MatchString(parts[1], entry.Title) - case "EntryURL": - match, _ = regexp.MatchString(parts[1], entry.URL) - case "EntryCommentsURL": - match, _ = regexp.MatchString(parts[1], entry.CommentsURL) - case "EntryContent": - match, _ = regexp.MatchString(parts[1], entry.Content) - case "EntryAuthor": - match, _ = regexp.MatchString(parts[1], entry.Author) - case "EntryTag": - containsTag := slices.ContainsFunc(entry.Tags, func(tag string) bool { - match, _ = regexp.MatchString(parts[1], tag) - return match - }) - if containsTag { - match = true - } - } - - if match { - slog.Debug("Blocking entry based on rule", - slog.String("entry_url", entry.URL), - slog.Int64("feed_id", feed.ID), - slog.String("feed_url", feed.FeedURL), - slog.String("rule", rule), - ) - return true - } - } - } - - if feed.BlocklistRules == "" { - return false - } - - compiledBlocklist, err := regexp.Compile(feed.BlocklistRules) - if err != nil { - slog.Debug("Failed on regexp compilation", - slog.String("pattern", feed.BlocklistRules), - slog.Any("error", err), - ) - return false - } - - containsBlockedTag := slices.ContainsFunc(entry.Tags, func(tag string) bool { - return compiledBlocklist.MatchString(tag) - }) - - if compiledBlocklist.MatchString(entry.URL) || compiledBlocklist.MatchString(entry.Title) || compiledBlocklist.MatchString(entry.Author) || containsBlockedTag { - slog.Debug("Blocking entry based on rule", - slog.String("entry_url", entry.URL), - slog.Int64("feed_id", feed.ID), - slog.String("feed_url", feed.FeedURL), - slog.String("rule", feed.BlocklistRules), - ) - return true - } - - return false -} - -func isAllowedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool { - if user.KeepFilterEntryRules != "" { - rules := strings.Split(user.KeepFilterEntryRules, "\n") - for _, rule := range rules { - parts := strings.SplitN(rule, "=", 2) - - var match bool - switch parts[0] { - case "EntryDate": - datePattern := parts[1] - match = isDateMatchingPattern(entry.Date, datePattern) - case "EntryTitle": - match, _ = regexp.MatchString(parts[1], entry.Title) - case "EntryURL": - match, _ = regexp.MatchString(parts[1], entry.URL) - case "EntryCommentsURL": - match, _ = regexp.MatchString(parts[1], entry.CommentsURL) - case "EntryContent": - match, _ = regexp.MatchString(parts[1], entry.Content) - case "EntryAuthor": - match, _ = regexp.MatchString(parts[1], entry.Author) - case "EntryTag": - containsTag := slices.ContainsFunc(entry.Tags, func(tag string) bool { - match, _ = regexp.MatchString(parts[1], tag) - return match - }) - if containsTag { - match = true - } - } - - if match { - slog.Debug("Allowing entry based on rule", - slog.String("entry_url", entry.URL), - slog.Int64("feed_id", feed.ID), - slog.String("feed_url", feed.FeedURL), - slog.String("rule", rule), - ) - return true - } - } - return false - } - - if feed.KeeplistRules == "" { - return true - } - - compiledKeeplist, err := regexp.Compile(feed.KeeplistRules) - if err != nil { - slog.Debug("Failed on regexp compilation", - slog.String("pattern", feed.KeeplistRules), - slog.Any("error", err), - ) - return false - } - containsAllowedTag := slices.ContainsFunc(entry.Tags, func(tag string) bool { - return compiledKeeplist.MatchString(tag) - }) - - if compiledKeeplist.MatchString(entry.URL) || compiledKeeplist.MatchString(entry.Title) || compiledKeeplist.MatchString(entry.Author) || containsAllowedTag { - slog.Debug("Allow entry based on rule", - slog.String("entry_url", entry.URL), - slog.Int64("feed_id", feed.ID), - slog.String("feed_url", feed.FeedURL), - slog.String("rule", feed.KeeplistRules), - ) - return true - } - return false -} - // ProcessEntryWebPage downloads the entry web page and apply rewrite rules. func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error { startTime := time.Now() @@ -358,94 +216,6 @@ func rewriteEntryURL(feed *model.Feed, entry *model.Entry) string { return rewrittenURL } -func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool, user *model.User) { - if !user.ShowReadingTime { - slog.Debug("Skip reading time estimation for this user", slog.Int64("user_id", user.ID)) - return - } - - if shouldFetchYouTubeWatchTime(entry) { - if entryIsNew { - watchTime, err := fetchYouTubeWatchTime(entry.URL) - if err != nil { - slog.Warn("Unable to fetch YouTube watch time", - slog.Int64("user_id", user.ID), - slog.Int64("entry_id", entry.ID), - slog.String("entry_url", entry.URL), - slog.Int64("feed_id", feed.ID), - slog.String("feed_url", feed.FeedURL), - slog.Any("error", err), - ) - } - entry.ReadingTime = watchTime - } else { - entry.ReadingTime = store.GetReadTime(feed.ID, entry.Hash) - } - } - - if shouldFetchNebulaWatchTime(entry) { - if entryIsNew { - watchTime, err := fetchNebulaWatchTime(entry.URL) - if err != nil { - slog.Warn("Unable to fetch Nebula watch time", - slog.Int64("user_id", user.ID), - slog.Int64("entry_id", entry.ID), - slog.String("entry_url", entry.URL), - slog.Int64("feed_id", feed.ID), - slog.String("feed_url", feed.FeedURL), - slog.Any("error", err), - ) - } - entry.ReadingTime = watchTime - } else { - entry.ReadingTime = store.GetReadTime(feed.ID, entry.Hash) - } - } - - if shouldFetchOdyseeWatchTime(entry) { - if entryIsNew { - watchTime, err := fetchOdyseeWatchTime(entry.URL) - if err != nil { - slog.Warn("Unable to fetch Odysee watch time", - slog.Int64("user_id", user.ID), - slog.Int64("entry_id", entry.ID), - slog.String("entry_url", entry.URL), - slog.Int64("feed_id", feed.ID), - slog.String("feed_url", feed.FeedURL), - slog.Any("error", err), - ) - } - entry.ReadingTime = watchTime - } else { - entry.ReadingTime = store.GetReadTime(feed.ID, entry.Hash) - } - } - - if shouldFetchBilibiliWatchTime(entry) { - if entryIsNew { - watchTime, err := fetchBilibiliWatchTime(entry.URL) - if err != nil { - slog.Warn("Unable to fetch Bilibili watch time", - slog.Int64("user_id", user.ID), - slog.Int64("entry_id", entry.ID), - slog.String("entry_url", entry.URL), - slog.Int64("feed_id", feed.ID), - slog.String("feed_url", feed.FeedURL), - slog.Any("error", err), - ) - } - entry.ReadingTime = watchTime - } else { - entry.ReadingTime = store.GetReadTime(feed.ID, entry.Hash) - } - } - - // Handle YT error case and non-YT entries. - if entry.ReadingTime == 0 { - entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed) - } -} - func isRecentEntry(entry *model.Entry) bool { if config.Opts.FilterEntryMaxAgeDays() == 0 || entry.Date.After(time.Now().AddDate(0, 0, -config.Opts.FilterEntryMaxAgeDays())) { return true @@ -468,44 +238,3 @@ func minifyEntryContent(entryContent string) string { return entryContent } - -func isDateMatchingPattern(entryDate time.Time, pattern string) bool { - if pattern == "future" { - return entryDate.After(time.Now()) - } - - parts := strings.SplitN(pattern, ":", 2) - if len(parts) != 2 { - return false - } - - operator := parts[0] - dateStr := parts[1] - - switch operator { - case "before": - targetDate, err := time.Parse("2006-01-02", dateStr) - if err != nil { - return false - } - return entryDate.Before(targetDate) - case "after": - targetDate, err := time.Parse("2006-01-02", dateStr) - if err != nil { - return false - } - return entryDate.After(targetDate) - case "between": - dates := strings.Split(dateStr, ",") - if len(dates) != 2 { - return false - } - startDate, err1 := time.Parse("2006-01-02", dates[0]) - endDate, err2 := time.Parse("2006-01-02", dates[1]) - if err1 != nil || err2 != nil { - return false - } - return entryDate.After(startDate) && entryDate.Before(endDate) - } - return false -} diff --git a/internal/reader/processor/reading_time.go b/internal/reader/processor/reading_time.go new file mode 100644 index 00000000..8fe9b12e --- /dev/null +++ b/internal/reader/processor/reading_time.go @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package processor // import "miniflux.app/v2/internal/reader/processor + +import ( + "log/slog" + + "miniflux.app/v2/internal/model" + "miniflux.app/v2/internal/reader/readingtime" + "miniflux.app/v2/internal/storage" +) + +func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool, user *model.User) { + if !user.ShowReadingTime { + slog.Debug("Skip reading time estimation for this user", slog.Int64("user_id", user.ID)) + return + } + + // Define a type for watch time fetching functions + type watchTimeFetcher func(string) (int, error) + + // Define watch time fetching scenarios + watchTimeScenarios := []struct { + shouldFetch func(*model.Entry) bool + fetchFunc watchTimeFetcher + platform string + }{ + {shouldFetchYouTubeWatchTimeForSingleEntry, fetchYouTubeWatchTimeForSingleEntry, "YouTube"}, + {shouldFetchNebulaWatchTime, fetchNebulaWatchTime, "Nebula"}, + {shouldFetchOdyseeWatchTime, fetchOdyseeWatchTime, "Odysee"}, + {shouldFetchBilibiliWatchTime, fetchBilibiliWatchTime, "Bilibili"}, + } + + // Iterate through scenarios and attempt to fetch watch time + for _, scenario := range watchTimeScenarios { + if scenario.shouldFetch(entry) { + if entryIsNew { + if watchTime, err := scenario.fetchFunc(entry.URL); err != nil { + slog.Warn("Unable to fetch watch time", + slog.String("platform", scenario.platform), + slog.Int64("user_id", user.ID), + slog.Int64("entry_id", entry.ID), + slog.String("entry_url", entry.URL), + slog.Int64("feed_id", feed.ID), + slog.String("feed_url", feed.FeedURL), + slog.Any("error", err), + ) + } else { + entry.ReadingTime = watchTime + } + } else { + entry.ReadingTime = store.GetReadTime(feed.ID, entry.Hash) + } + break + } + } + + // Fallback to text-based reading time estimation + if entry.ReadingTime == 0 && entry.Content != "" { + entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed) + } +} diff --git a/internal/reader/processor/youtube.go b/internal/reader/processor/youtube.go index c8ae1d3c..4ebbc5a0 100644 --- a/internal/reader/processor/youtube.go +++ b/internal/reader/processor/youtube.go @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -package processor +package processor // import "miniflux.app/v2/internal/reader/processor import ( "encoding/json" @@ -11,6 +11,7 @@ import ( "net/url" "regexp" "strconv" + "strings" "time" "github.com/PuerkitoBio/goquery" @@ -25,24 +26,30 @@ var ( iso8601Regex = regexp.MustCompile(`^P((?P\d+)Y)?((?P\d+)M)?((?P\d+)W)?((?P\d+)D)?(T((?P\d+)H)?((?P\d+)M)?((?P\d+)S)?)?$`) ) -func shouldFetchYouTubeWatchTime(entry *model.Entry) bool { - if !config.Opts.FetchYouTubeWatchTime() { - return false - } - matches := youtubeRegex.FindStringSubmatch(entry.URL) - urlMatchesYouTubePattern := len(matches) == 2 - return urlMatchesYouTubePattern +func isYouTubeVideoURL(websiteURL string) bool { + return len(youtubeRegex.FindStringSubmatch(websiteURL)) == 2 } -func fetchYouTubeWatchTime(websiteURL string) (int, error) { - if config.Opts.YouTubeApiKey() == "" { - return fetchYouTubeWatchTimeFromWebsite(websiteURL) - } else { - return fetchYouTubeWatchTimeFromApi(websiteURL) +func getVideoIDFromYouTubeURL(websiteURL string) string { + parsedWebsiteURL, err := url.Parse(websiteURL) + if err != nil { + return "" } + + return parsedWebsiteURL.Query().Get("v") } -func fetchYouTubeWatchTimeFromWebsite(websiteURL string) (int, error) { +func shouldFetchYouTubeWatchTimeForSingleEntry(entry *model.Entry) bool { + return config.Opts.FetchYouTubeWatchTime() && config.Opts.YouTubeApiKey() == "" && isYouTubeVideoURL(entry.URL) +} + +func shouldFetchYouTubeWatchTimeInBulk() bool { + return config.Opts.FetchYouTubeWatchTime() && config.Opts.YouTubeApiKey() != "" +} + +func fetchYouTubeWatchTimeForSingleEntry(websiteURL string) (int, error) { + slog.Debug("Fetching YouTube watch time for a single entry", slog.String("website_url", websiteURL)) + requestBuilder := fetcher.NewRequestBuilder() requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) @@ -60,31 +67,59 @@ func fetchYouTubeWatchTimeFromWebsite(websiteURL string) (int, error) { return 0, docErr } - durs, exists := doc.FindMatcher(goquery.Single(`meta[itemprop="duration"]`)).Attr("content") + htmlDuration, exists := doc.FindMatcher(goquery.Single(`meta[itemprop="duration"]`)).Attr("content") if !exists { - return 0, errors.New("duration has not found") + return 0, errors.New("youtube: duration has not found") } - dur, err := parseISO8601(durs) + parsedDuration, err := parseISO8601(htmlDuration) if err != nil { - return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err) + return 0, fmt.Errorf("youtube: unable to parse duration %s: %v", htmlDuration, err) } - return int(dur.Minutes()), nil + return int(parsedDuration.Minutes()), nil } -func fetchYouTubeWatchTimeFromApi(websiteURL string) (int, error) { - requestBuilder := fetcher.NewRequestBuilder() - requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) - requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) +func fetchYouTubeWatchTimeInBulk(entries []*model.Entry) { + var videosEntriesMapping = make(map[string]*model.Entry) + var videoIDs []string - parsedWebsiteURL, err := url.Parse(websiteURL) - if err != nil { - return 0, fmt.Errorf("unable to parse URL: %v", err) + for _, entry := range entries { + if !isYouTubeVideoURL(entry.URL) { + continue + } + + youtubeVideoID := getVideoIDFromYouTubeURL(entry.URL) + if youtubeVideoID == "" { + continue + } + + videosEntriesMapping[getVideoIDFromYouTubeURL(entry.URL)] = entry + videoIDs = append(videoIDs, youtubeVideoID) } + if len(videoIDs) == 0 { + return + } + + watchTimeMap, err := fetchYouTubeWatchTimeFromApiInBulk(videoIDs) + if err != nil { + slog.Warn("Unable to fetch YouTube watch time in bulk", slog.Any("error", err)) + return + } + + for videoID, watchTime := range watchTimeMap { + if entry, ok := videosEntriesMapping[videoID]; ok { + entry.ReadingTime = int(watchTime.Minutes()) + } + } +} + +func fetchYouTubeWatchTimeFromApiInBulk(videoIDs []string) (map[string]time.Duration, error) { + slog.Debug("Fetching YouTube watch time in bulk", slog.Any("video_ids", videoIDs)) + apiQuery := url.Values{} - apiQuery.Set("id", parsedWebsiteURL.Query().Get("v")) + apiQuery.Set("id", strings.Join(videoIDs, ",")) apiQuery.Set("key", config.Opts.YouTubeApiKey()) apiQuery.Set("part", "contentDetails") @@ -95,37 +130,33 @@ func fetchYouTubeWatchTimeFromApi(websiteURL string) (int, error) { RawQuery: apiQuery.Encode(), } + requestBuilder := fetcher.NewRequestBuilder() + requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) + requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) + responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(apiURL.String())) defer responseHandler.Close() if localizedError := responseHandler.LocalizedError(); localizedError != nil { - slog.Warn("Unable to fetch contentDetails from YouTube API", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) - return 0, localizedError.Error() - } - - var videos struct { - Items []struct { - ContentDetails struct { - Duration string `json:"duration"` - } `json:"contentDetails"` - } `json:"items"` + slog.Warn("Unable to fetch contentDetails from YouTube API", slog.Any("error", localizedError.Error())) + return nil, localizedError.Error() } + var videos youtubeVideoListResponse if err := json.NewDecoder(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())).Decode(&videos); err != nil { - return 0, fmt.Errorf("unable to decode JSON: %v", err) + return nil, fmt.Errorf("youtube: unable to decode JSON: %v", err) } - if n := len(videos.Items); n != 1 { - return 0, fmt.Errorf("invalid items length: %d", n) + watchTimeMap := make(map[string]time.Duration) + for _, video := range videos.Items { + duration, err := parseISO8601(video.ContentDetails.Duration) + if err != nil { + slog.Warn("Unable to parse ISO8601 duration", slog.Any("error", err)) + continue + } + watchTimeMap[video.ID] = duration } - - durs := videos.Items[0].ContentDetails.Duration - dur, err := parseISO8601(durs) - if err != nil { - return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err) - } - - return int(dur.Minutes()), nil + return watchTimeMap, nil } func parseISO8601(from string) (time.Duration, error) { @@ -135,7 +166,7 @@ func parseISO8601(from string) (time.Duration, error) { if iso8601Regex.MatchString(from) { match = iso8601Regex.FindStringSubmatch(from) } else { - return 0, errors.New("could not parse duration string") + return 0, errors.New("youtube: could not parse duration string") } for i, name := range iso8601Regex.SubexpNames() { @@ -157,9 +188,18 @@ func parseISO8601(from string) (time.Duration, error) { case "second": d += time.Duration(val) * time.Second default: - return 0, fmt.Errorf("unknown field %s", name) + return 0, fmt.Errorf("youtube: unknown field %s", name) } } return d, nil } + +type youtubeVideoListResponse struct { + Items []struct { + ID string `json:"id"` + ContentDetails struct { + Duration string `json:"duration"` + } `json:"contentDetails"` + } `json:"items"` +} diff --git a/internal/reader/processor/youtube_test.go b/internal/reader/processor/youtube_test.go index 66f72661..9018abcd 100644 --- a/internal/reader/processor/youtube_test.go +++ b/internal/reader/processor/youtube_test.go @@ -36,3 +36,37 @@ func TestParseISO8601(t *testing.T) { } } } + +func TestGetYouTubeVideoIDFromURL(t *testing.T) { + scenarios := []struct { + url string + expected string + }{ + {"https://www.youtube.com/watch?v=HLrqNhgdiC0", "HLrqNhgdiC0"}, + {"https://www.youtube.com/watch?v=HLrqNhgdiC0&feature=youtu.be", "HLrqNhgdiC0"}, + {"https://example.org/test", ""}, + } + for _, tc := range scenarios { + result := getVideoIDFromYouTubeURL(tc.url) + if tc.expected != result { + t.Errorf(`Unexpected result, got %q for url %q`, result, tc.url) + } + } +} + +func TestIsYouTubeVideoURL(t *testing.T) { + scenarios := []struct { + url string + expected bool + }{ + {"https://www.youtube.com/watch?v=HLrqNhgdiC0", true}, + {"https://www.youtube.com/watch?v=HLrqNhgdiC0&feature=youtu.be", true}, + {"https://example.org/test", false}, + } + for _, tc := range scenarios { + result := isYouTubeVideoURL(tc.url) + if tc.expected != result { + t.Errorf(`Unexpected result, got %v for url %q`, result, tc.url) + } + } +}