1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

feat(processor): fetch YouTube watch time in bulk using the API

This commit is contained in:
Frédéric Guillot 2025-01-24 14:27:17 -08:00
parent c3c42b0c37
commit 369054b02d
8 changed files with 396 additions and 330 deletions

View file

@ -1,7 +1,7 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
package processor package processor // import "miniflux.app/v2/internal/reader/processor
import ( import (
"encoding/json" "encoding/json"

View file

@ -0,0 +1,200 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package processor // import "miniflux.app/v2/internal/reader/processor
import (
"log/slog"
"regexp"
"slices"
"strings"
"time"
"miniflux.app/v2/internal/model"
)
func isBlockedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool {
if user.BlockFilterEntryRules != "" {
rules := strings.Split(user.BlockFilterEntryRules, "\n")
for _, rule := range rules {
parts := strings.SplitN(rule, "=", 2)
var match bool
switch parts[0] {
case "EntryDate":
datePattern := parts[1]
match = isDateMatchingPattern(entry.Date, datePattern)
case "EntryTitle":
match, _ = regexp.MatchString(parts[1], entry.Title)
case "EntryURL":
match, _ = regexp.MatchString(parts[1], entry.URL)
case "EntryCommentsURL":
match, _ = regexp.MatchString(parts[1], entry.CommentsURL)
case "EntryContent":
match, _ = regexp.MatchString(parts[1], entry.Content)
case "EntryAuthor":
match, _ = regexp.MatchString(parts[1], entry.Author)
case "EntryTag":
containsTag := slices.ContainsFunc(entry.Tags, func(tag string) bool {
match, _ = regexp.MatchString(parts[1], tag)
return match
})
if containsTag {
match = true
}
}
if match {
slog.Debug("Blocking entry based on rule",
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.String("rule", rule),
)
return true
}
}
}
if feed.BlocklistRules == "" {
return false
}
compiledBlocklist, err := regexp.Compile(feed.BlocklistRules)
if err != nil {
slog.Debug("Failed on regexp compilation",
slog.String("pattern", feed.BlocklistRules),
slog.Any("error", err),
)
return false
}
containsBlockedTag := slices.ContainsFunc(entry.Tags, func(tag string) bool {
return compiledBlocklist.MatchString(tag)
})
if compiledBlocklist.MatchString(entry.URL) || compiledBlocklist.MatchString(entry.Title) || compiledBlocklist.MatchString(entry.Author) || containsBlockedTag {
slog.Debug("Blocking entry based on rule",
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.String("rule", feed.BlocklistRules),
)
return true
}
return false
}
func isAllowedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool {
if user.KeepFilterEntryRules != "" {
rules := strings.Split(user.KeepFilterEntryRules, "\n")
for _, rule := range rules {
parts := strings.SplitN(rule, "=", 2)
var match bool
switch parts[0] {
case "EntryDate":
datePattern := parts[1]
match = isDateMatchingPattern(entry.Date, datePattern)
case "EntryTitle":
match, _ = regexp.MatchString(parts[1], entry.Title)
case "EntryURL":
match, _ = regexp.MatchString(parts[1], entry.URL)
case "EntryCommentsURL":
match, _ = regexp.MatchString(parts[1], entry.CommentsURL)
case "EntryContent":
match, _ = regexp.MatchString(parts[1], entry.Content)
case "EntryAuthor":
match, _ = regexp.MatchString(parts[1], entry.Author)
case "EntryTag":
containsTag := slices.ContainsFunc(entry.Tags, func(tag string) bool {
match, _ = regexp.MatchString(parts[1], tag)
return match
})
if containsTag {
match = true
}
}
if match {
slog.Debug("Allowing entry based on rule",
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.String("rule", rule),
)
return true
}
}
return false
}
if feed.KeeplistRules == "" {
return true
}
compiledKeeplist, err := regexp.Compile(feed.KeeplistRules)
if err != nil {
slog.Debug("Failed on regexp compilation",
slog.String("pattern", feed.KeeplistRules),
slog.Any("error", err),
)
return false
}
containsAllowedTag := slices.ContainsFunc(entry.Tags, func(tag string) bool {
return compiledKeeplist.MatchString(tag)
})
if compiledKeeplist.MatchString(entry.URL) || compiledKeeplist.MatchString(entry.Title) || compiledKeeplist.MatchString(entry.Author) || containsAllowedTag {
slog.Debug("Allow entry based on rule",
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.String("rule", feed.KeeplistRules),
)
return true
}
return false
}
func isDateMatchingPattern(entryDate time.Time, pattern string) bool {
if pattern == "future" {
return entryDate.After(time.Now())
}
parts := strings.SplitN(pattern, ":", 2)
if len(parts) != 2 {
return false
}
operator := parts[0]
dateStr := parts[1]
switch operator {
case "before":
targetDate, err := time.Parse("2006-01-02", dateStr)
if err != nil {
return false
}
return entryDate.Before(targetDate)
case "after":
targetDate, err := time.Parse("2006-01-02", dateStr)
if err != nil {
return false
}
return entryDate.After(targetDate)
case "between":
dates := strings.Split(dateStr, ",")
if len(dates) != 2 {
return false
}
startDate, err1 := time.Parse("2006-01-02", dates[0])
endDate, err2 := time.Parse("2006-01-02", dates[1])
if err1 != nil || err2 != nil {
return false
}
return entryDate.After(startDate) && entryDate.Before(endDate)
}
return false
}

View file

@ -1,7 +1,7 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
package processor package processor // import "miniflux.app/v2/internal/reader/processor
import ( import (
"errors" "errors"

View file

@ -1,7 +1,7 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
package processor package processor // import "miniflux.app/v2/internal/reader/processor
import ( import (
"errors" "errors"

View file

@ -1,13 +1,11 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
package processor package processor // import "miniflux.app/v2/internal/reader/processor
import ( import (
"log/slog" "log/slog"
"regexp" "regexp"
"slices"
"strings"
"time" "time"
"github.com/tdewolff/minify/v2" "github.com/tdewolff/minify/v2"
@ -127,157 +125,17 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content) entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content)
updateEntryReadingTime(store, feed, entry, entryIsNew, user) updateEntryReadingTime(store, feed, entry, entryIsNew, user)
filteredEntries = append(filteredEntries, entry) filteredEntries = append(filteredEntries, entry)
} }
if user.ShowReadingTime && shouldFetchYouTubeWatchTimeInBulk() {
fetchYouTubeWatchTimeInBulk(filteredEntries)
}
feed.Entries = filteredEntries feed.Entries = filteredEntries
} }
func isBlockedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool {
if user.BlockFilterEntryRules != "" {
rules := strings.Split(user.BlockFilterEntryRules, "\n")
for _, rule := range rules {
parts := strings.SplitN(rule, "=", 2)
var match bool
switch parts[0] {
case "EntryDate":
datePattern := parts[1]
match = isDateMatchingPattern(entry.Date, datePattern)
case "EntryTitle":
match, _ = regexp.MatchString(parts[1], entry.Title)
case "EntryURL":
match, _ = regexp.MatchString(parts[1], entry.URL)
case "EntryCommentsURL":
match, _ = regexp.MatchString(parts[1], entry.CommentsURL)
case "EntryContent":
match, _ = regexp.MatchString(parts[1], entry.Content)
case "EntryAuthor":
match, _ = regexp.MatchString(parts[1], entry.Author)
case "EntryTag":
containsTag := slices.ContainsFunc(entry.Tags, func(tag string) bool {
match, _ = regexp.MatchString(parts[1], tag)
return match
})
if containsTag {
match = true
}
}
if match {
slog.Debug("Blocking entry based on rule",
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.String("rule", rule),
)
return true
}
}
}
if feed.BlocklistRules == "" {
return false
}
compiledBlocklist, err := regexp.Compile(feed.BlocklistRules)
if err != nil {
slog.Debug("Failed on regexp compilation",
slog.String("pattern", feed.BlocklistRules),
slog.Any("error", err),
)
return false
}
containsBlockedTag := slices.ContainsFunc(entry.Tags, func(tag string) bool {
return compiledBlocklist.MatchString(tag)
})
if compiledBlocklist.MatchString(entry.URL) || compiledBlocklist.MatchString(entry.Title) || compiledBlocklist.MatchString(entry.Author) || containsBlockedTag {
slog.Debug("Blocking entry based on rule",
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.String("rule", feed.BlocklistRules),
)
return true
}
return false
}
func isAllowedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool {
if user.KeepFilterEntryRules != "" {
rules := strings.Split(user.KeepFilterEntryRules, "\n")
for _, rule := range rules {
parts := strings.SplitN(rule, "=", 2)
var match bool
switch parts[0] {
case "EntryDate":
datePattern := parts[1]
match = isDateMatchingPattern(entry.Date, datePattern)
case "EntryTitle":
match, _ = regexp.MatchString(parts[1], entry.Title)
case "EntryURL":
match, _ = regexp.MatchString(parts[1], entry.URL)
case "EntryCommentsURL":
match, _ = regexp.MatchString(parts[1], entry.CommentsURL)
case "EntryContent":
match, _ = regexp.MatchString(parts[1], entry.Content)
case "EntryAuthor":
match, _ = regexp.MatchString(parts[1], entry.Author)
case "EntryTag":
containsTag := slices.ContainsFunc(entry.Tags, func(tag string) bool {
match, _ = regexp.MatchString(parts[1], tag)
return match
})
if containsTag {
match = true
}
}
if match {
slog.Debug("Allowing entry based on rule",
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.String("rule", rule),
)
return true
}
}
return false
}
if feed.KeeplistRules == "" {
return true
}
compiledKeeplist, err := regexp.Compile(feed.KeeplistRules)
if err != nil {
slog.Debug("Failed on regexp compilation",
slog.String("pattern", feed.KeeplistRules),
slog.Any("error", err),
)
return false
}
containsAllowedTag := slices.ContainsFunc(entry.Tags, func(tag string) bool {
return compiledKeeplist.MatchString(tag)
})
if compiledKeeplist.MatchString(entry.URL) || compiledKeeplist.MatchString(entry.Title) || compiledKeeplist.MatchString(entry.Author) || containsAllowedTag {
slog.Debug("Allow entry based on rule",
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.String("rule", feed.KeeplistRules),
)
return true
}
return false
}
// ProcessEntryWebPage downloads the entry web page and apply rewrite rules. // ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error { func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error {
startTime := time.Now() startTime := time.Now()
@ -358,94 +216,6 @@ func rewriteEntryURL(feed *model.Feed, entry *model.Entry) string {
return rewrittenURL return rewrittenURL
} }
func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool, user *model.User) {
if !user.ShowReadingTime {
slog.Debug("Skip reading time estimation for this user", slog.Int64("user_id", user.ID))
return
}
if shouldFetchYouTubeWatchTime(entry) {
if entryIsNew {
watchTime, err := fetchYouTubeWatchTime(entry.URL)
if err != nil {
slog.Warn("Unable to fetch YouTube watch time",
slog.Int64("user_id", user.ID),
slog.Int64("entry_id", entry.ID),
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.Any("error", err),
)
}
entry.ReadingTime = watchTime
} else {
entry.ReadingTime = store.GetReadTime(feed.ID, entry.Hash)
}
}
if shouldFetchNebulaWatchTime(entry) {
if entryIsNew {
watchTime, err := fetchNebulaWatchTime(entry.URL)
if err != nil {
slog.Warn("Unable to fetch Nebula watch time",
slog.Int64("user_id", user.ID),
slog.Int64("entry_id", entry.ID),
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.Any("error", err),
)
}
entry.ReadingTime = watchTime
} else {
entry.ReadingTime = store.GetReadTime(feed.ID, entry.Hash)
}
}
if shouldFetchOdyseeWatchTime(entry) {
if entryIsNew {
watchTime, err := fetchOdyseeWatchTime(entry.URL)
if err != nil {
slog.Warn("Unable to fetch Odysee watch time",
slog.Int64("user_id", user.ID),
slog.Int64("entry_id", entry.ID),
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.Any("error", err),
)
}
entry.ReadingTime = watchTime
} else {
entry.ReadingTime = store.GetReadTime(feed.ID, entry.Hash)
}
}
if shouldFetchBilibiliWatchTime(entry) {
if entryIsNew {
watchTime, err := fetchBilibiliWatchTime(entry.URL)
if err != nil {
slog.Warn("Unable to fetch Bilibili watch time",
slog.Int64("user_id", user.ID),
slog.Int64("entry_id", entry.ID),
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.Any("error", err),
)
}
entry.ReadingTime = watchTime
} else {
entry.ReadingTime = store.GetReadTime(feed.ID, entry.Hash)
}
}
// Handle YT error case and non-YT entries.
if entry.ReadingTime == 0 {
entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed)
}
}
func isRecentEntry(entry *model.Entry) bool { func isRecentEntry(entry *model.Entry) bool {
if config.Opts.FilterEntryMaxAgeDays() == 0 || entry.Date.After(time.Now().AddDate(0, 0, -config.Opts.FilterEntryMaxAgeDays())) { if config.Opts.FilterEntryMaxAgeDays() == 0 || entry.Date.After(time.Now().AddDate(0, 0, -config.Opts.FilterEntryMaxAgeDays())) {
return true return true
@ -468,44 +238,3 @@ func minifyEntryContent(entryContent string) string {
return entryContent return entryContent
} }
func isDateMatchingPattern(entryDate time.Time, pattern string) bool {
if pattern == "future" {
return entryDate.After(time.Now())
}
parts := strings.SplitN(pattern, ":", 2)
if len(parts) != 2 {
return false
}
operator := parts[0]
dateStr := parts[1]
switch operator {
case "before":
targetDate, err := time.Parse("2006-01-02", dateStr)
if err != nil {
return false
}
return entryDate.Before(targetDate)
case "after":
targetDate, err := time.Parse("2006-01-02", dateStr)
if err != nil {
return false
}
return entryDate.After(targetDate)
case "between":
dates := strings.Split(dateStr, ",")
if len(dates) != 2 {
return false
}
startDate, err1 := time.Parse("2006-01-02", dates[0])
endDate, err2 := time.Parse("2006-01-02", dates[1])
if err1 != nil || err2 != nil {
return false
}
return entryDate.After(startDate) && entryDate.Before(endDate)
}
return false
}

View file

@ -0,0 +1,63 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package processor // import "miniflux.app/v2/internal/reader/processor
import (
"log/slog"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/readingtime"
"miniflux.app/v2/internal/storage"
)
func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *model.Entry, entryIsNew bool, user *model.User) {
if !user.ShowReadingTime {
slog.Debug("Skip reading time estimation for this user", slog.Int64("user_id", user.ID))
return
}
// Define a type for watch time fetching functions
type watchTimeFetcher func(string) (int, error)
// Define watch time fetching scenarios
watchTimeScenarios := []struct {
shouldFetch func(*model.Entry) bool
fetchFunc watchTimeFetcher
platform string
}{
{shouldFetchYouTubeWatchTimeForSingleEntry, fetchYouTubeWatchTimeForSingleEntry, "YouTube"},
{shouldFetchNebulaWatchTime, fetchNebulaWatchTime, "Nebula"},
{shouldFetchOdyseeWatchTime, fetchOdyseeWatchTime, "Odysee"},
{shouldFetchBilibiliWatchTime, fetchBilibiliWatchTime, "Bilibili"},
}
// Iterate through scenarios and attempt to fetch watch time
for _, scenario := range watchTimeScenarios {
if scenario.shouldFetch(entry) {
if entryIsNew {
if watchTime, err := scenario.fetchFunc(entry.URL); err != nil {
slog.Warn("Unable to fetch watch time",
slog.String("platform", scenario.platform),
slog.Int64("user_id", user.ID),
slog.Int64("entry_id", entry.ID),
slog.String("entry_url", entry.URL),
slog.Int64("feed_id", feed.ID),
slog.String("feed_url", feed.FeedURL),
slog.Any("error", err),
)
} else {
entry.ReadingTime = watchTime
}
} else {
entry.ReadingTime = store.GetReadTime(feed.ID, entry.Hash)
}
break
}
}
// Fallback to text-based reading time estimation
if entry.ReadingTime == 0 && entry.Content != "" {
entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed)
}
}

View file

@ -1,7 +1,7 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
package processor package processor // import "miniflux.app/v2/internal/reader/processor
import ( import (
"encoding/json" "encoding/json"
@ -11,6 +11,7 @@ import (
"net/url" "net/url"
"regexp" "regexp"
"strconv" "strconv"
"strings"
"time" "time"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
@ -25,24 +26,30 @@ var (
iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`) iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
) )
func shouldFetchYouTubeWatchTime(entry *model.Entry) bool { func isYouTubeVideoURL(websiteURL string) bool {
if !config.Opts.FetchYouTubeWatchTime() { return len(youtubeRegex.FindStringSubmatch(websiteURL)) == 2
return false
}
matches := youtubeRegex.FindStringSubmatch(entry.URL)
urlMatchesYouTubePattern := len(matches) == 2
return urlMatchesYouTubePattern
} }
func fetchYouTubeWatchTime(websiteURL string) (int, error) { func getVideoIDFromYouTubeURL(websiteURL string) string {
if config.Opts.YouTubeApiKey() == "" { parsedWebsiteURL, err := url.Parse(websiteURL)
return fetchYouTubeWatchTimeFromWebsite(websiteURL) if err != nil {
} else { return ""
return fetchYouTubeWatchTimeFromApi(websiteURL)
} }
return parsedWebsiteURL.Query().Get("v")
} }
func fetchYouTubeWatchTimeFromWebsite(websiteURL string) (int, error) { func shouldFetchYouTubeWatchTimeForSingleEntry(entry *model.Entry) bool {
return config.Opts.FetchYouTubeWatchTime() && config.Opts.YouTubeApiKey() == "" && isYouTubeVideoURL(entry.URL)
}
func shouldFetchYouTubeWatchTimeInBulk() bool {
return config.Opts.FetchYouTubeWatchTime() && config.Opts.YouTubeApiKey() != ""
}
func fetchYouTubeWatchTimeForSingleEntry(websiteURL string) (int, error) {
slog.Debug("Fetching YouTube watch time for a single entry", slog.String("website_url", websiteURL))
requestBuilder := fetcher.NewRequestBuilder() requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
@ -60,31 +67,59 @@ func fetchYouTubeWatchTimeFromWebsite(websiteURL string) (int, error) {
return 0, docErr return 0, docErr
} }
durs, exists := doc.FindMatcher(goquery.Single(`meta[itemprop="duration"]`)).Attr("content") htmlDuration, exists := doc.FindMatcher(goquery.Single(`meta[itemprop="duration"]`)).Attr("content")
if !exists { if !exists {
return 0, errors.New("duration has not found") return 0, errors.New("youtube: duration has not found")
} }
dur, err := parseISO8601(durs) parsedDuration, err := parseISO8601(htmlDuration)
if err != nil { if err != nil {
return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err) return 0, fmt.Errorf("youtube: unable to parse duration %s: %v", htmlDuration, err)
} }
return int(dur.Minutes()), nil return int(parsedDuration.Minutes()), nil
} }
func fetchYouTubeWatchTimeFromApi(websiteURL string) (int, error) { func fetchYouTubeWatchTimeInBulk(entries []*model.Entry) {
requestBuilder := fetcher.NewRequestBuilder() var videosEntriesMapping = make(map[string]*model.Entry)
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) var videoIDs []string
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
parsedWebsiteURL, err := url.Parse(websiteURL) for _, entry := range entries {
if err != nil { if !isYouTubeVideoURL(entry.URL) {
return 0, fmt.Errorf("unable to parse URL: %v", err) continue
}
youtubeVideoID := getVideoIDFromYouTubeURL(entry.URL)
if youtubeVideoID == "" {
continue
}
videosEntriesMapping[getVideoIDFromYouTubeURL(entry.URL)] = entry
videoIDs = append(videoIDs, youtubeVideoID)
} }
if len(videoIDs) == 0 {
return
}
watchTimeMap, err := fetchYouTubeWatchTimeFromApiInBulk(videoIDs)
if err != nil {
slog.Warn("Unable to fetch YouTube watch time in bulk", slog.Any("error", err))
return
}
for videoID, watchTime := range watchTimeMap {
if entry, ok := videosEntriesMapping[videoID]; ok {
entry.ReadingTime = int(watchTime.Minutes())
}
}
}
func fetchYouTubeWatchTimeFromApiInBulk(videoIDs []string) (map[string]time.Duration, error) {
slog.Debug("Fetching YouTube watch time in bulk", slog.Any("video_ids", videoIDs))
apiQuery := url.Values{} apiQuery := url.Values{}
apiQuery.Set("id", parsedWebsiteURL.Query().Get("v")) apiQuery.Set("id", strings.Join(videoIDs, ","))
apiQuery.Set("key", config.Opts.YouTubeApiKey()) apiQuery.Set("key", config.Opts.YouTubeApiKey())
apiQuery.Set("part", "contentDetails") apiQuery.Set("part", "contentDetails")
@ -95,37 +130,33 @@ func fetchYouTubeWatchTimeFromApi(websiteURL string) (int, error) {
RawQuery: apiQuery.Encode(), RawQuery: apiQuery.Encode(),
} }
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(apiURL.String())) responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(apiURL.String()))
defer responseHandler.Close() defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil { if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to fetch contentDetails from YouTube API", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) slog.Warn("Unable to fetch contentDetails from YouTube API", slog.Any("error", localizedError.Error()))
return 0, localizedError.Error() return nil, localizedError.Error()
}
var videos struct {
Items []struct {
ContentDetails struct {
Duration string `json:"duration"`
} `json:"contentDetails"`
} `json:"items"`
} }
var videos youtubeVideoListResponse
if err := json.NewDecoder(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())).Decode(&videos); err != nil { if err := json.NewDecoder(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())).Decode(&videos); err != nil {
return 0, fmt.Errorf("unable to decode JSON: %v", err) return nil, fmt.Errorf("youtube: unable to decode JSON: %v", err)
} }
if n := len(videos.Items); n != 1 { watchTimeMap := make(map[string]time.Duration)
return 0, fmt.Errorf("invalid items length: %d", n) for _, video := range videos.Items {
duration, err := parseISO8601(video.ContentDetails.Duration)
if err != nil {
slog.Warn("Unable to parse ISO8601 duration", slog.Any("error", err))
continue
}
watchTimeMap[video.ID] = duration
} }
return watchTimeMap, nil
durs := videos.Items[0].ContentDetails.Duration
dur, err := parseISO8601(durs)
if err != nil {
return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err)
}
return int(dur.Minutes()), nil
} }
func parseISO8601(from string) (time.Duration, error) { func parseISO8601(from string) (time.Duration, error) {
@ -135,7 +166,7 @@ func parseISO8601(from string) (time.Duration, error) {
if iso8601Regex.MatchString(from) { if iso8601Regex.MatchString(from) {
match = iso8601Regex.FindStringSubmatch(from) match = iso8601Regex.FindStringSubmatch(from)
} else { } else {
return 0, errors.New("could not parse duration string") return 0, errors.New("youtube: could not parse duration string")
} }
for i, name := range iso8601Regex.SubexpNames() { for i, name := range iso8601Regex.SubexpNames() {
@ -157,9 +188,18 @@ func parseISO8601(from string) (time.Duration, error) {
case "second": case "second":
d += time.Duration(val) * time.Second d += time.Duration(val) * time.Second
default: default:
return 0, fmt.Errorf("unknown field %s", name) return 0, fmt.Errorf("youtube: unknown field %s", name)
} }
} }
return d, nil return d, nil
} }
type youtubeVideoListResponse struct {
Items []struct {
ID string `json:"id"`
ContentDetails struct {
Duration string `json:"duration"`
} `json:"contentDetails"`
} `json:"items"`
}

View file

@ -36,3 +36,37 @@ func TestParseISO8601(t *testing.T) {
} }
} }
} }
func TestGetYouTubeVideoIDFromURL(t *testing.T) {
scenarios := []struct {
url string
expected string
}{
{"https://www.youtube.com/watch?v=HLrqNhgdiC0", "HLrqNhgdiC0"},
{"https://www.youtube.com/watch?v=HLrqNhgdiC0&feature=youtu.be", "HLrqNhgdiC0"},
{"https://example.org/test", ""},
}
for _, tc := range scenarios {
result := getVideoIDFromYouTubeURL(tc.url)
if tc.expected != result {
t.Errorf(`Unexpected result, got %q for url %q`, result, tc.url)
}
}
}
func TestIsYouTubeVideoURL(t *testing.T) {
scenarios := []struct {
url string
expected bool
}{
{"https://www.youtube.com/watch?v=HLrqNhgdiC0", true},
{"https://www.youtube.com/watch?v=HLrqNhgdiC0&feature=youtu.be", true},
{"https://example.org/test", false},
}
for _, tc := range scenarios {
result := isYouTubeVideoURL(tc.url)
if tc.expected != result {
t.Errorf(`Unexpected result, got %v for url %q`, result, tc.url)
}
}
}