Use YouTube video duration as read time

This feature works by scraping YouTube website. To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to 1. Resolves #972.
2025-09-30 19:22:11 +00:00 · 2021-01-27 12:50:34 +00:00 · 2021-01-27 12:50:34 +00:00 · c3f871b49b
commit c3f871b49b
parent e5b2eab727
6 changed files with 153 additions and 1 deletions
--- a/reader/processor/processor.go
+++ b/reader/processor/processor.go
@ -5,24 +5,35 @@
 package processor

 import (
+	"errors"
+	"fmt"
 	"math"
 	"regexp"
+	"strconv"
 	"strings"
 	"time"
 	"unicode/utf8"

 	"miniflux.app/config"
+	"miniflux.app/http/client"
 	"miniflux.app/logger"
 	"miniflux.app/metric"
 	"miniflux.app/model"
+	"miniflux.app/reader/browser"
 	"miniflux.app/reader/rewrite"
 	"miniflux.app/reader/sanitizer"
 	"miniflux.app/reader/scraper"
 	"miniflux.app/storage"

+	"github.com/PuerkitoBio/goquery"
 	"github.com/rylans/getlang"
 )

+var (
+	youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
+	iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
+)
+
 // ProcessFeedEntries downloads original web page for entries and apply filters.
 func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
 	var filteredEntries model.Entries
@ -63,7 +74,20 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
 		// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
 		entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)

-		entry.ReadingTime = calculateReadingTime(entry.Content)
+		if config.Opts.FetchYouTubeWatchTime() {
+			if matches := youtubeRegex.FindStringSubmatch(entry.URL); len(matches) == 2 {
+				watchTime, err := fetchYouTubeWatchTime(entry.URL)
+				if err != nil {
+					logger.Error("[Processor] Unable to fetch YouTube watch time: %q => %v", entry.URL, err)
+				}
+				entry.ReadingTime = watchTime
+			}
+		}
+
+		if entry.ReadingTime == 0 {
+			entry.ReadingTime = calculateReadingTime(entry.Content)
+		}
+
 		filteredEntries = append(filteredEntries, entry)
 	}

@ -120,6 +144,68 @@ func ProcessEntryWebPage(entry *model.Entry) error {
 	return nil
 }

+func fetchYouTubeWatchTime(url string) (int, error) {
+	clt := client.NewClientWithConfig(url, config.Opts)
+	response, browserErr := browser.Exec(clt)
+	if browserErr != nil {
+		return 0, browserErr
+	}
+
+	doc, docErr := goquery.NewDocumentFromReader(response.Body)
+	if docErr != nil {
+		return 0, docErr
+	}
+
+	durs, exists := doc.Find(`meta[itemprop="duration"]`).First().Attr("content")
+	if !exists {
+		return 0, errors.New("duration has not found")
+	}
+
+	dur, err := parseISO8601(durs)
+	if err != nil {
+		return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err)
+	}
+
+	return int(dur.Minutes()), nil
+}
+
+// parseISO8601 parses an ISO 8601 duration string.
+func parseISO8601(from string) (time.Duration, error) {
+	var match []string
+	var d time.Duration
+
+	if iso8601Regex.MatchString(from) {
+		match = iso8601Regex.FindStringSubmatch(from)
+	} else {
+		return 0, errors.New("could not parse duration string")
+	}
+
+	for i, name := range iso8601Regex.SubexpNames() {
+		part := match[i]
+		if i == 0 || name == "" || part == "" {
+			continue
+		}
+
+		val, err := strconv.ParseInt(part, 10, 64)
+		if err != nil {
+			return 0, err
+		}
+
+		switch name {
+		case "hour":
+			d = d + (time.Duration(val) * time.Hour)
+		case "minute":
+			d = d + (time.Duration(val) * time.Minute)
+		case "second":
+			d = d + (time.Duration(val) * time.Second)
+		default:
+			return 0, fmt.Errorf("unknown field %s", name)
+		}
+	}
+
+	return d, nil
+}
+
 func calculateReadingTime(content string) int {
 	sanitizedContent := sanitizer.StripTags(content)
 	languageInfo := getlang.FromString(sanitizedContent)