1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

Use YouTube video duration as read time

This feature works by scraping YouTube website.

To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to
1.

Resolves #972.
This commit is contained in:
Ilya Mateyko 2021-01-27 12:50:34 +00:00 committed by fguillot
parent e5b2eab727
commit c3f871b49b
6 changed files with 153 additions and 1 deletions

View file

@ -5,24 +5,35 @@
package processor
import (
"errors"
"fmt"
"math"
"regexp"
"strconv"
"strings"
"time"
"unicode/utf8"
"miniflux.app/config"
"miniflux.app/http/client"
"miniflux.app/logger"
"miniflux.app/metric"
"miniflux.app/model"
"miniflux.app/reader/browser"
"miniflux.app/reader/rewrite"
"miniflux.app/reader/sanitizer"
"miniflux.app/reader/scraper"
"miniflux.app/storage"
"github.com/PuerkitoBio/goquery"
"github.com/rylans/getlang"
)
var (
youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
)
// ProcessFeedEntries downloads original web page for entries and apply filters.
func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
var filteredEntries model.Entries
@ -63,7 +74,20 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
entry.ReadingTime = calculateReadingTime(entry.Content)
if config.Opts.FetchYouTubeWatchTime() {
if matches := youtubeRegex.FindStringSubmatch(entry.URL); len(matches) == 2 {
watchTime, err := fetchYouTubeWatchTime(entry.URL)
if err != nil {
logger.Error("[Processor] Unable to fetch YouTube watch time: %q => %v", entry.URL, err)
}
entry.ReadingTime = watchTime
}
}
if entry.ReadingTime == 0 {
entry.ReadingTime = calculateReadingTime(entry.Content)
}
filteredEntries = append(filteredEntries, entry)
}
@ -120,6 +144,68 @@ func ProcessEntryWebPage(entry *model.Entry) error {
return nil
}
func fetchYouTubeWatchTime(url string) (int, error) {
clt := client.NewClientWithConfig(url, config.Opts)
response, browserErr := browser.Exec(clt)
if browserErr != nil {
return 0, browserErr
}
doc, docErr := goquery.NewDocumentFromReader(response.Body)
if docErr != nil {
return 0, docErr
}
durs, exists := doc.Find(`meta[itemprop="duration"]`).First().Attr("content")
if !exists {
return 0, errors.New("duration has not found")
}
dur, err := parseISO8601(durs)
if err != nil {
return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err)
}
return int(dur.Minutes()), nil
}
// parseISO8601 parses an ISO 8601 duration string.
func parseISO8601(from string) (time.Duration, error) {
var match []string
var d time.Duration
if iso8601Regex.MatchString(from) {
match = iso8601Regex.FindStringSubmatch(from)
} else {
return 0, errors.New("could not parse duration string")
}
for i, name := range iso8601Regex.SubexpNames() {
part := match[i]
if i == 0 || name == "" || part == "" {
continue
}
val, err := strconv.ParseInt(part, 10, 64)
if err != nil {
return 0, err
}
switch name {
case "hour":
d = d + (time.Duration(val) * time.Hour)
case "minute":
d = d + (time.Duration(val) * time.Minute)
case "second":
d = d + (time.Duration(val) * time.Second)
default:
return 0, fmt.Errorf("unknown field %s", name)
}
}
return d, nil
}
func calculateReadingTime(content string) int {
sanitizedContent := sanitizer.StripTags(content)
languageInfo := getlang.FromString(sanitizedContent)