diff --git a/internal/reader/processor/bilibili.go b/internal/reader/processor/bilibili.go new file mode 100644 index 00000000..b207ff03 --- /dev/null +++ b/internal/reader/processor/bilibili.go @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package processor + +import ( + "encoding/json" + "fmt" + "log/slog" + "regexp" + + "miniflux.app/v2/internal/config" + "miniflux.app/v2/internal/model" + "miniflux.app/v2/internal/reader/fetcher" +) + +var ( + bilibiliURLRegex = regexp.MustCompile(`bilibili\.com/video/(.*)$`) + bilibiliVideoIdRegex = regexp.MustCompile(`/video/(?:av(\d+)|BV([a-zA-Z0-9]+))`) +) + +func shouldFetchBilibiliWatchTime(entry *model.Entry) bool { + if !config.Opts.FetchBilibiliWatchTime() { + return false + } + matches := bilibiliURLRegex.FindStringSubmatch(entry.URL) + urlMatchesBilibiliPattern := len(matches) == 2 + return urlMatchesBilibiliPattern +} + +func extractBilibiliVideoID(websiteURL string) (string, string, error) { + matches := bilibiliVideoIdRegex.FindStringSubmatch(websiteURL) + if matches == nil { + return "", "", fmt.Errorf("no video ID found in URL: %s", websiteURL) + } + if matches[1] != "" { + return "aid", matches[1], nil + } + if matches[2] != "" { + return "bvid", matches[2], nil + } + return "", "", fmt.Errorf("unexpected regex match result for URL: %s", websiteURL) +} + +func fetchBilibiliWatchTime(websiteURL string) (int, error) { + requestBuilder := fetcher.NewRequestBuilder() + requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) + requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) + + idType, videoID, extractErr := extractBilibiliVideoID(websiteURL) + if extractErr != nil { + return 0, extractErr + } + bilibiliApiURL := fmt.Sprintf("https://api.bilibili.com/x/web-interface/view?%s=%s", idType, videoID) + + responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(bilibiliApiURL)) + defer responseHandler.Close() + + if localizedError := responseHandler.LocalizedError(); localizedError != nil { + slog.Warn("Unable to fetch Bilibili API", + slog.String("website_url", websiteURL), + slog.String("api_url", bilibiliApiURL), + slog.Any("error", localizedError.Error())) + return 0, localizedError.Error() + } + + var result map[string]interface{} + doc := json.NewDecoder(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) + if docErr := doc.Decode(&result); docErr != nil { + return 0, fmt.Errorf("failed to decode API response: %v", docErr) + } + + if code, ok := result["code"].(float64); !ok || code != 0 { + return 0, fmt.Errorf("API returned error code: %v", result["code"]) + } + + data, ok := result["data"].(map[string]interface{}) + if !ok { + return 0, fmt.Errorf("data field not found or not an object") + } + + duration, ok := data["duration"].(float64) + if !ok { + return 0, fmt.Errorf("duration not found or not a number") + } + intDuration := int(duration) + durationMin := intDuration / 60 + if intDuration%60 != 0 { + durationMin++ + } + return durationMin, nil +} diff --git a/internal/reader/processor/nebula.go b/internal/reader/processor/nebula.go new file mode 100644 index 00000000..d0b0b6ef --- /dev/null +++ b/internal/reader/processor/nebula.go @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package processor + +import ( + "errors" + "fmt" + "log/slog" + "regexp" + "strconv" + + "github.com/PuerkitoBio/goquery" + + "miniflux.app/v2/internal/config" + "miniflux.app/v2/internal/model" + "miniflux.app/v2/internal/reader/fetcher" +) + +var nebulaRegex = regexp.MustCompile(`^https://nebula\.tv`) + +func shouldFetchNebulaWatchTime(entry *model.Entry) bool { + if !config.Opts.FetchNebulaWatchTime() { + return false + } + matches := nebulaRegex.FindStringSubmatch(entry.URL) + return matches != nil +} + +func fetchNebulaWatchTime(websiteURL string) (int, error) { + requestBuilder := fetcher.NewRequestBuilder() + requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) + requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) + + responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL)) + defer responseHandler.Close() + + if localizedError := responseHandler.LocalizedError(); localizedError != nil { + slog.Warn("Unable to fetch Nebula watch time", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) + return 0, localizedError.Error() + } + + doc, docErr := goquery.NewDocumentFromReader(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) + if docErr != nil { + return 0, docErr + } + + durs, exists := doc.Find(`meta[property="video:duration"]`).First().Attr("content") + // durs contains video watch time in seconds + if !exists { + return 0, errors.New("duration has not found") + } + + dur, err := strconv.ParseInt(durs, 10, 64) + if err != nil { + return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err) + } + + return int(dur / 60), nil +} diff --git a/internal/reader/processor/odysee.go b/internal/reader/processor/odysee.go new file mode 100644 index 00000000..90733b2f --- /dev/null +++ b/internal/reader/processor/odysee.go @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package processor + +import ( + "errors" + "fmt" + "log/slog" + "regexp" + "strconv" + + "github.com/PuerkitoBio/goquery" + + "miniflux.app/v2/internal/config" + "miniflux.app/v2/internal/model" + "miniflux.app/v2/internal/reader/fetcher" +) + +var odyseeRegex = regexp.MustCompile(`^https://odysee\.com`) + +func shouldFetchOdyseeWatchTime(entry *model.Entry) bool { + if !config.Opts.FetchOdyseeWatchTime() { + return false + } + matches := odyseeRegex.FindStringSubmatch(entry.URL) + return matches != nil +} + +func fetchOdyseeWatchTime(websiteURL string) (int, error) { + requestBuilder := fetcher.NewRequestBuilder() + requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) + requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) + + responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL)) + defer responseHandler.Close() + + if localizedError := responseHandler.LocalizedError(); localizedError != nil { + slog.Warn("Unable to fetch Odysee watch time", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) + return 0, localizedError.Error() + } + + doc, docErr := goquery.NewDocumentFromReader(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) + if docErr != nil { + return 0, docErr + } + + durs, exists := doc.Find(`meta[property="og:video:duration"]`).First().Attr("content") + // durs contains video watch time in seconds + if !exists { + return 0, errors.New("duration has not found") + } + + dur, err := strconv.ParseInt(durs, 10, 64) + if err != nil { + return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err) + } + + return int(dur / 60), nil +} diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index f07fd936..53222a1f 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -4,13 +4,9 @@ package processor import ( - "encoding/json" - "errors" - "fmt" "log/slog" "regexp" "slices" - "strconv" "strings" "time" @@ -25,20 +21,11 @@ import ( "miniflux.app/v2/internal/reader/urlcleaner" "miniflux.app/v2/internal/storage" - "github.com/PuerkitoBio/goquery" "github.com/tdewolff/minify/v2" "github.com/tdewolff/minify/v2/html" ) -var ( - youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)$`) - nebulaRegex = regexp.MustCompile(`^https://nebula\.tv`) - odyseeRegex = regexp.MustCompile(`^https://odysee\.com`) - bilibiliURLRegex = regexp.MustCompile(`bilibili\.com/video/(.*)$`) - bilibiliVideoIdRegex = regexp.MustCompile(`/video/(?:av(\d+)|BV([a-zA-Z0-9]+))`) - iso8601Regex = regexp.MustCompile(`^P((?P\d+)Y)?((?P\d+)M)?((?P\d+)W)?((?P\d+)D)?(T((?P\d+)H)?((?P\d+)M)?((?P\d+)S)?)?$`) - customReplaceRuleRegex = regexp.MustCompile(`rewrite\("(.*)"\|"(.*)"\)`) -) +var customReplaceRuleRegex = regexp.MustCompile(`rewrite\("(.*)"\|"(.*)"\)`) // ProcessFeedEntries downloads original web page for entries and apply filters. func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.User, forceRefresh bool) { @@ -446,234 +433,6 @@ func updateEntryReadingTime(store *storage.Storage, feed *model.Feed, entry *mod } } -func shouldFetchYouTubeWatchTime(entry *model.Entry) bool { - if !config.Opts.FetchYouTubeWatchTime() { - return false - } - matches := youtubeRegex.FindStringSubmatch(entry.URL) - urlMatchesYouTubePattern := len(matches) == 2 - return urlMatchesYouTubePattern -} - -func shouldFetchNebulaWatchTime(entry *model.Entry) bool { - if !config.Opts.FetchNebulaWatchTime() { - return false - } - matches := nebulaRegex.FindStringSubmatch(entry.URL) - return matches != nil -} - -func shouldFetchOdyseeWatchTime(entry *model.Entry) bool { - if !config.Opts.FetchOdyseeWatchTime() { - return false - } - matches := odyseeRegex.FindStringSubmatch(entry.URL) - return matches != nil -} - -func shouldFetchBilibiliWatchTime(entry *model.Entry) bool { - if !config.Opts.FetchBilibiliWatchTime() { - return false - } - matches := bilibiliURLRegex.FindStringSubmatch(entry.URL) - urlMatchesBilibiliPattern := len(matches) == 2 - return urlMatchesBilibiliPattern -} - -func fetchYouTubeWatchTime(websiteURL string) (int, error) { - requestBuilder := fetcher.NewRequestBuilder() - requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) - requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) - - responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL)) - defer responseHandler.Close() - - if localizedError := responseHandler.LocalizedError(); localizedError != nil { - slog.Warn("Unable to fetch YouTube page", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) - return 0, localizedError.Error() - } - - doc, docErr := goquery.NewDocumentFromReader(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) - if docErr != nil { - return 0, docErr - } - - durs, exists := doc.Find(`meta[itemprop="duration"]`).First().Attr("content") - if !exists { - return 0, errors.New("duration has not found") - } - - dur, err := parseISO8601(durs) - if err != nil { - return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err) - } - - return int(dur.Minutes()), nil -} - -func fetchNebulaWatchTime(websiteURL string) (int, error) { - requestBuilder := fetcher.NewRequestBuilder() - requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) - requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) - - responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL)) - defer responseHandler.Close() - - if localizedError := responseHandler.LocalizedError(); localizedError != nil { - slog.Warn("Unable to fetch Nebula watch time", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) - return 0, localizedError.Error() - } - - doc, docErr := goquery.NewDocumentFromReader(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) - if docErr != nil { - return 0, docErr - } - - durs, exists := doc.Find(`meta[property="video:duration"]`).First().Attr("content") - // durs contains video watch time in seconds - if !exists { - return 0, errors.New("duration has not found") - } - - dur, err := strconv.ParseInt(durs, 10, 64) - if err != nil { - return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err) - } - - return int(dur / 60), nil -} - -func fetchOdyseeWatchTime(websiteURL string) (int, error) { - requestBuilder := fetcher.NewRequestBuilder() - requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) - requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) - - responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL)) - defer responseHandler.Close() - - if localizedError := responseHandler.LocalizedError(); localizedError != nil { - slog.Warn("Unable to fetch Odysee watch time", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) - return 0, localizedError.Error() - } - - doc, docErr := goquery.NewDocumentFromReader(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) - if docErr != nil { - return 0, docErr - } - - durs, exists := doc.Find(`meta[property="og:video:duration"]`).First().Attr("content") - // durs contains video watch time in seconds - if !exists { - return 0, errors.New("duration has not found") - } - - dur, err := strconv.ParseInt(durs, 10, 64) - if err != nil { - return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err) - } - - return int(dur / 60), nil -} - -func extractBilibiliVideoID(websiteURL string) (string, string, error) { - matches := bilibiliVideoIdRegex.FindStringSubmatch(websiteURL) - if matches == nil { - return "", "", fmt.Errorf("no video ID found in URL: %s", websiteURL) - } - if matches[1] != "" { - return "aid", matches[1], nil - } - if matches[2] != "" { - return "bvid", matches[2], nil - } - return "", "", fmt.Errorf("unexpected regex match result for URL: %s", websiteURL) -} - -func fetchBilibiliWatchTime(websiteURL string) (int, error) { - requestBuilder := fetcher.NewRequestBuilder() - requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) - requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) - - idType, videoID, extractErr := extractBilibiliVideoID(websiteURL) - if extractErr != nil { - return 0, extractErr - } - bilibiliApiURL := fmt.Sprintf("https://api.bilibili.com/x/web-interface/view?%s=%s", idType, videoID) - - responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(bilibiliApiURL)) - defer responseHandler.Close() - - if localizedError := responseHandler.LocalizedError(); localizedError != nil { - slog.Warn("Unable to fetch Bilibili API", - slog.String("website_url", bilibiliApiURL), - slog.Any("error", localizedError.Error())) - return 0, localizedError.Error() - } - - var result map[string]interface{} - doc := json.NewDecoder(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) - if docErr := doc.Decode(&result); docErr != nil { - return 0, fmt.Errorf("failed to decode API response: %v", docErr) - } - - if code, ok := result["code"].(float64); !ok || code != 0 { - return 0, fmt.Errorf("API returned error code: %v", result["code"]) - } - - data, ok := result["data"].(map[string]interface{}) - if !ok { - return 0, fmt.Errorf("data field not found or not an object") - } - - duration, ok := data["duration"].(float64) - if !ok { - return 0, fmt.Errorf("duration not found or not a number") - } - intDuration := int(duration) - durationMin := intDuration / 60 - if intDuration%60 != 0 { - durationMin++ - } - return durationMin, nil -} - -// parseISO8601 parses an ISO 8601 duration string. -func parseISO8601(from string) (time.Duration, error) { - var match []string - var d time.Duration - - if iso8601Regex.MatchString(from) { - match = iso8601Regex.FindStringSubmatch(from) - } else { - return 0, errors.New("could not parse duration string") - } - - for i, name := range iso8601Regex.SubexpNames() { - part := match[i] - if i == 0 || name == "" || part == "" { - continue - } - - val, err := strconv.ParseInt(part, 10, 64) - if err != nil { - return 0, err - } - - switch name { - case "hour": - d += (time.Duration(val) * time.Hour) - case "minute": - d += (time.Duration(val) * time.Minute) - case "second": - d += (time.Duration(val) * time.Second) - default: - return 0, fmt.Errorf("unknown field %s", name) - } - } - - return d, nil -} - func isRecentEntry(entry *model.Entry) bool { if config.Opts.FilterEntryMaxAgeDays() == 0 || entry.Date.After(time.Now().AddDate(0, 0, -config.Opts.FilterEntryMaxAgeDays())) { return true diff --git a/internal/reader/processor/processor_test.go b/internal/reader/processor/processor_test.go index 57d34e47..2a594a4a 100644 --- a/internal/reader/processor/processor_test.go +++ b/internal/reader/processor/processor_test.go @@ -85,35 +85,6 @@ func TestAllowEntries(t *testing.T) { } } -func TestParseISO8601(t *testing.T) { - var scenarios = []struct { - duration string - expected time.Duration - }{ - // Live streams and radio. - {"PT0M0S", 0}, - // https://www.youtube.com/watch?v=HLrqNhgdiC0 - {"PT6M20S", (6 * time.Minute) + (20 * time.Second)}, - // https://www.youtube.com/watch?v=LZa5KKfqHtA - {"PT5M41S", (5 * time.Minute) + (41 * time.Second)}, - // https://www.youtube.com/watch?v=yIxEEgEuhT4 - {"PT51M52S", (51 * time.Minute) + (52 * time.Second)}, - // https://www.youtube.com/watch?v=bpHf1XcoiFs - {"PT80M42S", (1 * time.Hour) + (20 * time.Minute) + (42 * time.Second)}, - } - - for _, tc := range scenarios { - result, err := parseISO8601(tc.duration) - if err != nil { - t.Errorf("Got an error when parsing %q: %v", tc.duration, err) - } - - if tc.expected != result { - t.Errorf(`Unexpected result, got %v for duration %q`, result, tc.duration) - } - } -} - func TestIsRecentEntry(t *testing.T) { parser := config.NewParser() var err error diff --git a/internal/reader/processor/youtube.go b/internal/reader/processor/youtube.go new file mode 100644 index 00000000..52de18c4 --- /dev/null +++ b/internal/reader/processor/youtube.go @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package processor + +import ( + "errors" + "fmt" + "log/slog" + "regexp" + "strconv" + "time" + + "github.com/PuerkitoBio/goquery" + + "miniflux.app/v2/internal/config" + "miniflux.app/v2/internal/model" + "miniflux.app/v2/internal/reader/fetcher" +) + +var ( + youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)$`) + iso8601Regex = regexp.MustCompile(`^P((?P\d+)Y)?((?P\d+)M)?((?P\d+)W)?((?P\d+)D)?(T((?P\d+)H)?((?P\d+)M)?((?P\d+)S)?)?$`) +) + +func shouldFetchYouTubeWatchTime(entry *model.Entry) bool { + if !config.Opts.FetchYouTubeWatchTime() { + return false + } + matches := youtubeRegex.FindStringSubmatch(entry.URL) + urlMatchesYouTubePattern := len(matches) == 2 + return urlMatchesYouTubePattern +} + +func fetchYouTubeWatchTime(websiteURL string) (int, error) { + requestBuilder := fetcher.NewRequestBuilder() + requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout()) + requestBuilder.WithProxy(config.Opts.HTTPClientProxy()) + + responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL)) + defer responseHandler.Close() + + if localizedError := responseHandler.LocalizedError(); localizedError != nil { + slog.Warn("Unable to fetch YouTube page", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error())) + return 0, localizedError.Error() + } + + doc, docErr := goquery.NewDocumentFromReader(responseHandler.Body(config.Opts.HTTPClientMaxBodySize())) + if docErr != nil { + return 0, docErr + } + + durs, exists := doc.Find(`meta[itemprop="duration"]`).First().Attr("content") + if !exists { + return 0, errors.New("duration has not found") + } + + dur, err := parseISO8601(durs) + if err != nil { + return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err) + } + + return int(dur.Minutes()), nil +} + +func parseISO8601(from string) (time.Duration, error) { + var match []string + var d time.Duration + + if iso8601Regex.MatchString(from) { + match = iso8601Regex.FindStringSubmatch(from) + } else { + return 0, errors.New("could not parse duration string") + } + + for i, name := range iso8601Regex.SubexpNames() { + part := match[i] + if i == 0 || name == "" || part == "" { + continue + } + + val, err := strconv.ParseInt(part, 10, 64) + if err != nil { + return 0, err + } + + switch name { + case "hour": + d += (time.Duration(val) * time.Hour) + case "minute": + d += (time.Duration(val) * time.Minute) + case "second": + d += (time.Duration(val) * time.Second) + default: + return 0, fmt.Errorf("unknown field %s", name) + } + } + + return d, nil +} diff --git a/internal/reader/processor/youtube_test.go b/internal/reader/processor/youtube_test.go new file mode 100644 index 00000000..66f72661 --- /dev/null +++ b/internal/reader/processor/youtube_test.go @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package processor // import "miniflux.app/v2/internal/reader/processor" + +import ( + "testing" + "time" +) + +func TestParseISO8601(t *testing.T) { + var scenarios = []struct { + duration string + expected time.Duration + }{ + // Live streams and radio. + {"PT0M0S", 0}, + // https://www.youtube.com/watch?v=HLrqNhgdiC0 + {"PT6M20S", (6 * time.Minute) + (20 * time.Second)}, + // https://www.youtube.com/watch?v=LZa5KKfqHtA + {"PT5M41S", (5 * time.Minute) + (41 * time.Second)}, + // https://www.youtube.com/watch?v=yIxEEgEuhT4 + {"PT51M52S", (51 * time.Minute) + (52 * time.Second)}, + // https://www.youtube.com/watch?v=bpHf1XcoiFs + {"PT80M42S", (1 * time.Hour) + (20 * time.Minute) + (42 * time.Second)}, + } + + for _, tc := range scenarios { + result, err := parseISO8601(tc.duration) + if err != nil { + t.Errorf("Got an error when parsing %q: %v", tc.duration, err) + } + + if tc.expected != result { + t.Errorf(`Unexpected result, got %v for duration %q`, result, tc.duration) + } + } +}