From eed3fcf92aee72f352520213d34146aee171efde Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Tue, 10 Dec 2024 01:05:14 +0000 Subject: [PATCH 01/31] refactor(locale): delay parsing of translations until they're used While doing some profiling for #2900, I noticed that `miniflux.app/v2/internal/locale.LoadCatalogMessages` is responsible for more than 10% of the consumed memory. As most miniflux instances won't have enough diverse users to use all the available translations at the same time, it makes sense to load them on demand. The overhead is a single function call and a check in a map, per call to translation-related functions. --- internal/cli/cli.go | 5 ---- internal/locale/catalog.go | 15 +++++++++--- internal/locale/catalog_test.go | 6 ++--- internal/locale/locale.go | 42 ++++++++++++++++----------------- internal/locale/locale_test.go | 2 +- internal/locale/printer.go | 33 +++++++++++++++----------- internal/ui/settings_show.go | 2 +- internal/ui/settings_update.go | 2 +- internal/validator/user.go | 2 +- 9 files changed, 58 insertions(+), 51 deletions(-) diff --git a/internal/cli/cli.go b/internal/cli/cli.go index ca4f47bd..fc074717 100644 --- a/internal/cli/cli.go +++ b/internal/cli/cli.go @@ -13,7 +13,6 @@ import ( "miniflux.app/v2/internal/config" "miniflux.app/v2/internal/database" - "miniflux.app/v2/internal/locale" "miniflux.app/v2/internal/storage" "miniflux.app/v2/internal/ui/static" "miniflux.app/v2/internal/version" @@ -153,10 +152,6 @@ func Parse() { slog.Info("The default value for DATABASE_URL is used") } - if err := locale.LoadCatalogMessages(); err != nil { - printErrorAndExit(fmt.Errorf("unable to load translations: %v", err)) - } - if err := static.CalculateBinaryFileChecksums(); err != nil { printErrorAndExit(fmt.Errorf("unable to calculate binary file checksums: %v", err)) } diff --git a/internal/locale/catalog.go b/internal/locale/catalog.go index 61f5f27d..8ecdab74 100644 --- a/internal/locale/catalog.go +++ b/internal/locale/catalog.go @@ -12,17 +12,26 @@ import ( type translationDict map[string]interface{} type catalog map[string]translationDict -var defaultCatalog catalog +var defaultCatalog = make(catalog, len(AvailableLanguages)) //go:embed translations/*.json var translationFiles embed.FS +func GetTranslationDict(language string) (translationDict, error) { + if _, ok := defaultCatalog[language]; !ok { + var err error + if defaultCatalog[language], err = loadTranslationFile(language); err != nil { + return nil, err + } + } + return defaultCatalog[language], nil +} + // LoadCatalogMessages loads and parses all translations encoded in JSON. func LoadCatalogMessages() error { var err error - defaultCatalog = make(catalog, len(AvailableLanguages())) - for language := range AvailableLanguages() { + for language := range AvailableLanguages { defaultCatalog[language], err = loadTranslationFile(language) if err != nil { return err diff --git a/internal/locale/catalog_test.go b/internal/locale/catalog_test.go index 75537911..687b1de2 100644 --- a/internal/locale/catalog_test.go +++ b/internal/locale/catalog_test.go @@ -39,7 +39,7 @@ func TestLoadCatalog(t *testing.T) { } func TestAllKeysHaveValue(t *testing.T) { - for language := range AvailableLanguages() { + for language := range AvailableLanguages { messages, err := loadTranslationFile(language) if err != nil { t.Fatalf(`Unable to load translation messages for language %q`, language) @@ -71,7 +71,7 @@ func TestMissingTranslations(t *testing.T) { t.Fatal(`Unable to parse reference language`) } - for language := range AvailableLanguages() { + for language := range AvailableLanguages { if language == refLang { continue } @@ -110,7 +110,7 @@ func TestTranslationFilePluralForms(t *testing.T) { "uk_UA": 3, "id_ID": 1, } - for language := range AvailableLanguages() { + for language := range AvailableLanguages { messages, err := loadTranslationFile(language) if err != nil { t.Fatalf(`Unable to load translation messages for language %q`, language) diff --git a/internal/locale/locale.go b/internal/locale/locale.go index a5a1010b..aa6165b8 100644 --- a/internal/locale/locale.go +++ b/internal/locale/locale.go @@ -3,26 +3,24 @@ package locale // import "miniflux.app/v2/internal/locale" -// AvailableLanguages returns the list of available languages. -func AvailableLanguages() map[string]string { - return map[string]string{ - "en_US": "English", - "es_ES": "Español", - "fr_FR": "Français", - "de_DE": "Deutsch", - "pl_PL": "Polski", - "pt_BR": "Português Brasileiro", - "zh_CN": "简体中文", - "zh_TW": "繁體中文", - "nl_NL": "Nederlands", - "ru_RU": "Русский", - "it_IT": "Italiano", - "ja_JP": "日本語", - "tr_TR": "Türkçe", - "el_EL": "Ελληνικά", - "fi_FI": "Suomi", - "hi_IN": "हिन्दी", - "uk_UA": "Українська", - "id_ID": "Bahasa Indonesia", - } +// AvailableLanguages is the list of available languages. +var AvailableLanguages = map[string]string{ + "en_US": "English", + "es_ES": "Español", + "fr_FR": "Français", + "de_DE": "Deutsch", + "pl_PL": "Polski", + "pt_BR": "Português Brasileiro", + "zh_CN": "简体中文", + "zh_TW": "繁體中文", + "nl_NL": "Nederlands", + "ru_RU": "Русский", + "it_IT": "Italiano", + "ja_JP": "日本語", + "tr_TR": "Türkçe", + "el_EL": "Ελληνικά", + "fi_FI": "Suomi", + "hi_IN": "हिन्दी", + "uk_UA": "Українська", + "id_ID": "Bahasa Indonesia", } diff --git a/internal/locale/locale_test.go b/internal/locale/locale_test.go index 86b52820..32f6a40f 100644 --- a/internal/locale/locale_test.go +++ b/internal/locale/locale_test.go @@ -6,7 +6,7 @@ package locale // import "miniflux.app/v2/internal/locale" import "testing" func TestAvailableLanguages(t *testing.T) { - results := AvailableLanguages() + results := AvailableLanguages for k, v := range results { if k == "" { t.Errorf(`Empty language key detected`) diff --git a/internal/locale/printer.go b/internal/locale/printer.go index f85960fa..d997c1a7 100644 --- a/internal/locale/printer.go +++ b/internal/locale/printer.go @@ -11,9 +11,11 @@ type Printer struct { } func (p *Printer) Print(key string) string { - if str, ok := defaultCatalog[p.language][key]; ok { - if translation, ok := str.(string); ok { - return translation + if dict, err := GetTranslationDict(p.language); err == nil { + if str, ok := dict[key]; ok { + if translation, ok := str.(string); ok { + return translation + } } } return key @@ -21,16 +23,16 @@ func (p *Printer) Print(key string) string { // Printf is like fmt.Printf, but using language-specific formatting. func (p *Printer) Printf(key string, args ...interface{}) string { - var translation string + translation := key - str, found := defaultCatalog[p.language][key] - if !found { - translation = key - } else { - var valid bool - translation, valid = str.(string) - if !valid { - translation = key + if dict, err := GetTranslationDict(p.language); err == nil { + str, found := dict[key] + if found { + var valid bool + translation, valid = str.(string) + if !valid { + translation = key + } } } @@ -39,9 +41,12 @@ func (p *Printer) Printf(key string, args ...interface{}) string { // Plural returns the translation of the given key by using the language plural form. func (p *Printer) Plural(key string, n int, args ...interface{}) string { - choices, found := defaultCatalog[p.language][key] + dict, err := GetTranslationDict(p.language) + if err != nil { + return key + } - if found { + if choices, found := dict[key]; found { var plurals []string switch v := choices.(type) { diff --git a/internal/ui/settings_show.go b/internal/ui/settings_show.go index 179b9802..eae72a7f 100644 --- a/internal/ui/settings_show.go +++ b/internal/ui/settings_show.go @@ -71,7 +71,7 @@ func (h *handler) showSettingsPage(w http.ResponseWriter, r *http.Request) { "MarkAsReadOnlyOnPlayerCompletion": form.MarkAsReadOnlyOnPlayerCompletion, }) view.Set("themes", model.Themes()) - view.Set("languages", locale.AvailableLanguages()) + view.Set("languages", locale.AvailableLanguages) view.Set("timezones", timezones) view.Set("menu", "settings") view.Set("user", user) diff --git a/internal/ui/settings_update.go b/internal/ui/settings_update.go index be99adb5..5610a9a9 100644 --- a/internal/ui/settings_update.go +++ b/internal/ui/settings_update.go @@ -44,7 +44,7 @@ func (h *handler) updateSettings(w http.ResponseWriter, r *http.Request) { view := view.New(h.tpl, r, sess) view.Set("form", settingsForm) view.Set("themes", model.Themes()) - view.Set("languages", locale.AvailableLanguages()) + view.Set("languages", locale.AvailableLanguages) view.Set("timezones", timezones) view.Set("menu", "settings") view.Set("user", loggedUser) diff --git a/internal/validator/user.go b/internal/validator/user.go index 2e79785b..b461f912 100644 --- a/internal/validator/user.go +++ b/internal/validator/user.go @@ -155,7 +155,7 @@ func validateTheme(theme string) *locale.LocalizedError { } func validateLanguage(language string) *locale.LocalizedError { - languages := locale.AvailableLanguages() + languages := locale.AvailableLanguages if _, found := languages[language]; !found { return locale.NewLocalizedError("error.invalid_language") } From 728423339a012d4a4995bf417a69b800589ab032 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Tue, 10 Dec 2024 01:14:54 +0000 Subject: [PATCH 02/31] refactor(sanitizer): improve `rewriteIframeURL()` - Use `url.Parse` instead of a regex, as this is much faster and way more robust - Add support for Vimeo's Do Not Track parameter --- internal/reader/sanitizer/sanitizer.go | 29 ++++++++++++++++----- internal/reader/sanitizer/sanitizer_test.go | 4 +-- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/internal/reader/sanitizer/sanitizer.go b/internal/reader/sanitizer/sanitizer.go index 9e337075..203e8702 100644 --- a/internal/reader/sanitizer/sanitizer.go +++ b/internal/reader/sanitizer/sanitizer.go @@ -5,7 +5,7 @@ package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer" import ( "io" - "regexp" + "net/url" "slices" "strconv" "strings" @@ -18,8 +18,7 @@ import ( ) var ( - youtubeEmbedRegex = regexp.MustCompile(`^(?:https?:)?//(?:www\.)?youtube\.com/embed/(.+)$`) - tagAllowList = map[string][]string{ + tagAllowList = map[string][]string{ "a": {"href", "title", "id"}, "abbr": {"title"}, "acronym": {"title"}, @@ -397,9 +396,27 @@ func isValidIframeSource(baseURL, src string) bool { } func rewriteIframeURL(link string) string { - matches := youtubeEmbedRegex.FindStringSubmatch(link) - if len(matches) == 2 { - return config.Opts.YouTubeEmbedUrlOverride() + matches[1] + u, err := url.Parse(link) + if err != nil { + return link + } + + switch strings.TrimPrefix(u.Hostname(), "www.") { + case "youtube.com": + if strings.HasPrefix(u.Path, "/embed/") { + if len(u.RawQuery) > 0 { + return config.Opts.YouTubeEmbedUrlOverride() + strings.TrimPrefix(u.Path, "/embed/") + "?" + u.RawQuery + } + return config.Opts.YouTubeEmbedUrlOverride() + strings.TrimPrefix(u.Path, "/embed/") + } + case "player.vimeo.com": + // See https://help.vimeo.com/hc/en-us/articles/12426260232977-About-Player-parameters + if strings.HasPrefix(u.Path, "/video/") { + if len(u.RawQuery) > 0 { + return link + "&dnt=1" + } + return link + "?dnt=1" + } } return link diff --git a/internal/reader/sanitizer/sanitizer_test.go b/internal/reader/sanitizer/sanitizer_test.go index a924c430..a0eb46e9 100644 --- a/internal/reader/sanitizer/sanitizer_test.go +++ b/internal/reader/sanitizer/sanitizer_test.go @@ -611,9 +611,9 @@ func TestReplaceYoutubeURLWithCustomURL(t *testing.T) { } } -func TestReplaceIframeURL(t *testing.T) { +func TestReplaceIframeVimedoDNTURL(t *testing.T) { input := `` - expected := `` + expected := `` output := Sanitize("http://example.org/", input) if expected != output { From 02c6d146599193098b19a64bc60b51f16a81bed1 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Tue, 10 Dec 2024 01:19:28 +0000 Subject: [PATCH 03/31] refactor(subscription): use `strings.HasSuffix` instead of a regex in `FindSubscriptionsFromYouTubePlaylistPage` --- internal/reader/subscription/finder.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/internal/reader/subscription/finder.go b/internal/reader/subscription/finder.go index 945fd1c6..514b7ffc 100644 --- a/internal/reader/subscription/finder.go +++ b/internal/reader/subscription/finder.go @@ -25,7 +25,6 @@ import ( ) var ( - youtubeHostRegex = regexp.MustCompile(`youtube\.com$`) youtubeChannelRegex = regexp.MustCompile(`channel/(.*)$`) ) @@ -284,7 +283,7 @@ func (f *SubscriptionFinder) FindSubscriptionsFromYouTubeChannelPage(websiteURL return nil, locale.NewLocalizedErrorWrapper(err, "error.invalid_site_url", err) } - if !youtubeHostRegex.MatchString(decodedUrl.Host) { + if !strings.HasSuffix(decodedUrl.Host, "youtube.com") { slog.Debug("This website is not a YouTube page, the regex doesn't match", slog.String("website_url", websiteURL)) return nil, nil } @@ -303,7 +302,7 @@ func (f *SubscriptionFinder) FindSubscriptionsFromYouTubePlaylistPage(websiteURL return nil, locale.NewLocalizedErrorWrapper(err, "error.invalid_site_url", err) } - if !youtubeHostRegex.MatchString(decodedUrl.Host) { + if !strings.HasSuffix(decodedUrl.Host, "youtube.com") { slog.Debug("This website is not a YouTube page, the regex doesn't match", slog.String("website_url", websiteURL)) return nil, nil } From 637fb85de07b6c4438f4b151ab388e83abfb3af3 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Tue, 10 Dec 2024 03:32:59 +0000 Subject: [PATCH 04/31] refactor(handler): delay `store.UserByID` as much as possible In internal/reader/handler/handler.go:RefreshFeed, there is a call to store.UserByID pretty early, which is only used for originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language) Its only other usage is in processor.ProcessFeedEntries(store, originalFeed, user, forceRefresh), which is pretty late in RefreshFeed, and only called if there are new items in the feed. It makes sense to only fetch the user's language if the error localization function is used. Calls to `store.UserByID` take around 10% of the CPU time of RefreshFeed in my profiling. This commit also makes `processor.ProcessFeedEntries` take a `userID` instead of a `user`, to make the code a bit more concise. This should close #2984 --- internal/reader/handler/handler.go | 41 +++++++++++++++----------- internal/reader/processor/processor.go | 8 ++++- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/internal/reader/handler/handler.go b/internal/reader/handler/handler.go index 185c57fa..937d7b78 100644 --- a/internal/reader/handler/handler.go +++ b/internal/reader/handler/handler.go @@ -31,11 +31,6 @@ func CreateFeedFromSubscriptionDiscovery(store *storage.Storage, userID int64, f slog.String("feed_url", feedCreationRequest.FeedURL), ) - user, storeErr := store.UserByID(userID) - if storeErr != nil { - return nil, locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) - } - if !store.CategoryIDExists(userID, feedCreationRequest.CategoryID) { return nil, locale.NewLocalizedErrorWrapper(ErrCategoryNotFound, "error.category_not_found") } @@ -71,7 +66,7 @@ func CreateFeedFromSubscriptionDiscovery(store *storage.Storage, userID int64, f subscription.WithCategoryID(feedCreationRequest.CategoryID) subscription.CheckedNow() - processor.ProcessFeedEntries(store, subscription, user, true) + processor.ProcessFeedEntries(store, subscription, userID, true) if storeErr := store.CreateFeed(subscription); storeErr != nil { return nil, locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) @@ -105,11 +100,6 @@ func CreateFeed(store *storage.Storage, userID int64, feedCreationRequest *model slog.String("feed_url", feedCreationRequest.FeedURL), ) - user, storeErr := store.UserByID(userID) - if storeErr != nil { - return nil, locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) - } - if !store.CategoryIDExists(userID, feedCreationRequest.CategoryID) { return nil, locale.NewLocalizedErrorWrapper(ErrCategoryNotFound, "error.category_not_found") } @@ -170,7 +160,7 @@ func CreateFeed(store *storage.Storage, userID int64, feedCreationRequest *model subscription.WithCategoryID(feedCreationRequest.CategoryID) subscription.CheckedNow() - processor.ProcessFeedEntries(store, subscription, user, true) + processor.ProcessFeedEntries(store, subscription, userID, true) if storeErr := store.CreateFeed(subscription); storeErr != nil { return nil, locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) @@ -195,11 +185,6 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool slog.Bool("force_refresh", forceRefresh), ) - user, storeErr := store.UserByID(userID) - if storeErr != nil { - return locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) - } - originalFeed, storeErr := store.FeedByID(userID, feedID) if storeErr != nil { return locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) @@ -256,6 +241,10 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool if localizedError := responseHandler.LocalizedError(); localizedError != nil { slog.Warn("Unable to fetch feed", slog.String("feed_url", originalFeed.FeedURL), slog.Any("error", localizedError.Error())) + user, storeErr := store.UserByID(userID) + if storeErr != nil { + return locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) + } originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language)) store.UpdateFeedError(originalFeed) return localizedError @@ -263,6 +252,10 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool if store.AnotherFeedURLExists(userID, originalFeed.ID, responseHandler.EffectiveURL()) { localizedError := locale.NewLocalizedErrorWrapper(ErrDuplicatedFeed, "error.duplicated_feed") + user, storeErr := store.UserByID(userID) + if storeErr != nil { + return locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) + } originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language)) store.UpdateFeedError(originalFeed) return localizedError @@ -289,6 +282,10 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool if errors.Is(parseErr, parser.ErrFeedFormatNotDetected) { localizedError = locale.NewLocalizedErrorWrapper(parseErr, "error.feed_format_not_detected", parseErr) } + user, storeErr := store.UserByID(userID) + if storeErr != nil { + return locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) + } originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language)) store.UpdateFeedError(originalFeed) @@ -309,13 +306,17 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool ) originalFeed.Entries = updatedFeed.Entries - processor.ProcessFeedEntries(store, originalFeed, user, forceRefresh) + processor.ProcessFeedEntries(store, originalFeed, userID, forceRefresh) // We don't update existing entries when the crawler is enabled (we crawl only inexisting entries). Unless it is forced to refresh updateExistingEntries := forceRefresh || !originalFeed.Crawler newEntries, storeErr := store.RefreshFeedEntries(originalFeed.UserID, originalFeed.ID, originalFeed.Entries, updateExistingEntries) if storeErr != nil { localizedError := locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) + user, storeErr := store.UserByID(userID) + if storeErr != nil { + return locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) + } originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language)) store.UpdateFeedError(originalFeed) return localizedError @@ -359,6 +360,10 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool if storeErr := store.UpdateFeed(originalFeed); storeErr != nil { localizedError := locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) + user, storeErr := store.UserByID(userID) + if storeErr != nil { + return locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr) + } originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language)) store.UpdateFeedError(originalFeed) return localizedError diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index ceae674c..0bdf0f61 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -28,9 +28,15 @@ import ( var customReplaceRuleRegex = regexp.MustCompile(`rewrite\("([^"]+)"\|"([^"]+)"\)`) // ProcessFeedEntries downloads original web page for entries and apply filters. -func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.User, forceRefresh bool) { +func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64, forceRefresh bool) { var filteredEntries model.Entries + user, storeErr := store.UserByID(userID) + if storeErr != nil { + slog.Error("Database error", slog.Any("error", storeErr)) + return + } + // Process older entries first for i := len(feed.Entries) - 1; i >= 0; i-- { entry := feed.Entries[i] From 3caa16ac3176e2a86bf26771d09cc8753da7c0ce Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Thu, 12 Dec 2024 03:30:59 +0000 Subject: [PATCH 05/31] refactor(processor): use URL parsing instead of a regex --- internal/reader/processor/nebula.go | 13 ++++++++----- internal/reader/processor/odysee.go | 13 ++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/internal/reader/processor/nebula.go b/internal/reader/processor/nebula.go index d0b0b6ef..cf8a70c0 100644 --- a/internal/reader/processor/nebula.go +++ b/internal/reader/processor/nebula.go @@ -7,7 +7,7 @@ import ( "errors" "fmt" "log/slog" - "regexp" + "net/url" "strconv" "github.com/PuerkitoBio/goquery" @@ -17,14 +17,17 @@ import ( "miniflux.app/v2/internal/reader/fetcher" ) -var nebulaRegex = regexp.MustCompile(`^https://nebula\.tv`) - func shouldFetchNebulaWatchTime(entry *model.Entry) bool { if !config.Opts.FetchNebulaWatchTime() { return false } - matches := nebulaRegex.FindStringSubmatch(entry.URL) - return matches != nil + + u, err := url.Parse(entry.URL) + if err != nil { + return false + } + + return u.Hostname() == "nebula.tv" } func fetchNebulaWatchTime(websiteURL string) (int, error) { diff --git a/internal/reader/processor/odysee.go b/internal/reader/processor/odysee.go index 90733b2f..7a174b5a 100644 --- a/internal/reader/processor/odysee.go +++ b/internal/reader/processor/odysee.go @@ -7,7 +7,7 @@ import ( "errors" "fmt" "log/slog" - "regexp" + "net/url" "strconv" "github.com/PuerkitoBio/goquery" @@ -17,14 +17,17 @@ import ( "miniflux.app/v2/internal/reader/fetcher" ) -var odyseeRegex = regexp.MustCompile(`^https://odysee\.com`) - func shouldFetchOdyseeWatchTime(entry *model.Entry) bool { if !config.Opts.FetchOdyseeWatchTime() { return false } - matches := odyseeRegex.FindStringSubmatch(entry.URL) - return matches != nil + + u, err := url.Parse(entry.URL) + if err != nil { + return false + } + + return u.Hostname() == "odysee.com" } func fetchOdyseeWatchTime(websiteURL string) (int, error) { From 68448b4abb4fc85ff40ceb8d9b37d4e24440b7bd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 11 Dec 2024 22:30:56 +0000 Subject: [PATCH 06/31] build(deps): bump golang.org/x/crypto from 0.30.0 to 0.31.0 Bumps [golang.org/x/crypto](https://github.com/golang/crypto) from 0.30.0 to 0.31.0. - [Commits](https://github.com/golang/crypto/compare/v0.30.0...v0.31.0) --- updated-dependencies: - dependency-name: golang.org/x/crypto dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 8d5ae72c..575e4d84 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/lib/pq v1.10.9 github.com/prometheus/client_golang v1.20.5 github.com/tdewolff/minify/v2 v2.21.2 - golang.org/x/crypto v0.30.0 + golang.org/x/crypto v0.31.0 golang.org/x/net v0.32.0 golang.org/x/oauth2 v0.24.0 golang.org/x/term v0.27.0 diff --git a/go.sum b/go.sum index 715e1a58..177cda60 100644 --- a/go.sum +++ b/go.sum @@ -68,8 +68,8 @@ github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3i github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.30.0 h1:RwoQn3GkWiMkzlX562cLB7OxWvjH1L8xutO2WoJcRoY= -golang.org/x/crypto v0.30.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= From 1b0b8b9c42604fdb01636f56aed37116f9858ef9 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Thu, 12 Dec 2024 03:40:55 +0000 Subject: [PATCH 07/31] =?UTF-8?q?refactor:=20use=20a=20better=20construct?= =?UTF-8?q?=20than=20`doc.Find(=E2=80=A6).First()`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As mentioned in goquery's documentation (https://pkg.go.dev/github.com/PuerkitoBio/goquery#Single): > By default, Selection.Find and other functions that accept a selector string to select nodes will use all matches corresponding to that selector. By using the Matcher returned by Single, at most the first match will be selected. > > The one using Single is optimized to be potentially much faster on large documents. --- internal/mediaproxy/rewriter.go | 2 +- internal/reader/processor/nebula.go | 2 +- internal/reader/processor/odysee.go | 2 +- internal/reader/processor/youtube.go | 2 +- internal/reader/readability/readability.go | 2 +- internal/reader/rewrite/rewrite_functions.go | 22 ++++++++++---------- internal/reader/scraper/scraper.go | 2 +- internal/reader/subscription/finder.go | 2 +- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/internal/mediaproxy/rewriter.go b/internal/mediaproxy/rewriter.go index bb5c2b78..39da1e8b 100644 --- a/internal/mediaproxy/rewriter.go +++ b/internal/mediaproxy/rewriter.go @@ -87,7 +87,7 @@ func genericProxyRewriter(router *mux.Router, proxifyFunction urlProxyRewriter, } } - output, err := doc.Find("body").First().Html() + output, err := doc.FindMatcher(goquery.Single("body")).Html() if err != nil { return htmlDocument } diff --git a/internal/reader/processor/nebula.go b/internal/reader/processor/nebula.go index cf8a70c0..216e9b34 100644 --- a/internal/reader/processor/nebula.go +++ b/internal/reader/processor/nebula.go @@ -48,7 +48,7 @@ func fetchNebulaWatchTime(websiteURL string) (int, error) { return 0, docErr } - durs, exists := doc.Find(`meta[property="video:duration"]`).First().Attr("content") + durs, exists := doc.FindMatcher(goquery.Single(`meta[property="video:duration"]`)).Attr("content") // durs contains video watch time in seconds if !exists { return 0, errors.New("duration has not found") diff --git a/internal/reader/processor/odysee.go b/internal/reader/processor/odysee.go index 7a174b5a..873ae60c 100644 --- a/internal/reader/processor/odysee.go +++ b/internal/reader/processor/odysee.go @@ -48,7 +48,7 @@ func fetchOdyseeWatchTime(websiteURL string) (int, error) { return 0, docErr } - durs, exists := doc.Find(`meta[property="og:video:duration"]`).First().Attr("content") + durs, exists := doc.FindMatcher(goquery.Single(`meta[property="og:video:duration"]`)).Attr("content") // durs contains video watch time in seconds if !exists { return 0, errors.New("duration has not found") diff --git a/internal/reader/processor/youtube.go b/internal/reader/processor/youtube.go index 2d41e11f..68e72ba6 100644 --- a/internal/reader/processor/youtube.go +++ b/internal/reader/processor/youtube.go @@ -60,7 +60,7 @@ func fetchYouTubeWatchTimeFromWebsite(websiteURL string) (int, error) { return 0, docErr } - durs, exists := doc.Find(`meta[itemprop="duration"]`).First().Attr("content") + durs, exists := doc.FindMatcher(goquery.Single(`meta[itemprop="duration"]`)).Attr("content") if !exists { return 0, errors.New("duration has not found") } diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 193edf07..299211f5 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -77,7 +77,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er return "", "", err } - if hrefValue, exists := document.Find("head base").First().Attr("href"); exists { + if hrefValue, exists := document.FindMatcher(goquery.Single("head base")).Attr("href"); exists { hrefValue = strings.TrimSpace(hrefValue) if urllib.IsAbsoluteURL(hrefValue) { baseURL = hrefValue diff --git a/internal/reader/rewrite/rewrite_functions.go b/internal/reader/rewrite/rewrite_functions.go index 78590031..2125f8f0 100644 --- a/internal/reader/rewrite/rewrite_functions.go +++ b/internal/reader/rewrite/rewrite_functions.go @@ -44,7 +44,7 @@ func addImageTitle(entryURL, entryContent string) string { img.ReplaceWithHtml(`
` + altAttr + `

` + html.EscapeString(titleAttr) + `

`) }) - output, _ := doc.Find("body").First().Html() + output, _ := doc.FindMatcher(goquery.Single("body")).Html() return output } @@ -76,7 +76,7 @@ func addMailtoSubject(entryURL, entryContent string) string { a.AppendHtml(" [" + html.EscapeString(subject) + "]") }) - output, _ := doc.Find("body").First().Html() + output, _ := doc.FindMatcher(goquery.Single("body")).Html() return output } @@ -160,7 +160,7 @@ func addDynamicImage(entryURL, entryContent string) string { } if changed { - output, _ := doc.Find("body").First().Html() + output, _ := doc.FindMatcher(goquery.Single("body")).Html() return output } @@ -197,7 +197,7 @@ func addDynamicIframe(entryURL, entryContent string) string { }) if changed { - output, _ := doc.Find("body").First().Html() + output, _ := doc.FindMatcher(goquery.Single("body")).Html() return output } @@ -217,7 +217,7 @@ func fixMediumImages(entryURL, entryContent string) string { } }) - output, _ := doc.Find("body").First().Html() + output, _ := doc.FindMatcher(goquery.Single("body")).Html() return output } @@ -239,7 +239,7 @@ func useNoScriptImages(entryURL, entryContent string) string { } }) - output, _ := doc.Find("body").First().Html() + output, _ := doc.FindMatcher(goquery.Single("body")).Html() return output } @@ -317,7 +317,7 @@ func removeCustom(entryContent string, selector string) string { doc.Find(selector).Remove() - output, _ := doc.Find("body").First().Html() + output, _ := doc.FindMatcher(goquery.Single("body")).Html() return output } @@ -344,7 +344,7 @@ func applyFuncOnTextContent(entryContent string, selector string, repl func(stri doc.Find(selector).Each(treatChildren) - output, _ := doc.Find("body").First().Html() + output, _ := doc.FindMatcher(goquery.Single("body")).Html() return output } @@ -401,7 +401,7 @@ func addHackerNewsLinksUsing(entryContent, app string) string { } }) - output, _ := doc.Find("body").First().Html() + output, _ := doc.FindMatcher(goquery.Single("body")).Html() return output } @@ -420,7 +420,7 @@ func removeTables(entryContent string) string { for _, selector := range selectors { for { - loopElement = doc.Find(selector).First() + loopElement = doc.FindMatcher(goquery.Single(selector)) if loopElement.Length() == 0 { break @@ -436,6 +436,6 @@ func removeTables(entryContent string) string { } } - output, _ := doc.Find("body").First().Html() + output, _ := doc.FindMatcher(goquery.Single("body")).Html() return output } diff --git a/internal/reader/scraper/scraper.go b/internal/reader/scraper/scraper.go index a200a587..de8e3afc 100644 --- a/internal/reader/scraper/scraper.go +++ b/internal/reader/scraper/scraper.go @@ -75,7 +75,7 @@ func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, return "", "", err } - if hrefValue, exists := document.Find("head base").First().Attr("href"); exists { + if hrefValue, exists := document.FindMatcher(goquery.Single("head base")).Attr("href"); exists { hrefValue = strings.TrimSpace(hrefValue) if urllib.IsAbsoluteURL(hrefValue) { baseURL = hrefValue diff --git a/internal/reader/subscription/finder.go b/internal/reader/subscription/finder.go index 514b7ffc..ebebe56a 100644 --- a/internal/reader/subscription/finder.go +++ b/internal/reader/subscription/finder.go @@ -146,7 +146,7 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentTyp return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err) } - if hrefValue, exists := doc.Find("head base").First().Attr("href"); exists { + if hrefValue, exists := doc.FindMatcher(goquery.Single("head base")).Attr("href"); exists { hrefValue = strings.TrimSpace(hrefValue) if urllib.IsAbsoluteURL(hrefValue) { websiteURL = hrefValue From e6185b13931f48a4c3aea83970a89d7235496ac7 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Thu, 12 Dec 2024 03:43:14 +0000 Subject: [PATCH 08/31] refactor: use min/max instead of math.Min/math.Max This saves a couple of back'n'forth casts. --- internal/model/feed.go | 4 ++-- internal/reader/readability/readability.go | 5 ++--- internal/reader/readingtime/readingtime.go | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/internal/model/feed.go b/internal/model/feed.go index 9f1de1eb..1682b111 100644 --- a/internal/model/feed.go +++ b/internal/model/feed.go @@ -123,8 +123,8 @@ func (f *Feed) ScheduleNextCheck(weeklyCount int, refreshDelayInMinutes int) { intervalMinutes = config.Opts.SchedulerEntryFrequencyMaxInterval() } else { intervalMinutes = int(math.Round(float64(7*24*60) / float64(weeklyCount*config.Opts.SchedulerEntryFrequencyFactor()))) - intervalMinutes = int(math.Min(float64(intervalMinutes), float64(config.Opts.SchedulerEntryFrequencyMaxInterval()))) - intervalMinutes = int(math.Max(float64(intervalMinutes), float64(config.Opts.SchedulerEntryFrequencyMinInterval()))) + intervalMinutes = min(intervalMinutes, config.Opts.SchedulerEntryFrequencyMaxInterval()) + intervalMinutes = max(intervalMinutes, config.Opts.SchedulerEntryFrequencyMinInterval()) } } diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 299211f5..18f30ded 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -8,7 +8,6 @@ import ( "fmt" "io" "log/slog" - "math" "regexp" "strings" @@ -108,7 +107,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er // Things like preambles, content split by ads that we removed, etc. func getArticle(topCandidate *candidate, candidates candidateList) string { output := bytes.NewBufferString("
") - siblingScoreThreshold := float32(math.Max(10, float64(topCandidate.score*.2))) + siblingScoreThreshold := max(10, topCandidate.score*.2) topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) { append := false @@ -223,7 +222,7 @@ func getCandidates(document *goquery.Document) candidateList { contentScore += float32(strings.Count(text, ",") + 1) // For every 100 characters in this paragraph, add another point. Up to 3 points. - contentScore += float32(math.Min(float64(int(len(text)/100.0)), 3)) + contentScore += float32(min(int(len(text)/100.0), 3)) candidates[parentNode].score += contentScore if grandParentNode != nil { diff --git a/internal/reader/readingtime/readingtime.go b/internal/reader/readingtime/readingtime.go index 0cfb2b22..9159ee71 100644 --- a/internal/reader/readingtime/readingtime.go +++ b/internal/reader/readingtime/readingtime.go @@ -19,7 +19,7 @@ func EstimateReadingTime(content string, defaultReadingSpeed, cjkReadingSpeed in sanitizedContent := sanitizer.StripTags(content) // Litterature on language detection says that around 100 signes is enough, we're safe here. - truncationPoint := int(math.Min(float64(len(sanitizedContent)), 250)) + truncationPoint := min(len(sanitizedContent), 250) // We're only interested in identifying Japanse/Chinese/Korean options := whatlanggo.Options{ From 113abeea594ae568e0830e069e6723e299005c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Thu, 12 Dec 2024 19:39:38 -0800 Subject: [PATCH 09/31] test(rewrite): add unit test for referer rewrite function --- .../reader/rewrite/referer_override_test.go | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 internal/reader/rewrite/referer_override_test.go diff --git a/internal/reader/rewrite/referer_override_test.go b/internal/reader/rewrite/referer_override_test.go new file mode 100644 index 00000000..7d527bb0 --- /dev/null +++ b/internal/reader/rewrite/referer_override_test.go @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package rewrite // import "miniflux.app/v2/internal/reader/rewrite" + +import ( + "testing" +) + +func TestGetRefererForURL(t *testing.T) { + testCases := []struct { + name string + url string + expected string + }{ + { + name: "Weibo Image URL", + url: "https://wx1.sinaimg.cn/large/example.jpg", + expected: "https://weibo.com", + }, + { + name: "Pixiv Image URL", + url: "https://i.pximg.net/img-master/example.jpg", + expected: "https://www.pixiv.net", + }, + { + name: "SSPai CDN URL", + url: "https://cdnfile.sspai.com/example.png", + expected: "https://sspai.com", + }, + { + name: "Instagram CDN URL", + url: "https://scontent-sjc3-1.cdninstagram.com/example.jpg", + expected: "https://www.instagram.com", + }, + { + name: "Piokok URL", + url: "https://sp1.piokok.com/example.jpg", + expected: "https://sp1.piokok.com", + }, + { + name: "Weibo Video URL", + url: "https://f.video.weibocdn.com/example.mp4", + expected: "https://weibo.com", + }, + { + name: "HelloGithub Image URL", + url: "https://img.hellogithub.com/example.png", + expected: "https://hellogithub.com", + }, + { + name: "Non-matching URL", + url: "https://example.com/image.jpg", + expected: "", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result := GetRefererForURL(tc.url) + if result != tc.expected { + t.Errorf("GetRefererForURL(%s): expected %s, got %s", + tc.url, tc.expected, result) + } + }) + } +} From 6ad5ad0bb23ab1faa60ff5c30e2f8ef03a6c22e3 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Fri, 13 Dec 2024 04:41:56 +0000 Subject: [PATCH 10/31] refactor(readability): various improvements and optimizations - Replace a completely overkill regex - Use `.Remove()` instead of a hand-rolled loop - Use a strings.Builder instead of a bytes.NewBufferString - Replace a call to Fprintf with string concatenation, as the latter are much faster - Remove a superfluous cast - Delay some computations - Add some tests --- internal/reader/readability/readability.go | 52 +++++++--------- .../reader/readability/readability_test.go | 61 +++++++++++++++++++ 2 files changed, 84 insertions(+), 29 deletions(-) diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 18f30ded..8f10b145 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -4,7 +4,6 @@ package readability // import "miniflux.app/v2/internal/reader/readability" import ( - "bytes" "fmt" "io" "log/slog" @@ -23,7 +22,6 @@ const ( var ( divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) - sentenceRegexp = regexp.MustCompile(`\.( |$)`) blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`) okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`) @@ -84,7 +82,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er } document.Find("script,style").Each(func(i int, s *goquery.Selection) { - removeNodes(s) + s.Remove() }) transformMisusedDivsIntoParagraphs(document) @@ -106,7 +104,8 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er // Now that we have the top candidate, look through its siblings for content that might also be related. // Things like preambles, content split by ads that we removed, etc. func getArticle(topCandidate *candidate, candidates candidateList) string { - output := bytes.NewBufferString("
") + var output strings.Builder + output.WriteString("
") siblingScoreThreshold := max(10, topCandidate.score*.2) topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) { @@ -124,10 +123,14 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { content := s.Text() contentLength := len(content) - if contentLength >= 80 && linkDensity < .25 { - append = true - } else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) { - append = true + if contentLength >= 80 { + if linkDensity < .25 { + append = true + } + } else { + if linkDensity == 0 && containsSentence(content) { + append = true + } } } @@ -138,7 +141,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { } html, _ := s.Html() - fmt.Fprintf(output, "<%s>%s", tag, html, tag) + output.WriteString("<" + tag + ">" + html + "") } }) @@ -156,9 +159,9 @@ func removeUnlikelyCandidates(document *goquery.Document) { str := strings.ToLower(class + id) if blacklistCandidatesRegexp.MatchString(str) { - removeNodes(s) + s.Remove() } else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) { - removeNodes(s) + s.Remove() } }) } @@ -222,7 +225,7 @@ func getCandidates(document *goquery.Document) candidateList { contentScore += float32(strings.Count(text, ",") + 1) // For every 100 characters in this paragraph, add another point. Up to 3 points. - contentScore += float32(min(int(len(text)/100.0), 3)) + contentScore += float32(min(len(text)/100.0, 3)) candidates[parentNode].score += contentScore if grandParentNode != nil { @@ -261,13 +264,14 @@ func scoreNode(s *goquery.Selection) *candidate { // Get the density of links as a percentage of the content // This is the amount of text that is inside a link divided by the total text in the node. func getLinkDensity(s *goquery.Selection) float32 { - linkLength := len(s.Find("a").Text()) textLength := len(s.Text()) if textLength == 0 { return 0 } + linkLength := len(s.Find("a").Text()) + return float32(linkLength) / float32(textLength) } @@ -278,25 +282,20 @@ func getClassWeight(s *goquery.Selection) float32 { class, _ := s.Attr("class") id, _ := s.Attr("id") - class = strings.ToLower(class) - id = strings.ToLower(id) - if class != "" { + class = strings.ToLower(class) if negativeRegexp.MatchString(class) { weight -= 25 - } - - if positiveRegexp.MatchString(class) { + } else if positiveRegexp.MatchString(class) { weight += 25 } } if id != "" { + id = strings.ToLower(id) if negativeRegexp.MatchString(id) { weight -= 25 - } - - if positiveRegexp.MatchString(id) { + } else if positiveRegexp.MatchString(id) { weight += 25 } } @@ -314,11 +313,6 @@ func transformMisusedDivsIntoParagraphs(document *goquery.Document) { }) } -func removeNodes(s *goquery.Selection) { - s.Each(func(i int, s *goquery.Selection) { - parent := s.Parent() - if parent.Length() > 0 { - parent.Get(0).RemoveChild(s.Get(0)) - } - }) +func containsSentence(content string) bool { + return strings.HasSuffix(content, ".") || strings.Contains(content, ". ") } diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index bd47d859..8baee1a0 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -100,3 +100,64 @@ func TestWithoutBaseURL(t *testing.T) { t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL) } } + +func TestRemoveStyleScript(t *testing.T) { + html := ` + + + Test + + + + + +
Some content
+ + ` + want := `
Somecontent
` + + _, content, err := ExtractContent(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + content = strings.ReplaceAll(content, "\n", "") + content = strings.ReplaceAll(content, " ", "") + content = strings.ReplaceAll(content, "\t", "") + + if content != want { + t.Errorf(`Invalid content, got %s instead of %s`, content, want) + } +} + +func TestRemoveBlacklist(t *testing.T) { + html := ` + + + Test + + +
Some content
+
Some other thing
+
And more
+
Valid!
+ + ` + want := `
Valid!
` + + _, content, err := ExtractContent(strings.NewReader(html)) + if err != nil { + t.Fatal(err) + } + + content = strings.ReplaceAll(content, "\n", "") + content = strings.ReplaceAll(content, " ", "") + content = strings.ReplaceAll(content, "\t", "") + + if content != want { + t.Errorf(`Invalid content, got %s instead of %s`, content, want) + } +} From c3649bd6b1d89d52162b198c5019cf7bc69dc6eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Thu, 12 Dec 2024 20:46:07 -0800 Subject: [PATCH 11/31] refactor(rewrite): remove unused function arguments --- internal/reader/rewrite/rewrite_functions.go | 12 ++++++------ internal/reader/rewrite/rewriter.go | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/internal/reader/rewrite/rewrite_functions.go b/internal/reader/rewrite/rewrite_functions.go index 2125f8f0..c1634955 100644 --- a/internal/reader/rewrite/rewrite_functions.go +++ b/internal/reader/rewrite/rewrite_functions.go @@ -27,7 +27,7 @@ var ( textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`) ) -func addImageTitle(entryURL, entryContent string) string { +func addImageTitle(entryContent string) string { doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) if err != nil { return entryContent @@ -51,7 +51,7 @@ func addImageTitle(entryURL, entryContent string) string { return entryContent } -func addMailtoSubject(entryURL, entryContent string) string { +func addMailtoSubject(entryContent string) string { doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) if err != nil { return entryContent @@ -83,7 +83,7 @@ func addMailtoSubject(entryURL, entryContent string) string { return entryContent } -func addDynamicImage(entryURL, entryContent string) string { +func addDynamicImage(entryContent string) string { doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) if err != nil { return entryContent @@ -167,7 +167,7 @@ func addDynamicImage(entryURL, entryContent string) string { return entryContent } -func addDynamicIframe(entryURL, entryContent string) string { +func addDynamicIframe(entryContent string) string { doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) if err != nil { return entryContent @@ -204,7 +204,7 @@ func addDynamicIframe(entryURL, entryContent string) string { return entryContent } -func fixMediumImages(entryURL, entryContent string) string { +func fixMediumImages(entryContent string) string { doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) if err != nil { return entryContent @@ -221,7 +221,7 @@ func fixMediumImages(entryURL, entryContent string) string { return output } -func useNoScriptImages(entryURL, entryContent string) string { +func useNoScriptImages(entryContent string) string { doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) if err != nil { return entryContent diff --git a/internal/reader/rewrite/rewriter.go b/internal/reader/rewrite/rewriter.go index 8ce7dce0..4f8ee951 100644 --- a/internal/reader/rewrite/rewriter.go +++ b/internal/reader/rewrite/rewriter.go @@ -24,13 +24,13 @@ type rule struct { func (rule rule) applyRule(entryURL string, entry *model.Entry) { switch rule.name { case "add_image_title": - entry.Content = addImageTitle(entryURL, entry.Content) + entry.Content = addImageTitle(entry.Content) case "add_mailto_subject": - entry.Content = addMailtoSubject(entryURL, entry.Content) + entry.Content = addMailtoSubject(entry.Content) case "add_dynamic_image": - entry.Content = addDynamicImage(entryURL, entry.Content) + entry.Content = addDynamicImage(entry.Content) case "add_dynamic_iframe": - entry.Content = addDynamicIframe(entryURL, entry.Content) + entry.Content = addDynamicIframe(entry.Content) case "add_youtube_video": entry.Content = addYoutubeVideo(entryURL, entry.Content) case "add_invidious_video": @@ -46,9 +46,9 @@ func (rule rule) applyRule(entryURL string, entry *model.Entry) { case "convert_text_link", "convert_text_links": entry.Content = replaceTextLinks(entry.Content) case "fix_medium_images": - entry.Content = fixMediumImages(entryURL, entry.Content) + entry.Content = fixMediumImages(entry.Content) case "use_noscript_figure_images": - entry.Content = useNoScriptImages(entryURL, entry.Content) + entry.Content = useNoScriptImages(entry.Content) case "replace": // Format: replace("search-term"|"replace-term") if len(rule.args) >= 2 { From 945d43605560c442287e1c1458553fad65c48084 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Fri, 13 Dec 2024 22:50:12 +0000 Subject: [PATCH 12/31] refactor(rewriter): replace regex with URL parsing for referrer override No need for brittle regex when matching plain strings or domain names. This should save some negligible amount of heap memory as well as tremendously speeding up the matching. --- internal/reader/rewrite/rules.go | 72 +++++++++++++------------------- 1 file changed, 29 insertions(+), 43 deletions(-) diff --git a/internal/reader/rewrite/rules.go b/internal/reader/rewrite/rules.go index 89cba31d..4a77b1ff 100644 --- a/internal/reader/rewrite/rules.go +++ b/internal/reader/rewrite/rules.go @@ -3,7 +3,10 @@ package rewrite // import "miniflux.app/v2/internal/reader/rewrite" -import "regexp" +import ( + "net/url" + "strings" +) // List of predefined rewrite rules (alphabetically sorted) // Available rules: "add_image_title", "add_youtube_video" @@ -39,49 +42,32 @@ var predefinedRules = map[string]string{ "youtube.com": "add_youtube_video", } -type RefererRule struct { - URLPattern *regexp.Regexp - Referer string -} - -// List of predefined referer rules -var PredefinedRefererRules = []RefererRule{ - { - URLPattern: regexp.MustCompile(`^https://\w+\.sinaimg\.cn`), - Referer: "https://weibo.com", - }, - { - URLPattern: regexp.MustCompile(`^https://i\.pximg\.net`), - Referer: "https://www.pixiv.net", - }, - { - URLPattern: regexp.MustCompile(`^https://cdnfile\.sspai\.com`), - Referer: "https://sspai.com", - }, - { - URLPattern: regexp.MustCompile(`^https://(?:\w|-)+\.cdninstagram\.com`), - Referer: "https://www.instagram.com", - }, - { - URLPattern: regexp.MustCompile(`^https://sp1\.piokok\.com`), - Referer: "https://sp1.piokok.com", - }, - { - URLPattern: regexp.MustCompile(`^https://f\.video\.weibocdn\.com`), - Referer: "https://weibo.com", - }, - { - URLPattern: regexp.MustCompile(`^https://img\.hellogithub\.com`), - Referer: "https://hellogithub.com", - }, -} - // GetRefererForURL returns the referer for the given URL if it exists, otherwise an empty string. -func GetRefererForURL(url string) string { - for _, rule := range PredefinedRefererRules { - if rule.URLPattern.MatchString(url) { - return rule.Referer - } +func GetRefererForURL(u string) string { + parsedUrl, err := url.Parse(u) + if err != nil { + return "" } + + switch parsedUrl.Hostname() { + case "i.pximg.net": + return "https://www.pixiv.net" + case "sp1.piokok.com": + return "https://sp1.piokok.com" + case "cdnfile.sspai.com": + return "https://sspai.com" + case "f.video.weibocdn.com": + return "https://weibo.com" + case "img.hellogithub.com": + return "https://hellogithub.com" + } + + switch { + case strings.HasSuffix(parsedUrl.Hostname(), ".sinaimg.cn"): + return "https://weibo.com" + case strings.HasSuffix(parsedUrl.Hostname(), ".cdninstagram.com"): + return "https://www.instagram.com" + } + return "" } From fd9cfd757a0d14575fb513d281627de183004d72 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 15 Dec 2024 19:34:12 +0100 Subject: [PATCH 13/31] Replace -ms-text-size-adjust with text-size-adjust https://caniuse.com/?search=text-size-adjust says that `ms-text-size-adjust` is supported in Edge. --- internal/ui/static/css/common.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/ui/static/css/common.css b/internal/ui/static/css/common.css index 498b7068..7fecff1a 100644 --- a/internal/ui/static/css/common.css +++ b/internal/ui/static/css/common.css @@ -7,7 +7,7 @@ html { -webkit-text-size-adjust: 100%; - -ms-text-size-adjust: 100%; + text-size-adjust: 100%; } body { From c3e842eba6703b00774fb81986aeaac0bbee060c Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 15 Dec 2024 19:37:18 +0100 Subject: [PATCH 14/31] Remove -webkit-clip-path https://caniuse.com/?search=clip-path says that `clip-path` is supported since Safari 13.1 --- internal/ui/static/css/common.css | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/ui/static/css/common.css b/internal/ui/static/css/common.css index 7fecff1a..cadd1e9e 100644 --- a/internal/ui/static/css/common.css +++ b/internal/ui/static/css/common.css @@ -53,7 +53,6 @@ a:hover { .sr-only { border: 0 !important; clip: rect(1px, 1px, 1px, 1px) !important; - -webkit-clip-path: inset(50%) !important; clip-path: inset(50%) !important; height: 1px !important; overflow: hidden !important; From 14a6e8ed3ab00ce0d08e309d3a63dcb904b22674 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Sun, 15 Dec 2024 19:44:19 +0100 Subject: [PATCH 15/31] Factorise .pagination-next and .pagination-last together --- internal/ui/static/css/common.css | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/internal/ui/static/css/common.css b/internal/ui/static/css/common.css index cadd1e9e..c8e08412 100644 --- a/internal/ui/static/css/common.css +++ b/internal/ui/static/css/common.css @@ -748,7 +748,7 @@ template { padding-left: 15px; } -.pagination-next { +.pagination-next, .pagination-last { text-align: right; } @@ -756,10 +756,6 @@ template { content: " ›"; } -.pagination-last { - text-align: right; -} - .pagination-last:after { content: " »"; } From cfda948c3af7b4bcf5e4f5d915a4022d4269004b Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Mon, 16 Dec 2024 01:56:39 +0000 Subject: [PATCH 16/31] refactor(rewriter): avoid the use of regex in `addDynamicImage` See https://dustri.org/b/parsing-noscript-tags-with-goquery.html for the whole story. --- internal/reader/rewrite/rewrite_functions.go | 11 ++++------- internal/reader/rewrite/rewriter_test.go | 4 ++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/internal/reader/rewrite/rewrite_functions.go b/internal/reader/rewrite/rewrite_functions.go index c1634955..a696c22a 100644 --- a/internal/reader/rewrite/rewrite_functions.go +++ b/internal/reader/rewrite/rewrite_functions.go @@ -23,7 +23,6 @@ var ( youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)$`) youtubeIdRegex = regexp.MustCompile(`youtube_id"?\s*[:=]\s*"([a-zA-Z0-9_-]{11})"`) invidioRegex = regexp.MustCompile(`https?://(.*)/watch\?v=(.*)`) - imgRegex = regexp.MustCompile(`]+>`) textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`) ) @@ -84,10 +83,11 @@ func addMailtoSubject(entryContent string) string { } func addDynamicImage(entryContent string) string { - doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) + parserHtml, err := nethtml.ParseWithOptions(strings.NewReader(entryContent), nethtml.ParseOptionEnableScripting(false)) if err != nil { return entryContent } + doc := goquery.NewDocumentFromNode(parserHtml) // Ordered most preferred to least preferred. candidateAttrs := []string{ @@ -149,12 +149,9 @@ func addDynamicImage(entryContent string) string { if !changed { doc.Find("noscript").Each(func(i int, noscript *goquery.Selection) { - matches := imgRegex.FindAllString(noscript.Text(), 2) - - if len(matches) == 1 { + if img := noscript.Find("img"); img.Length() == 1 { + img.Unwrap() changed = true - - noscript.ReplaceWithHtml(matches[0]) } }) } diff --git a/internal/reader/rewrite/rewriter_test.go b/internal/reader/rewrite/rewriter_test.go index fa2b765b..93123dbb 100644 --- a/internal/reader/rewrite/rewriter_test.go +++ b/internal/reader/rewrite/rewriter_test.go @@ -256,7 +256,7 @@ func TestRewriteWithNoLazyImage(t *testing.T) { func TestRewriteWithLazyImage(t *testing.T) { controlEntry := &model.Entry{ Title: `A title`, - Content: `Image`, + Content: `Image`, } testEntry := &model.Entry{ Title: `A title`, @@ -272,7 +272,7 @@ func TestRewriteWithLazyImage(t *testing.T) { func TestRewriteWithLazyDivImage(t *testing.T) { controlEntry := &model.Entry{ Title: `A title`, - Content: `Image`, + Content: `Image`, } testEntry := &model.Entry{ Title: `A title`, From 777d0dd2481d120b8193130ea06442aedf14f738 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Mon, 16 Dec 2024 04:47:19 +0000 Subject: [PATCH 17/31] feat: resize favicons before storing them Some websites are using images of O(10kB) when not )O(100kB) for their favicons. As miniflux only displays them with a 16x16 resolution, let's do our best to resize them before storing them in the database. This should make miniflux consume less bandwidth when serving pages, for the joy of mobile users on a small data plan. Of course, images that already are 16x16 aren't resized. --- go.mod | 1 + go.sum | 2 + internal/reader/icon/finder.go | 57 +++++++++++++++++++++++++++++ internal/reader/icon/finder_test.go | 54 +++++++++++++++++++++++++++ 4 files changed, 114 insertions(+) diff --git a/go.mod b/go.mod index 575e4d84..ec3e858b 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/prometheus/client_golang v1.20.5 github.com/tdewolff/minify/v2 v2.21.2 golang.org/x/crypto v0.31.0 + golang.org/x/image v0.23.0 golang.org/x/net v0.32.0 golang.org/x/oauth2 v0.24.0 golang.org/x/term v0.27.0 diff --git a/go.sum b/go.sum index 177cda60..f580e312 100644 --- a/go.sum +++ b/go.sum @@ -70,6 +70,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/image v0.23.0 h1:HseQ7c2OpPKTPVzNjG5fwJsOTCiiwS4QdsYi5XU6H68= +golang.org/x/image v0.23.0/go.mod h1:wJJBTdLfCCf3tiHa1fNxpZmUI4mmoZvwMCPP0ddoNKY= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= diff --git a/internal/reader/icon/finder.go b/internal/reader/icon/finder.go index 835a3a14..e0efc90b 100644 --- a/internal/reader/icon/finder.go +++ b/internal/reader/icon/finder.go @@ -4,12 +4,18 @@ package icon // import "miniflux.app/v2/internal/reader/icon" import ( + "bytes" "encoding/base64" "fmt" + "image" + "image/gif" + "image/jpeg" + "image/png" "io" "log/slog" "net/url" "regexp" + "slices" "strings" "miniflux.app/v2/internal/config" @@ -19,6 +25,7 @@ import ( "miniflux.app/v2/internal/urllib" "github.com/PuerkitoBio/goquery" + "golang.org/x/image/draw" "golang.org/x/net/html/charset" ) @@ -180,9 +187,59 @@ func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) { Content: responseBody, } + icon = resizeIcon(icon) + return icon, nil } +func resizeIcon(icon *model.Icon) *model.Icon { + r := bytes.NewReader(icon.Content) + + if !slices.Contains([]string{"image/jpeg", "image/png", "image/gif"}, icon.MimeType) { + slog.Info("icon isn't a png/gif/jpeg/ico, can't resize", slog.String("mimetype", icon.MimeType)) + return icon + } + + // Don't resize icons that we can't decode, or that already have the right size. + config, _, err := image.DecodeConfig(r) + if err != nil { + slog.Warn("unable to decode the metadata of the icon", slog.Any("error", err)) + return icon + } + if config.Height <= 16 && config.Width <= 16 { + slog.Debug("icon don't need to be rescaled", slog.Int("height", config.Height), slog.Int("width", config.Width)) + return icon + } + + r.Seek(0, io.SeekStart) + + var src image.Image + switch icon.MimeType { + case "image/jpeg": + src, err = jpeg.Decode(r) + case "image/png": + src, err = png.Decode(r) + case "image/gif": + src, err = gif.Decode(r) + } + if err != nil { + slog.Warn("unable to decode the icon", slog.Any("error", err)) + return icon + } + + dst := image.NewRGBA(image.Rect(0, 0, 16, 16)) + draw.BiLinear.Scale(dst, dst.Rect, src, src.Bounds(), draw.Over, nil) + + var b bytes.Buffer + if err = png.Encode(io.Writer(&b), dst); err != nil { + slog.Warn("unable to encode the new icon", slog.Any("error", err)) + } + + icon.Content = b.Bytes() + icon.MimeType = "image/png" + return icon +} + func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string, error) { queries := []string{ "link[rel='icon' i]", diff --git a/internal/reader/icon/finder_test.go b/internal/reader/icon/finder_test.go index 9bb71126..1cd632af 100644 --- a/internal/reader/icon/finder_test.go +++ b/internal/reader/icon/finder_test.go @@ -4,8 +4,13 @@ package icon // import "miniflux.app/v2/internal/reader/icon" import ( + "bytes" + "encoding/base64" + "image" "strings" "testing" + + "miniflux.app/v2/internal/model" ) func TestParseImageDataURL(t *testing.T) { @@ -125,3 +130,52 @@ func TestParseDocumentWithWhitespaceIconURL(t *testing.T) { t.Errorf(`Invalid icon URL, got %q`, iconURLs[0]) } } + +func TestResizeIconSmallGif(t *testing.T) { + data, err := base64.StdEncoding.DecodeString("R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==") + if err != nil { + t.Fatal(err) + } + icon := model.Icon{ + Content: data, + MimeType: "image/gif", + } + if !bytes.Equal(icon.Content, resizeIcon(&icon).Content) { + t.Fatalf("Converted gif smaller than 16x16") + } +} + +func TestResizeIconPng(t *testing.T) { + data, err := base64.StdEncoding.DecodeString("iVBORw0KGgoAAAANSUhEUgAAABEAAAARCAYAAAA7bUf6AAAAHElEQVR42mP8z/C/noFCwDhqyKgho4aMGkIlQwBrHSpf28Yx+gAAAABJRU5ErkJggg==") + if err != nil { + t.Fatal(err) + } + icon := model.Icon{ + Content: data, + MimeType: "image/png", + } + resizedIcon := resizeIcon(&icon) + + if bytes.Equal(data, resizedIcon.Content) { + t.Fatalf("Didn't convert png of 17x17") + } + + config, _, err := image.DecodeConfig(bytes.NewReader(resizedIcon.Content)) + if err != nil { + t.Fatalf("Couln't decode resulting png: %v", err) + } + + if config.Height != 16 || config.Width != 16 { + t.Fatalf("Was expecting an image of 16x16, got %dx%d", config.Width, config.Height) + } +} + +func TestResizeInvalidImage(t *testing.T) { + icon := model.Icon{ + Content: []byte("invalid data"), + MimeType: "image/gif", + } + if !bytes.Equal(icon.Content, resizeIcon(&icon).Content) { + t.Fatalf("Tried to convert an invalid image") + } +} From 2df59b48656459ab172dd01365f776f4ff258c29 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Fri, 13 Dec 2024 23:43:07 +0100 Subject: [PATCH 18/31] Refactor internal/reader/readability/testdata - Use chained strings.Contains instead of a regex for blacklistCandidatesRegexp, as this is a bit faster - Simplify a Find.Each.Remove to Find.Remove - Don't concatenate id and class for removeUnlikelyCandidates, as it makes no sense to match on overlaps. It might also marginally improve performances, as regex now have to run on two strings separately, instead of both. - Add a small benchmark --- internal/reader/readability/readability.go | 36 +++++++++++-------- .../reader/readability/readability_test.go | 21 +++++++++++ internal/reader/readability/testdata | 1 + 3 files changed, 43 insertions(+), 15 deletions(-) create mode 120000 internal/reader/readability/testdata diff --git a/internal/reader/readability/readability.go b/internal/reader/readability/readability.go index 8f10b145..46771eeb 100644 --- a/internal/reader/readability/readability.go +++ b/internal/reader/readability/readability.go @@ -23,7 +23,6 @@ const ( var ( divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`) - blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`) okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`) unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`) @@ -81,9 +80,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er } } - document.Find("script,style").Each(func(i int, s *goquery.Selection) { - s.Remove() - }) + document.Find("script,style").Remove() transformMisusedDivsIntoParagraphs(document) removeUnlikelyCandidates(document) @@ -150,18 +147,29 @@ func getArticle(topCandidate *candidate, candidates candidateList) string { } func removeUnlikelyCandidates(document *goquery.Document) { + var shouldRemove = func(str string) bool { + str = strings.ToLower(str) + if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") { + return true + } else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) { + return true + } + return false + } + document.Find("*").Each(func(i int, s *goquery.Selection) { if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" { return } - class, _ := s.Attr("class") - id, _ := s.Attr("id") - str := strings.ToLower(class + id) - if blacklistCandidatesRegexp.MatchString(str) { - s.Remove() - } else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) { - s.Remove() + if class, ok := s.Attr("class"); ok { + if shouldRemove(class) { + s.Remove() + } + } else if id, ok := s.Attr("id"); ok { + if shouldRemove(id) { + s.Remove() + } } }) } @@ -279,10 +287,8 @@ func getLinkDensity(s *goquery.Selection) float32 { // element looks good or bad. func getClassWeight(s *goquery.Selection) float32 { weight := 0 - class, _ := s.Attr("class") - id, _ := s.Attr("id") - if class != "" { + if class, ok := s.Attr("class"); ok { class = strings.ToLower(class) if negativeRegexp.MatchString(class) { weight -= 25 @@ -291,7 +297,7 @@ func getClassWeight(s *goquery.Selection) float32 { } } - if id != "" { + if id, ok := s.Attr("id"); ok { id = strings.ToLower(id) if negativeRegexp.MatchString(id) { weight -= 25 diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go index 8baee1a0..e6deb889 100644 --- a/internal/reader/readability/readability_test.go +++ b/internal/reader/readability/readability_test.go @@ -4,6 +4,8 @@ package readability // import "miniflux.app/v2/internal/reader/readability" import ( + "bytes" + "os" "strings" "testing" ) @@ -161,3 +163,22 @@ func TestRemoveBlacklist(t *testing.T) { t.Errorf(`Invalid content, got %s instead of %s`, content, want) } } + +func BenchmarkExtractContent(b *testing.B) { + var testCases = map[string][]byte{ + "miniflux_github.html": {}, + "miniflux_wikipedia.html": {}, + } + for filename := range testCases { + data, err := os.ReadFile("testdata/" + filename) + if err != nil { + b.Fatalf(`Unable to read file %q: %v`, filename, err) + } + testCases[filename] = data + } + for range b.N { + for _, v := range testCases { + ExtractContent(bytes.NewReader(v)) + } + } +} diff --git a/internal/reader/readability/testdata b/internal/reader/readability/testdata new file mode 120000 index 00000000..83507da4 --- /dev/null +++ b/internal/reader/readability/testdata @@ -0,0 +1 @@ +../../reader/sanitizer/testdata/ \ No newline at end of file From a06657b74d92bde0909acc8563df3f1aaa10a710 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 16 Dec 2024 04:41:43 +0100 Subject: [PATCH 19/31] Factorise a line in internal/ui/static/js/app.js --- internal/ui/static/js/app.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/internal/ui/static/js/app.js b/internal/ui/static/js/app.js index f4bfbff1..d2732fff 100644 --- a/internal/ui/static/js/app.js +++ b/internal/ui/static/js/app.js @@ -751,11 +751,10 @@ function checkShareAPI(title, url) { title: title, url: url }); - window.location.reload(); } catch (err) { console.error(err); - window.location.reload(); } + window.location.reload(); } function getCsrfToken() { From 7939b543410caa6d608a82b09dd6a654a2055546 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Mon, 16 Dec 2024 22:53:38 +0100 Subject: [PATCH 20/31] Resize favicons to 32x32 to account of scaling As suggested by @michaelkuhn in https://github.com/miniflux/v2/pull/2998#issuecomment-2546702212 --- internal/reader/icon/finder.go | 4 ++-- internal/reader/icon/finder_test.go | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/reader/icon/finder.go b/internal/reader/icon/finder.go index e0efc90b..21d179bb 100644 --- a/internal/reader/icon/finder.go +++ b/internal/reader/icon/finder.go @@ -206,7 +206,7 @@ func resizeIcon(icon *model.Icon) *model.Icon { slog.Warn("unable to decode the metadata of the icon", slog.Any("error", err)) return icon } - if config.Height <= 16 && config.Width <= 16 { + if config.Height <= 32 && config.Width <= 32 { slog.Debug("icon don't need to be rescaled", slog.Int("height", config.Height), slog.Int("width", config.Width)) return icon } @@ -227,7 +227,7 @@ func resizeIcon(icon *model.Icon) *model.Icon { return icon } - dst := image.NewRGBA(image.Rect(0, 0, 16, 16)) + dst := image.NewRGBA(image.Rect(0, 0, 32, 32)) draw.BiLinear.Scale(dst, dst.Rect, src, src.Bounds(), draw.Over, nil) var b bytes.Buffer diff --git a/internal/reader/icon/finder_test.go b/internal/reader/icon/finder_test.go index 1cd632af..3a06e35f 100644 --- a/internal/reader/icon/finder_test.go +++ b/internal/reader/icon/finder_test.go @@ -146,7 +146,7 @@ func TestResizeIconSmallGif(t *testing.T) { } func TestResizeIconPng(t *testing.T) { - data, err := base64.StdEncoding.DecodeString("iVBORw0KGgoAAAANSUhEUgAAABEAAAARCAYAAAA7bUf6AAAAHElEQVR42mP8z/C/noFCwDhqyKgho4aMGkIlQwBrHSpf28Yx+gAAAABJRU5ErkJggg==") + data, err := base64.StdEncoding.DecodeString("iVBORw0KGgoAAAANSUhEUgAAACEAAAAhCAYAAABX5MJvAAAALUlEQVR42u3OMQEAAAgDoJnc6BpjDyRgcrcpGwkJCQkJCQkJCQkJCQkJCYmyB7NfUj/Kk4FkAAAAAElFTkSuQmCC") if err != nil { t.Fatal(err) } @@ -157,7 +157,7 @@ func TestResizeIconPng(t *testing.T) { resizedIcon := resizeIcon(&icon) if bytes.Equal(data, resizedIcon.Content) { - t.Fatalf("Didn't convert png of 17x17") + t.Fatalf("Didn't convert png of 33x33") } config, _, err := image.DecodeConfig(bytes.NewReader(resizedIcon.Content)) @@ -165,7 +165,7 @@ func TestResizeIconPng(t *testing.T) { t.Fatalf("Couln't decode resulting png: %v", err) } - if config.Height != 16 || config.Width != 16 { + if config.Height != 32 || config.Width != 32 { t.Fatalf("Was expecting an image of 16x16, got %dx%d", config.Width, config.Height) } } From 7346d751cc0cf14c10cfe3a281304c3f12483552 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 22:07:19 +0000 Subject: [PATCH 21/31] build(deps): bump library/alpine in /packaging/docker/alpine Bumps library/alpine from 3.20 to 3.21. --- updated-dependencies: - dependency-name: library/alpine dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- packaging/docker/alpine/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/docker/alpine/Dockerfile b/packaging/docker/alpine/Dockerfile index 3988e2b3..43a1b9e1 100644 --- a/packaging/docker/alpine/Dockerfile +++ b/packaging/docker/alpine/Dockerfile @@ -4,7 +4,7 @@ ADD . /go/src/app WORKDIR /go/src/app RUN make miniflux -FROM docker.io/library/alpine:3.20 +FROM docker.io/library/alpine:3.21 LABEL org.opencontainers.image.title=Miniflux LABEL org.opencontainers.image.description="Miniflux is a minimalist and opinionated feed reader" From bca9bea67677e21bc5e16bdaba77b1639927832e Mon Sep 17 00:00:00 2001 From: "Sevi.C" <91365763+Sevichecc@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:38:20 +0800 Subject: [PATCH 22/31] feat: add date-based entry filtering rules --- internal/reader/processor/processor.go | 53 +++++++++++++++++++-- internal/reader/processor/processor_test.go | 6 +++ internal/validator/user.go | 2 +- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/internal/reader/processor/processor.go b/internal/reader/processor/processor.go index 0bdf0f61..3c824b66 100644 --- a/internal/reader/processor/processor.go +++ b/internal/reader/processor/processor.go @@ -10,6 +10,9 @@ import ( "strings" "time" + "github.com/tdewolff/minify/v2" + "github.com/tdewolff/minify/v2/html" + "miniflux.app/v2/internal/config" "miniflux.app/v2/internal/metric" "miniflux.app/v2/internal/model" @@ -20,9 +23,6 @@ import ( "miniflux.app/v2/internal/reader/scraper" "miniflux.app/v2/internal/reader/urlcleaner" "miniflux.app/v2/internal/storage" - - "github.com/tdewolff/minify/v2" - "github.com/tdewolff/minify/v2/html" ) var customReplaceRuleRegex = regexp.MustCompile(`rewrite\("([^"]+)"\|"([^"]+)"\)`) @@ -141,6 +141,9 @@ func isBlockedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool var match bool switch parts[0] { + case "EntryDate": + datePattern := parts[1] + match = isDateMatchingPattern(entry.Date, datePattern) case "EntryTitle": match, _ = regexp.MatchString(parts[1], entry.Title) case "EntryURL": @@ -211,6 +214,9 @@ func isAllowedEntry(feed *model.Feed, entry *model.Entry, user *model.User) bool var match bool switch parts[0] { + case "EntryDate": + datePattern := parts[1] + match = isDateMatchingPattern(entry.Date, datePattern) case "EntryTitle": match, _ = regexp.MatchString(parts[1], entry.Title) case "EntryURL": @@ -462,3 +468,44 @@ func minifyEntryContent(entryContent string) string { return entryContent } + +func isDateMatchingPattern(entryDate time.Time, pattern string) bool { + if pattern == "future" { + return entryDate.After(time.Now()) + } + + parts := strings.SplitN(pattern, ":", 2) + if len(parts) != 2 { + return false + } + + operator := parts[0] + dateStr := parts[1] + + switch operator { + case "before": + targetDate, err := time.Parse("2006-01-02", dateStr) + if err != nil { + return false + } + return entryDate.Before(targetDate) + case "after": + targetDate, err := time.Parse("2006-01-02", dateStr) + if err != nil { + return false + } + return entryDate.After(targetDate) + case "between": + dates := strings.Split(dateStr, ",") + if len(dates) != 2 { + return false + } + startDate, err1 := time.Parse("2006-01-02", dates[0]) + endDate, err2 := time.Parse("2006-01-02", dates[1]) + if err1 != nil || err2 != nil { + return false + } + return entryDate.After(startDate) && entryDate.Before(endDate) + } + return false +} diff --git a/internal/reader/processor/processor_test.go b/internal/reader/processor/processor_test.go index 2a594a4a..9e228366 100644 --- a/internal/reader/processor/processor_test.go +++ b/internal/reader/processor/processor_test.go @@ -75,6 +75,12 @@ func TestAllowEntries(t *testing.T) { {&model.Feed{ID: 1, BlocklistRules: ""}, &model.Entry{Author: "Example", Tags: []string{"example", "something else"}}, &model.User{KeepFilterEntryRules: "EntryAuthor=(?i)example\nEntryTag=(?i)Test"}, true}, {&model.Feed{ID: 1, BlocklistRules: ""}, &model.Entry{Author: "Different", Tags: []string{"example", "something else"}}, &model.User{KeepFilterEntryRules: "EntryAuthor=(?i)example\nEntryTag=(?i)example"}, true}, {&model.Feed{ID: 1, BlocklistRules: ""}, &model.Entry{Author: "Different", Tags: []string{"example", "something else"}}, &model.User{KeepFilterEntryRules: "EntryAuthor=(?i)example\nEntryTag=(?i)Test"}, false}, + {&model.Feed{ID: 1, BlocklistRules: ""}, &model.Entry{Date: time.Now().Add(24 * time.Hour)}, &model.User{KeepFilterEntryRules: "EntryDate=future"}, true}, + {&model.Feed{ID: 1, BlocklistRules: ""}, &model.Entry{Date: time.Now().Add(-24 * time.Hour)}, &model.User{KeepFilterEntryRules: "EntryDate=future"}, false}, + {&model.Feed{ID: 1, BlocklistRules: ""}, &model.Entry{Date: time.Date(2024, 3, 14, 0, 0, 0, 0, time.UTC)}, &model.User{KeepFilterEntryRules: "EntryDate=before:2024-03-15"}, true}, + {&model.Feed{ID: 1, BlocklistRules: ""}, &model.Entry{Date: time.Date(2024, 3, 16, 0, 0, 0, 0, time.UTC)}, &model.User{KeepFilterEntryRules: "EntryDate=after:2024-03-15"}, true}, + {&model.Feed{ID: 1, BlocklistRules: ""}, &model.Entry{Date: time.Date(2024, 3, 10, 0, 0, 0, 0, time.UTC)}, &model.User{KeepFilterEntryRules: "EntryDate=between:2024-03-01,2024-03-15"}, true}, + {&model.Feed{ID: 1, BlocklistRules: ""}, &model.Entry{Date: time.Date(2024, 2, 28, 0, 0, 0, 0, time.UTC)}, &model.User{KeepFilterEntryRules: "EntryDate=between:2024-03-01,2024-03-15"}, false}, } for _, tc := range scenarios { diff --git a/internal/validator/user.go b/internal/validator/user.go index b461f912..a7e05edb 100644 --- a/internal/validator/user.go +++ b/internal/validator/user.go @@ -219,7 +219,7 @@ func validateMediaPlaybackRate(mediaPlaybackRate float64) *locale.LocalizedError func isValidFilterRules(filterEntryRules string, filterType string) *locale.LocalizedError { // Valid Format: FieldName=RegEx\nFieldName=RegEx... - fieldNames := []string{"EntryTitle", "EntryURL", "EntryCommentsURL", "EntryContent", "EntryAuthor", "EntryTag"} + fieldNames := []string{"EntryTitle", "EntryURL", "EntryCommentsURL", "EntryContent", "EntryAuthor", "EntryTag", "EntryDate"} rules := strings.Split(filterEntryRules, "\n") for i, rule := range rules { From 276b2d8b0bdc8bf798fcb9379d65412bf3650940 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 18 Dec 2024 22:01:23 +0000 Subject: [PATCH 23/31] build(deps): bump golang.org/x/net from 0.32.0 to 0.33.0 Bumps [golang.org/x/net](https://github.com/golang/net) from 0.32.0 to 0.33.0. - [Commits](https://github.com/golang/net/compare/v0.32.0...v0.33.0) --- updated-dependencies: - dependency-name: golang.org/x/net dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index ec3e858b..50c78a06 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/tdewolff/minify/v2 v2.21.2 golang.org/x/crypto v0.31.0 golang.org/x/image v0.23.0 - golang.org/x/net v0.32.0 + golang.org/x/net v0.33.0 golang.org/x/oauth2 v0.24.0 golang.org/x/term v0.27.0 golang.org/x/text v0.21.0 diff --git a/go.sum b/go.sum index f580e312..68a5ed1c 100644 --- a/go.sum +++ b/go.sum @@ -79,8 +79,8 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI= -golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs= +golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= From bd91e5f320fb79e0d8109dca918664489956f005 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Fri, 20 Dec 2024 15:33:29 +0100 Subject: [PATCH 24/31] Add more referer spoofing Based on #2261. For moyu.im/jandan.net, see https://github.com/DIYgod/RSSHub/issues/11528 --- internal/reader/rewrite/rules.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/internal/reader/rewrite/rules.go b/internal/reader/rewrite/rules.go index 4a77b1ff..ee5b298e 100644 --- a/internal/reader/rewrite/rules.go +++ b/internal/reader/rewrite/rules.go @@ -50,6 +50,8 @@ func GetRefererForURL(u string) string { } switch parsedUrl.Hostname() { + case "moyu.im": + return "https://i.jandan.net" case "i.pximg.net": return "https://www.pixiv.net" case "sp1.piokok.com": @@ -60,6 +62,10 @@ func GetRefererForURL(u string) string { return "https://weibo.com" case "img.hellogithub.com": return "https://hellogithub.com" + case "bjp.org.cn": + return "https://bjp.org.cn" + case "appinn.com": + return "https://appinn.com" } switch { @@ -67,6 +73,8 @@ func GetRefererForURL(u string) string { return "https://weibo.com" case strings.HasSuffix(parsedUrl.Hostname(), ".cdninstagram.com"): return "https://www.instagram.com" + case strings.HasSuffix(parsedUrl.Hostname(), ".moyu.im"): + return "https://i.jandan.net" } return "" From d345c8737612c57b50935a4093f57c1284242ebd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Fri, 20 Dec 2024 13:05:52 -0800 Subject: [PATCH 25/31] docs(changelog): update release notes for version 2.2.4 --- ChangeLog | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ miniflux.1 | 2 +- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 3c273451..4a4d699e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,64 @@ +Version 2.2.4 (December 20, 2024) +--------------------------------- + +* test(rewrite): add unit test for referer rewrite function +* refactor(subscription): use `strings.HasSuffix` instead of a regex in `FindSubscriptionsFromYouTubePlaylistPage` +* refactor(sanitizer): use `token.String()` instead of `html.EscapeString(token.Data)` +* refactor(sanitizer): simplify `isValidTag` +* refactor(sanitizer): simplify `hasRequiredAttributes` +* refactor(sanitizer): remove condition because `config.Opts` is guaranteed to never be nil +* refactor(sanitizer): remove a now-useless function after refactoring +* refactor(sanitizer): refactor conditions to highlight their similitude, enabling further refactoring +* refactor(sanitizer): optimize `strip_tags.go` +* refactor(sanitizer): micro-optimizations of `srcset.go` +* refactor(sanitizer): merge two conditions +* refactor(sanitizer): inline a function in `sanitizeAttributes` and fix a bug in it +* refactor(sanitizer): inline a condition in `sanitizeSrcsetAttr` +* refactor(sanitizer): improve `rewriteIframeURL()` +* refactor(sanitizer): Google+ isn't a thing anymore +* refactor(sanitizer): change the scope of a variable +* refactor(rewriter): replace regex with URL parsing for referrer override +* refactor(rewriter): avoid the use of regex in `addDynamicImage` +* refactor(rewrite): remove unused function arguments +* refactor(readability): various improvements and optimizations +* refactor(readability): simplify the regexes in `readability.go` +* refactor(processor): use URL parsing instead of a regex +* refactor(processor): improve the `rewrite` URL rule regex +* refactor(locale): delay parsing of translations until they're used +* refactor(js): factorise a line in `app.js` +* refactor(handler): delay `store.UserByID()` as much as possible +* refactor(css): replace `-ms-text-size-adjust` with `text-size-adjust` +* refactor(css): remove `-webkit-clip-path` +* refactor(css): factorise `.pagination-next` and `.pagination-last` together +* refactor: use a better construct than `doc.Find(…).First()` +* refactor: use `min/max` instead of `math.Min/math.Max` +* refactor: refactor `internal/reader/readability/testdata` +* refactor: optimize `sanitizeAttributes` +* refactor: get rid of `numberOfPluralFormsPerLanguage` test-only variable +* fix(storage): replace timezone function call with view +* fix(consistency): align feed modification behavior between API and UI +* fix(ci): fix grammar in pull-request template +* fix: load icon from site URL instead of feed URL +* fix: feed icon from xml ignored during force refresh +* feat(rewrite)!: remove `parse_markdown` rewrite rule +* feat(mediaproxy): update predefined referer spoofing rules for restricted media resources +* feat(locale): update translations to clarify readeck URL instead of readeck API endpoint +* feat(locale): update German translations +* feat(locale): update Chinese translations +* feat(apprise): update `SendNotification` to handle multiple entries and add logging +* feat(apprise): add title in notification request body +* feat: resize favicons before storing them in the database +* feat: optionally fetch watch time from YouTube API instead of website +* feat: only show the commit URL if it's not empty on `/about` +* feat: add predefined scraper rules for `arstechnica.com` +* feat: add date-based entry filtering rules +* chore: remove `blog.laravel.com` rewrite rule +* build(deps): bump `library/alpine` in `/packaging/docker/alpine` to `3.21` +* build(deps): bump `golang.org/x/term` from `0.26.0` to `0.27.0` +* build(deps): bump `golang.org/x/net` from `0.31.0` to `0.33.0` +* build(deps): bump `golang.org/x/crypto` from `0.30.0` to `0.31.0` +* build(deps): bump `github.com/tdewolff/minify/v2` from `2.21.1` to `2.21.2` + Version 2.2.3 (November 10, 2024) --------------------------------- diff --git a/miniflux.1 b/miniflux.1 index 528b43fd..05093890 100644 --- a/miniflux.1 +++ b/miniflux.1 @@ -1,5 +1,5 @@ .\" Manpage for miniflux. -.TH "MINIFLUX" "1" "October 26, 2024" "\ \&" "\ \&" +.TH "MINIFLUX" "1" "December 7, 2024" "\ \&" "\ \&" .SH NAME miniflux \- Minimalist and opinionated feed reader From 28fe05332989bdf41d700c3efed1ee5e9c362eb5 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Mon, 23 Dec 2024 06:00:06 +0000 Subject: [PATCH 26/31] ci: don't specify languages for CodeQL As stated in the [documentation](https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning#changing-the-languages-that-are-analyzed): > CodeQL code scanning automatically detects code written in the supported languages. This will also reduce the number of CodeQL jobs from two to one. See #3029 --- .github/workflows/codeql-analysis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index f9df2dd0..b2eb4d95 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -22,8 +22,6 @@ jobs: strategy: fail-fast: false - matrix: - language: [ 'go', 'javascript' ] steps: - name: Checkout repository From 057f7601960a5390f6319c37b53c7ecf4ce30705 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Mon, 23 Dec 2024 06:04:00 +0000 Subject: [PATCH 27/31] ci: don't run `go vet ./...` as it's run as part of `golangci-lint` See https://golangci-lint.run/usage/linters/#govet --- .github/workflows/linters.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml index 695dea78..c26957ab 100644 --- a/.github/workflows/linters.yml +++ b/.github/workflows/linters.yml @@ -29,7 +29,6 @@ jobs: - uses: actions/setup-go@v5 with: go-version: "1.23.x" - - run: "go vet ./..." - uses: golangci/golangci-lint-action@v6 with: args: > From b93543f416f031ac64ef8867d6205353b8ab4e07 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Mon, 23 Dec 2024 19:34:29 +0000 Subject: [PATCH 28/31] feat: replace `%{?systemd_requires}` with `%{?systemd_ordering}` As said [in the documentation](https://docs.fedoraproject.org/en-US/packaging-guidelines/Scriptlets/#_dependencies_on_the_systemd_package): > If the package wants to use systemd tools if they are available, but does not want to declare a dependency, then the `%{?systemd_ordering}` macro MAY be used as a weaker form of %{?systemd_requires} that only declares an ordering during an RPM transaction. See https://github.com/systemd/systemd/commit/2424b6bd716f0c1c3bf3406b1fd1a16ba1b6a556 and https://pagure.io/packaging-committee/issue/644 for more information. And also use `--setopt=install_weak_deps=False` to avoid installing a lot of useless dependencies. --- packaging/rpm/Dockerfile | 2 +- packaging/rpm/miniflux.spec | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/packaging/rpm/Dockerfile b/packaging/rpm/Dockerfile index 87eae784..58cbd46d 100644 --- a/packaging/rpm/Dockerfile +++ b/packaging/rpm/Dockerfile @@ -5,7 +5,7 @@ WORKDIR /go/src/app RUN make miniflux FROM rockylinux:9 -RUN dnf install -y rpm-build systemd +RUN dnf install --setopt=install_weak_deps=False -y rpm-build systemd-rpm-macros RUN mkdir -p /root/rpmbuild/{BUILD,RPMS,SOURCES,SPECS,SRPMS} RUN echo "%_topdir /root/rpmbuild" >> .rpmmacros COPY --from=build /go/src/app/miniflux /root/rpmbuild/SOURCES/miniflux diff --git a/packaging/rpm/miniflux.spec b/packaging/rpm/miniflux.spec index e5617215..827ecac5 100644 --- a/packaging/rpm/miniflux.spec +++ b/packaging/rpm/miniflux.spec @@ -16,8 +16,7 @@ BuildRoot: %{_topdir}/BUILD/%{name}-%{version}-%{release} BuildArch: x86_64 Requires(pre): shadow-utils -%{?systemd_requires} -BuildRequires: systemd +%{?systemd_ordering} AutoReqProv: no From f52411f734e4294683bcfd9ea4ba24c9e09a5615 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Mon, 23 Dec 2024 19:45:45 +0000 Subject: [PATCH 29/31] ci: only run `-race -cover` on Ubuntu The coverage information isn't used anywhere in the CI, so no need to have it for every OS. As for `-race`, there is no point in using it everywhere, one time should be enough, especially since it's taking a lot of time on Windows. --- .github/workflows/tests.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ff6d16da..a471ad7a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -23,8 +23,12 @@ jobs: go-version: ${{ matrix.go-version }} - name: Checkout uses: actions/checkout@v4 - - name: Run unit tests + - name: Run unit tests with coverage and race conditions checking + if: matrix.os == 'ubuntu-latest' run: make test + - name: Run unit tests without coverage and race conditions checking + if: matrix.os != 'ubuntu-latest' + run: go test -count=1 ./... integration-tests: name: Integration Tests From 195b75d1859fcd8d15e6e3152a210789c23df2f4 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Tue, 24 Dec 2024 05:16:02 +0000 Subject: [PATCH 30/31] refactor(rewriter): use custom title case converter implementation instead of `golang.org/x/text/cases.Title()` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The implementation is equivalent to `cases.Title(language.English).String(strings.ToLower(…))`, and this is the only place in miniflux where "golang.org/x/text/cases" and "golang.org/x/text/language" are (directly) used. This reduces the binary size from 27015590 to 26686112 on my machine. Kudos to https://gsa.zxilly.dev for making it straightforward to catch things like this. --- go.mod | 2 +- internal/reader/rewrite/rewrite_functions.go | 19 +++++++++++++++++++ internal/reader/rewrite/rewriter.go | 5 +---- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 50c78a06..38341a40 100644 --- a/go.mod +++ b/go.mod @@ -17,7 +17,6 @@ require ( golang.org/x/net v0.33.0 golang.org/x/oauth2 v0.24.0 golang.org/x/term v0.27.0 - golang.org/x/text v0.21.0 ) require ( @@ -42,6 +41,7 @@ require ( github.com/tdewolff/parse/v2 v2.7.19 // indirect github.com/x448/float16 v0.8.4 // indirect golang.org/x/sys v0.28.0 // indirect + golang.org/x/text v0.21.0 // indirect google.golang.org/protobuf v1.34.2 // indirect ) diff --git a/internal/reader/rewrite/rewrite_functions.go b/internal/reader/rewrite/rewrite_functions.go index a696c22a..1b48eb9b 100644 --- a/internal/reader/rewrite/rewrite_functions.go +++ b/internal/reader/rewrite/rewrite_functions.go @@ -11,6 +11,7 @@ import ( "net/url" "regexp" "strings" + "unicode" "miniflux.app/v2/internal/config" @@ -26,6 +27,24 @@ var ( textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`) ) +// titlelize returns a copy of the string s with all Unicode letters that begin words +// mapped to their Unicode title case. +func titlelize(s string) string { + // A closure is used here to remember the previous character + // so that we can check if there is a space preceding the current + // character. + previous := ' ' + return strings.Map( + func(current rune) rune { + if unicode.IsSpace(previous) { + previous = current + return unicode.ToTitle(current) + } + previous = current + return current + }, strings.ToLower(s)) +} + func addImageTitle(entryContent string) string { doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent)) if err != nil { diff --git a/internal/reader/rewrite/rewriter.go b/internal/reader/rewrite/rewriter.go index 4f8ee951..e2c26b6c 100644 --- a/internal/reader/rewrite/rewriter.go +++ b/internal/reader/rewrite/rewriter.go @@ -11,9 +11,6 @@ import ( "miniflux.app/v2/internal/model" "miniflux.app/v2/internal/urllib" - - "golang.org/x/text/cases" - "golang.org/x/text/language" ) type rule struct { @@ -94,7 +91,7 @@ func (rule rule) applyRule(entryURL string, entry *model.Entry) { case "remove_tables": entry.Content = removeTables(entry.Content) case "remove_clickbait": - entry.Title = cases.Title(language.English).String(strings.ToLower(entry.Title)) + entry.Title = titlelize(entry.Title) } } From f3989cdb2fdfba810d7eb98e5f64819ee3ad0a86 Mon Sep 17 00:00:00 2001 From: Julien Voisin Date: Tue, 24 Dec 2024 05:24:22 +0000 Subject: [PATCH 31/31] ci: checkout before installing Go Obtaining the code before deploying go allows better caching, as the go.sum file becomes available. See https://github.com/actions/setup-go/issues/281 --- .github/workflows/build_binaries.yml | 4 ++-- .github/workflows/tests.yml | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build_binaries.yml b/.github/workflows/build_binaries.yml index 7de6e733..62ff3a4e 100644 --- a/.github/workflows/build_binaries.yml +++ b/.github/workflows/build_binaries.yml @@ -9,13 +9,13 @@ jobs: name: Build runs-on: ubuntu-latest steps: + - name: Checkout + uses: actions/checkout@v4 - name: Set up Golang uses: actions/setup-go@v5 with: go-version: "1.23.x" check-latest: true - - name: Checkout - uses: actions/checkout@v4 - name: Compile binaries env: CGO_ENABLED: 0 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a471ad7a..1d4af838 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,18 +17,18 @@ jobs: os: [ubuntu-latest, windows-latest, macOS-latest] go-version: ["1.23.x"] steps: + - name: Checkout + uses: actions/checkout@v4 - name: Set up Go uses: actions/setup-go@v5 with: go-version: ${{ matrix.go-version }} - - name: Checkout - uses: actions/checkout@v4 - name: Run unit tests with coverage and race conditions checking if: matrix.os == 'ubuntu-latest' run: make test - name: Run unit tests without coverage and race conditions checking if: matrix.os != 'ubuntu-latest' - run: go test -count=1 ./... + run: go test ./... integration-tests: name: Integration Tests @@ -44,12 +44,12 @@ jobs: - 5432:5432 options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 steps: + - name: Checkout + uses: actions/checkout@v4 - name: Set up Go uses: actions/setup-go@v5 with: go-version: "1.23.x" - - name: Checkout - uses: actions/checkout@v4 - name: Install Postgres client run: sudo apt update && sudo apt install -y postgresql-client - name: Run integration tests