1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-10-05 19:31:01 +00:00
miniflux-v2/internal/reader/rewrite/content_rewrite_functions.go
Axel Verhaeghe b8bc367a00
feat(rewrite): add remove_img_blur_params rule
Adds a new content rewrite rule to strip image URL query parameters from blurred images.

This addresses issues with sites like Belgian national news that use blurry placeholder images which get replaced with high-quality versions, allowing Miniflux to fetch the original images instead of the placeholders.
2025-10-01 20:41:08 -07:00

590 lines
14 KiB
Go

// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package rewrite // import "miniflux.app/v2/internal/reader/rewrite"
import (
"encoding/base64"
"fmt"
"html"
"log/slog"
"net/url"
"regexp"
"strconv"
"strings"
"unicode"
"miniflux.app/v2/internal/config"
nethtml "golang.org/x/net/html"
"github.com/PuerkitoBio/goquery"
)
var (
youtubeIdRegex = regexp.MustCompile(`youtube_id"?\s*[:=]\s*"([a-zA-Z0-9_-]{11})"`)
textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)
)
// titlelize returns a copy of the string s with all Unicode letters that begin words
// mapped to their Unicode title case.
func titlelize(s string) string {
// A closure is used here to remember the previous character
// so that we can check if there is a space preceding the current
// character.
previous := ' '
return strings.Map(
func(current rune) rune {
if unicode.IsSpace(previous) {
previous = current
return unicode.ToTitle(current)
}
previous = current
return current
}, strings.ToLower(s))
}
func addImageTitle(entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
matches := doc.Find("img[src][title]")
if matches.Length() > 0 {
matches.Each(func(i int, img *goquery.Selection) {
altAttr := img.AttrOr("alt", "")
srcAttr, _ := img.Attr("src")
titleAttr, _ := img.Attr("title")
img.ReplaceWithHtml(`<figure><img src="` + srcAttr + `" alt="` + altAttr + `"/><figcaption><p>` + html.EscapeString(titleAttr) + `</p></figcaption></figure>`)
})
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
return entryContent
}
func addMailtoSubject(entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
matches := doc.Find(`a[href^="mailto:"]`)
if matches.Length() > 0 {
matches.Each(func(i int, a *goquery.Selection) {
hrefAttr, _ := a.Attr("href")
mailto, err := url.Parse(hrefAttr)
if err != nil {
return
}
subject := mailto.Query().Get("subject")
if subject == "" {
return
}
a.AppendHtml(" [" + html.EscapeString(subject) + "]")
})
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
return entryContent
}
func addDynamicImage(entryContent string) string {
parserHtml, err := nethtml.ParseWithOptions(strings.NewReader(entryContent), nethtml.ParseOptionEnableScripting(false))
if err != nil {
return entryContent
}
doc := goquery.NewDocumentFromNode(parserHtml)
// Ordered most preferred to least preferred.
candidateAttrs := [...]string{
"data-src",
"data-original",
"data-orig",
"data-url",
"data-orig-file",
"data-large-file",
"data-medium-file",
"data-original-mos",
"data-2000src",
"data-1000src",
"data-800src",
"data-655src",
"data-500src",
"data-380src",
}
candidateSrcsetAttrs := [...]string{
"data-srcset",
}
changed := false
doc.Find("img,div").Each(func(i int, img *goquery.Selection) {
// Src-linked candidates
for _, candidateAttr := range candidateAttrs {
if srcAttr, found := img.Attr(candidateAttr); found {
changed = true
if img.Is("img") {
img.SetAttr("src", srcAttr)
} else {
altAttr := img.AttrOr("alt", "")
img.ReplaceWithHtml(`<img src="` + srcAttr + `" alt="` + altAttr + `"/>`)
}
break
}
}
// Srcset-linked candidates
for _, candidateAttr := range candidateSrcsetAttrs {
if srcAttr, found := img.Attr(candidateAttr); found {
changed = true
if img.Is("img") {
img.SetAttr("srcset", srcAttr)
} else {
altAttr := img.AttrOr("alt", "")
img.ReplaceWithHtml(`<img srcset="` + srcAttr + `" alt="` + altAttr + `"/>`)
}
break
}
}
})
if !changed {
doc.Find("noscript").Each(func(i int, noscript *goquery.Selection) {
if img := noscript.Find("img"); img.Length() == 1 {
img.Unwrap()
changed = true
}
})
}
if changed {
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
return entryContent
}
func addDynamicIframe(entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
// Ordered most preferred to least preferred.
candidateAttrs := []string{
"data-src",
"data-original",
"data-orig",
"data-url",
"data-lazy-src",
}
changed := false
doc.Find("iframe").Each(func(i int, iframe *goquery.Selection) {
for _, candidateAttr := range candidateAttrs {
if srcAttr, found := iframe.Attr(candidateAttr); found {
changed = true
iframe.SetAttr("src", srcAttr)
break
}
}
})
if changed {
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
return entryContent
}
func fixMediumImages(entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) {
noscriptElement := paragraphImage.Find("noscript")
if noscriptElement.Length() > 0 {
paragraphImage.ReplaceWithHtml(noscriptElement.Text())
}
})
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
func useNoScriptImages(entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
doc.Find("figure").Each(func(i int, figureElement *goquery.Selection) {
imgElement := figureElement.Find("img")
if imgElement.Length() > 0 {
noscriptElement := figureElement.Find("noscript")
if noscriptElement.Length() > 0 {
figureElement.PrependHtml(noscriptElement.Text())
imgElement.Remove()
noscriptElement.Remove()
}
}
})
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
func getYoutubVideoIDFromURL(entryURL string) string {
u, err := url.Parse(entryURL)
if err != nil {
return ""
}
if !strings.HasSuffix(u.Hostname(), "youtube.com") {
return ""
}
if u.Path == "/watch" {
if v := u.Query().Get("v"); v != "" {
return v
}
return ""
}
if id, found := strings.CutPrefix(u.Path, "/shorts/"); found {
if len(id) == 11 {
// youtube shorts id are always 11 chars.
return id
}
}
return ""
}
func buildVideoPlayerIframe(absoluteVideoURL string) string {
return `<iframe width="650" height="350" frameborder="0" src="` + absoluteVideoURL + `" allowfullscreen></iframe>`
}
func addVideoPlayerIframe(absoluteVideoURL, entryContent string) string {
return buildVideoPlayerIframe(absoluteVideoURL) + `<br>` + entryContent
}
func addYoutubeVideoRewriteRule(entryURL, entryContent string) string {
if videoURL := getYoutubVideoIDFromURL(entryURL); videoURL != "" {
return addVideoPlayerIframe(config.Opts.YouTubeEmbedUrlOverride()+videoURL, entryContent)
}
return entryContent
}
func addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent string) string {
if videoURL := getYoutubVideoIDFromURL(entryURL); videoURL != "" {
return addVideoPlayerIframe(`https://`+config.Opts.InvidiousInstance()+`/embed/`+videoURL, entryContent)
}
return entryContent
}
// For reference: https://github.com/miniflux/v2/pull/1314
func addYoutubeVideoFromId(entryContent string) string {
matches := youtubeIdRegex.FindAllStringSubmatch(entryContent, -1)
if matches == nil {
return entryContent
}
videoPlayerHTML := ""
for _, match := range matches {
if len(match) == 2 {
videoPlayerHTML += buildVideoPlayerIframe(config.Opts.YouTubeEmbedUrlOverride()+match[1]) + "<br>"
}
}
return videoPlayerHTML + entryContent
}
func addInvidiousVideo(entryURL, entryContent string) string {
u, err := url.Parse(entryURL)
if err != nil {
return entryContent
}
if u.Path != "/watch" {
return entryContent
}
qs := u.Query()
videoID := qs.Get("v")
if videoID == "" {
return entryContent
}
qs.Del("v")
embedVideoURL := "https://" + u.Hostname() + `/embed/` + videoID
if len(qs) > 0 {
embedVideoURL += "?" + qs.Encode()
}
return addVideoPlayerIframe(embedVideoURL, entryContent)
}
func addPDFLink(entryURL, entryContent string) string {
if strings.HasSuffix(entryURL, ".pdf") {
return fmt.Sprintf(`<a href=%q>PDF</a><br>%s`, entryURL, entryContent)
}
return entryContent
}
func replaceTextLinks(input string) string {
return textLinkRegex.ReplaceAllString(input, `<a href="${1}">${1}</a>`)
}
func replaceCustom(entryContent string, searchTerm string, replaceTerm string) string {
re, err := regexp.Compile(searchTerm)
if err == nil {
return re.ReplaceAllString(entryContent, replaceTerm)
}
return entryContent
}
func removeCustom(entryContent string, selector string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
doc.Find(selector).Remove()
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
func addCastopodEpisode(entryURL, entryContent string) string {
player := `<iframe width="650" frameborder="0" src="` + entryURL + `/embed/light"></iframe>`
return player + `<br>` + entryContent
}
func applyFuncOnTextContent(entryContent string, selector string, repl func(string) string) string {
var treatChildren func(i int, s *goquery.Selection)
treatChildren = func(i int, s *goquery.Selection) {
if s.Nodes[0].Type == nethtml.TextNode {
s.ReplaceWithHtml(repl(s.Nodes[0].Data))
} else {
s.Contents().Each(treatChildren)
}
}
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
doc.Find(selector).Each(treatChildren)
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
func decodeBase64Content(entryContent string) string {
if ret, err := base64.StdEncoding.DecodeString(strings.TrimSpace(entryContent)); err != nil {
return entryContent
} else {
return html.EscapeString(string(ret))
}
}
func addHackerNewsLinksUsing(entryContent, app string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
hn_prefix := "https://news.ycombinator.com/"
matches := doc.Find(`a[href^="` + hn_prefix + `"]`)
if matches.Length() > 0 {
matches.Each(func(i int, a *goquery.Selection) {
hrefAttr, _ := a.Attr("href")
hn_uri, err := url.Parse(hrefAttr)
if err != nil {
return
}
switch app {
case "opener":
params := url.Values{}
params.Add("url", hn_uri.String())
url := url.URL{
Scheme: "opener",
Host: "x-callback-url",
Path: "show-options",
RawQuery: params.Encode(),
}
open_with_opener := `<a href="` + url.String() + `">Open with Opener</a>`
a.Parent().AppendHtml(" " + open_with_opener)
case "hack":
url := strings.Replace(hn_uri.String(), hn_prefix, "hack://", 1)
open_with_hack := `<a href="` + url + `">Open with HACK</a>`
a.Parent().AppendHtml(" " + open_with_hack)
default:
slog.Warn("Unknown app provided for openHackerNewsLinksWith rewrite rule",
slog.String("app", app),
)
return
}
})
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
return entryContent
}
func removeTables(entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
selectors := []string{"table", "tbody", "thead", "td", "th", "td"}
var loopElement *goquery.Selection
for _, selector := range selectors {
for {
loopElement = doc.FindMatcher(goquery.Single(selector))
if loopElement.Length() == 0 {
break
}
innerHtml, err := loopElement.Html()
if err != nil {
break
}
loopElement.Parent().AppendHtml(innerHtml)
loopElement.Remove()
}
}
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
func fixGhostCards(entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
const cardSelector = "figure.kg-card"
var currentList *goquery.Selection
doc.Find(cardSelector).Each(func(i int, s *goquery.Selection) {
title := s.Find(".kg-bookmark-title").First().Text()
author := s.Find(".kg-bookmark-author").First().Text()
href := s.Find("a.kg-bookmark-container").First().AttrOr("href", "")
// if there is no link or title, skip processing
if href == "" || title == "" {
return
}
link := ""
if author == "" || strings.HasSuffix(title, author) {
link = fmt.Sprintf("<a href=\"%s\">%s</a>", href, title)
} else {
link = fmt.Sprintf("<a href=\"%s\">%s - %s</a>", href, title, author)
}
next := s.Next()
// if the next element is also a card, start a list
if next.Is(cardSelector) && currentList == nil {
currentList = s.BeforeHtml("<ul></ul>").Prev()
}
if currentList != nil {
// add this card to the list, then delete it
currentList.AppendHtml("<li>" + link + "</li>")
s.Remove()
} else {
// replace single card
s.ReplaceWithHtml(link)
}
// if the next element is not a card, start a new list
if !next.Is(cardSelector) && currentList != nil {
currentList = nil
}
})
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return strings.TrimSpace(output)
}
func removeImgBlurParams(entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
changed := false
doc.Find("img[src]").Each(func(i int, img *goquery.Selection) {
srcAttr, exists := img.Attr("src")
if !exists {
return
}
parsedURL, err := url.Parse(srcAttr)
if err != nil {
return
}
// Only strip query parameters if this is a blurry placeholder image
if parsedURL.RawQuery != "" {
// Check if there's a blur parameter with a non-zero value
if blurValue := parsedURL.Query().Get("blur"); blurValue != "" {
if blurInt, err := strconv.Atoi(blurValue); err == nil && blurInt > 0 {
parsedURL.RawQuery = ""
img.SetAttr("src", parsedURL.String())
changed = true
}
}
}
})
if changed {
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
return output
}
return entryContent
}