mirror of
https://github.com/miniflux/v2.git
synced 2025-10-05 19:31:01 +00:00
Adds a new content rewrite rule to strip image URL query parameters from blurred images. This addresses issues with sites like Belgian national news that use blurry placeholder images which get replaced with high-quality versions, allowing Miniflux to fetch the original images instead of the placeholders.
590 lines
14 KiB
Go
590 lines
14 KiB
Go
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
|
|
package rewrite // import "miniflux.app/v2/internal/reader/rewrite"
|
|
|
|
import (
|
|
"encoding/base64"
|
|
"fmt"
|
|
"html"
|
|
"log/slog"
|
|
"net/url"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"miniflux.app/v2/internal/config"
|
|
|
|
nethtml "golang.org/x/net/html"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
var (
|
|
youtubeIdRegex = regexp.MustCompile(`youtube_id"?\s*[:=]\s*"([a-zA-Z0-9_-]{11})"`)
|
|
textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)
|
|
)
|
|
|
|
// titlelize returns a copy of the string s with all Unicode letters that begin words
|
|
// mapped to their Unicode title case.
|
|
func titlelize(s string) string {
|
|
// A closure is used here to remember the previous character
|
|
// so that we can check if there is a space preceding the current
|
|
// character.
|
|
previous := ' '
|
|
return strings.Map(
|
|
func(current rune) rune {
|
|
if unicode.IsSpace(previous) {
|
|
previous = current
|
|
return unicode.ToTitle(current)
|
|
}
|
|
previous = current
|
|
return current
|
|
}, strings.ToLower(s))
|
|
}
|
|
|
|
func addImageTitle(entryContent string) string {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
matches := doc.Find("img[src][title]")
|
|
|
|
if matches.Length() > 0 {
|
|
matches.Each(func(i int, img *goquery.Selection) {
|
|
altAttr := img.AttrOr("alt", "")
|
|
srcAttr, _ := img.Attr("src")
|
|
titleAttr, _ := img.Attr("title")
|
|
|
|
img.ReplaceWithHtml(`<figure><img src="` + srcAttr + `" alt="` + altAttr + `"/><figcaption><p>` + html.EscapeString(titleAttr) + `</p></figcaption></figure>`)
|
|
})
|
|
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
return entryContent
|
|
}
|
|
|
|
func addMailtoSubject(entryContent string) string {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
matches := doc.Find(`a[href^="mailto:"]`)
|
|
|
|
if matches.Length() > 0 {
|
|
matches.Each(func(i int, a *goquery.Selection) {
|
|
hrefAttr, _ := a.Attr("href")
|
|
|
|
mailto, err := url.Parse(hrefAttr)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
subject := mailto.Query().Get("subject")
|
|
if subject == "" {
|
|
return
|
|
}
|
|
|
|
a.AppendHtml(" [" + html.EscapeString(subject) + "]")
|
|
})
|
|
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
return entryContent
|
|
}
|
|
|
|
func addDynamicImage(entryContent string) string {
|
|
parserHtml, err := nethtml.ParseWithOptions(strings.NewReader(entryContent), nethtml.ParseOptionEnableScripting(false))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
doc := goquery.NewDocumentFromNode(parserHtml)
|
|
|
|
// Ordered most preferred to least preferred.
|
|
candidateAttrs := [...]string{
|
|
"data-src",
|
|
"data-original",
|
|
"data-orig",
|
|
"data-url",
|
|
"data-orig-file",
|
|
"data-large-file",
|
|
"data-medium-file",
|
|
"data-original-mos",
|
|
"data-2000src",
|
|
"data-1000src",
|
|
"data-800src",
|
|
"data-655src",
|
|
"data-500src",
|
|
"data-380src",
|
|
}
|
|
|
|
candidateSrcsetAttrs := [...]string{
|
|
"data-srcset",
|
|
}
|
|
|
|
changed := false
|
|
|
|
doc.Find("img,div").Each(func(i int, img *goquery.Selection) {
|
|
// Src-linked candidates
|
|
for _, candidateAttr := range candidateAttrs {
|
|
if srcAttr, found := img.Attr(candidateAttr); found {
|
|
changed = true
|
|
|
|
if img.Is("img") {
|
|
img.SetAttr("src", srcAttr)
|
|
} else {
|
|
altAttr := img.AttrOr("alt", "")
|
|
img.ReplaceWithHtml(`<img src="` + srcAttr + `" alt="` + altAttr + `"/>`)
|
|
}
|
|
|
|
break
|
|
}
|
|
}
|
|
|
|
// Srcset-linked candidates
|
|
for _, candidateAttr := range candidateSrcsetAttrs {
|
|
if srcAttr, found := img.Attr(candidateAttr); found {
|
|
changed = true
|
|
|
|
if img.Is("img") {
|
|
img.SetAttr("srcset", srcAttr)
|
|
} else {
|
|
altAttr := img.AttrOr("alt", "")
|
|
img.ReplaceWithHtml(`<img srcset="` + srcAttr + `" alt="` + altAttr + `"/>`)
|
|
}
|
|
|
|
break
|
|
}
|
|
}
|
|
})
|
|
|
|
if !changed {
|
|
doc.Find("noscript").Each(func(i int, noscript *goquery.Selection) {
|
|
if img := noscript.Find("img"); img.Length() == 1 {
|
|
img.Unwrap()
|
|
changed = true
|
|
}
|
|
})
|
|
}
|
|
|
|
if changed {
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
return entryContent
|
|
}
|
|
|
|
func addDynamicIframe(entryContent string) string {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
// Ordered most preferred to least preferred.
|
|
candidateAttrs := []string{
|
|
"data-src",
|
|
"data-original",
|
|
"data-orig",
|
|
"data-url",
|
|
"data-lazy-src",
|
|
}
|
|
|
|
changed := false
|
|
|
|
doc.Find("iframe").Each(func(i int, iframe *goquery.Selection) {
|
|
for _, candidateAttr := range candidateAttrs {
|
|
if srcAttr, found := iframe.Attr(candidateAttr); found {
|
|
changed = true
|
|
|
|
iframe.SetAttr("src", srcAttr)
|
|
|
|
break
|
|
}
|
|
}
|
|
})
|
|
|
|
if changed {
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
return entryContent
|
|
}
|
|
|
|
func fixMediumImages(entryContent string) string {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) {
|
|
noscriptElement := paragraphImage.Find("noscript")
|
|
if noscriptElement.Length() > 0 {
|
|
paragraphImage.ReplaceWithHtml(noscriptElement.Text())
|
|
}
|
|
})
|
|
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
func useNoScriptImages(entryContent string) string {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
doc.Find("figure").Each(func(i int, figureElement *goquery.Selection) {
|
|
imgElement := figureElement.Find("img")
|
|
if imgElement.Length() > 0 {
|
|
noscriptElement := figureElement.Find("noscript")
|
|
if noscriptElement.Length() > 0 {
|
|
figureElement.PrependHtml(noscriptElement.Text())
|
|
imgElement.Remove()
|
|
noscriptElement.Remove()
|
|
}
|
|
}
|
|
})
|
|
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
func getYoutubVideoIDFromURL(entryURL string) string {
|
|
u, err := url.Parse(entryURL)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
if !strings.HasSuffix(u.Hostname(), "youtube.com") {
|
|
return ""
|
|
}
|
|
|
|
if u.Path == "/watch" {
|
|
if v := u.Query().Get("v"); v != "" {
|
|
return v
|
|
}
|
|
return ""
|
|
}
|
|
|
|
if id, found := strings.CutPrefix(u.Path, "/shorts/"); found {
|
|
if len(id) == 11 {
|
|
// youtube shorts id are always 11 chars.
|
|
return id
|
|
}
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func buildVideoPlayerIframe(absoluteVideoURL string) string {
|
|
return `<iframe width="650" height="350" frameborder="0" src="` + absoluteVideoURL + `" allowfullscreen></iframe>`
|
|
}
|
|
|
|
func addVideoPlayerIframe(absoluteVideoURL, entryContent string) string {
|
|
return buildVideoPlayerIframe(absoluteVideoURL) + `<br>` + entryContent
|
|
}
|
|
|
|
func addYoutubeVideoRewriteRule(entryURL, entryContent string) string {
|
|
if videoURL := getYoutubVideoIDFromURL(entryURL); videoURL != "" {
|
|
return addVideoPlayerIframe(config.Opts.YouTubeEmbedUrlOverride()+videoURL, entryContent)
|
|
}
|
|
return entryContent
|
|
}
|
|
|
|
func addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent string) string {
|
|
if videoURL := getYoutubVideoIDFromURL(entryURL); videoURL != "" {
|
|
return addVideoPlayerIframe(`https://`+config.Opts.InvidiousInstance()+`/embed/`+videoURL, entryContent)
|
|
}
|
|
return entryContent
|
|
}
|
|
|
|
// For reference: https://github.com/miniflux/v2/pull/1314
|
|
func addYoutubeVideoFromId(entryContent string) string {
|
|
matches := youtubeIdRegex.FindAllStringSubmatch(entryContent, -1)
|
|
if matches == nil {
|
|
return entryContent
|
|
}
|
|
videoPlayerHTML := ""
|
|
for _, match := range matches {
|
|
if len(match) == 2 {
|
|
videoPlayerHTML += buildVideoPlayerIframe(config.Opts.YouTubeEmbedUrlOverride()+match[1]) + "<br>"
|
|
}
|
|
}
|
|
return videoPlayerHTML + entryContent
|
|
}
|
|
|
|
func addInvidiousVideo(entryURL, entryContent string) string {
|
|
u, err := url.Parse(entryURL)
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
if u.Path != "/watch" {
|
|
return entryContent
|
|
}
|
|
|
|
qs := u.Query()
|
|
videoID := qs.Get("v")
|
|
if videoID == "" {
|
|
return entryContent
|
|
}
|
|
qs.Del("v")
|
|
|
|
embedVideoURL := "https://" + u.Hostname() + `/embed/` + videoID
|
|
if len(qs) > 0 {
|
|
embedVideoURL += "?" + qs.Encode()
|
|
}
|
|
|
|
return addVideoPlayerIframe(embedVideoURL, entryContent)
|
|
}
|
|
|
|
func addPDFLink(entryURL, entryContent string) string {
|
|
if strings.HasSuffix(entryURL, ".pdf") {
|
|
return fmt.Sprintf(`<a href=%q>PDF</a><br>%s`, entryURL, entryContent)
|
|
}
|
|
return entryContent
|
|
}
|
|
|
|
func replaceTextLinks(input string) string {
|
|
return textLinkRegex.ReplaceAllString(input, `<a href="${1}">${1}</a>`)
|
|
}
|
|
|
|
func replaceCustom(entryContent string, searchTerm string, replaceTerm string) string {
|
|
re, err := regexp.Compile(searchTerm)
|
|
if err == nil {
|
|
return re.ReplaceAllString(entryContent, replaceTerm)
|
|
}
|
|
return entryContent
|
|
}
|
|
|
|
func removeCustom(entryContent string, selector string) string {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
doc.Find(selector).Remove()
|
|
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
func addCastopodEpisode(entryURL, entryContent string) string {
|
|
player := `<iframe width="650" frameborder="0" src="` + entryURL + `/embed/light"></iframe>`
|
|
|
|
return player + `<br>` + entryContent
|
|
}
|
|
|
|
func applyFuncOnTextContent(entryContent string, selector string, repl func(string) string) string {
|
|
var treatChildren func(i int, s *goquery.Selection)
|
|
treatChildren = func(i int, s *goquery.Selection) {
|
|
if s.Nodes[0].Type == nethtml.TextNode {
|
|
s.ReplaceWithHtml(repl(s.Nodes[0].Data))
|
|
} else {
|
|
s.Contents().Each(treatChildren)
|
|
}
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
doc.Find(selector).Each(treatChildren)
|
|
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
func decodeBase64Content(entryContent string) string {
|
|
if ret, err := base64.StdEncoding.DecodeString(strings.TrimSpace(entryContent)); err != nil {
|
|
return entryContent
|
|
} else {
|
|
return html.EscapeString(string(ret))
|
|
}
|
|
}
|
|
|
|
func addHackerNewsLinksUsing(entryContent, app string) string {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
hn_prefix := "https://news.ycombinator.com/"
|
|
matches := doc.Find(`a[href^="` + hn_prefix + `"]`)
|
|
|
|
if matches.Length() > 0 {
|
|
matches.Each(func(i int, a *goquery.Selection) {
|
|
hrefAttr, _ := a.Attr("href")
|
|
|
|
hn_uri, err := url.Parse(hrefAttr)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
switch app {
|
|
case "opener":
|
|
params := url.Values{}
|
|
params.Add("url", hn_uri.String())
|
|
|
|
url := url.URL{
|
|
Scheme: "opener",
|
|
Host: "x-callback-url",
|
|
Path: "show-options",
|
|
RawQuery: params.Encode(),
|
|
}
|
|
|
|
open_with_opener := `<a href="` + url.String() + `">Open with Opener</a>`
|
|
a.Parent().AppendHtml(" " + open_with_opener)
|
|
case "hack":
|
|
url := strings.Replace(hn_uri.String(), hn_prefix, "hack://", 1)
|
|
|
|
open_with_hack := `<a href="` + url + `">Open with HACK</a>`
|
|
a.Parent().AppendHtml(" " + open_with_hack)
|
|
default:
|
|
slog.Warn("Unknown app provided for openHackerNewsLinksWith rewrite rule",
|
|
slog.String("app", app),
|
|
)
|
|
return
|
|
}
|
|
})
|
|
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
return entryContent
|
|
}
|
|
|
|
func removeTables(entryContent string) string {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
selectors := []string{"table", "tbody", "thead", "td", "th", "td"}
|
|
|
|
var loopElement *goquery.Selection
|
|
|
|
for _, selector := range selectors {
|
|
for {
|
|
loopElement = doc.FindMatcher(goquery.Single(selector))
|
|
|
|
if loopElement.Length() == 0 {
|
|
break
|
|
}
|
|
|
|
innerHtml, err := loopElement.Html()
|
|
if err != nil {
|
|
break
|
|
}
|
|
|
|
loopElement.Parent().AppendHtml(innerHtml)
|
|
loopElement.Remove()
|
|
}
|
|
}
|
|
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
func fixGhostCards(entryContent string) string {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
const cardSelector = "figure.kg-card"
|
|
var currentList *goquery.Selection
|
|
|
|
doc.Find(cardSelector).Each(func(i int, s *goquery.Selection) {
|
|
title := s.Find(".kg-bookmark-title").First().Text()
|
|
author := s.Find(".kg-bookmark-author").First().Text()
|
|
href := s.Find("a.kg-bookmark-container").First().AttrOr("href", "")
|
|
|
|
// if there is no link or title, skip processing
|
|
if href == "" || title == "" {
|
|
return
|
|
}
|
|
|
|
link := ""
|
|
if author == "" || strings.HasSuffix(title, author) {
|
|
link = fmt.Sprintf("<a href=\"%s\">%s</a>", href, title)
|
|
} else {
|
|
link = fmt.Sprintf("<a href=\"%s\">%s - %s</a>", href, title, author)
|
|
}
|
|
|
|
next := s.Next()
|
|
|
|
// if the next element is also a card, start a list
|
|
if next.Is(cardSelector) && currentList == nil {
|
|
currentList = s.BeforeHtml("<ul></ul>").Prev()
|
|
}
|
|
|
|
if currentList != nil {
|
|
// add this card to the list, then delete it
|
|
currentList.AppendHtml("<li>" + link + "</li>")
|
|
s.Remove()
|
|
} else {
|
|
// replace single card
|
|
s.ReplaceWithHtml(link)
|
|
}
|
|
|
|
// if the next element is not a card, start a new list
|
|
if !next.Is(cardSelector) && currentList != nil {
|
|
currentList = nil
|
|
}
|
|
})
|
|
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return strings.TrimSpace(output)
|
|
}
|
|
|
|
func removeImgBlurParams(entryContent string) string {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
|
|
if err != nil {
|
|
return entryContent
|
|
}
|
|
|
|
changed := false
|
|
|
|
doc.Find("img[src]").Each(func(i int, img *goquery.Selection) {
|
|
srcAttr, exists := img.Attr("src")
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
parsedURL, err := url.Parse(srcAttr)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
// Only strip query parameters if this is a blurry placeholder image
|
|
if parsedURL.RawQuery != "" {
|
|
// Check if there's a blur parameter with a non-zero value
|
|
if blurValue := parsedURL.Query().Get("blur"); blurValue != "" {
|
|
if blurInt, err := strconv.Atoi(blurValue); err == nil && blurInt > 0 {
|
|
parsedURL.RawQuery = ""
|
|
img.SetAttr("src", parsedURL.String())
|
|
changed = true
|
|
}
|
|
}
|
|
}
|
|
})
|
|
|
|
if changed {
|
|
output, _ := doc.FindMatcher(goquery.Single("body")).Html()
|
|
return output
|
|
}
|
|
|
|
return entryContent
|
|
}
|