1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00
miniflux-v2/internal/reader/rewrite/rewrite_functions.go
Julien Voisin a913f3f75f
feat(rewrite)!: remove parse_markdown rewrite rule
It was added in 2022 by #1513, to support blog.laravel.com, which has
since switched to HTML. The Atom 0.3/1.0, RSS 1.0/2.0, RDF, and JSON formats
don't support markdown in their spec, and any website serving it there should
be considered as buggy and fixed.

This shaves off 2MB from the miniflux binary, which is quite steep for a
feature that nobody is/should be using, and remove a dependency which is always
a good thing.
2024-12-08 14:34:47 -08:00

441 lines
11 KiB
Go

// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package rewrite // import "miniflux.app/v2/internal/reader/rewrite"
import (
"encoding/base64"
"fmt"
"html"
"log/slog"
"net/url"
"regexp"
"strings"
"miniflux.app/v2/internal/config"
nethtml "golang.org/x/net/html"
"github.com/PuerkitoBio/goquery"
)
var (
youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)$`)
youtubeIdRegex = regexp.MustCompile(`youtube_id"?\s*[:=]\s*"([a-zA-Z0-9_-]{11})"`)
invidioRegex = regexp.MustCompile(`https?://(.*)/watch\?v=(.*)`)
imgRegex = regexp.MustCompile(`<img [^>]+>`)
textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)
)
func addImageTitle(entryURL, entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
matches := doc.Find("img[src][title]")
if matches.Length() > 0 {
matches.Each(func(i int, img *goquery.Selection) {
altAttr := img.AttrOr("alt", "")
srcAttr, _ := img.Attr("src")
titleAttr, _ := img.Attr("title")
img.ReplaceWithHtml(`<figure><img src="` + srcAttr + `" alt="` + altAttr + `"/><figcaption><p>` + html.EscapeString(titleAttr) + `</p></figcaption></figure>`)
})
output, _ := doc.Find("body").First().Html()
return output
}
return entryContent
}
func addMailtoSubject(entryURL, entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
matches := doc.Find(`a[href^="mailto:"]`)
if matches.Length() > 0 {
matches.Each(func(i int, a *goquery.Selection) {
hrefAttr, _ := a.Attr("href")
mailto, err := url.Parse(hrefAttr)
if err != nil {
return
}
subject := mailto.Query().Get("subject")
if subject == "" {
return
}
a.AppendHtml(" [" + html.EscapeString(subject) + "]")
})
output, _ := doc.Find("body").First().Html()
return output
}
return entryContent
}
func addDynamicImage(entryURL, entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
// Ordered most preferred to least preferred.
candidateAttrs := []string{
"data-src",
"data-original",
"data-orig",
"data-url",
"data-orig-file",
"data-large-file",
"data-medium-file",
"data-original-mos",
"data-2000src",
"data-1000src",
"data-800src",
"data-655src",
"data-500src",
"data-380src",
}
candidateSrcsetAttrs := []string{
"data-srcset",
}
changed := false
doc.Find("img,div").Each(func(i int, img *goquery.Selection) {
// Src-linked candidates
for _, candidateAttr := range candidateAttrs {
if srcAttr, found := img.Attr(candidateAttr); found {
changed = true
if img.Is("img") {
img.SetAttr("src", srcAttr)
} else {
altAttr := img.AttrOr("alt", "")
img.ReplaceWithHtml(`<img src="` + srcAttr + `" alt="` + altAttr + `"/>`)
}
break
}
}
// Srcset-linked candidates
for _, candidateAttr := range candidateSrcsetAttrs {
if srcAttr, found := img.Attr(candidateAttr); found {
changed = true
if img.Is("img") {
img.SetAttr("srcset", srcAttr)
} else {
altAttr := img.AttrOr("alt", "")
img.ReplaceWithHtml(`<img srcset="` + srcAttr + `" alt="` + altAttr + `"/>`)
}
break
}
}
})
if !changed {
doc.Find("noscript").Each(func(i int, noscript *goquery.Selection) {
matches := imgRegex.FindAllString(noscript.Text(), 2)
if len(matches) == 1 {
changed = true
noscript.ReplaceWithHtml(matches[0])
}
})
}
if changed {
output, _ := doc.Find("body").First().Html()
return output
}
return entryContent
}
func addDynamicIframe(entryURL, entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
// Ordered most preferred to least preferred.
candidateAttrs := []string{
"data-src",
"data-original",
"data-orig",
"data-url",
"data-lazy-src",
}
changed := false
doc.Find("iframe").Each(func(i int, iframe *goquery.Selection) {
for _, candidateAttr := range candidateAttrs {
if srcAttr, found := iframe.Attr(candidateAttr); found {
changed = true
iframe.SetAttr("src", srcAttr)
break
}
}
})
if changed {
output, _ := doc.Find("body").First().Html()
return output
}
return entryContent
}
func fixMediumImages(entryURL, entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) {
noscriptElement := paragraphImage.Find("noscript")
if noscriptElement.Length() > 0 {
paragraphImage.ReplaceWithHtml(noscriptElement.Text())
}
})
output, _ := doc.Find("body").First().Html()
return output
}
func useNoScriptImages(entryURL, entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
doc.Find("figure").Each(func(i int, figureElement *goquery.Selection) {
imgElement := figureElement.Find("img")
if imgElement.Length() > 0 {
noscriptElement := figureElement.Find("noscript")
if noscriptElement.Length() > 0 {
figureElement.PrependHtml(noscriptElement.Text())
imgElement.Remove()
noscriptElement.Remove()
}
}
})
output, _ := doc.Find("body").First().Html()
return output
}
func addYoutubeVideo(entryURL, entryContent string) string {
matches := youtubeRegex.FindStringSubmatch(entryURL)
if len(matches) == 2 {
video := `<iframe width="650" height="350" frameborder="0" src="` + config.Opts.YouTubeEmbedUrlOverride() + matches[1] + `" allowfullscreen></iframe>`
return video + `<br>` + entryContent
}
return entryContent
}
func addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent string) string {
matches := youtubeRegex.FindStringSubmatch(entryURL)
if len(matches) == 2 {
video := `<iframe width="650" height="350" frameborder="0" src="https://` + config.Opts.InvidiousInstance() + `/embed/` + matches[1] + `" allowfullscreen></iframe>`
return video + `<br>` + entryContent
}
return entryContent
}
func addYoutubeVideoFromId(entryContent string) string {
matches := youtubeIdRegex.FindAllStringSubmatch(entryContent, -1)
if matches == nil {
return entryContent
}
sb := strings.Builder{}
for _, match := range matches {
if len(match) == 2 {
sb.WriteString(`<iframe width="650" height="350" frameborder="0" src="`)
sb.WriteString(config.Opts.YouTubeEmbedUrlOverride())
sb.WriteString(match[1])
sb.WriteString(`" allowfullscreen></iframe><br>`)
}
}
sb.WriteString(entryContent)
return sb.String()
}
func addInvidiousVideo(entryURL, entryContent string) string {
matches := invidioRegex.FindStringSubmatch(entryURL)
if len(matches) == 3 {
video := `<iframe width="650" height="350" frameborder="0" src="https://` + matches[1] + `/embed/` + matches[2] + `" allowfullscreen></iframe>`
return video + `<br>` + entryContent
}
return entryContent
}
func addPDFLink(entryURL, entryContent string) string {
if strings.HasSuffix(entryURL, ".pdf") {
return fmt.Sprintf(`<a href=%q>PDF</a><br>%s`, entryURL, entryContent)
}
return entryContent
}
func replaceTextLinks(input string) string {
return textLinkRegex.ReplaceAllString(input, `<a href="${1}">${1}</a>`)
}
func replaceCustom(entryContent string, searchTerm string, replaceTerm string) string {
re, err := regexp.Compile(searchTerm)
if err == nil {
return re.ReplaceAllString(entryContent, replaceTerm)
}
return entryContent
}
func removeCustom(entryContent string, selector string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
doc.Find(selector).Remove()
output, _ := doc.Find("body").First().Html()
return output
}
func addCastopodEpisode(entryURL, entryContent string) string {
player := `<iframe width="650" frameborder="0" src="` + entryURL + `/embed/light"></iframe>`
return player + `<br>` + entryContent
}
func applyFuncOnTextContent(entryContent string, selector string, repl func(string) string) string {
var treatChildren func(i int, s *goquery.Selection)
treatChildren = func(i int, s *goquery.Selection) {
if s.Nodes[0].Type == nethtml.TextNode {
s.ReplaceWithHtml(repl(s.Nodes[0].Data))
} else {
s.Contents().Each(treatChildren)
}
}
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
doc.Find(selector).Each(treatChildren)
output, _ := doc.Find("body").First().Html()
return output
}
func decodeBase64Content(entryContent string) string {
if ret, err := base64.StdEncoding.DecodeString(strings.TrimSpace(entryContent)); err != nil {
return entryContent
} else {
return html.EscapeString(string(ret))
}
}
func addHackerNewsLinksUsing(entryContent, app string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
hn_prefix := "https://news.ycombinator.com/"
matches := doc.Find(`a[href^="` + hn_prefix + `"]`)
if matches.Length() > 0 {
matches.Each(func(i int, a *goquery.Selection) {
hrefAttr, _ := a.Attr("href")
hn_uri, err := url.Parse(hrefAttr)
if err != nil {
return
}
switch app {
case "opener":
params := url.Values{}
params.Add("url", hn_uri.String())
url := url.URL{
Scheme: "opener",
Host: "x-callback-url",
Path: "show-options",
RawQuery: params.Encode(),
}
open_with_opener := `<a href="` + url.String() + `">Open with Opener</a>`
a.Parent().AppendHtml(" " + open_with_opener)
case "hack":
url := strings.Replace(hn_uri.String(), hn_prefix, "hack://", 1)
open_with_hack := `<a href="` + url + `">Open with HACK</a>`
a.Parent().AppendHtml(" " + open_with_hack)
default:
slog.Warn("Unknown app provided for openHackerNewsLinksWith rewrite rule",
slog.String("app", app),
)
return
}
})
output, _ := doc.Find("body").First().Html()
return output
}
return entryContent
}
func removeTables(entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
selectors := []string{"table", "tbody", "thead", "td", "th", "td"}
var loopElement *goquery.Selection
for _, selector := range selectors {
for {
loopElement = doc.Find(selector).First()
if loopElement.Length() == 0 {
break
}
innerHtml, err := loopElement.Html()
if err != nil {
break
}
loopElement.Parent().AppendHtml(innerHtml)
loopElement.Remove()
}
}
output, _ := doc.Find("body").First().Html()
return output
}