Move internal packages to an internal folder

For reference: https://go.dev/doc/go1.4#internalpackages
2025-09-15 18:57:04 +00:00 · 2023-08-10 19:46:45 -07:00 · 2023-08-10 19:46:45 -07:00 · 168a870c02
commit 168a870c02
parent c234903255
433 changed files with 1121 additions and 1123 deletions
--- a/internal/reader/scraper/rules.go
+++ b/internal/reader/scraper/rules.go
@ -0,0 +1,57 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package scraper // import "miniflux.app/v2/internal/reader/scraper"
+
+// List of predefined scraper rules (alphabetically sorted)
+// domain => CSS selectors
+var predefinedRules = map[string]string{
+	"bbc.co.uk":            "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
+	"blog.cloudflare.com":  "div.post-content",
+	"cbc.ca":               ".story-content",
+	"darkreading.com":      "#article-main:not(header)",
+	"developpez.com":       "div[itemprop=articleBody]",
+	"dilbert.com":          "span.comic-title-name, img.img-comic",
+	"explosm.net":          "div#comic",
+	"financialsamurai.com": "article",
+	"francetvinfo.fr":      ".text",
+	"github.com":           "article.entry-content",
+	"heise.de":             "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
+	"igen.fr":              "section.corps",
+	"ikiwiki.iki.fi":       ".page.group",
+	"ilpost.it":            ".entry-content",
+	"ing.dk":               "section.body",
+	"lapresse.ca":          ".amorce, .entry",
+	"lemonde.fr":           "article",
+	"lepoint.fr":           ".art-text",
+	"lesjoiesducode.fr":    ".blog-post-content img",
+	"lesnumeriques.com":    ".text",
+	"linux.com":            "div.content, div[property]",
+	"mac4ever.com":         "div[itemprop=articleBody]",
+	"monwindows.com":       ".blog-post-body",
+	"npr.org":              "#storytext",
+	"oneindia.com":         ".io-article-body",
+	"opensource.com":       "div[property]",
+	"openingsource.org":    "article.suxing-popup-gallery",
+	"osnews.com":           "div.newscontent1",
+	"phoronix.com":         "div.content",
+	"pseudo-sciences.org":  "#art_main",
+	"quantamagazine.org":   ".outer--content, figure, script",
+	"raywenderlich.com":    "article",
+	"royalroad.com":        ".author-note-portlet,.chapter-content",
+	"slate.fr":             ".field-items",
+	"smbc-comics.com":      "div#cc-comicbody, div#aftercomic",
+	"swordscomic.com":      "img#comic-image, div#info-frame.tab-content-area",
+	"techcrunch.com":       "div.article-entry",
+	"theoatmeal.com":       "div#comic",
+	"theregister.com":      "#top-col-story h2, #body",
+	"theverge.com":         "h2.inline:nth-child(2),h2.duet--article--dangerously-set-cms-markup,figure.w-full,div.duet--article--article-body-component",
+	"turnoff.us":           "article.post-content",
+	"universfreebox.com":   "#corps_corps",
+	"version2.dk":          "section.body",
+	"wdwnt.com":            "div.entry-content",
+	"webtoons.com":         ".viewer_img",
+	"wired.com":            "main figure, article",
+	"zeit.de":              ".summary, .article-body",
+	"zdnet.com":            "div.storyBody",
+}
--- a/internal/reader/scraper/scraper.go
+++ b/internal/reader/scraper/scraper.go
@ -0,0 +1,105 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package scraper // import "miniflux.app/v2/internal/reader/scraper"
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"strings"
+
+	"miniflux.app/v2/internal/config"
+	"miniflux.app/v2/internal/http/client"
+	"miniflux.app/v2/internal/logger"
+	"miniflux.app/v2/internal/reader/readability"
+	"miniflux.app/v2/internal/url"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+// Fetch downloads a web page and returns relevant contents.
+func Fetch(websiteURL, rules, userAgent string, cookie string, allowSelfSignedCertificates, useProxy bool) (string, error) {
+	clt := client.NewClientWithConfig(websiteURL, config.Opts)
+	clt.WithUserAgent(userAgent)
+	clt.WithCookie(cookie)
+	if useProxy {
+		clt.WithProxy()
+	}
+	clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
+
+	response, err := clt.Get()
+	if err != nil {
+		return "", err
+	}
+
+	if response.HasServerFailure() {
+		return "", errors.New("scraper: unable to download web page")
+	}
+
+	if !isAllowedContentType(response.ContentType) {
+		return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
+	}
+
+	if err = response.EnsureUnicodeBody(); err != nil {
+		return "", err
+	}
+
+	// The entry URL could redirect somewhere else.
+	sameSite := url.Domain(websiteURL) == url.Domain(response.EffectiveURL)
+	websiteURL = response.EffectiveURL
+
+	if rules == "" {
+		rules = getPredefinedScraperRules(websiteURL)
+	}
+
+	var content string
+	if sameSite && rules != "" {
+		logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL)
+		content, err = scrapContent(response.Body, rules)
+	} else {
+		logger.Debug(`[Scraper] Using readability for %q`, websiteURL)
+		content, err = readability.ExtractContent(response.Body)
+	}
+
+	if err != nil {
+		return "", err
+	}
+
+	return content, nil
+}
+
+func scrapContent(page io.Reader, rules string) (string, error) {
+	document, err := goquery.NewDocumentFromReader(page)
+	if err != nil {
+		return "", err
+	}
+
+	contents := ""
+	document.Find(rules).Each(func(i int, s *goquery.Selection) {
+		var content string
+
+		content, _ = goquery.OuterHtml(s)
+		contents += content
+	})
+
+	return contents, nil
+}
+
+func getPredefinedScraperRules(websiteURL string) string {
+	urlDomain := url.Domain(websiteURL)
+
+	for domain, rules := range predefinedRules {
+		if strings.Contains(urlDomain, domain) {
+			return rules
+		}
+	}
+
+	return ""
+}
+
+func isAllowedContentType(contentType string) bool {
+	contentType = strings.ToLower(contentType)
+	return strings.HasPrefix(contentType, "text/html") ||
+		strings.HasPrefix(contentType, "application/xhtml+xml")
+}
--- a/internal/reader/scraper/scraper_test.go
+++ b/internal/reader/scraper/scraper_test.go
@ -0,0 +1,75 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package scraper // import "miniflux.app/v2/internal/reader/scraper"
+
+import (
+	"bytes"
+	"os"
+	"strings"
+	"testing"
+)
+
+func TestGetPredefinedRules(t *testing.T) {
+	if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
+		t.Error("Unable to find rule for phoronix.com")
+	}
+
+	if getPredefinedScraperRules("https://www.linux.com/") == "" {
+		t.Error("Unable to find rule for linux.com")
+	}
+
+	if getPredefinedScraperRules("https://example.org/") != "" {
+		t.Error("A rule not defined should not return anything")
+	}
+}
+
+func TestWhitelistedContentTypes(t *testing.T) {
+	scenarios := map[string]bool{
+		"text/html":                            true,
+		"TeXt/hTmL":                            true,
+		"application/xhtml+xml":                true,
+		"text/html; charset=utf-8":             true,
+		"application/xhtml+xml; charset=utf-8": true,
+		"text/css":                             false,
+		"application/javascript":               false,
+		"image/png":                            false,
+		"application/pdf":                      false,
+	}
+
+	for inputValue, expectedResult := range scenarios {
+		actualResult := isAllowedContentType(inputValue)
+		if actualResult != expectedResult {
+			t.Errorf(`Unexpected result for content type whitelist, got "%v" instead of "%v"`, actualResult, expectedResult)
+		}
+	}
+}
+
+func TestSelectorRules(t *testing.T) {
+	var ruleTestCases = map[string]string{
+		"img.html":    "article > img",
+		"iframe.html": "article > iframe",
+		"p.html":      "article > p",
+	}
+
+	for filename, rule := range ruleTestCases {
+		html, err := os.ReadFile("testdata/" + filename)
+		if err != nil {
+			t.Fatalf(`Unable to read file %q: %v`, filename, err)
+		}
+
+		actualResult, err := scrapContent(bytes.NewReader(html), rule)
+		if err != nil {
+			t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
+		}
+
+		expectedResult, err := os.ReadFile("testdata/" + filename + "-result")
+		if err != nil {
+			t.Fatalf(`Unable to read file %q: %v`, filename, err)
+		}
+
+		if actualResult != strings.TrimSpace(string(expectedResult)) {
+			t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
+		}
+	}
+}
--- a/internal/reader/scraper/testdata/iframe.html
+++ b/internal/reader/scraper/testdata/iframe.html
@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html lang="en-US">
+	<body>
+		<article>
+			<iframe id="1" src="about:blank"></iframe>
+			<iframe id="2" src="about:blank"></iframe>
+			<iframe id="3" src="about:blank"></iframe>
+			<iframe id="4" src="about:blank"></iframe>
+			<iframe id="5" src="about:blank"></iframe>
+		</article>
+	</body>
+</html>
--- a/internal/reader/scraper/testdata/iframe.html-result
+++ b/internal/reader/scraper/testdata/iframe.html-result
@ -0,0 +1 @@
+<iframe id="1" src="about:blank"></iframe><iframe id="2" src="about:blank"></iframe><iframe id="3" src="about:blank"></iframe><iframe id="4" src="about:blank"></iframe><iframe id="5" src="about:blank"></iframe>
--- a/internal/reader/scraper/testdata/img.html
+++ b/internal/reader/scraper/testdata/img.html
@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html lang="en-US">
+	<body>
+		<article>
+			<img id="1" src="#" alt="" />
+			<img id="2" src="#" alt="" />
+			<img id="3" src="#" alt="" />
+			<img id="4" src="#" alt="" />
+			<img id="5" src="#" alt="" />
+		</article>
+	</body>
+</html>
--- a/internal/reader/scraper/testdata/img.html-result
+++ b/internal/reader/scraper/testdata/img.html-result
@ -0,0 +1 @@
+<img id="1" src="#" alt=""/><img id="2" src="#" alt=""/><img id="3" src="#" alt=""/><img id="4" src="#" alt=""/><img id="5" src="#" alt=""/>
--- a/internal/reader/scraper/testdata/p.html
+++ b/internal/reader/scraper/testdata/p.html
@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html lang="en-US">
+	<body>
+		<article>
+			<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p>
+			<p>Apquam tincidunt mauris eu risus.</p>
+			<p>Vestibulum auctor dapibus neque.</p>
+		</article>
+	</body>
+</html>
--- a/internal/reader/scraper/testdata/p.html-result
+++ b/internal/reader/scraper/testdata/p.html-result
@ -0,0 +1 @@
+<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p><p>Apquam tincidunt mauris eu risus.</p><p>Vestibulum auctor dapibus neque.</p>
				`@ -0,0 +1 @@`
				`<iframe id="1" src="about:blank"></iframe><iframe id="2" src="about:blank"></iframe><iframe id="3" src="about:blank"></iframe><iframe id="4" src="about:blank"></iframe><iframe id="5" src="about:blank"></iframe>`
				`@ -0,0 +1 @@`
				`<img id="1" src="#" alt=""/><img id="2" src="#" alt=""/><img id="3" src="#" alt=""/><img id="4" src="#" alt=""/><img id="5" src="#" alt=""/>`
				`@ -0,0 +1 @@`
				`<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p><p>Apquam tincidunt mauris eu risus.</p><p>Vestibulum auctor dapibus neque.</p>`