Add scraper rules

2025-09-15 18:57:04 +00:00 · 2017-12-10 20:51:04 -08:00 · 2017-12-10 20:51:04 -08:00 · 87ccad5c7f
commit 87ccad5c7f
parent 7a35c58f53
16 changed files with 140 additions and 34 deletions
--- a/reader/scraper/rules.go
+++ b/reader/scraper/rules.go
@ -0,0 +1,16 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+// List of predefined scraper rules (alphabetically sorted)
+// domain => CSS selectors
+var predefinedRules = map[string]string{
+	"lemonde.fr":        "div#articleBody",
+	"lesjoiesducode.fr": ".blog-post-content img",
+	"linux.com":         "div.content, div[property]",
+	"opensource.com":    "div[property]",
+	"phoronix.com":      "div.content",
+	"techcrunch.com":    "div.article-entry",
+}
--- a/reader/scraper/scraper.go
+++ b/reader/scraper/scraper.go
@ -6,14 +6,19 @@ package scraper

 import (
 	"errors"
+	"io"
+	"log"
+	"strings"

+	"github.com/PuerkitoBio/goquery"
 	"github.com/miniflux/miniflux2/http"
 	"github.com/miniflux/miniflux2/reader/readability"
 	"github.com/miniflux/miniflux2/reader/sanitizer"
+	"github.com/miniflux/miniflux2/url"
 )

 // Fetch download a web page a returns relevant contents.
-func Fetch(websiteURL string) (string, error) {
+func Fetch(websiteURL, rules string) (string, error) {
 	client := http.NewClient(websiteURL)
 	response, err := client.Get()
 	if err != nil {
@ -29,10 +34,57 @@ func Fetch(websiteURL string) (string, error) {
 		return "", err
 	}

-	content, err := readability.ExtractContent(page)
+	var content string
+	if rules == "" {
+		rules = getPredefinedScraperRules(websiteURL)
+	}
+
+	if rules != "" {
+		log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
+		content, err = scrapContent(page, rules)
+	} else {
+		log.Printf(`[Scraper] Using readability for "%s"`, websiteURL)
+		content, err = readability.ExtractContent(page)
+	}
+
 	if err != nil {
 		return "", err
 	}

 	return sanitizer.Sanitize(websiteURL, content), nil
 }
+
+func scrapContent(page io.Reader, rules string) (string, error) {
+	document, err := goquery.NewDocumentFromReader(page)
+	if err != nil {
+		return "", err
+	}
+
+	contents := ""
+	document.Find(rules).Each(func(i int, s *goquery.Selection) {
+		var content string
+
+		// For some inline elements, we get the parent.
+		if s.Is("img") {
+			content, _ = s.Parent().Html()
+		} else {
+			content, _ = s.Html()
+		}
+
+		contents += content
+	})
+
+	return contents, nil
+}
+
+func getPredefinedScraperRules(websiteURL string) string {
+	urlDomain := url.Domain(websiteURL)
+
+	for domain, rules := range predefinedRules {
+		if strings.Contains(urlDomain, domain) {
+			return rules
+		}
+	}
+
+	return ""
+}
--- a/reader/scraper/scraper_test.go
+++ b/reader/scraper/scraper_test.go
@ -0,0 +1,21 @@
+// Copyright 2017 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package scraper
+
+import "testing"
+
+func TestGetPredefinedRules(t *testing.T) {
+	if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
+		t.Error("Unable to find rule for phoronix.com")
+	}
+
+	if getPredefinedScraperRules("https://www.linux.com/") == "" {
+		t.Error("Unable to find rule for linux.com")
+	}
+
+	if getPredefinedScraperRules("https://example.org/") != "" {
+		t.Error("A rule not defined should not return anything")
+	}
+}