mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
Return outer HTML when scraping elements
This commit is contained in:
parent
30f22fbd78
commit
8e1ed8bef3
8 changed files with 73 additions and 8 deletions
|
@ -75,13 +75,7 @@ func scrapContent(page io.Reader, rules string) (string, error) {
|
||||||
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
||||||
var content string
|
var content string
|
||||||
|
|
||||||
// For some inline elements, we get the parent.
|
content, _ = goquery.OuterHtml(s)
|
||||||
if s.Is("img") || s.Is("iframe") {
|
|
||||||
content, _ = s.Parent().Html()
|
|
||||||
} else {
|
|
||||||
content, _ = s.Html()
|
|
||||||
}
|
|
||||||
|
|
||||||
contents += content
|
contents += content
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,12 @@
|
||||||
|
|
||||||
package scraper // import "miniflux.app/reader/scraper"
|
package scraper // import "miniflux.app/reader/scraper"
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io/ioutil"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestGetPredefinedRules(t *testing.T) {
|
func TestGetPredefinedRules(t *testing.T) {
|
||||||
if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
|
if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
|
||||||
|
@ -40,3 +45,32 @@ func TestWhitelistedContentTypes(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSelectorRules(t *testing.T) {
|
||||||
|
var ruleTestCases = map[string]string {
|
||||||
|
"img.html": "article > img",
|
||||||
|
"iframe.html": "article > iframe",
|
||||||
|
"p.html": "article > p",
|
||||||
|
}
|
||||||
|
|
||||||
|
for filename, rule := range ruleTestCases {
|
||||||
|
html, err := ioutil.ReadFile("testdata/" + filename)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(`Unable to read file %q: %v`, filename, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
actualResult, err := scrapContent(bytes.NewReader(html), rule)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedResult, err := ioutil.ReadFile("testdata/" + filename + "-result")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(`Unable to read file %q: %v`, filename, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if actualResult != strings.TrimSpace(string(expectedResult)) {
|
||||||
|
t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
12
reader/scraper/testdata/iframe.html
vendored
Normal file
12
reader/scraper/testdata/iframe.html
vendored
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en-US">
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<iframe id="1" src="about:blank"></iframe>
|
||||||
|
<iframe id="2" src="about:blank"></iframe>
|
||||||
|
<iframe id="3" src="about:blank"></iframe>
|
||||||
|
<iframe id="4" src="about:blank"></iframe>
|
||||||
|
<iframe id="5" src="about:blank"></iframe>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
1
reader/scraper/testdata/iframe.html-result
vendored
Normal file
1
reader/scraper/testdata/iframe.html-result
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
<iframe id="1" src="about:blank"></iframe><iframe id="2" src="about:blank"></iframe><iframe id="3" src="about:blank"></iframe><iframe id="4" src="about:blank"></iframe><iframe id="5" src="about:blank"></iframe>
|
12
reader/scraper/testdata/img.html
vendored
Normal file
12
reader/scraper/testdata/img.html
vendored
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en-US">
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<img id="1" src="#" alt="" />
|
||||||
|
<img id="2" src="#" alt="" />
|
||||||
|
<img id="3" src="#" alt="" />
|
||||||
|
<img id="4" src="#" alt="" />
|
||||||
|
<img id="5" src="#" alt="" />
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
1
reader/scraper/testdata/img.html-result
vendored
Normal file
1
reader/scraper/testdata/img.html-result
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
<img id="1" src="#" alt=""/><img id="2" src="#" alt=""/><img id="3" src="#" alt=""/><img id="4" src="#" alt=""/><img id="5" src="#" alt=""/>
|
10
reader/scraper/testdata/p.html
vendored
Normal file
10
reader/scraper/testdata/p.html
vendored
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en-US">
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p>
|
||||||
|
<p>Apquam tincidunt mauris eu risus.</p>
|
||||||
|
<p>Vestibulum auctor dapibus neque.</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
1
reader/scraper/testdata/p.html-result
vendored
Normal file
1
reader/scraper/testdata/p.html-result
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p><p>Apquam tincidunt mauris eu risus.</p><p>Vestibulum auctor dapibus neque.</p>
|
Loading…
Add table
Add a link
Reference in a new issue