")
siblingScoreThreshold := max(10, topCandidate.score*.2)
topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
@@ -124,10 +123,14 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
content := s.Text()
contentLength := len(content)
- if contentLength >= 80 && linkDensity < .25 {
- append = true
- } else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
- append = true
+ if contentLength >= 80 {
+ if linkDensity < .25 {
+ append = true
+ }
+ } else {
+ if linkDensity == 0 && containsSentence(content) {
+ append = true
+ }
}
}
@@ -138,7 +141,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
}
html, _ := s.Html()
- fmt.Fprintf(output, "<%s>%s%s>", tag, html, tag)
+ output.WriteString("<" + tag + ">" + html + "" + tag + ">")
}
})
@@ -156,9 +159,9 @@ func removeUnlikelyCandidates(document *goquery.Document) {
str := strings.ToLower(class + id)
if blacklistCandidatesRegexp.MatchString(str) {
- removeNodes(s)
+ s.Remove()
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
- removeNodes(s)
+ s.Remove()
}
})
}
@@ -222,7 +225,7 @@ func getCandidates(document *goquery.Document) candidateList {
contentScore += float32(strings.Count(text, ",") + 1)
// For every 100 characters in this paragraph, add another point. Up to 3 points.
- contentScore += float32(min(int(len(text)/100.0), 3))
+ contentScore += float32(min(len(text)/100.0, 3))
candidates[parentNode].score += contentScore
if grandParentNode != nil {
@@ -261,13 +264,14 @@ func scoreNode(s *goquery.Selection) *candidate {
// Get the density of links as a percentage of the content
// This is the amount of text that is inside a link divided by the total text in the node.
func getLinkDensity(s *goquery.Selection) float32 {
- linkLength := len(s.Find("a").Text())
textLength := len(s.Text())
if textLength == 0 {
return 0
}
+ linkLength := len(s.Find("a").Text())
+
return float32(linkLength) / float32(textLength)
}
@@ -278,25 +282,20 @@ func getClassWeight(s *goquery.Selection) float32 {
class, _ := s.Attr("class")
id, _ := s.Attr("id")
- class = strings.ToLower(class)
- id = strings.ToLower(id)
-
if class != "" {
+ class = strings.ToLower(class)
if negativeRegexp.MatchString(class) {
weight -= 25
- }
-
- if positiveRegexp.MatchString(class) {
+ } else if positiveRegexp.MatchString(class) {
weight += 25
}
}
if id != "" {
+ id = strings.ToLower(id)
if negativeRegexp.MatchString(id) {
weight -= 25
- }
-
- if positiveRegexp.MatchString(id) {
+ } else if positiveRegexp.MatchString(id) {
weight += 25
}
}
@@ -314,11 +313,6 @@ func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
})
}
-func removeNodes(s *goquery.Selection) {
- s.Each(func(i int, s *goquery.Selection) {
- parent := s.Parent()
- if parent.Length() > 0 {
- parent.Get(0).RemoveChild(s.Get(0))
- }
- })
+func containsSentence(content string) bool {
+ return strings.HasSuffix(content, ".") || strings.Contains(content, ". ")
}
diff --git a/internal/reader/readability/readability_test.go b/internal/reader/readability/readability_test.go
index bd47d859..8baee1a0 100644
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@@ -100,3 +100,64 @@ func TestWithoutBaseURL(t *testing.T) {
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
}
}
+
+func TestRemoveStyleScript(t *testing.T) {
+ html := `
+
+
+
Test
+
+
+
+
+
+
Some content
+
+ `
+ want := `
`
+
+ _, content, err := ExtractContent(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ content = strings.ReplaceAll(content, "\n", "")
+ content = strings.ReplaceAll(content, " ", "")
+ content = strings.ReplaceAll(content, "\t", "")
+
+ if content != want {
+ t.Errorf(`Invalid content, got %s instead of %s`, content, want)
+ }
+}
+
+func TestRemoveBlacklist(t *testing.T) {
+ html := `
+
+
+
Test
+
+
+
Some content
+
Some other thing
+
And more
+
Valid!
+
+ `
+ want := `
`
+
+ _, content, err := ExtractContent(strings.NewReader(html))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ content = strings.ReplaceAll(content, "\n", "")
+ content = strings.ReplaceAll(content, " ", "")
+ content = strings.ReplaceAll(content, "\t", "")
+
+ if content != want {
+ t.Errorf(`Invalid content, got %s instead of %s`, content, want)
+ }
+}