1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-01 17:38:37 +00:00

feat: implement base element handling in content scraper

This commit is contained in:
Frédéric Guillot 2024-07-24 21:41:09 -07:00
parent c0f6e32a99
commit 29387f2d60
5 changed files with 224 additions and 38 deletions

View file

@ -18,72 +18,77 @@ import (
"golang.org/x/net/html/charset"
)
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) {
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return "", localizedError.Error()
slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))
return "", "", localizedError.Error()
}
if !isAllowedContentType(responseHandler.ContentType()) {
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
}
// The entry URL could redirect somewhere else.
sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL())
websiteURL = responseHandler.EffectiveURL()
sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())
pageURL = responseHandler.EffectiveURL()
if rules == "" {
rules = getPredefinedScraperRules(websiteURL)
rules = getPredefinedScraperRules(pageURL)
}
var content string
var err error
htmlDocumentReader, err := charset.NewReader(
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
responseHandler.ContentType(),
)
if err != nil {
return "", fmt.Errorf("scraper: unable to read HTML document: %v", err)
return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
}
if sameSite && rules != "" {
slog.Debug("Extracting content with custom rules",
"url", websiteURL,
"url", pageURL,
"rules", rules,
)
content, err = findContentUsingCustomRules(htmlDocumentReader, rules)
baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)
} else {
slog.Debug("Extracting content with readability",
"url", websiteURL,
"url", pageURL,
)
content, err = readability.ExtractContent(htmlDocumentReader)
baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)
}
if err != nil {
return "", err
if baseURL == "" {
baseURL = pageURL
} else {
slog.Debug("Using base URL from HTML document", "base_url", baseURL)
}
return content, nil
return baseURL, extractedContent, nil
}
func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {
document, err := goquery.NewDocumentFromReader(page)
if err != nil {
return "", err
return "", "", err
}
if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
hrefValue = strings.TrimSpace(hrefValue)
if urllib.IsAbsoluteURL(hrefValue) {
baseURL = hrefValue
}
}
contents := ""
document.Find(rules).Each(func(i int, s *goquery.Selection) {
if content, err := goquery.OuterHtml(s); err == nil {
contents += content
extractedContent += content
}
})
return contents, nil
return baseURL, extractedContent, nil
}
func getPredefinedScraperRules(websiteURL string) string {