mirror of
https://github.com/miniflux/v2.git
synced 2025-08-01 17:38:37 +00:00
feat: implement base element handling in content scraper
This commit is contained in:
parent
c0f6e32a99
commit
29387f2d60
5 changed files with 224 additions and 38 deletions
|
@ -18,72 +18,77 @@ import (
|
|||
"golang.org/x/net/html/charset"
|
||||
)
|
||||
|
||||
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) {
|
||||
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
|
||||
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
|
||||
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))
|
||||
defer responseHandler.Close()
|
||||
|
||||
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
|
||||
slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
|
||||
return "", localizedError.Error()
|
||||
slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))
|
||||
return "", "", localizedError.Error()
|
||||
}
|
||||
|
||||
if !isAllowedContentType(responseHandler.ContentType()) {
|
||||
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
|
||||
return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
|
||||
}
|
||||
|
||||
// The entry URL could redirect somewhere else.
|
||||
sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL())
|
||||
websiteURL = responseHandler.EffectiveURL()
|
||||
sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())
|
||||
pageURL = responseHandler.EffectiveURL()
|
||||
|
||||
if rules == "" {
|
||||
rules = getPredefinedScraperRules(websiteURL)
|
||||
rules = getPredefinedScraperRules(pageURL)
|
||||
}
|
||||
|
||||
var content string
|
||||
var err error
|
||||
|
||||
htmlDocumentReader, err := charset.NewReader(
|
||||
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
|
||||
responseHandler.ContentType(),
|
||||
)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("scraper: unable to read HTML document: %v", err)
|
||||
return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
|
||||
}
|
||||
|
||||
if sameSite && rules != "" {
|
||||
slog.Debug("Extracting content with custom rules",
|
||||
"url", websiteURL,
|
||||
"url", pageURL,
|
||||
"rules", rules,
|
||||
)
|
||||
content, err = findContentUsingCustomRules(htmlDocumentReader, rules)
|
||||
baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)
|
||||
} else {
|
||||
slog.Debug("Extracting content with readability",
|
||||
"url", websiteURL,
|
||||
"url", pageURL,
|
||||
)
|
||||
content, err = readability.ExtractContent(htmlDocumentReader)
|
||||
baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return "", err
|
||||
if baseURL == "" {
|
||||
baseURL = pageURL
|
||||
} else {
|
||||
slog.Debug("Using base URL from HTML document", "base_url", baseURL)
|
||||
}
|
||||
|
||||
return content, nil
|
||||
return baseURL, extractedContent, nil
|
||||
}
|
||||
|
||||
func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
|
||||
func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {
|
||||
document, err := goquery.NewDocumentFromReader(page)
|
||||
if err != nil {
|
||||
return "", err
|
||||
return "", "", err
|
||||
}
|
||||
|
||||
if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
|
||||
hrefValue = strings.TrimSpace(hrefValue)
|
||||
if urllib.IsAbsoluteURL(hrefValue) {
|
||||
baseURL = hrefValue
|
||||
}
|
||||
}
|
||||
|
||||
contents := ""
|
||||
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
||||
if content, err := goquery.OuterHtml(s); err == nil {
|
||||
contents += content
|
||||
extractedContent += content
|
||||
}
|
||||
})
|
||||
|
||||
return contents, nil
|
||||
return baseURL, extractedContent, nil
|
||||
}
|
||||
|
||||
func getPredefinedScraperRules(websiteURL string) string {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue