mirror of
				https://github.com/miniflux/v2.git
				synced 2025-10-20 19:52:08 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			108 lines
		
	
	
	
		
			3.2 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			108 lines
		
	
	
	
		
			3.2 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
 | |
| // SPDX-License-Identifier: Apache-2.0
 | |
| 
 | |
| package scraper // import "miniflux.app/v2/internal/reader/scraper"
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"io"
 | |
| 	"log/slog"
 | |
| 	"strings"
 | |
| 
 | |
| 	"miniflux.app/v2/internal/config"
 | |
| 	"miniflux.app/v2/internal/reader/fetcher"
 | |
| 	"miniflux.app/v2/internal/reader/readability"
 | |
| 	"miniflux.app/v2/internal/urllib"
 | |
| 
 | |
| 	"github.com/PuerkitoBio/goquery"
 | |
| 	"golang.org/x/net/html/charset"
 | |
| )
 | |
| 
 | |
| func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
 | |
| 	responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))
 | |
| 	defer responseHandler.Close()
 | |
| 
 | |
| 	if localizedError := responseHandler.LocalizedError(); localizedError != nil {
 | |
| 		slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))
 | |
| 		return "", "", localizedError.Error()
 | |
| 	}
 | |
| 
 | |
| 	if !isAllowedContentType(responseHandler.ContentType()) {
 | |
| 		return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
 | |
| 	}
 | |
| 
 | |
| 	// The entry URL could redirect somewhere else.
 | |
| 	sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())
 | |
| 	pageURL = responseHandler.EffectiveURL()
 | |
| 
 | |
| 	if rules == "" {
 | |
| 		rules = getPredefinedScraperRules(pageURL)
 | |
| 	}
 | |
| 
 | |
| 	htmlDocumentReader, err := charset.NewReader(
 | |
| 		responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
 | |
| 		responseHandler.ContentType(),
 | |
| 	)
 | |
| 	if err != nil {
 | |
| 		return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	if sameSite && rules != "" {
 | |
| 		slog.Debug("Extracting content with custom rules",
 | |
| 			"url", pageURL,
 | |
| 			"rules", rules,
 | |
| 		)
 | |
| 		baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)
 | |
| 	} else {
 | |
| 		slog.Debug("Extracting content with readability",
 | |
| 			"url", pageURL,
 | |
| 		)
 | |
| 		baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)
 | |
| 	}
 | |
| 
 | |
| 	if baseURL == "" {
 | |
| 		baseURL = pageURL
 | |
| 	} else {
 | |
| 		slog.Debug("Using base URL from HTML document", "base_url", baseURL)
 | |
| 	}
 | |
| 
 | |
| 	return baseURL, extractedContent, nil
 | |
| }
 | |
| 
 | |
| func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {
 | |
| 	document, err := goquery.NewDocumentFromReader(page)
 | |
| 	if err != nil {
 | |
| 		return "", "", err
 | |
| 	}
 | |
| 
 | |
| 	if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
 | |
| 		hrefValue = strings.TrimSpace(hrefValue)
 | |
| 		if urllib.IsAbsoluteURL(hrefValue) {
 | |
| 			baseURL = hrefValue
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	document.Find(rules).Each(func(i int, s *goquery.Selection) {
 | |
| 		if content, err := goquery.OuterHtml(s); err == nil {
 | |
| 			extractedContent += content
 | |
| 		}
 | |
| 	})
 | |
| 
 | |
| 	return baseURL, extractedContent, nil
 | |
| }
 | |
| 
 | |
| func getPredefinedScraperRules(websiteURL string) string {
 | |
| 	urlDomain := urllib.Domain(websiteURL)
 | |
| 	urlDomain = strings.TrimPrefix(urlDomain, "www.")
 | |
| 
 | |
| 	if rules, ok := predefinedRules[urlDomain]; ok {
 | |
| 		return rules
 | |
| 	}
 | |
| 	return ""
 | |
| }
 | |
| 
 | |
| func isAllowedContentType(contentType string) bool {
 | |
| 	contentType = strings.ToLower(contentType)
 | |
| 	return strings.HasPrefix(contentType, "text/html") ||
 | |
| 		strings.HasPrefix(contentType, "application/xhtml+xml")
 | |
| }
 |