1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-09-15 18:57:04 +00:00

Refactor HTTP Client and LocalizedError packages

This commit is contained in:
Frédéric Guillot 2023-10-21 19:50:29 -07:00
parent 120aabfbce
commit 14e25ab9fe
104 changed files with 1277 additions and 10672 deletions

View file

@ -6,9 +6,9 @@ package atom // import "miniflux.app/v2/internal/reader/atom"
import (
"bytes"
"encoding/xml"
"fmt"
"io"
"miniflux.app/v2/internal/errors"
"miniflux.app/v2/internal/model"
xml_decoder "miniflux.app/v2/internal/reader/xml"
)
@ -18,7 +18,7 @@ type atomFeed interface {
}
// Parse returns a normalized feed struct from a Atom feed.
func Parse(baseURL string, r io.Reader) (*model.Feed, *errors.LocalizedError) {
func Parse(baseURL string, r io.Reader) (*model.Feed, error) {
var buf bytes.Buffer
tee := io.TeeReader(r, &buf)
@ -29,10 +29,8 @@ func Parse(baseURL string, r io.Reader) (*model.Feed, *errors.LocalizedError) {
rawFeed = new(atom10Feed)
}
decoder := xml_decoder.NewDecoder(&buf)
err := decoder.Decode(rawFeed)
if err != nil {
return nil, errors.NewLocalizedError("Unable to parse Atom feed: %q", err)
if err := xml_decoder.NewDecoder(&buf).Decode(rawFeed); err != nil {
return nil, fmt.Errorf("atom: unable to parse feed: %w", err)
}
return rawFeed.Transform(baseURL), nil

View file

@ -1,54 +0,0 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package browser // import "miniflux.app/v2/internal/reader/browser"
import (
"miniflux.app/v2/internal/errors"
"miniflux.app/v2/internal/http/client"
)
var (
errRequestFailed = "Unable to open this link: %v"
errServerFailure = "Unable to fetch this resource (Status Code = %d)"
errEncoding = "Unable to normalize encoding: %q"
errEmptyFeed = "This feed is empty"
errResourceNotFound = "Resource not found (404), this feed doesn't exist anymore, check the feed URL"
errNotAuthorized = "You are not authorized to access this resource (invalid username/password)"
)
// Exec executes a HTTP request and handles errors.
func Exec(request *client.Client) (*client.Response, *errors.LocalizedError) {
response, err := request.Get()
if err != nil {
if e, ok := err.(*errors.LocalizedError); ok {
return nil, e
}
return nil, errors.NewLocalizedError(errRequestFailed, err)
}
if response.IsNotFound() {
return nil, errors.NewLocalizedError(errResourceNotFound)
}
if response.IsNotAuthorized() {
return nil, errors.NewLocalizedError(errNotAuthorized)
}
if response.HasServerFailure() {
return nil, errors.NewLocalizedError(errServerFailure, response.StatusCode)
}
if response.StatusCode != 304 {
// Content-Length = -1 when no Content-Length header is sent.
if response.ContentLength == 0 {
return nil, errors.NewLocalizedError(errEmptyFeed)
}
if err := response.EnsureUnicodeBody(); err != nil {
return nil, errors.NewLocalizedError(errEncoding, err)
}
}
return response, nil
}

View file

@ -0,0 +1,168 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package fetcher // import "miniflux.app/v2/internal/reader/fetcher"
import (
"crypto/tls"
"encoding/base64"
"log/slog"
"net"
"net/http"
"net/url"
"time"
)
const (
defaultHTTPClientTimeout = 20
defaultHTTPClientMaxBodySize = 15 * 1024 * 1024
)
type RequestBuilder struct {
headers http.Header
clientProxyURL string
useClientProxy bool
clientTimeout int
withoutRedirects bool
ignoreTLSErrors bool
}
func NewRequestBuilder() *RequestBuilder {
return &RequestBuilder{
headers: make(http.Header),
clientTimeout: defaultHTTPClientTimeout,
}
}
func (r *RequestBuilder) WithHeader(key, value string) *RequestBuilder {
r.headers.Set(key, value)
return r
}
func (r *RequestBuilder) WithETag(etag string) *RequestBuilder {
if etag != "" {
r.headers.Set("If-None-Match", etag)
}
return r
}
func (r *RequestBuilder) WithLastModified(lastModified string) *RequestBuilder {
if lastModified != "" {
r.headers.Set("If-Modified-Since", lastModified)
}
return r
}
func (r *RequestBuilder) WithUserAgent(userAgent string) *RequestBuilder {
if userAgent != "" {
r.headers.Set("User-Agent", userAgent)
} else {
r.headers.Del("User-Agent")
}
return r
}
func (r *RequestBuilder) WithCookie(cookie string) *RequestBuilder {
if cookie != "" {
r.headers.Set("Cookie", cookie)
}
return r
}
func (r *RequestBuilder) WithUsernameAndPassword(username, password string) *RequestBuilder {
if username != "" && password != "" {
r.headers.Set("Authorization", "Basic "+base64.StdEncoding.EncodeToString([]byte(username+":"+password)))
}
return r
}
func (r *RequestBuilder) WithProxy(proxyURL string) *RequestBuilder {
r.clientProxyURL = proxyURL
return r
}
func (r *RequestBuilder) UseProxy(value bool) *RequestBuilder {
r.useClientProxy = value
return r
}
func (r *RequestBuilder) WithTimeout(timeout int) *RequestBuilder {
r.clientTimeout = timeout
return r
}
func (r *RequestBuilder) WithoutRedirects() *RequestBuilder {
r.withoutRedirects = true
return r
}
func (r *RequestBuilder) IgnoreTLSErrors(value bool) *RequestBuilder {
r.ignoreTLSErrors = value
return r
}
func (r *RequestBuilder) ExecuteRequest(requestURL string) (*http.Response, error) {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
// Default is 30s.
Timeout: 10 * time.Second,
// Default is 30s.
KeepAlive: 15 * time.Second,
}).DialContext,
// Default is 100.
MaxIdleConns: 50,
// Default is 90s.
IdleConnTimeout: 10 * time.Second,
TLSClientConfig: &tls.Config{
InsecureSkipVerify: r.ignoreTLSErrors,
},
}
if r.useClientProxy && r.clientProxyURL != "" {
if proxyURL, err := url.Parse(r.clientProxyURL); err != nil {
slog.Warn("Unable to parse proxy URL",
slog.String("proxy_url", r.clientProxyURL),
slog.Any("error", err),
)
} else {
transport.Proxy = http.ProxyURL(proxyURL)
}
}
client := &http.Client{
Timeout: time.Duration(r.clientTimeout) * time.Second,
}
if r.withoutRedirects {
client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
}
}
client.Transport = transport
req, err := http.NewRequest("GET", requestURL, nil)
if err != nil {
return nil, err
}
req.Header = r.headers
req.Header.Set("Accept", "*/*")
req.Header.Set("Connection", "close")
slog.Debug("Making outgoing request", slog.Group("request",
slog.String("method", req.Method),
slog.String("url", req.URL.String()),
slog.Any("headers", req.Header),
slog.Bool("without_redirects", r.withoutRedirects),
slog.Bool("with_proxy", r.useClientProxy),
slog.String("proxy_url", r.clientProxyURL),
))
return client.Do(req)
}

View file

@ -0,0 +1,147 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package fetcher // import "miniflux.app/v2/internal/reader/fetcher"
import (
"crypto/x509"
"errors"
"fmt"
"io"
"net"
"net/http"
"miniflux.app/v2/internal/locale"
)
type ResponseHandler struct {
httpResponse *http.Response
clientErr error
}
func NewResponseHandler(httpResponse *http.Response, clientErr error) *ResponseHandler {
return &ResponseHandler{httpResponse: httpResponse, clientErr: clientErr}
}
func (r *ResponseHandler) EffectiveURL() string {
return r.httpResponse.Request.URL.String()
}
func (r *ResponseHandler) ContentType() string {
return r.httpResponse.Header.Get("Content-Type")
}
func (r *ResponseHandler) LastModified() string {
// Ignore caching headers for feeds that do not want any cache.
if r.httpResponse.Header.Get("Expires") == "0" {
return ""
}
return r.httpResponse.Header.Get("Last-Modified")
}
func (r *ResponseHandler) ETag() string {
// Ignore caching headers for feeds that do not want any cache.
if r.httpResponse.Header.Get("Expires") == "0" {
return ""
}
return r.httpResponse.Header.Get("ETag")
}
func (r *ResponseHandler) IsModified(lastEtagValue, lastModifiedValue string) bool {
if r.httpResponse.StatusCode == http.StatusNotModified {
return false
}
if r.ETag() != "" && r.ETag() == lastEtagValue {
return false
}
if r.LastModified() != "" && r.LastModified() == lastModifiedValue {
return false
}
return true
}
func (r *ResponseHandler) Close() {
if r.httpResponse != nil && r.httpResponse.Body != nil && r.clientErr == nil {
r.httpResponse.Body.Close()
}
}
func (r *ResponseHandler) Body(maxBodySize int64) io.ReadCloser {
return http.MaxBytesReader(nil, r.httpResponse.Body, maxBodySize)
}
func (r *ResponseHandler) ReadBody(maxBodySize int64) ([]byte, *locale.LocalizedErrorWrapper) {
limitedReader := http.MaxBytesReader(nil, r.httpResponse.Body, maxBodySize)
buffer, err := io.ReadAll(limitedReader)
if err != nil && err != io.EOF {
if err == io.ErrUnexpectedEOF {
return nil, locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: response body too large: %w", err), "error.http_response_too_large")
}
return nil, locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: unable to read response body: %w", err), "error.http_body_read", err)
}
if len(buffer) == 0 {
return nil, locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: empty response body"), "error.http_empty_response_body")
}
return buffer, nil
}
func (r *ResponseHandler) LocalizedError() *locale.LocalizedErrorWrapper {
if r.clientErr != nil {
switch r.clientErr.(type) {
case x509.CertificateInvalidError, x509.UnknownAuthorityError, x509.HostnameError:
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: %w", r.clientErr), "error.tls_error", r.clientErr.Error())
case *net.OpError:
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: %w", r.clientErr), "error.network_operation", r.clientErr.Error())
case net.Error:
networkErr := r.clientErr.(net.Error)
if networkErr.Timeout() {
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: %w", r.clientErr), "error.network_timeout", r.clientErr.Error())
}
}
if errors.Is(r.clientErr, io.EOF) {
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: %w", r.clientErr), "error.http_empty_response")
}
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: %w", r.clientErr), "error.http_client_error", r.clientErr.Error())
}
switch r.httpResponse.StatusCode {
case http.StatusUnauthorized:
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: access unauthorized (401 status code)"), "error.http_not_authorized")
case http.StatusForbidden:
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: access forbidden (403 status code)"), "error.http_forbidden")
case http.StatusTooManyRequests:
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: too many requests (429 status code)"), "error.http_too_many_requests")
case http.StatusNotFound, http.StatusGone:
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: resource not found (%d status code)", r.httpResponse.StatusCode), "error.http_resource_not_found")
case http.StatusInternalServerError:
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: remote server error (%d status code)", r.httpResponse.StatusCode), "error.http_internal_server_error")
case http.StatusBadGateway:
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: bad gateway (%d status code)", r.httpResponse.StatusCode), "error.http_bad_gateway")
case http.StatusServiceUnavailable:
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: service unavailable (%d status code)", r.httpResponse.StatusCode), "error.http_service_unavailable")
case http.StatusGatewayTimeout:
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: gateway timeout (%d status code)", r.httpResponse.StatusCode), "error.http_gateway_timeout")
}
if r.httpResponse.StatusCode >= 400 {
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: unexpected status code (%d status code)", r.httpResponse.StatusCode), "error.http_unexpected_status_code", r.httpResponse.StatusCode)
}
if r.httpResponse.StatusCode != 304 {
// Content-Length = -1 when no Content-Length header is sent.
if r.httpResponse.ContentLength == 0 {
return locale.NewLocalizedErrorWrapper(fmt.Errorf("fetcher: empty response body"), "error.http_empty_response_body")
}
}
return nil
}

View file

@ -4,16 +4,15 @@
package handler // import "miniflux.app/v2/internal/reader/handler"
import (
"errors"
"log/slog"
"time"
"miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/errors"
"miniflux.app/v2/internal/http/client"
"miniflux.app/v2/internal/integration"
"miniflux.app/v2/internal/locale"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/browser"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/icon"
"miniflux.app/v2/internal/reader/parser"
"miniflux.app/v2/internal/reader/processor"
@ -21,13 +20,13 @@ import (
)
var (
errDuplicate = "This feed already exists (%s)"
errNotFound = "Feed %d not found"
errCategoryNotFound = "Category not found for this user"
ErrCategoryNotFound = errors.New("fetcher: category not found")
ErrFeedNotFound = errors.New("fetcher: feed not found")
ErrDuplicatedFeed = errors.New("fetcher: duplicated feed")
)
// CreateFeed fetch, parse and store a new feed.
func CreateFeed(store *storage.Storage, userID int64, feedCreationRequest *model.FeedCreationRequest) (*model.Feed, error) {
func CreateFeed(store *storage.Storage, userID int64, feedCreationRequest *model.FeedCreationRequest) (*model.Feed, *locale.LocalizedErrorWrapper) {
slog.Debug("Begin feed creation process",
slog.Int64("user_id", userID),
slog.String("feed_url", feedCreationRequest.FeedURL),
@ -35,35 +34,43 @@ func CreateFeed(store *storage.Storage, userID int64, feedCreationRequest *model
user, storeErr := store.UserByID(userID)
if storeErr != nil {
return nil, storeErr
return nil, locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr)
}
if !store.CategoryIDExists(userID, feedCreationRequest.CategoryID) {
return nil, errors.NewLocalizedError(errCategoryNotFound)
return nil, locale.NewLocalizedErrorWrapper(ErrCategoryNotFound, "error.category_not_found")
}
request := client.NewClientWithConfig(feedCreationRequest.FeedURL, config.Opts)
request.WithCredentials(feedCreationRequest.Username, feedCreationRequest.Password)
request.WithUserAgent(feedCreationRequest.UserAgent)
request.WithCookie(feedCreationRequest.Cookie)
request.AllowSelfSignedCertificates = feedCreationRequest.AllowSelfSignedCertificates
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithUsernameAndPassword(feedCreationRequest.Username, feedCreationRequest.Password)
requestBuilder.WithUserAgent(feedCreationRequest.UserAgent)
requestBuilder.WithCookie(feedCreationRequest.Cookie)
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
requestBuilder.UseProxy(feedCreationRequest.FetchViaProxy)
requestBuilder.IgnoreTLSErrors(feedCreationRequest.AllowSelfSignedCertificates)
if feedCreationRequest.FetchViaProxy {
request.WithProxy()
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(feedCreationRequest.FeedURL))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to fetch feed", slog.String("feed_url", feedCreationRequest.FeedURL), slog.Any("error", localizedError.Error()))
return nil, localizedError
}
response, requestErr := browser.Exec(request)
if requestErr != nil {
return nil, requestErr
responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
if localizedError != nil {
slog.Warn("Unable to fetch feed", slog.String("feed_url", feedCreationRequest.FeedURL), slog.Any("error", localizedError.Error()))
return nil, localizedError
}
if store.FeedURLExists(userID, response.EffectiveURL) {
return nil, errors.NewLocalizedError(errDuplicate, response.EffectiveURL)
if store.FeedURLExists(userID, responseHandler.EffectiveURL()) {
return nil, locale.NewLocalizedErrorWrapper(ErrDuplicatedFeed, "error.duplicated_feed")
}
subscription, parseErr := parser.ParseFeed(response.EffectiveURL, response.BodyAsString())
subscription, parseErr := parser.ParseFeed(responseHandler.EffectiveURL(), string(responseBody))
if parseErr != nil {
return nil, parseErr
return nil, locale.NewLocalizedErrorWrapper(parseErr, "error.unable_to_parse_feed", parseErr)
}
subscription.UserID = userID
@ -81,14 +88,16 @@ func CreateFeed(store *storage.Storage, userID int64, feedCreationRequest *model
subscription.BlocklistRules = feedCreationRequest.BlocklistRules
subscription.KeeplistRules = feedCreationRequest.KeeplistRules
subscription.UrlRewriteRules = feedCreationRequest.UrlRewriteRules
subscription.EtagHeader = responseHandler.ETag()
subscription.LastModifiedHeader = responseHandler.LastModified()
subscription.FeedURL = responseHandler.EffectiveURL()
subscription.WithCategoryID(feedCreationRequest.CategoryID)
subscription.WithClientResponse(response)
subscription.CheckedNow()
processor.ProcessFeedEntries(store, subscription, user, true)
if storeErr := store.CreateFeed(subscription); storeErr != nil {
return nil, storeErr
return nil, locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr)
}
slog.Debug("Created feed",
@ -99,18 +108,16 @@ func CreateFeed(store *storage.Storage, userID int64, feedCreationRequest *model
checkFeedIcon(
store,
requestBuilder,
subscription.ID,
subscription.SiteURL,
subscription.IconURL,
feedCreationRequest.UserAgent,
feedCreationRequest.FetchViaProxy,
feedCreationRequest.AllowSelfSignedCertificates,
)
return subscription, nil
}
// RefreshFeed refreshes a feed.
func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool) error {
func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool) *locale.LocalizedErrorWrapper {
slog.Debug("Begin feed refresh process",
slog.Int64("user_id", userID),
slog.Int64("feed_id", feedID),
@ -119,18 +126,16 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool
user, storeErr := store.UserByID(userID)
if storeErr != nil {
return storeErr
return locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr)
}
printer := locale.NewPrinter(user.Language)
originalFeed, storeErr := store.FeedByID(userID, feedID)
if storeErr != nil {
return storeErr
return locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr)
}
if originalFeed == nil {
return errors.NewLocalizedError(errNotFound, feedID)
return locale.NewLocalizedErrorWrapper(ErrFeedNotFound, "error.feed_not_found")
}
weeklyEntryCount := 0
@ -138,52 +143,62 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool
var weeklyCountErr error
weeklyEntryCount, weeklyCountErr = store.WeeklyFeedEntryCount(userID, feedID)
if weeklyCountErr != nil {
return weeklyCountErr
return locale.NewLocalizedErrorWrapper(weeklyCountErr, "error.database_error", weeklyCountErr)
}
}
originalFeed.CheckedNow()
originalFeed.ScheduleNextCheck(weeklyEntryCount)
request := client.NewClientWithConfig(originalFeed.FeedURL, config.Opts)
request.WithCredentials(originalFeed.Username, originalFeed.Password)
request.WithUserAgent(originalFeed.UserAgent)
request.WithCookie(originalFeed.Cookie)
request.AllowSelfSignedCertificates = originalFeed.AllowSelfSignedCertificates
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithUsernameAndPassword(originalFeed.Username, originalFeed.Password)
requestBuilder.WithUserAgent(originalFeed.UserAgent)
requestBuilder.WithCookie(originalFeed.Cookie)
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
requestBuilder.UseProxy(originalFeed.FetchViaProxy)
requestBuilder.IgnoreTLSErrors(originalFeed.AllowSelfSignedCertificates)
if !originalFeed.IgnoreHTTPCache {
request.WithCacheHeaders(originalFeed.EtagHeader, originalFeed.LastModifiedHeader)
}
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(originalFeed.FeedURL))
defer responseHandler.Close()
if originalFeed.FetchViaProxy {
request.WithProxy()
}
response, requestErr := browser.Exec(request)
if requestErr != nil {
originalFeed.WithError(requestErr.Localize(printer))
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to fetch feed", slog.String("feed_url", originalFeed.FeedURL), slog.Any("error", localizedError.Error()))
originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language))
store.UpdateFeedError(originalFeed)
return requestErr
return localizedError
}
if store.AnotherFeedURLExists(userID, originalFeed.ID, response.EffectiveURL) {
storeErr := errors.NewLocalizedError(errDuplicate, response.EffectiveURL)
originalFeed.WithError(storeErr.Error())
if store.AnotherFeedURLExists(userID, originalFeed.ID, responseHandler.EffectiveURL()) {
localizedError := locale.NewLocalizedErrorWrapper(ErrDuplicatedFeed, "error.duplicated_feed")
originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language))
store.UpdateFeedError(originalFeed)
return storeErr
return localizedError
}
if originalFeed.IgnoreHTTPCache || response.IsModified(originalFeed.EtagHeader, originalFeed.LastModifiedHeader) {
if originalFeed.IgnoreHTTPCache || responseHandler.IsModified(originalFeed.EtagHeader, originalFeed.LastModifiedHeader) {
slog.Debug("Feed modified",
slog.Int64("user_id", userID),
slog.Int64("feed_id", feedID),
)
updatedFeed, parseErr := parser.ParseFeed(response.EffectiveURL, response.BodyAsString())
responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
if localizedError != nil {
slog.Warn("Unable to fetch feed", slog.String("feed_url", originalFeed.FeedURL), slog.Any("error", localizedError.Error()))
return localizedError
}
updatedFeed, parseErr := parser.ParseFeed(responseHandler.EffectiveURL(), string(responseBody))
if parseErr != nil {
originalFeed.WithError(parseErr.Localize(printer))
localizedError := locale.NewLocalizedErrorWrapper(parseErr, "error.unable_to_parse_feed")
if errors.Is(parseErr, parser.ErrFeedFormatNotDetected) {
localizedError = locale.NewLocalizedErrorWrapper(parseErr, "error.feed_format_not_detected", parseErr)
}
originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language))
store.UpdateFeedError(originalFeed)
return parseErr
return localizedError
}
// If the feed has a TTL defined, we use it to make sure we don't check it too often.
@ -215,9 +230,10 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool
updateExistingEntries := forceRefresh || !originalFeed.Crawler
newEntries, storeErr := store.RefreshFeedEntries(originalFeed.UserID, originalFeed.ID, originalFeed.Entries, updateExistingEntries)
if storeErr != nil {
originalFeed.WithError(storeErr.Error())
localizedError := locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr)
originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language))
store.UpdateFeedError(originalFeed)
return storeErr
return localizedError
}
userIntegrations, intErr := store.Integration(userID)
@ -233,16 +249,15 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool
// We update caching headers only if the feed has been modified,
// because some websites don't return the same headers when replying with a 304.
originalFeed.WithClientResponse(response)
originalFeed.EtagHeader = responseHandler.ETag()
originalFeed.LastModifiedHeader = responseHandler.LastModified()
checkFeedIcon(
store,
requestBuilder,
originalFeed.ID,
originalFeed.SiteURL,
updatedFeed.IconURL,
originalFeed.UserAgent,
originalFeed.FetchViaProxy,
originalFeed.AllowSelfSignedCertificates,
)
} else {
slog.Debug("Feed not modified",
@ -254,17 +269,18 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool
originalFeed.ResetErrorCounter()
if storeErr := store.UpdateFeed(originalFeed); storeErr != nil {
originalFeed.WithError(storeErr.Error())
localizedError := locale.NewLocalizedErrorWrapper(storeErr, "error.database_error", storeErr)
originalFeed.WithTranslatedErrorMessage(localizedError.Translate(user.Language))
store.UpdateFeedError(originalFeed)
return storeErr
return localizedError
}
return nil
}
func checkFeedIcon(store *storage.Storage, feedID int64, websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) {
func checkFeedIcon(store *storage.Storage, requestBuilder *fetcher.RequestBuilder, feedID int64, websiteURL, feedIconURL string) {
if !store.HasIcon(feedID) {
iconFinder := icon.NewIconFinder(websiteURL, feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates)
iconFinder := icon.NewIconFinder(requestBuilder, websiteURL, feedIconURL)
if icon, err := iconFinder.FindIcon(); err != nil {
slog.Debug("Unable to find feed icon",
slog.Int64("feed_id", feedID),

View file

@ -13,28 +13,24 @@ import (
"miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/crypto"
"miniflux.app/v2/internal/http/client"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/urllib"
"github.com/PuerkitoBio/goquery"
)
type IconFinder struct {
websiteURL string
feedIconURL string
userAgent string
fetchViaProxy bool
allowSelfSignedCertificates bool
requestBuilder *fetcher.RequestBuilder
websiteURL string
feedIconURL string
}
func NewIconFinder(websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) *IconFinder {
func NewIconFinder(requestBuilder *fetcher.RequestBuilder, websiteURL, feedIconURL string) *IconFinder {
return &IconFinder{
websiteURL: websiteURL,
feedIconURL: feedIconURL,
userAgent: userAgent,
fetchViaProxy: fetchViaProxy,
allowSelfSignedCertificates: allowSelfSignedCertificates,
requestBuilder: requestBuilder,
websiteURL: websiteURL,
feedIconURL: feedIconURL,
}
}
@ -105,12 +101,16 @@ func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) {
slog.String("website_url", f.websiteURL),
)
documentBody, err := f.FetchRootDocument()
if err != nil {
return nil, err
rootURL := urllib.RootURL(f.websiteURL)
responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(rootURL))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
return nil, fmt.Errorf("icon: unable to download website index page: %w", localizedError.Error())
}
iconURLs, err := findIconURLsFromHTMLDocument(documentBody)
iconURLs, err := findIconURLsFromHTMLDocument(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
if err != nil {
return nil, err
}
@ -151,64 +151,28 @@ func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) {
return nil, nil
}
func (f *IconFinder) FetchRootDocument() (io.Reader, error) {
rootURL := urllib.RootURL(f.websiteURL)
clt := client.NewClientWithConfig(rootURL, config.Opts)
clt.WithUserAgent(f.userAgent)
clt.AllowSelfSignedCertificates = f.allowSelfSignedCertificates
if f.fetchViaProxy {
clt.WithProxy()
}
response, err := clt.Get()
if err != nil {
return nil, fmt.Errorf("icon: unable to download website index page: %v", err)
}
if response.HasServerFailure() {
return nil, fmt.Errorf("icon: unable to download website index page: status=%d", response.StatusCode)
}
return response.Body, nil
}
func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) {
slog.Debug("Downloading icon",
slog.String("website_url", f.websiteURL),
slog.String("icon_url", iconURL),
)
clt := client.NewClientWithConfig(iconURL, config.Opts)
clt.WithUserAgent(f.userAgent)
clt.AllowSelfSignedCertificates = f.allowSelfSignedCertificates
if f.fetchViaProxy {
clt.WithProxy()
responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(iconURL))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
return nil, fmt.Errorf("icon: unable to download website icon: %w", localizedError.Error())
}
response, err := clt.Get()
if err != nil {
return nil, fmt.Errorf("icon: unable to download icon %s: %v", iconURL, err)
}
if response.HasServerFailure() {
return nil, fmt.Errorf("icon: unable to download icon %s: status=%d", iconURL, response.StatusCode)
}
body, err := io.ReadAll(response.Body)
if err != nil {
return nil, fmt.Errorf("icon: unable to read downloaded icon from %s: %v", iconURL, err)
}
if len(body) == 0 {
return nil, fmt.Errorf("icon: downloaded icon is empty, iconURL=%s", iconURL)
responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
if localizedError != nil {
return nil, fmt.Errorf("icon: unable to read response body: %w", localizedError.Error())
}
icon := &model.Icon{
Hash: crypto.HashFromBytes(body),
MimeType: response.ContentType,
Content: body,
Hash: crypto.HashFromBytes(responseBody),
MimeType: responseHandler.ContentType(),
Content: responseBody,
}
return icon, nil

View file

@ -5,18 +5,17 @@ package json // import "miniflux.app/v2/internal/reader/json"
import (
"encoding/json"
"fmt"
"io"
"miniflux.app/v2/internal/errors"
"miniflux.app/v2/internal/model"
)
// Parse returns a normalized feed struct from a JSON feed.
func Parse(baseURL string, data io.Reader) (*model.Feed, *errors.LocalizedError) {
func Parse(baseURL string, data io.Reader) (*model.Feed, error) {
feed := new(jsonFeed)
decoder := json.NewDecoder(data)
if err := decoder.Decode(&feed); err != nil {
return nil, errors.NewLocalizedError("Unable to parse JSON Feed: %q", err)
if err := json.NewDecoder(data).Decode(&feed); err != nil {
return nil, fmt.Errorf("json: unable to parse feed: %w", err)
}
return feed.Transform(baseURL), nil

View file

@ -5,14 +5,14 @@ package opml // import "miniflux.app/v2/internal/reader/opml"
import (
"encoding/xml"
"fmt"
"io"
"miniflux.app/v2/internal/errors"
"miniflux.app/v2/internal/reader/encoding"
)
// Parse reads an OPML file and returns a SubcriptionList.
func Parse(data io.Reader) (SubcriptionList, *errors.LocalizedError) {
func Parse(data io.Reader) (SubcriptionList, error) {
opmlDocument := NewOPMLDocument()
decoder := xml.NewDecoder(data)
decoder.Entity = xml.HTMLEntity
@ -21,7 +21,7 @@ func Parse(data io.Reader) (SubcriptionList, *errors.LocalizedError) {
err := decoder.Decode(opmlDocument)
if err != nil {
return nil, errors.NewLocalizedError("Unable to parse OPML file: %q", err)
return nil, fmt.Errorf("opml: unable to parse document: %w", err)
}
return getSubscriptionsFromOutlines(opmlDocument.Outlines, ""), nil

View file

@ -4,9 +4,9 @@
package parser // import "miniflux.app/v2/internal/reader/parser"
import (
"errors"
"strings"
"miniflux.app/v2/internal/errors"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/atom"
"miniflux.app/v2/internal/reader/json"
@ -14,8 +14,10 @@ import (
"miniflux.app/v2/internal/reader/rss"
)
var ErrFeedFormatNotDetected = errors.New("parser: unable to detect feed format")
// ParseFeed analyzes the input data and returns a normalized feed object.
func ParseFeed(baseURL, data string) (*model.Feed, *errors.LocalizedError) {
func ParseFeed(baseURL, data string) (*model.Feed, error) {
switch DetectFeedFormat(data) {
case FormatAtom:
return atom.Parse(baseURL, strings.NewReader(data))
@ -26,6 +28,6 @@ func ParseFeed(baseURL, data string) (*model.Feed, *errors.LocalizedError) {
case FormatRDF:
return rdf.Parse(baseURL, strings.NewReader(data))
default:
return nil, errors.NewLocalizedError("Unsupported feed format")
return nil, ErrFeedFormatNotDetected
}
}

View file

@ -4,11 +4,7 @@
package parser // import "miniflux.app/v2/internal/reader/parser"
import (
"bytes"
"os"
"testing"
"miniflux.app/v2/internal/http/client"
)
func TestParseAtom(t *testing.T) {
@ -301,50 +297,3 @@ func TestParseEmptyFeed(t *testing.T) {
t.Error("ParseFeed must returns an error")
}
}
func TestDifferentEncodingWithResponse(t *testing.T) {
var unicodeTestCases = []struct {
filename, contentType string
index int
title string
}{
// Arabic language encoded in UTF-8.
{"urdu_UTF8.xml", "text/xml; charset=utf-8", 0, "امریکی عسکری امداد کی بندش کی وجوہات: انڈیا سے جنگ، جوہری پروگرام اور اب دہشت گردوں کی پشت پناہی"},
// Windows-1251 encoding and not charset in HTTP header.
{"encoding_WINDOWS-1251.xml", "text/xml", 0, "Цитата #17703"},
// No encoding in XML, but defined in HTTP Content-Type header.
{"no_encoding_ISO-8859-1.xml", "application/xml; charset=ISO-8859-1", 2, "La criminalité liée surtout à... l'ennui ?"},
// ISO-8859-1 encoding defined in XML and HTTP header.
{"encoding_ISO-8859-1.xml", "application/rss+xml; charset=ISO-8859-1", 5, "Projekt Jedi: Microsoft will weiter mit US-Militär zusammenarbeiten"},
// UTF-8 encoding defined in RDF document and HTTP header.
{"rdf_UTF8.xml", "application/rss+xml; charset=utf-8", 1, "Mega-Deal: IBM übernimmt Red Hat"},
// UTF-8 encoding defined only in RDF document.
{"rdf_UTF8.xml", "application/rss+xml", 1, "Mega-Deal: IBM übernimmt Red Hat"},
}
for _, tc := range unicodeTestCases {
content, err := os.ReadFile("testdata/" + tc.filename)
if err != nil {
t.Fatalf(`Unable to read file %q: %v`, tc.filename, err)
}
r := &client.Response{Body: bytes.NewReader(content), ContentType: tc.contentType}
if encodingErr := r.EnsureUnicodeBody(); encodingErr != nil {
t.Fatalf(`Encoding error for %q: %v`, tc.filename, encodingErr)
}
feed, parseErr := ParseFeed("https://example.org/", r.BodyAsString())
if parseErr != nil {
t.Fatalf(`Parsing error for %q - %q: %v`, tc.filename, tc.contentType, parseErr)
}
if feed.Entries[tc.index].Title != tc.title {
t.Errorf(`Unexpected title, got %q instead of %q`, feed.Entries[tc.index].Title, tc.title)
}
}
}

View file

@ -12,10 +12,9 @@ import (
"time"
"miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/http/client"
"miniflux.app/v2/internal/metric"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/browser"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/readingtime"
"miniflux.app/v2/internal/reader/rewrite"
"miniflux.app/v2/internal/reader/sanitizer"
@ -52,7 +51,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
continue
}
url := getUrlFromEntry(feed, entry)
websiteURL := getUrlFromEntry(feed, entry)
entryIsNew := !store.EntryURLExists(feed.ID, entry.URL)
if feed.Crawler && (entryIsNew || forceRefresh) {
slog.Debug("Scraping entry",
@ -64,13 +63,19 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
)
startTime := time.Now()
content, scraperErr := scraper.Fetch(
url,
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithUserAgent(feed.UserAgent)
requestBuilder.WithCookie(feed.Cookie)
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
requestBuilder.UseProxy(feed.FetchViaProxy)
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
content, scraperErr := scraper.ScrapeWebsite(
requestBuilder,
websiteURL,
feed.ScraperRules,
feed.UserAgent,
feed.Cookie,
feed.AllowSelfSignedCertificates,
feed.FetchViaProxy,
)
if config.Opts.HasMetricsCollector() {
@ -96,10 +101,10 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
}
}
rewrite.Rewriter(url, entry, feed.RewriteRules)
rewrite.Rewriter(websiteURL, entry, feed.RewriteRules)
// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
entry.Content = sanitizer.Sanitize(url, entry.Content)
entry.Content = sanitizer.Sanitize(websiteURL, entry.Content)
updateEntryReadingTime(store, feed, entry, entryIsNew, user)
filteredEntries = append(filteredEntries, entry)
@ -146,15 +151,20 @@ func isAllowedEntry(feed *model.Feed, entry *model.Entry) bool {
// ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error {
startTime := time.Now()
url := getUrlFromEntry(feed, entry)
websiteURL := getUrlFromEntry(feed, entry)
content, scraperErr := scraper.Fetch(
url,
entry.Feed.ScraperRules,
entry.Feed.UserAgent,
entry.Feed.Cookie,
feed.AllowSelfSignedCertificates,
feed.FetchViaProxy,
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithUserAgent(feed.UserAgent)
requestBuilder.WithCookie(feed.Cookie)
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
requestBuilder.UseProxy(feed.FetchViaProxy)
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
content, scraperErr := scraper.ScrapeWebsite(
requestBuilder,
websiteURL,
feed.ScraperRules,
)
if config.Opts.HasMetricsCollector() {
@ -174,8 +184,8 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed)
}
rewrite.Rewriter(url, entry, entry.Feed.RewriteRules)
entry.Content = sanitizer.Sanitize(url, entry.Content)
rewrite.Rewriter(websiteURL, entry, entry.Feed.RewriteRules)
entry.Content = sanitizer.Sanitize(websiteURL, entry.Content)
return nil
}
@ -270,14 +280,20 @@ func shouldFetchOdyseeWatchTime(entry *model.Entry) bool {
return matches != nil
}
func fetchYouTubeWatchTime(url string) (int, error) {
clt := client.NewClientWithConfig(url, config.Opts)
response, browserErr := browser.Exec(clt)
if browserErr != nil {
return 0, browserErr
func fetchYouTubeWatchTime(websiteURL string) (int, error) {
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to fetch YouTube page", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return 0, localizedError.Error()
}
doc, docErr := goquery.NewDocumentFromReader(response.Body)
doc, docErr := goquery.NewDocumentFromReader(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
if docErr != nil {
return 0, docErr
}
@ -295,14 +311,20 @@ func fetchYouTubeWatchTime(url string) (int, error) {
return int(dur.Minutes()), nil
}
func fetchOdyseeWatchTime(url string) (int, error) {
clt := client.NewClientWithConfig(url, config.Opts)
response, browserErr := browser.Exec(clt)
if browserErr != nil {
return 0, browserErr
func fetchOdyseeWatchTime(websiteURL string) (int, error) {
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to fetch Odysee watch time", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return 0, localizedError.Error()
}
doc, docErr := goquery.NewDocumentFromReader(response.Body)
doc, docErr := goquery.NewDocumentFromReader(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
if docErr != nil {
return 0, docErr
}

View file

@ -4,20 +4,18 @@
package rdf // import "miniflux.app/v2/internal/reader/rdf"
import (
"fmt"
"io"
"miniflux.app/v2/internal/errors"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/xml"
)
// Parse returns a normalized feed struct from a RDF feed.
func Parse(baseURL string, data io.Reader) (*model.Feed, *errors.LocalizedError) {
func Parse(baseURL string, data io.Reader) (*model.Feed, error) {
feed := new(rdfFeed)
decoder := xml.NewDecoder(data)
err := decoder.Decode(feed)
if err != nil {
return nil, errors.NewLocalizedError("Unable to parse RDF feed: %q", err)
if err := xml.NewDecoder(data).Decode(feed); err != nil {
return nil, fmt.Errorf("rdf: unable to parse feed: %w", err)
}
return feed.Transform(baseURL), nil

View file

@ -4,21 +4,18 @@
package rss // import "miniflux.app/v2/internal/reader/rss"
import (
"fmt"
"io"
"miniflux.app/v2/internal/errors"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/xml"
)
// Parse returns a normalized feed struct from a RSS feed.
func Parse(baseURL string, data io.Reader) (*model.Feed, *errors.LocalizedError) {
func Parse(baseURL string, data io.Reader) (*model.Feed, error) {
feed := new(rssFeed)
decoder := xml.NewDecoder(data)
err := decoder.Decode(feed)
if err != nil {
return nil, errors.NewLocalizedError("Unable to parse RSS feed: %q", err)
if err := xml.NewDecoder(data).Decode(feed); err != nil {
return nil, fmt.Errorf("rss: unable to parse feed: %w", err)
}
return feed.Transform(baseURL), nil
}

View file

@ -4,67 +4,54 @@
package scraper // import "miniflux.app/v2/internal/reader/scraper"
import (
"errors"
"fmt"
"io"
"log/slog"
"strings"
"miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/http/client"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/readability"
"miniflux.app/v2/internal/urllib"
"github.com/PuerkitoBio/goquery"
)
// Fetch downloads a web page and returns relevant contents.
func Fetch(websiteURL, rules, userAgent string, cookie string, allowSelfSignedCertificates, useProxy bool) (string, error) {
clt := client.NewClientWithConfig(websiteURL, config.Opts)
clt.WithUserAgent(userAgent)
clt.WithCookie(cookie)
if useProxy {
clt.WithProxy()
}
clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) {
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
defer responseHandler.Close()
response, err := clt.Get()
if err != nil {
return "", err
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return "", localizedError.Error()
}
if response.HasServerFailure() {
return "", errors.New("scraper: unable to download web page")
}
if !isAllowedContentType(response.ContentType) {
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
}
if err = response.EnsureUnicodeBody(); err != nil {
return "", err
if !isAllowedContentType(responseHandler.ContentType()) {
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
}
// The entry URL could redirect somewhere else.
sameSite := urllib.Domain(websiteURL) == urllib.Domain(response.EffectiveURL)
websiteURL = response.EffectiveURL
sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL())
websiteURL = responseHandler.EffectiveURL()
if rules == "" {
rules = getPredefinedScraperRules(websiteURL)
}
var content string
var err error
if sameSite && rules != "" {
slog.Debug("Extracting content with custom rules",
"url", websiteURL,
"rules", rules,
)
content, err = scrapContent(response.Body, rules)
content, err = findContentUsingCustomRules(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), rules)
} else {
slog.Debug("Extracting content with readability",
"url", websiteURL,
)
content, err = readability.ExtractContent(response.Body)
content, err = readability.ExtractContent(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
}
if err != nil {
@ -74,7 +61,7 @@ func Fetch(websiteURL, rules, userAgent string, cookie string, allowSelfSignedCe
return content, nil
}
func scrapContent(page io.Reader, rules string) (string, error) {
func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
document, err := goquery.NewDocumentFromReader(page)
if err != nil {
return "", err

View file

@ -58,7 +58,7 @@ func TestSelectorRules(t *testing.T) {
t.Fatalf(`Unable to read file %q: %v`, filename, err)
}
actualResult, err := scrapContent(bytes.NewReader(html), rule)
actualResult, err := findContentUsingCustomRules(bytes.NewReader(html), rule)
if err != nil {
t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
}

View file

@ -4,16 +4,16 @@
package subscription // import "miniflux.app/v2/internal/reader/subscription"
import (
"bytes"
"fmt"
"io"
"log/slog"
"regexp"
"strings"
"miniflux.app/v2/internal/config"
"miniflux.app/v2/internal/errors"
"miniflux.app/v2/internal/http/client"
"miniflux.app/v2/internal/integration/rssbridge"
"miniflux.app/v2/internal/reader/browser"
"miniflux.app/v2/internal/locale"
"miniflux.app/v2/internal/reader/fetcher"
"miniflux.app/v2/internal/reader/parser"
"miniflux.app/v2/internal/urllib"
@ -21,18 +21,70 @@ import (
)
var (
errUnreadableDoc = "Unable to analyze this page: %v"
youtubeChannelRegex = regexp.MustCompile(`youtube\.com/channel/(.*)`)
youtubeVideoRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
)
// FindSubscriptions downloads and try to find one or more subscriptions from an URL.
func FindSubscriptions(websiteURL, userAgent, cookie, username, password string, fetchViaProxy, allowSelfSignedCertificates bool, rssbridgeURL string) (Subscriptions, *errors.LocalizedError) {
func FindSubscriptions(websiteURL, userAgent, cookie, username, password string, fetchViaProxy, allowSelfSignedCertificates bool, rssbridgeURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
websiteURL = findYoutubeChannelFeed(websiteURL)
websiteURL = parseYoutubeVideoPage(websiteURL)
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithUsernameAndPassword(username, password)
requestBuilder.WithUserAgent(userAgent)
requestBuilder.WithCookie(cookie)
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
requestBuilder.UseProxy(fetchViaProxy)
requestBuilder.IgnoreTLSErrors(allowSelfSignedCertificates)
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return nil, localizedError
}
responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
if localizedError != nil {
slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return nil, localizedError
}
if format := parser.DetectFeedFormat(string(responseBody)); format != parser.FormatUnknown {
var subscriptions Subscriptions
subscriptions = append(subscriptions, &Subscription{
Title: responseHandler.EffectiveURL(),
URL: responseHandler.EffectiveURL(),
Type: format,
})
return subscriptions, nil
}
subscriptions, localizedError := parseWebPage(responseHandler.EffectiveURL(), bytes.NewReader(responseBody))
if localizedError != nil || subscriptions != nil {
return subscriptions, localizedError
}
if rssbridgeURL != "" {
slog.Debug("Trying to detect feeds using RSS-Bridge",
slog.String("website_url", websiteURL),
slog.String("rssbridge_url", rssbridgeURL),
)
bridges, err := rssbridge.DetectBridges(rssbridgeURL, websiteURL)
if err != nil {
return nil, errors.NewLocalizedError("RSS-Bridge: %v", err)
return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_detect_rssbridge", err)
}
slog.Debug("RSS-Bridge results",
slog.String("website_url", websiteURL),
slog.String("rssbridge_url", rssbridgeURL),
slog.Int("nb_bridges", len(bridges)),
)
if len(bridges) > 0 {
var subscriptions Subscriptions
for _, bridge := range bridges {
@ -46,45 +98,10 @@ func FindSubscriptions(websiteURL, userAgent, cookie, username, password string,
}
}
websiteURL = findYoutubeChannelFeed(websiteURL)
websiteURL = parseYoutubeVideoPage(websiteURL)
clt := client.NewClientWithConfig(websiteURL, config.Opts)
clt.WithCredentials(username, password)
clt.WithUserAgent(userAgent)
clt.WithCookie(cookie)
clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
if fetchViaProxy {
clt.WithProxy()
}
response, err := browser.Exec(clt)
if err != nil {
return nil, err
}
body := response.BodyAsString()
if format := parser.DetectFeedFormat(body); format != parser.FormatUnknown {
var subscriptions Subscriptions
subscriptions = append(subscriptions, &Subscription{
Title: response.EffectiveURL,
URL: response.EffectiveURL,
Type: format,
})
return subscriptions, nil
}
subscriptions, err := parseWebPage(response.EffectiveURL, strings.NewReader(body))
if err != nil || subscriptions != nil {
return subscriptions, err
}
return tryWellKnownUrls(websiteURL, userAgent, cookie, username, password)
return tryWellKnownUrls(websiteURL, userAgent, cookie, username, password, fetchViaProxy, allowSelfSignedCertificates)
}
func parseWebPage(websiteURL string, data io.Reader) (Subscriptions, *errors.LocalizedError) {
func parseWebPage(websiteURL string, data io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) {
var subscriptions Subscriptions
queries := map[string]string{
"link[type='application/rss+xml']": "rss",
@ -95,7 +112,7 @@ func parseWebPage(websiteURL string, data io.Reader) (Subscriptions, *errors.Loc
doc, err := goquery.NewDocumentFromReader(data)
if err != nil {
return nil, errors.NewLocalizedError(errUnreadableDoc, err)
return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
}
for query, kind := range queries {
@ -140,13 +157,19 @@ func parseYoutubeVideoPage(websiteURL string) string {
return websiteURL
}
clt := client.NewClientWithConfig(websiteURL, config.Opts)
response, browserErr := browser.Exec(clt)
if browserErr != nil {
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return websiteURL
}
doc, docErr := goquery.NewDocumentFromReader(response.Body)
doc, docErr := goquery.NewDocumentFromReader(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
if docErr != nil {
return websiteURL
}
@ -158,7 +181,7 @@ func parseYoutubeVideoPage(websiteURL string) string {
return websiteURL
}
func tryWellKnownUrls(websiteURL, userAgent, cookie, username, password string) (Subscriptions, *errors.LocalizedError) {
func tryWellKnownUrls(websiteURL, userAgent, cookie, username, password string, fetchViaProxy, allowSelfSignedCertificates bool) (Subscriptions, *locale.LocalizedErrorWrapper) {
var subscriptions Subscriptions
knownURLs := map[string]string{
"atom.xml": "atom",
@ -173,6 +196,7 @@ func tryWellKnownUrls(websiteURL, userAgent, cookie, username, password string)
// Look for knownURLs in the root.
websiteURLRoot,
}
// Look for knownURLs in current subdirectory, such as 'example.com/blog/'.
websiteURL, _ = urllib.AbsoluteURL(websiteURL, "./")
if websiteURL != websiteURLRoot {
@ -185,30 +209,33 @@ func tryWellKnownUrls(websiteURL, userAgent, cookie, username, password string)
if err != nil {
continue
}
clt := client.NewClientWithConfig(fullURL, config.Opts)
clt.WithCredentials(username, password)
clt.WithUserAgent(userAgent)
clt.WithCookie(cookie)
requestBuilder := fetcher.NewRequestBuilder()
requestBuilder.WithUsernameAndPassword(username, password)
requestBuilder.WithUserAgent(userAgent)
requestBuilder.WithCookie(cookie)
requestBuilder.WithTimeout(config.Opts.HTTPClientTimeout())
requestBuilder.WithProxy(config.Opts.HTTPClientProxy())
requestBuilder.UseProxy(fetchViaProxy)
requestBuilder.IgnoreTLSErrors(allowSelfSignedCertificates)
// Some websites redirects unknown URLs to the home page.
// As result, the list of known URLs is returned to the subscription list.
// We don't want the user to choose between invalid feed URLs.
clt.WithoutRedirects()
requestBuilder.WithoutRedirects()
response, err := clt.Get()
if err != nil {
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(fullURL))
defer responseHandler.Close()
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
continue
}
if response != nil && response.StatusCode == 200 {
subscription := new(Subscription)
subscription.Type = kind
subscription.Title = fullURL
subscription.URL = fullURL
if subscription.URL != "" {
subscriptions = append(subscriptions, subscription)
}
}
subscription := new(Subscription)
subscription.Type = kind
subscription.Title = fullURL
subscription.URL = fullURL
subscriptions = append(subscriptions, subscription)
}
}