1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-11 17:51:01 +00:00

feat: add POLLING_LIMIT_PER_HOST to limit concurrent requests per host

Each batch of feeds sent to the worker pool is now guaranteed to contain unique feed URLs.

When `POLLING_LIMIT_PER_HOST` is set, an additional limit is applied to the number of concurrent requests per hostname, helping to prevent overloading a single server.

Note: Additional requests may still be made during feed refresh. For example, to fetch feed icons or when the web scraper is enabled for a particular feed.
This commit is contained in:
Frédéric Guillot 2025-08-08 12:19:01 -07:00
parent a4f672b589
commit 34499b887b
13 changed files with 146 additions and 34 deletions

View file

@ -143,6 +143,7 @@ func (h *handler) refreshCategory(w http.ResponseWriter, r *http.Request) {
batchBuilder.WithUserID(userID) batchBuilder.WithUserID(userID)
batchBuilder.WithCategoryID(categoryID) batchBuilder.WithCategoryID(categoryID)
batchBuilder.WithNextCheckExpired() batchBuilder.WithNextCheckExpired()
batchBuilder.WithLimitPerHost(config.Opts.PollingLimitPerHost())
jobs, err := batchBuilder.FetchJobs() jobs, err := batchBuilder.FetchJobs()
if err != nil { if err != nil {

View file

@ -76,6 +76,7 @@ func (h *handler) refreshAllFeeds(w http.ResponseWriter, r *http.Request) {
batchBuilder.WithoutDisabledFeeds() batchBuilder.WithoutDisabledFeeds()
batchBuilder.WithNextCheckExpired() batchBuilder.WithNextCheckExpired()
batchBuilder.WithUserID(userID) batchBuilder.WithUserID(userID)
batchBuilder.WithLimitPerHost(config.Opts.PollingLimitPerHost())
jobs, err := batchBuilder.FetchJobs() jobs, err := batchBuilder.FetchJobs()
if err != nil { if err != nil {

View file

@ -25,6 +25,7 @@ func refreshFeeds(store *storage.Storage) {
batchBuilder.WithErrorLimit(config.Opts.PollingParsingErrorLimit()) batchBuilder.WithErrorLimit(config.Opts.PollingParsingErrorLimit())
batchBuilder.WithoutDisabledFeeds() batchBuilder.WithoutDisabledFeeds()
batchBuilder.WithNextCheckExpired() batchBuilder.WithNextCheckExpired()
batchBuilder.WithLimitPerHost(config.Opts.PollingLimitPerHost())
jobs, err := batchBuilder.FetchJobs() jobs, err := batchBuilder.FetchJobs()
if err != nil { if err != nil {
@ -39,6 +40,8 @@ func refreshFeeds(store *storage.Storage) {
slog.Int("batch_size", config.Opts.BatchSize()), slog.Int("batch_size", config.Opts.BatchSize()),
) )
slog.Debug("Feed URLs in this batch", slog.Any("feed_urls", jobs.FeedURLs()))
var jobQueue = make(chan model.Job, nbJobs) var jobQueue = make(chan model.Job, nbJobs)
slog.Info("Starting a pool of workers", slog.Info("Starting a pool of workers",

View file

@ -21,6 +21,7 @@ func runScheduler(store *storage.Storage, pool *worker.Pool) {
config.Opts.PollingFrequency(), config.Opts.PollingFrequency(),
config.Opts.BatchSize(), config.Opts.BatchSize(),
config.Opts.PollingParsingErrorLimit(), config.Opts.PollingParsingErrorLimit(),
config.Opts.PollingLimitPerHost(),
) )
go cleanupScheduler( go cleanupScheduler(
@ -29,7 +30,7 @@ func runScheduler(store *storage.Storage, pool *worker.Pool) {
) )
} }
func feedScheduler(store *storage.Storage, pool *worker.Pool, frequency, batchSize, errorLimit int) { func feedScheduler(store *storage.Storage, pool *worker.Pool, frequency, batchSize, errorLimit, limitPerHost int) {
for range time.Tick(time.Duration(frequency) * time.Minute) { for range time.Tick(time.Duration(frequency) * time.Minute) {
// Generate a batch of feeds for any user that has feeds to refresh. // Generate a batch of feeds for any user that has feeds to refresh.
batchBuilder := store.NewBatchBuilder() batchBuilder := store.NewBatchBuilder()
@ -37,6 +38,7 @@ func feedScheduler(store *storage.Storage, pool *worker.Pool, frequency, batchSi
batchBuilder.WithErrorLimit(errorLimit) batchBuilder.WithErrorLimit(errorLimit)
batchBuilder.WithoutDisabledFeeds() batchBuilder.WithoutDisabledFeeds()
batchBuilder.WithNextCheckExpired() batchBuilder.WithNextCheckExpired()
batchBuilder.WithLimitPerHost(limitPerHost)
if jobs, err := batchBuilder.FetchJobs(); err != nil { if jobs, err := batchBuilder.FetchJobs(); err != nil {
slog.Error("Unable to fetch jobs from database", slog.Any("error", err)) slog.Error("Unable to fetch jobs from database", slog.Any("error", err))
@ -44,6 +46,7 @@ func feedScheduler(store *storage.Storage, pool *worker.Pool, frequency, batchSi
slog.Info("Created a batch of feeds", slog.Info("Created a batch of feeds",
slog.Int("nb_jobs", len(jobs)), slog.Int("nb_jobs", len(jobs)),
) )
slog.Debug("Feed URLs in this batch", slog.Any("feed_urls", jobs.FeedURLs()))
pool.Push(jobs) pool.Push(jobs)
} }
} }

View file

@ -2104,3 +2104,36 @@ func TestInvalidHTTPClientProxy(t *testing.T) {
t.Fatalf(`Expected error for invalid HTTP_CLIENT_PROXY value, but got none`) t.Fatalf(`Expected error for invalid HTTP_CLIENT_PROXY value, but got none`)
} }
} }
func TestDefaultPollingLimitPerHost(t *testing.T) {
os.Clearenv()
parser := NewParser()
opts, err := parser.ParseEnvironmentVariables()
if err != nil {
t.Fatalf(`Parsing failure: %v`, err)
}
expected := 0
result := opts.PollingLimitPerHost()
if result != expected {
t.Fatalf(`Unexpected default PollingLimitPerHost value, got %v instead of %v`, result, expected)
}
}
func TestCustomPollingLimitPerHost(t *testing.T) {
os.Clearenv()
os.Setenv("POLLING_LIMIT_PER_HOST", "10")
parser := NewParser()
opts, err := parser.ParseEnvironmentVariables()
if err != nil {
t.Fatalf(`Parsing failure: %v`, err)
}
expected := 10
result := opts.PollingLimitPerHost()
if result != expected {
t.Fatalf(`Unexpected custom PollingLimitPerHost value, got %v instead of %v`, result, expected)
}
}

View file

@ -130,16 +130,17 @@ type options struct {
cleanupArchiveUnreadDays int cleanupArchiveUnreadDays int
cleanupArchiveBatchSize int cleanupArchiveBatchSize int
cleanupRemoveSessionsDays int cleanupRemoveSessionsDays int
pollingFrequency int
forceRefreshInterval int forceRefreshInterval int
batchSize int batchSize int
pollingScheduler string
schedulerEntryFrequencyMinInterval int schedulerEntryFrequencyMinInterval int
schedulerEntryFrequencyMaxInterval int schedulerEntryFrequencyMaxInterval int
schedulerEntryFrequencyFactor int schedulerEntryFrequencyFactor int
schedulerRoundRobinMinInterval int schedulerRoundRobinMinInterval int
schedulerRoundRobinMaxInterval int schedulerRoundRobinMaxInterval int
pollingFrequency int
pollingLimitPerHost int
pollingParsingErrorLimit int pollingParsingErrorLimit int
pollingScheduler string
workerPoolSize int workerPoolSize int
createAdmin bool createAdmin bool
adminUsername string adminUsername string
@ -390,11 +391,6 @@ func (o *options) WorkerPoolSize() int {
return o.workerPoolSize return o.workerPoolSize
} }
// PollingFrequency returns the interval to refresh feeds in the background.
func (o *options) PollingFrequency() int {
return o.pollingFrequency
}
// ForceRefreshInterval returns the force refresh interval // ForceRefreshInterval returns the force refresh interval
func (o *options) ForceRefreshInterval() int { func (o *options) ForceRefreshInterval() int {
return o.forceRefreshInterval return o.forceRefreshInterval
@ -405,6 +401,22 @@ func (o *options) BatchSize() int {
return o.batchSize return o.batchSize
} }
// PollingFrequency returns the interval to refresh feeds in the background.
func (o *options) PollingFrequency() int {
return o.pollingFrequency
}
// PollingLimitPerHost returns the limit of concurrent requests per host.
// Set to zero to disable.
func (o *options) PollingLimitPerHost() int {
return o.pollingLimitPerHost
}
// PollingParsingErrorLimit returns the limit of errors when to stop polling.
func (o *options) PollingParsingErrorLimit() int {
return o.pollingParsingErrorLimit
}
// PollingScheduler returns the scheduler used for polling feeds. // PollingScheduler returns the scheduler used for polling feeds.
func (o *options) PollingScheduler() string { func (o *options) PollingScheduler() string {
return o.pollingScheduler return o.pollingScheduler
@ -433,11 +445,6 @@ func (o *options) SchedulerRoundRobinMaxInterval() int {
return o.schedulerRoundRobinMaxInterval return o.schedulerRoundRobinMaxInterval
} }
// PollingParsingErrorLimit returns the limit of errors when to stop polling.
func (o *options) PollingParsingErrorLimit() int {
return o.pollingParsingErrorLimit
}
// IsOAuth2UserCreationAllowed returns true if user creation is allowed for OAuth2 users. // IsOAuth2UserCreationAllowed returns true if user creation is allowed for OAuth2 users.
func (o *options) IsOAuth2UserCreationAllowed() bool { func (o *options) IsOAuth2UserCreationAllowed() bool {
return o.oauth2UserCreationAllowed return o.oauth2UserCreationAllowed
@ -762,8 +769,9 @@ func (o *options) SortedOptions(redactSecret bool) []*option {
"OAUTH2_REDIRECT_URL": o.oauth2RedirectURL, "OAUTH2_REDIRECT_URL": o.oauth2RedirectURL,
"OAUTH2_USER_CREATION": o.oauth2UserCreationAllowed, "OAUTH2_USER_CREATION": o.oauth2UserCreationAllowed,
"DISABLE_LOCAL_AUTH": o.disableLocalAuth, "DISABLE_LOCAL_AUTH": o.disableLocalAuth,
"POLLING_FREQUENCY": o.pollingFrequency,
"FORCE_REFRESH_INTERVAL": o.forceRefreshInterval, "FORCE_REFRESH_INTERVAL": o.forceRefreshInterval,
"POLLING_FREQUENCY": o.pollingFrequency,
"POLLING_LIMIT_PER_HOST": o.pollingLimitPerHost,
"POLLING_PARSING_ERROR_LIMIT": o.pollingParsingErrorLimit, "POLLING_PARSING_ERROR_LIMIT": o.pollingParsingErrorLimit,
"POLLING_SCHEDULER": o.pollingScheduler, "POLLING_SCHEDULER": o.pollingScheduler,
"MEDIA_PROXY_HTTP_CLIENT_TIMEOUT": o.mediaProxyHTTPClientTimeout, "MEDIA_PROXY_HTTP_CLIENT_TIMEOUT": o.mediaProxyHTTPClientTimeout,

View file

@ -137,12 +137,16 @@ func (p *parser) parseLines(lines []string) (err error) {
p.opts.cleanupRemoveSessionsDays = parseInt(value, defaultCleanupRemoveSessionsDays) p.opts.cleanupRemoveSessionsDays = parseInt(value, defaultCleanupRemoveSessionsDays)
case "WORKER_POOL_SIZE": case "WORKER_POOL_SIZE":
p.opts.workerPoolSize = parseInt(value, defaultWorkerPoolSize) p.opts.workerPoolSize = parseInt(value, defaultWorkerPoolSize)
case "POLLING_FREQUENCY":
p.opts.pollingFrequency = parseInt(value, defaultPollingFrequency)
case "FORCE_REFRESH_INTERVAL": case "FORCE_REFRESH_INTERVAL":
p.opts.forceRefreshInterval = parseInt(value, defaultForceRefreshInterval) p.opts.forceRefreshInterval = parseInt(value, defaultForceRefreshInterval)
case "BATCH_SIZE": case "BATCH_SIZE":
p.opts.batchSize = parseInt(value, defaultBatchSize) p.opts.batchSize = parseInt(value, defaultBatchSize)
case "POLLING_FREQUENCY":
p.opts.pollingFrequency = parseInt(value, defaultPollingFrequency)
case "POLLING_LIMIT_PER_HOST":
p.opts.pollingLimitPerHost = parseInt(value, 0)
case "POLLING_PARSING_ERROR_LIMIT":
p.opts.pollingParsingErrorLimit = parseInt(value, defaultPollingParsingErrorLimit)
case "POLLING_SCHEDULER": case "POLLING_SCHEDULER":
p.opts.pollingScheduler = strings.ToLower(parseString(value, defaultPollingScheduler)) p.opts.pollingScheduler = strings.ToLower(parseString(value, defaultPollingScheduler))
case "SCHEDULER_ENTRY_FREQUENCY_MAX_INTERVAL": case "SCHEDULER_ENTRY_FREQUENCY_MAX_INTERVAL":
@ -155,8 +159,6 @@ func (p *parser) parseLines(lines []string) (err error) {
p.opts.schedulerRoundRobinMinInterval = parseInt(value, defaultSchedulerRoundRobinMinInterval) p.opts.schedulerRoundRobinMinInterval = parseInt(value, defaultSchedulerRoundRobinMinInterval)
case "SCHEDULER_ROUND_ROBIN_MAX_INTERVAL": case "SCHEDULER_ROUND_ROBIN_MAX_INTERVAL":
p.opts.schedulerRoundRobinMaxInterval = parseInt(value, defaultSchedulerRoundRobinMaxInterval) p.opts.schedulerRoundRobinMaxInterval = parseInt(value, defaultSchedulerRoundRobinMaxInterval)
case "POLLING_PARSING_ERROR_LIMIT":
p.opts.pollingParsingErrorLimit = parseInt(value, defaultPollingParsingErrorLimit)
case "MEDIA_PROXY_HTTP_CLIENT_TIMEOUT": case "MEDIA_PROXY_HTTP_CLIENT_TIMEOUT":
p.opts.mediaProxyHTTPClientTimeout = parseInt(value, defaultMediaProxyHTTPClientTimeout) p.opts.mediaProxyHTTPClientTimeout = parseInt(value, defaultMediaProxyHTTPClientTimeout)
case "MEDIA_PROXY_MODE": case "MEDIA_PROXY_MODE":

View file

@ -7,7 +7,18 @@ package model // import "miniflux.app/v2/internal/model"
type Job struct { type Job struct {
UserID int64 UserID int64
FeedID int64 FeedID int64
FeedURL string
} }
// JobList represents a list of jobs. // JobList represents a list of jobs.
type JobList []Job type JobList []Job
// FeedURLs returns a list of feed URLs from the job list.
// This is useful for logging or debugging purposes to see which feeds are being processed.
func (jl *JobList) FeedURLs() []string {
feedURLs := make([]string, len(*jl))
for i, job := range *jl {
feedURLs[i] = job.FeedURL
}
return feedURLs
}

View file

@ -6,9 +6,11 @@ package storage // import "miniflux.app/v2/internal/storage"
import ( import (
"database/sql" "database/sql"
"fmt" "fmt"
"log/slog"
"strings" "strings"
"miniflux.app/v2/internal/model" "miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/urllib"
) )
type BatchBuilder struct { type BatchBuilder struct {
@ -16,6 +18,7 @@ type BatchBuilder struct {
args []any args []any
conditions []string conditions []string
limit int limit int
limitPerHost int
} }
func (s *Storage) NewBatchBuilder() *BatchBuilder { func (s *Storage) NewBatchBuilder() *BatchBuilder {
@ -59,15 +62,27 @@ func (b *BatchBuilder) WithoutDisabledFeeds() *BatchBuilder {
return b return b
} }
func (b *BatchBuilder) WithLimitPerHost(limit int) *BatchBuilder {
if limit > 0 {
b.limitPerHost = limit
}
return b
}
// FetchJobs retrieves a batch of jobs based on the conditions set in the builder.
// It ensures that each job is unique by feed URL to avoid making too many concurrent requests to the same website.
// When limitPerHost is set, it limits the number of jobs per feed hostname to prevent overwhelming a single host.
func (b *BatchBuilder) FetchJobs() (model.JobList, error) { func (b *BatchBuilder) FetchJobs() (model.JobList, error) {
query := `SELECT id, user_id FROM feeds` query := `SELECT DISTINCT ON (feed_url) id, user_id, feed_url FROM feeds`
if len(b.conditions) > 0 { if len(b.conditions) > 0 {
query += " WHERE " + strings.Join(b.conditions, " AND ") query += " WHERE " + strings.Join(b.conditions, " AND ")
} }
query += " ORDER BY feed_url, next_check_at ASC"
if b.limit > 0 { if b.limit > 0 {
query += fmt.Sprintf(" ORDER BY next_check_at ASC LIMIT %d", b.limit) query += fmt.Sprintf(" LIMIT %d", b.limit)
} }
rows, err := b.db.Query(query, b.args...) rows, err := b.db.Query(query, b.args...)
@ -77,15 +92,34 @@ func (b *BatchBuilder) FetchJobs() (model.JobList, error) {
defer rows.Close() defer rows.Close()
jobs := make(model.JobList, 0, b.limit) jobs := make(model.JobList, 0, b.limit)
hosts := make(map[string]int)
for rows.Next() { for rows.Next() {
var job model.Job var job model.Job
if err := rows.Scan(&job.FeedID, &job.UserID); err != nil { if err := rows.Scan(&job.FeedID, &job.UserID, &job.FeedURL); err != nil {
return nil, fmt.Errorf(`store: unable to fetch job: %v`, err) return nil, fmt.Errorf(`store: unable to fetch job record: %v`, err)
}
if b.limitPerHost > 0 {
feedHostname := urllib.Domain(job.FeedURL)
if hosts[feedHostname] >= b.limitPerHost {
slog.Debug("Feed host limit reached for this batch",
slog.String("feed_url", job.FeedURL),
slog.String("feed_hostname", feedHostname),
slog.Int("limit_per_host", b.limitPerHost),
slog.Int("current", hosts[feedHostname]),
)
continue
}
hosts[feedHostname]++
} }
jobs = append(jobs, job) jobs = append(jobs, job)
} }
if err := rows.Err(); err != nil {
return nil, fmt.Errorf(`store: error iterating on job records: %v`, err)
}
return jobs, nil return jobs, nil
} }

View file

@ -43,6 +43,7 @@ func (h *handler) refreshCategory(w http.ResponseWriter, r *http.Request) int64
batchBuilder.WithoutDisabledFeeds() batchBuilder.WithoutDisabledFeeds()
batchBuilder.WithUserID(userID) batchBuilder.WithUserID(userID)
batchBuilder.WithCategoryID(categoryID) batchBuilder.WithCategoryID(categoryID)
batchBuilder.WithLimitPerHost(config.Opts.PollingLimitPerHost())
jobs, err := batchBuilder.FetchJobs() jobs, err := batchBuilder.FetchJobs()
if err != nil { if err != nil {

View file

@ -47,6 +47,7 @@ func (h *handler) refreshAllFeeds(w http.ResponseWriter, r *http.Request) {
batchBuilder := h.store.NewBatchBuilder() batchBuilder := h.store.NewBatchBuilder()
batchBuilder.WithoutDisabledFeeds() batchBuilder.WithoutDisabledFeeds()
batchBuilder.WithUserID(userID) batchBuilder.WithUserID(userID)
batchBuilder.WithLimitPerHost(config.Opts.PollingLimitPerHost())
jobs, err := batchBuilder.FetchJobs() jobs, err := batchBuilder.FetchJobs()
if err != nil { if err != nil {

View file

@ -32,6 +32,7 @@ func (w *worker) Run(c <-chan model.Job) {
slog.Int("worker_id", w.id), slog.Int("worker_id", w.id),
slog.Int64("user_id", job.UserID), slog.Int64("user_id", job.UserID),
slog.Int64("feed_id", job.FeedID), slog.Int64("feed_id", job.FeedID),
slog.String("feed_url", job.FeedURL),
) )
startTime := time.Now() startTime := time.Now()

View file

@ -1,5 +1,5 @@
.\" Manpage for miniflux. .\" Manpage for miniflux.
.TH "MINIFLUX" "1" "June 23, 2025" "\ \&" "\ \&" .TH "MINIFLUX" "1" "August 8, 2025" "\ \&" "\ \&"
.SH NAME .SH NAME
miniflux \- Minimalist and opinionated feed reader miniflux \- Minimalist and opinionated feed reader
@ -490,19 +490,32 @@ Refresh interval in minutes for feeds\&.
.br .br
Default is 60 minutes\&. Default is 60 minutes\&.
.TP .TP
.B POLLING_LIMIT_PER_HOST
Limits the number of concurrent requests to the same hostname when polling feeds.
.br
This helps prevent overwhelming a single server during batch processing by the worker pool.
.br
Default is 0 (disabled)\&.
.TP
.B POLLING_PARSING_ERROR_LIMIT .B POLLING_PARSING_ERROR_LIMIT
The maximum number of parsing errors that the program will try before stopping polling a feed. Once the limit is reached, the user must refresh the feed manually. Set to 0 for unlimited. The maximum number of parsing errors that the program will try before stopping polling a feed.
.br
Once the limit is reached, the user must refresh the feed manually. Set to 0 for unlimited.
.br .br
Default is 3\&. Default is 3\&.
.TP .TP
.B POLLING_SCHEDULER .B POLLING_SCHEDULER
Scheduler used for polling feeds. Possible values are "round_robin" or "entry_frequency"\&. Determines the strategy used to schedule feed polling.
.br .br
The maximum number of feeds polled for a given period is subject to POLLING_FREQUENCY and BATCH_SIZE\&. Supported values are "round_robin" and "entry_frequency".
.br .br
When "entry_frequency" is selected, the refresh interval for a given feed is equal to the average updating interval of the last week of the feed\&. - "round_robin": Feeds are polled in a fixed, rotating order.
.br .br
The actual number of feeds polled will not exceed the maximum number of feeds that could be polled for a given period\&. - "entry_frequency": The polling interval for each feed is based on the average update frequency over the past week.
.br
The number of feeds polled in a given period is limited by the POLLING_FREQUENCY and BATCH_SIZE settings.
.br
Regardless of the scheduler used, the total number of polled feeds will not exceed the maximum allowed per polling cycle.
.br .br
Default is "round_robin"\&. Default is "round_robin"\&.
.TP .TP