1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-11 17:51:01 +00:00

feat: add POLLING_LIMIT_PER_HOST to limit concurrent requests per host

Each batch of feeds sent to the worker pool is now guaranteed to contain unique feed URLs.

When `POLLING_LIMIT_PER_HOST` is set, an additional limit is applied to the number of concurrent requests per hostname, helping to prevent overloading a single server.

Note: Additional requests may still be made during feed refresh. For example, to fetch feed icons or when the web scraper is enabled for a particular feed.
This commit is contained in:
Frédéric Guillot 2025-08-08 12:19:01 -07:00
parent a4f672b589
commit 34499b887b
13 changed files with 146 additions and 34 deletions

View file

@ -143,6 +143,7 @@ func (h *handler) refreshCategory(w http.ResponseWriter, r *http.Request) {
batchBuilder.WithUserID(userID)
batchBuilder.WithCategoryID(categoryID)
batchBuilder.WithNextCheckExpired()
batchBuilder.WithLimitPerHost(config.Opts.PollingLimitPerHost())
jobs, err := batchBuilder.FetchJobs()
if err != nil {

View file

@ -76,6 +76,7 @@ func (h *handler) refreshAllFeeds(w http.ResponseWriter, r *http.Request) {
batchBuilder.WithoutDisabledFeeds()
batchBuilder.WithNextCheckExpired()
batchBuilder.WithUserID(userID)
batchBuilder.WithLimitPerHost(config.Opts.PollingLimitPerHost())
jobs, err := batchBuilder.FetchJobs()
if err != nil {

View file

@ -25,6 +25,7 @@ func refreshFeeds(store *storage.Storage) {
batchBuilder.WithErrorLimit(config.Opts.PollingParsingErrorLimit())
batchBuilder.WithoutDisabledFeeds()
batchBuilder.WithNextCheckExpired()
batchBuilder.WithLimitPerHost(config.Opts.PollingLimitPerHost())
jobs, err := batchBuilder.FetchJobs()
if err != nil {
@ -39,6 +40,8 @@ func refreshFeeds(store *storage.Storage) {
slog.Int("batch_size", config.Opts.BatchSize()),
)
slog.Debug("Feed URLs in this batch", slog.Any("feed_urls", jobs.FeedURLs()))
var jobQueue = make(chan model.Job, nbJobs)
slog.Info("Starting a pool of workers",

View file

@ -21,6 +21,7 @@ func runScheduler(store *storage.Storage, pool *worker.Pool) {
config.Opts.PollingFrequency(),
config.Opts.BatchSize(),
config.Opts.PollingParsingErrorLimit(),
config.Opts.PollingLimitPerHost(),
)
go cleanupScheduler(
@ -29,7 +30,7 @@ func runScheduler(store *storage.Storage, pool *worker.Pool) {
)
}
func feedScheduler(store *storage.Storage, pool *worker.Pool, frequency, batchSize, errorLimit int) {
func feedScheduler(store *storage.Storage, pool *worker.Pool, frequency, batchSize, errorLimit, limitPerHost int) {
for range time.Tick(time.Duration(frequency) * time.Minute) {
// Generate a batch of feeds for any user that has feeds to refresh.
batchBuilder := store.NewBatchBuilder()
@ -37,6 +38,7 @@ func feedScheduler(store *storage.Storage, pool *worker.Pool, frequency, batchSi
batchBuilder.WithErrorLimit(errorLimit)
batchBuilder.WithoutDisabledFeeds()
batchBuilder.WithNextCheckExpired()
batchBuilder.WithLimitPerHost(limitPerHost)
if jobs, err := batchBuilder.FetchJobs(); err != nil {
slog.Error("Unable to fetch jobs from database", slog.Any("error", err))
@ -44,6 +46,7 @@ func feedScheduler(store *storage.Storage, pool *worker.Pool, frequency, batchSi
slog.Info("Created a batch of feeds",
slog.Int("nb_jobs", len(jobs)),
)
slog.Debug("Feed URLs in this batch", slog.Any("feed_urls", jobs.FeedURLs()))
pool.Push(jobs)
}
}

View file

@ -2104,3 +2104,36 @@ func TestInvalidHTTPClientProxy(t *testing.T) {
t.Fatalf(`Expected error for invalid HTTP_CLIENT_PROXY value, but got none`)
}
}
func TestDefaultPollingLimitPerHost(t *testing.T) {
os.Clearenv()
parser := NewParser()
opts, err := parser.ParseEnvironmentVariables()
if err != nil {
t.Fatalf(`Parsing failure: %v`, err)
}
expected := 0
result := opts.PollingLimitPerHost()
if result != expected {
t.Fatalf(`Unexpected default PollingLimitPerHost value, got %v instead of %v`, result, expected)
}
}
func TestCustomPollingLimitPerHost(t *testing.T) {
os.Clearenv()
os.Setenv("POLLING_LIMIT_PER_HOST", "10")
parser := NewParser()
opts, err := parser.ParseEnvironmentVariables()
if err != nil {
t.Fatalf(`Parsing failure: %v`, err)
}
expected := 10
result := opts.PollingLimitPerHost()
if result != expected {
t.Fatalf(`Unexpected custom PollingLimitPerHost value, got %v instead of %v`, result, expected)
}
}

View file

@ -130,16 +130,17 @@ type options struct {
cleanupArchiveUnreadDays int
cleanupArchiveBatchSize int
cleanupRemoveSessionsDays int
pollingFrequency int
forceRefreshInterval int
batchSize int
pollingScheduler string
schedulerEntryFrequencyMinInterval int
schedulerEntryFrequencyMaxInterval int
schedulerEntryFrequencyFactor int
schedulerRoundRobinMinInterval int
schedulerRoundRobinMaxInterval int
pollingFrequency int
pollingLimitPerHost int
pollingParsingErrorLimit int
pollingScheduler string
workerPoolSize int
createAdmin bool
adminUsername string
@ -390,11 +391,6 @@ func (o *options) WorkerPoolSize() int {
return o.workerPoolSize
}
// PollingFrequency returns the interval to refresh feeds in the background.
func (o *options) PollingFrequency() int {
return o.pollingFrequency
}
// ForceRefreshInterval returns the force refresh interval
func (o *options) ForceRefreshInterval() int {
return o.forceRefreshInterval
@ -405,6 +401,22 @@ func (o *options) BatchSize() int {
return o.batchSize
}
// PollingFrequency returns the interval to refresh feeds in the background.
func (o *options) PollingFrequency() int {
return o.pollingFrequency
}
// PollingLimitPerHost returns the limit of concurrent requests per host.
// Set to zero to disable.
func (o *options) PollingLimitPerHost() int {
return o.pollingLimitPerHost
}
// PollingParsingErrorLimit returns the limit of errors when to stop polling.
func (o *options) PollingParsingErrorLimit() int {
return o.pollingParsingErrorLimit
}
// PollingScheduler returns the scheduler used for polling feeds.
func (o *options) PollingScheduler() string {
return o.pollingScheduler
@ -433,11 +445,6 @@ func (o *options) SchedulerRoundRobinMaxInterval() int {
return o.schedulerRoundRobinMaxInterval
}
// PollingParsingErrorLimit returns the limit of errors when to stop polling.
func (o *options) PollingParsingErrorLimit() int {
return o.pollingParsingErrorLimit
}
// IsOAuth2UserCreationAllowed returns true if user creation is allowed for OAuth2 users.
func (o *options) IsOAuth2UserCreationAllowed() bool {
return o.oauth2UserCreationAllowed
@ -762,8 +769,9 @@ func (o *options) SortedOptions(redactSecret bool) []*option {
"OAUTH2_REDIRECT_URL": o.oauth2RedirectURL,
"OAUTH2_USER_CREATION": o.oauth2UserCreationAllowed,
"DISABLE_LOCAL_AUTH": o.disableLocalAuth,
"POLLING_FREQUENCY": o.pollingFrequency,
"FORCE_REFRESH_INTERVAL": o.forceRefreshInterval,
"POLLING_FREQUENCY": o.pollingFrequency,
"POLLING_LIMIT_PER_HOST": o.pollingLimitPerHost,
"POLLING_PARSING_ERROR_LIMIT": o.pollingParsingErrorLimit,
"POLLING_SCHEDULER": o.pollingScheduler,
"MEDIA_PROXY_HTTP_CLIENT_TIMEOUT": o.mediaProxyHTTPClientTimeout,

View file

@ -137,12 +137,16 @@ func (p *parser) parseLines(lines []string) (err error) {
p.opts.cleanupRemoveSessionsDays = parseInt(value, defaultCleanupRemoveSessionsDays)
case "WORKER_POOL_SIZE":
p.opts.workerPoolSize = parseInt(value, defaultWorkerPoolSize)
case "POLLING_FREQUENCY":
p.opts.pollingFrequency = parseInt(value, defaultPollingFrequency)
case "FORCE_REFRESH_INTERVAL":
p.opts.forceRefreshInterval = parseInt(value, defaultForceRefreshInterval)
case "BATCH_SIZE":
p.opts.batchSize = parseInt(value, defaultBatchSize)
case "POLLING_FREQUENCY":
p.opts.pollingFrequency = parseInt(value, defaultPollingFrequency)
case "POLLING_LIMIT_PER_HOST":
p.opts.pollingLimitPerHost = parseInt(value, 0)
case "POLLING_PARSING_ERROR_LIMIT":
p.opts.pollingParsingErrorLimit = parseInt(value, defaultPollingParsingErrorLimit)
case "POLLING_SCHEDULER":
p.opts.pollingScheduler = strings.ToLower(parseString(value, defaultPollingScheduler))
case "SCHEDULER_ENTRY_FREQUENCY_MAX_INTERVAL":
@ -155,8 +159,6 @@ func (p *parser) parseLines(lines []string) (err error) {
p.opts.schedulerRoundRobinMinInterval = parseInt(value, defaultSchedulerRoundRobinMinInterval)
case "SCHEDULER_ROUND_ROBIN_MAX_INTERVAL":
p.opts.schedulerRoundRobinMaxInterval = parseInt(value, defaultSchedulerRoundRobinMaxInterval)
case "POLLING_PARSING_ERROR_LIMIT":
p.opts.pollingParsingErrorLimit = parseInt(value, defaultPollingParsingErrorLimit)
case "MEDIA_PROXY_HTTP_CLIENT_TIMEOUT":
p.opts.mediaProxyHTTPClientTimeout = parseInt(value, defaultMediaProxyHTTPClientTimeout)
case "MEDIA_PROXY_MODE":

View file

@ -7,7 +7,18 @@ package model // import "miniflux.app/v2/internal/model"
type Job struct {
UserID int64
FeedID int64
FeedURL string
}
// JobList represents a list of jobs.
type JobList []Job
// FeedURLs returns a list of feed URLs from the job list.
// This is useful for logging or debugging purposes to see which feeds are being processed.
func (jl *JobList) FeedURLs() []string {
feedURLs := make([]string, len(*jl))
for i, job := range *jl {
feedURLs[i] = job.FeedURL
}
return feedURLs
}

View file

@ -6,9 +6,11 @@ package storage // import "miniflux.app/v2/internal/storage"
import (
"database/sql"
"fmt"
"log/slog"
"strings"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/urllib"
)
type BatchBuilder struct {
@ -16,6 +18,7 @@ type BatchBuilder struct {
args []any
conditions []string
limit int
limitPerHost int
}
func (s *Storage) NewBatchBuilder() *BatchBuilder {
@ -59,15 +62,27 @@ func (b *BatchBuilder) WithoutDisabledFeeds() *BatchBuilder {
return b
}
func (b *BatchBuilder) WithLimitPerHost(limit int) *BatchBuilder {
if limit > 0 {
b.limitPerHost = limit
}
return b
}
// FetchJobs retrieves a batch of jobs based on the conditions set in the builder.
// It ensures that each job is unique by feed URL to avoid making too many concurrent requests to the same website.
// When limitPerHost is set, it limits the number of jobs per feed hostname to prevent overwhelming a single host.
func (b *BatchBuilder) FetchJobs() (model.JobList, error) {
query := `SELECT id, user_id FROM feeds`
query := `SELECT DISTINCT ON (feed_url) id, user_id, feed_url FROM feeds`
if len(b.conditions) > 0 {
query += " WHERE " + strings.Join(b.conditions, " AND ")
}
query += " ORDER BY feed_url, next_check_at ASC"
if b.limit > 0 {
query += fmt.Sprintf(" ORDER BY next_check_at ASC LIMIT %d", b.limit)
query += fmt.Sprintf(" LIMIT %d", b.limit)
}
rows, err := b.db.Query(query, b.args...)
@ -77,15 +92,34 @@ func (b *BatchBuilder) FetchJobs() (model.JobList, error) {
defer rows.Close()
jobs := make(model.JobList, 0, b.limit)
hosts := make(map[string]int)
for rows.Next() {
var job model.Job
if err := rows.Scan(&job.FeedID, &job.UserID); err != nil {
return nil, fmt.Errorf(`store: unable to fetch job: %v`, err)
if err := rows.Scan(&job.FeedID, &job.UserID, &job.FeedURL); err != nil {
return nil, fmt.Errorf(`store: unable to fetch job record: %v`, err)
}
if b.limitPerHost > 0 {
feedHostname := urllib.Domain(job.FeedURL)
if hosts[feedHostname] >= b.limitPerHost {
slog.Debug("Feed host limit reached for this batch",
slog.String("feed_url", job.FeedURL),
slog.String("feed_hostname", feedHostname),
slog.Int("limit_per_host", b.limitPerHost),
slog.Int("current", hosts[feedHostname]),
)
continue
}
hosts[feedHostname]++
}
jobs = append(jobs, job)
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf(`store: error iterating on job records: %v`, err)
}
return jobs, nil
}

View file

@ -43,6 +43,7 @@ func (h *handler) refreshCategory(w http.ResponseWriter, r *http.Request) int64
batchBuilder.WithoutDisabledFeeds()
batchBuilder.WithUserID(userID)
batchBuilder.WithCategoryID(categoryID)
batchBuilder.WithLimitPerHost(config.Opts.PollingLimitPerHost())
jobs, err := batchBuilder.FetchJobs()
if err != nil {

View file

@ -47,6 +47,7 @@ func (h *handler) refreshAllFeeds(w http.ResponseWriter, r *http.Request) {
batchBuilder := h.store.NewBatchBuilder()
batchBuilder.WithoutDisabledFeeds()
batchBuilder.WithUserID(userID)
batchBuilder.WithLimitPerHost(config.Opts.PollingLimitPerHost())
jobs, err := batchBuilder.FetchJobs()
if err != nil {

View file

@ -32,6 +32,7 @@ func (w *worker) Run(c <-chan model.Job) {
slog.Int("worker_id", w.id),
slog.Int64("user_id", job.UserID),
slog.Int64("feed_id", job.FeedID),
slog.String("feed_url", job.FeedURL),
)
startTime := time.Now()

View file

@ -1,5 +1,5 @@
.\" Manpage for miniflux.
.TH "MINIFLUX" "1" "June 23, 2025" "\ \&" "\ \&"
.TH "MINIFLUX" "1" "August 8, 2025" "\ \&" "\ \&"
.SH NAME
miniflux \- Minimalist and opinionated feed reader
@ -490,19 +490,32 @@ Refresh interval in minutes for feeds\&.
.br
Default is 60 minutes\&.
.TP
.B POLLING_LIMIT_PER_HOST
Limits the number of concurrent requests to the same hostname when polling feeds.
.br
This helps prevent overwhelming a single server during batch processing by the worker pool.
.br
Default is 0 (disabled)\&.
.TP
.B POLLING_PARSING_ERROR_LIMIT
The maximum number of parsing errors that the program will try before stopping polling a feed. Once the limit is reached, the user must refresh the feed manually. Set to 0 for unlimited.
The maximum number of parsing errors that the program will try before stopping polling a feed.
.br
Once the limit is reached, the user must refresh the feed manually. Set to 0 for unlimited.
.br
Default is 3\&.
.TP
.B POLLING_SCHEDULER
Scheduler used for polling feeds. Possible values are "round_robin" or "entry_frequency"\&.
Determines the strategy used to schedule feed polling.
.br
The maximum number of feeds polled for a given period is subject to POLLING_FREQUENCY and BATCH_SIZE\&.
Supported values are "round_robin" and "entry_frequency".
.br
When "entry_frequency" is selected, the refresh interval for a given feed is equal to the average updating interval of the last week of the feed\&.
- "round_robin": Feeds are polled in a fixed, rotating order.
.br
The actual number of feeds polled will not exceed the maximum number of feeds that could be polled for a given period\&.
- "entry_frequency": The polling interval for each feed is based on the average update frequency over the past week.
.br
The number of feeds polled in a given period is limited by the POLLING_FREQUENCY and BATCH_SIZE settings.
.br
Regardless of the scheduler used, the total number of polled feeds will not exceed the maximum allowed per polling cycle.
.br
Default is "round_robin"\&.
.TP