feat: add POLLING_LIMIT_PER_HOST to limit concurrent requests per host

Each batch of feeds sent to the worker pool is now guaranteed to contain unique feed URLs. When `POLLING_LIMIT_PER_HOST` is set, an additional limit is applied to the number of concurrent requests per hostname, helping to prevent overloading a single server. Note: Additional requests may still be made during feed refresh. For example, to fetch feed icons or when the web scraper is enabled for a particular feed.
2025-09-30 19:22:11 +00:00 · 2025-08-08 12:19:01 -07:00 · 2025-08-08 12:19:01 -07:00 · 34499b887b
commit 34499b887b
parent a4f672b589
13 changed files with 146 additions and 34 deletions
--- a/internal/storage/batch.go
+++ b/internal/storage/batch.go
@ -6,16 +6,19 @@ package storage // import "miniflux.app/v2/internal/storage"
 import (
 	"database/sql"
 	"fmt"
+	"log/slog"
 	"strings"

 	"miniflux.app/v2/internal/model"
+	"miniflux.app/v2/internal/urllib"
 )

 type BatchBuilder struct {
-	db         *sql.DB
-	args       []any
-	conditions []string
-	limit      int
+	db           *sql.DB
+	args         []any
+	conditions   []string
+	limit        int
+	limitPerHost int
 }

 func (s *Storage) NewBatchBuilder() *BatchBuilder {
@ -59,15 +62,27 @@ func (b *BatchBuilder) WithoutDisabledFeeds() *BatchBuilder {
 	return b
 }

+func (b *BatchBuilder) WithLimitPerHost(limit int) *BatchBuilder {
+	if limit > 0 {
+		b.limitPerHost = limit
+	}
+	return b
+}
+
+// FetchJobs retrieves a batch of jobs based on the conditions set in the builder.
+// It ensures that each job is unique by feed URL to avoid making too many concurrent requests to the same website.
+// When limitPerHost is set, it limits the number of jobs per feed hostname to prevent overwhelming a single host.
 func (b *BatchBuilder) FetchJobs() (model.JobList, error) {
-	query := `SELECT id, user_id FROM feeds`
+	query := `SELECT DISTINCT ON (feed_url) id, user_id, feed_url FROM feeds`

 	if len(b.conditions) > 0 {
 		query += " WHERE " + strings.Join(b.conditions, " AND ")
 	}

+	query += " ORDER BY feed_url, next_check_at ASC"
+
 	if b.limit > 0 {
-		query += fmt.Sprintf(" ORDER BY next_check_at ASC LIMIT %d", b.limit)
+		query += fmt.Sprintf(" LIMIT %d", b.limit)
 	}

 	rows, err := b.db.Query(query, b.args...)
@ -77,15 +92,34 @@ func (b *BatchBuilder) FetchJobs() (model.JobList, error) {
 	defer rows.Close()

 	jobs := make(model.JobList, 0, b.limit)
+	hosts := make(map[string]int)

 	for rows.Next() {
 		var job model.Job
-		if err := rows.Scan(&job.FeedID, &job.UserID); err != nil {
-			return nil, fmt.Errorf(`store: unable to fetch job: %v`, err)
+		if err := rows.Scan(&job.FeedID, &job.UserID, &job.FeedURL); err != nil {
+			return nil, fmt.Errorf(`store: unable to fetch job record: %v`, err)
+		}
+
+		if b.limitPerHost > 0 {
+			feedHostname := urllib.Domain(job.FeedURL)
+			if hosts[feedHostname] >= b.limitPerHost {
+				slog.Debug("Feed host limit reached for this batch",
+					slog.String("feed_url", job.FeedURL),
+					slog.String("feed_hostname", feedHostname),
+					slog.Int("limit_per_host", b.limitPerHost),
+					slog.Int("current", hosts[feedHostname]),
+				)
+				continue
+			}
+			hosts[feedHostname]++
 		}

 		jobs = append(jobs, job)
 	}

+	if err := rows.Err(); err != nil {
+		return nil, fmt.Errorf(`store: error iterating on job records: %v`, err)
+	}
+
 	return jobs, nil
 }