From 84ae1d5dc0a30b6798478e20657e3c98c3463149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Sun, 17 Aug 2025 18:27:57 -0700 Subject: [PATCH] fix(storage): index only the first 500K characters of the article contents to avoid tsvector limits The length of a tsvector (lexemes + positions) must be less than 1 megabyte. We don't need to index the entire content, and we need to keep a buffer for the positions. --- internal/storage/entry.go | 28 +++++++++++++++++----------- internal/storage/entry_test.go | 17 +++++++++-------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/internal/storage/entry.go b/internal/storage/entry.go index ece466f2..10df55a3 100644 --- a/internal/storage/entry.go +++ b/internal/storage/entry.go @@ -69,6 +69,7 @@ func (s *Storage) NewEntryQueryBuilder(userID int64) *EntryQueryBuilder { // UpdateEntryTitleAndContent updates entry title and content. func (s *Storage) UpdateEntryTitleAndContent(entry *model.Entry) error { + truncatedTitle, truncatedContent := truncateTitleAndContentForTSVectorField(entry.Title, entry.Content) query := ` UPDATE entries @@ -86,8 +87,8 @@ func (s *Storage) UpdateEntryTitleAndContent(entry *model.Entry) error { entry.Title, entry.Content, entry.ReadingTime, - truncateStringForTSVectorField(entry.Title), - truncateStringForTSVectorField(entry.Content), + truncatedTitle, + truncatedContent, entry.ID, entry.UserID); err != nil { return fmt.Errorf(`store: unable to update entry #%d: %v`, entry.ID, err) @@ -98,6 +99,7 @@ func (s *Storage) UpdateEntryTitleAndContent(entry *model.Entry) error { // createEntry add a new entry. func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error { + truncatedTitle, truncatedContent := truncateTitleAndContentForTSVectorField(entry.Title, entry.Content) query := ` INSERT INTO entries ( @@ -146,8 +148,8 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error { entry.UserID, entry.FeedID, entry.ReadingTime, - truncateStringForTSVectorField(entry.Title), - truncateStringForTSVectorField(entry.Content), + truncatedTitle, + truncatedContent, pq.Array(entry.Tags), ).Scan( &entry.ID, @@ -175,6 +177,7 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error { // Note: we do not update the published date because some feeds do not contains any date, // it default to time.Now() which could change the order of items on the history page. func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error { + truncatedTitle, truncatedContent := truncateTitleAndContentForTSVectorField(entry.Title, entry.Content) query := ` UPDATE entries @@ -200,8 +203,8 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error { entry.Content, entry.Author, entry.ReadingTime, - truncateStringForTSVectorField(entry.Title), - truncateStringForTSVectorField(entry.Content), + truncatedTitle, + truncatedContent, entry.UserID, entry.FeedID, entry.Hash, @@ -702,17 +705,20 @@ func (s *Storage) UnshareEntry(userID int64, entryID int64) (err error) { return } -// truncateStringForTSVectorField truncates a string to fit within the maximum size for a TSVector field in PostgreSQL. -func truncateStringForTSVectorField(s string) string { +func truncateTitleAndContentForTSVectorField(title, content string) (string, string) { // The length of a tsvector (lexemes + positions) must be less than 1 megabyte. - const maxTSVectorSize = 1024 * 1024 + // We don't need to index the entire content, and we need to keep a buffer for the positions. + return truncateStringForTSVectorField(title, 200000), truncateStringForTSVectorField(content, 500000) +} - if len(s) < maxTSVectorSize { +// truncateStringForTSVectorField truncates a string and don't break UTF-8 characters. +func truncateStringForTSVectorField(s string, maxSize int) string { + if len(s) < maxSize { return s } // Truncate to fit under the limit, ensuring we don't break UTF-8 characters - truncated := s[:maxTSVectorSize-1] + truncated := s[:maxSize-1] // Walk backwards to find the last complete UTF-8 character for i := len(truncated) - 1; i >= 0; i-- { diff --git a/internal/storage/entry_test.go b/internal/storage/entry_test.go index 6ee2ef9c..7e982ccf 100644 --- a/internal/storage/entry_test.go +++ b/internal/storage/entry_test.go @@ -9,20 +9,21 @@ import ( ) func TestTruncateStringForTSVectorField(t *testing.T) { + const megabyte = 1024 * 1024 + // Test case 1: Short Chinese text should not be truncated shortText := "这是一个简短的中文测试文本" - result := truncateStringForTSVectorField(shortText) + result := truncateStringForTSVectorField(shortText, megabyte) if result != shortText { t.Errorf("Short text should not be truncated, got %s", result) } // Test case 2: Long Chinese text should be truncated to stay under 1MB // Generate a long Chinese string that would exceed 1MB - const megabyte = 1024 * 1024 chineseChar := "汉" longText := strings.Repeat(chineseChar, megabyte/len(chineseChar)+1000) // Ensure it exceeds 1MB - result = truncateStringForTSVectorField(longText) + result = truncateStringForTSVectorField(longText, megabyte) // Verify the result is under 1MB if len(result) >= megabyte { @@ -36,14 +37,14 @@ func TestTruncateStringForTSVectorField(t *testing.T) { // Test case 3: Text exactly at limit should not be truncated limitText := strings.Repeat("a", megabyte-1) - result = truncateStringForTSVectorField(limitText) + result = truncateStringForTSVectorField(limitText, megabyte) if result != limitText { t.Error("Text under limit should not be truncated") } // Test case 4: Mixed Chinese and ASCII text mixedText := strings.Repeat("测试Test汉字", megabyte/20) // Create large mixed text - result = truncateStringForTSVectorField(mixedText) + result = truncateStringForTSVectorField(mixedText, megabyte) if len(result) >= megabyte { t.Errorf("Mixed text should be truncated under 1MB, got %d bytes", len(result)) @@ -56,7 +57,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) { // Test case 5: Large text ending with ASCII characters asciiSuffix := strings.Repeat("a", megabyte-100) + strings.Repeat("测试", 50) + "abcdef" - result = truncateStringForTSVectorField(asciiSuffix) + result = truncateStringForTSVectorField(asciiSuffix, megabyte) if len(result) >= megabyte { t.Errorf("ASCII suffix text should be truncated under 1MB, got %d bytes", len(result)) @@ -69,7 +70,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) { // Test case 6: Large ASCII text to cover ASCII branch in UTF-8 detection largeAscii := strings.Repeat("abcdefghijklmnopqrstuvwxyz", megabyte/26+1000) - result = truncateStringForTSVectorField(largeAscii) + result = truncateStringForTSVectorField(largeAscii, megabyte) if len(result) >= megabyte { t.Errorf("Large ASCII text should be truncated under 1MB, got %d bytes", len(result)) @@ -87,7 +88,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) { for i := range invalidBytes { invalidBytes[i] = 0x80 // Continuation byte without start byte } - result = truncateStringForTSVectorField(string(invalidBytes)) + result = truncateStringForTSVectorField(string(invalidBytes), megabyte) // Should return empty string as fallback if result != "" {