fix(storage): index only the first 500K characters of the article contents to avoid tsvector limits

The length of a tsvector (lexemes + positions) must be less than 1 megabyte. We don't need to index the entire content, and we need to keep a buffer for the positions.
2025-10-15 19:42:07 +00:00 · 2025-08-17 18:27:57 -07:00 · 2025-08-17 18:27:57 -07:00 · 84ae1d5dc0
commit 84ae1d5dc0
parent 5403ca09f6
2 changed files with 26 additions and 19 deletions
--- a/internal/storage/entry_test.go
+++ b/internal/storage/entry_test.go
@ -9,20 +9,21 @@ import (
 )

 func TestTruncateStringForTSVectorField(t *testing.T) {
+	const megabyte = 1024 * 1024
+
 	// Test case 1: Short Chinese text should not be truncated
 	shortText := "这是一个简短的中文测试文本"
-	result := truncateStringForTSVectorField(shortText)
+	result := truncateStringForTSVectorField(shortText, megabyte)
 	if result != shortText {
 		t.Errorf("Short text should not be truncated, got %s", result)
 	}

 	// Test case 2: Long Chinese text should be truncated to stay under 1MB
 	// Generate a long Chinese string that would exceed 1MB
-	const megabyte = 1024 * 1024
 	chineseChar := "汉"
 	longText := strings.Repeat(chineseChar, megabyte/len(chineseChar)+1000) // Ensure it exceeds 1MB

-	result = truncateStringForTSVectorField(longText)
+	result = truncateStringForTSVectorField(longText, megabyte)

 	// Verify the result is under 1MB
 	if len(result) >= megabyte {
@ -36,14 +37,14 @@ func TestTruncateStringForTSVectorField(t *testing.T) {

 	// Test case 3: Text exactly at limit should not be truncated
 	limitText := strings.Repeat("a", megabyte-1)
-	result = truncateStringForTSVectorField(limitText)
+	result = truncateStringForTSVectorField(limitText, megabyte)
 	if result != limitText {
 		t.Error("Text under limit should not be truncated")
 	}

 	// Test case 4: Mixed Chinese and ASCII text
 	mixedText := strings.Repeat("测试Test汉字", megabyte/20) // Create large mixed text
-	result = truncateStringForTSVectorField(mixedText)
+	result = truncateStringForTSVectorField(mixedText, megabyte)

 	if len(result) >= megabyte {
 		t.Errorf("Mixed text should be truncated under 1MB, got %d bytes", len(result))
@ -56,7 +57,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) {

 	// Test case 5: Large text ending with ASCII characters
 	asciiSuffix := strings.Repeat("a", megabyte-100) + strings.Repeat("测试", 50) + "abcdef"
-	result = truncateStringForTSVectorField(asciiSuffix)
+	result = truncateStringForTSVectorField(asciiSuffix, megabyte)

 	if len(result) >= megabyte {
 		t.Errorf("ASCII suffix text should be truncated under 1MB, got %d bytes", len(result))
@ -69,7 +70,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) {

 	// Test case 6: Large ASCII text to cover ASCII branch in UTF-8 detection
 	largeAscii := strings.Repeat("abcdefghijklmnopqrstuvwxyz", megabyte/26+1000)
-	result = truncateStringForTSVectorField(largeAscii)
+	result = truncateStringForTSVectorField(largeAscii, megabyte)

 	if len(result) >= megabyte {
 		t.Errorf("Large ASCII text should be truncated under 1MB, got %d bytes", len(result))
@ -87,7 +88,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) {
 	for i := range invalidBytes {
 		invalidBytes[i] = 0x80 // Continuation byte without start byte
 	}
-	result = truncateStringForTSVectorField(string(invalidBytes))
+	result = truncateStringForTSVectorField(string(invalidBytes), megabyte)

 	// Should return empty string as fallback
 	if result != "" {