// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 package storage import ( "strings" "testing" ) func TestTruncateStringForTSVectorField(t *testing.T) { // Test case 1: Short Chinese text should not be truncated shortText := "这是一个简短的中文测试文本" result := truncateStringForTSVectorField(shortText) if result != shortText { t.Errorf("Short text should not be truncated, got %s", result) } // Test case 2: Long Chinese text should be truncated to stay under 1MB // Generate a long Chinese string that would exceed 1MB const megabyte = 1024 * 1024 chineseChar := "汉" longText := strings.Repeat(chineseChar, megabyte/len(chineseChar)+1000) // Ensure it exceeds 1MB result = truncateStringForTSVectorField(longText) // Verify the result is under 1MB if len(result) >= megabyte { t.Errorf("Truncated text should be under 1MB, got %d bytes", len(result)) } // Verify the result is still valid UTF-8 and doesn't cut in the middle of a character if !strings.HasPrefix(longText, result) { t.Error("Truncated text should be a prefix of original text") } // Test case 3: Text exactly at limit should not be truncated limitText := strings.Repeat("a", megabyte-1) result = truncateStringForTSVectorField(limitText) if result != limitText { t.Error("Text under limit should not be truncated") } // Test case 4: Mixed Chinese and ASCII text mixedText := strings.Repeat("测试Test汉字", megabyte/20) // Create large mixed text result = truncateStringForTSVectorField(mixedText) if len(result) >= megabyte { t.Errorf("Mixed text should be truncated under 1MB, got %d bytes", len(result)) } // Verify no broken UTF-8 sequences if !strings.HasPrefix(mixedText, result) { t.Error("Truncated mixed text should be a valid prefix") } // Test case 5: Large text ending with ASCII characters asciiSuffix := strings.Repeat("a", megabyte-100) + strings.Repeat("测试", 50) + "abcdef" result = truncateStringForTSVectorField(asciiSuffix) if len(result) >= megabyte { t.Errorf("ASCII suffix text should be truncated under 1MB, got %d bytes", len(result)) } // Should end with ASCII character if !strings.HasPrefix(asciiSuffix, result) { t.Error("Truncated ASCII suffix text should be a valid prefix") } // Test case 6: Large ASCII text to cover ASCII branch in UTF-8 detection largeAscii := strings.Repeat("abcdefghijklmnopqrstuvwxyz", megabyte/26+1000) result = truncateStringForTSVectorField(largeAscii) if len(result) >= megabyte { t.Errorf("Large ASCII text should be truncated under 1MB, got %d bytes", len(result)) } // Should be a prefix if !strings.HasPrefix(largeAscii, result) { t.Error("Truncated ASCII text should be a valid prefix") } // Test case 7: Edge case - string that would trigger the fallback // Create a pathological case: all continuation bytes without start bytes // This should trigger the fallback because there are no valid UTF-8 boundaries invalidBytes := make([]byte, megabyte) for i := range invalidBytes { invalidBytes[i] = 0x80 // Continuation byte without start byte } result = truncateStringForTSVectorField(string(invalidBytes)) // Should return empty string as fallback if result != "" { t.Errorf("Invalid UTF-8 continuation bytes should return empty string, got %d bytes", len(result)) } }