mirror of
https://github.com/miniflux/v2.git
synced 2025-08-26 18:21:01 +00:00
fix(storage): index only the first 500K characters of the article contents to avoid tsvector limits
The length of a tsvector (lexemes + positions) must be less than 1 megabyte. We don't need to index the entire content, and we need to keep a buffer for the positions.
This commit is contained in:
parent
5403ca09f6
commit
84ae1d5dc0
2 changed files with 26 additions and 19 deletions
|
@ -69,6 +69,7 @@ func (s *Storage) NewEntryQueryBuilder(userID int64) *EntryQueryBuilder {
|
||||||
|
|
||||||
// UpdateEntryTitleAndContent updates entry title and content.
|
// UpdateEntryTitleAndContent updates entry title and content.
|
||||||
func (s *Storage) UpdateEntryTitleAndContent(entry *model.Entry) error {
|
func (s *Storage) UpdateEntryTitleAndContent(entry *model.Entry) error {
|
||||||
|
truncatedTitle, truncatedContent := truncateTitleAndContentForTSVectorField(entry.Title, entry.Content)
|
||||||
query := `
|
query := `
|
||||||
UPDATE
|
UPDATE
|
||||||
entries
|
entries
|
||||||
|
@ -86,8 +87,8 @@ func (s *Storage) UpdateEntryTitleAndContent(entry *model.Entry) error {
|
||||||
entry.Title,
|
entry.Title,
|
||||||
entry.Content,
|
entry.Content,
|
||||||
entry.ReadingTime,
|
entry.ReadingTime,
|
||||||
truncateStringForTSVectorField(entry.Title),
|
truncatedTitle,
|
||||||
truncateStringForTSVectorField(entry.Content),
|
truncatedContent,
|
||||||
entry.ID,
|
entry.ID,
|
||||||
entry.UserID); err != nil {
|
entry.UserID); err != nil {
|
||||||
return fmt.Errorf(`store: unable to update entry #%d: %v`, entry.ID, err)
|
return fmt.Errorf(`store: unable to update entry #%d: %v`, entry.ID, err)
|
||||||
|
@ -98,6 +99,7 @@ func (s *Storage) UpdateEntryTitleAndContent(entry *model.Entry) error {
|
||||||
|
|
||||||
// createEntry add a new entry.
|
// createEntry add a new entry.
|
||||||
func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
|
func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
|
||||||
|
truncatedTitle, truncatedContent := truncateTitleAndContentForTSVectorField(entry.Title, entry.Content)
|
||||||
query := `
|
query := `
|
||||||
INSERT INTO entries
|
INSERT INTO entries
|
||||||
(
|
(
|
||||||
|
@ -146,8 +148,8 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
|
||||||
entry.UserID,
|
entry.UserID,
|
||||||
entry.FeedID,
|
entry.FeedID,
|
||||||
entry.ReadingTime,
|
entry.ReadingTime,
|
||||||
truncateStringForTSVectorField(entry.Title),
|
truncatedTitle,
|
||||||
truncateStringForTSVectorField(entry.Content),
|
truncatedContent,
|
||||||
pq.Array(entry.Tags),
|
pq.Array(entry.Tags),
|
||||||
).Scan(
|
).Scan(
|
||||||
&entry.ID,
|
&entry.ID,
|
||||||
|
@ -175,6 +177,7 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
|
||||||
// Note: we do not update the published date because some feeds do not contains any date,
|
// Note: we do not update the published date because some feeds do not contains any date,
|
||||||
// it default to time.Now() which could change the order of items on the history page.
|
// it default to time.Now() which could change the order of items on the history page.
|
||||||
func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
|
func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
|
||||||
|
truncatedTitle, truncatedContent := truncateTitleAndContentForTSVectorField(entry.Title, entry.Content)
|
||||||
query := `
|
query := `
|
||||||
UPDATE
|
UPDATE
|
||||||
entries
|
entries
|
||||||
|
@ -200,8 +203,8 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
|
||||||
entry.Content,
|
entry.Content,
|
||||||
entry.Author,
|
entry.Author,
|
||||||
entry.ReadingTime,
|
entry.ReadingTime,
|
||||||
truncateStringForTSVectorField(entry.Title),
|
truncatedTitle,
|
||||||
truncateStringForTSVectorField(entry.Content),
|
truncatedContent,
|
||||||
entry.UserID,
|
entry.UserID,
|
||||||
entry.FeedID,
|
entry.FeedID,
|
||||||
entry.Hash,
|
entry.Hash,
|
||||||
|
@ -702,17 +705,20 @@ func (s *Storage) UnshareEntry(userID int64, entryID int64) (err error) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// truncateStringForTSVectorField truncates a string to fit within the maximum size for a TSVector field in PostgreSQL.
|
func truncateTitleAndContentForTSVectorField(title, content string) (string, string) {
|
||||||
func truncateStringForTSVectorField(s string) string {
|
|
||||||
// The length of a tsvector (lexemes + positions) must be less than 1 megabyte.
|
// The length of a tsvector (lexemes + positions) must be less than 1 megabyte.
|
||||||
const maxTSVectorSize = 1024 * 1024
|
// We don't need to index the entire content, and we need to keep a buffer for the positions.
|
||||||
|
return truncateStringForTSVectorField(title, 200000), truncateStringForTSVectorField(content, 500000)
|
||||||
|
}
|
||||||
|
|
||||||
if len(s) < maxTSVectorSize {
|
// truncateStringForTSVectorField truncates a string and don't break UTF-8 characters.
|
||||||
|
func truncateStringForTSVectorField(s string, maxSize int) string {
|
||||||
|
if len(s) < maxSize {
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// Truncate to fit under the limit, ensuring we don't break UTF-8 characters
|
// Truncate to fit under the limit, ensuring we don't break UTF-8 characters
|
||||||
truncated := s[:maxTSVectorSize-1]
|
truncated := s[:maxSize-1]
|
||||||
|
|
||||||
// Walk backwards to find the last complete UTF-8 character
|
// Walk backwards to find the last complete UTF-8 character
|
||||||
for i := len(truncated) - 1; i >= 0; i-- {
|
for i := len(truncated) - 1; i >= 0; i-- {
|
||||||
|
|
|
@ -9,20 +9,21 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestTruncateStringForTSVectorField(t *testing.T) {
|
func TestTruncateStringForTSVectorField(t *testing.T) {
|
||||||
|
const megabyte = 1024 * 1024
|
||||||
|
|
||||||
// Test case 1: Short Chinese text should not be truncated
|
// Test case 1: Short Chinese text should not be truncated
|
||||||
shortText := "这是一个简短的中文测试文本"
|
shortText := "这是一个简短的中文测试文本"
|
||||||
result := truncateStringForTSVectorField(shortText)
|
result := truncateStringForTSVectorField(shortText, megabyte)
|
||||||
if result != shortText {
|
if result != shortText {
|
||||||
t.Errorf("Short text should not be truncated, got %s", result)
|
t.Errorf("Short text should not be truncated, got %s", result)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test case 2: Long Chinese text should be truncated to stay under 1MB
|
// Test case 2: Long Chinese text should be truncated to stay under 1MB
|
||||||
// Generate a long Chinese string that would exceed 1MB
|
// Generate a long Chinese string that would exceed 1MB
|
||||||
const megabyte = 1024 * 1024
|
|
||||||
chineseChar := "汉"
|
chineseChar := "汉"
|
||||||
longText := strings.Repeat(chineseChar, megabyte/len(chineseChar)+1000) // Ensure it exceeds 1MB
|
longText := strings.Repeat(chineseChar, megabyte/len(chineseChar)+1000) // Ensure it exceeds 1MB
|
||||||
|
|
||||||
result = truncateStringForTSVectorField(longText)
|
result = truncateStringForTSVectorField(longText, megabyte)
|
||||||
|
|
||||||
// Verify the result is under 1MB
|
// Verify the result is under 1MB
|
||||||
if len(result) >= megabyte {
|
if len(result) >= megabyte {
|
||||||
|
@ -36,14 +37,14 @@ func TestTruncateStringForTSVectorField(t *testing.T) {
|
||||||
|
|
||||||
// Test case 3: Text exactly at limit should not be truncated
|
// Test case 3: Text exactly at limit should not be truncated
|
||||||
limitText := strings.Repeat("a", megabyte-1)
|
limitText := strings.Repeat("a", megabyte-1)
|
||||||
result = truncateStringForTSVectorField(limitText)
|
result = truncateStringForTSVectorField(limitText, megabyte)
|
||||||
if result != limitText {
|
if result != limitText {
|
||||||
t.Error("Text under limit should not be truncated")
|
t.Error("Text under limit should not be truncated")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test case 4: Mixed Chinese and ASCII text
|
// Test case 4: Mixed Chinese and ASCII text
|
||||||
mixedText := strings.Repeat("测试Test汉字", megabyte/20) // Create large mixed text
|
mixedText := strings.Repeat("测试Test汉字", megabyte/20) // Create large mixed text
|
||||||
result = truncateStringForTSVectorField(mixedText)
|
result = truncateStringForTSVectorField(mixedText, megabyte)
|
||||||
|
|
||||||
if len(result) >= megabyte {
|
if len(result) >= megabyte {
|
||||||
t.Errorf("Mixed text should be truncated under 1MB, got %d bytes", len(result))
|
t.Errorf("Mixed text should be truncated under 1MB, got %d bytes", len(result))
|
||||||
|
@ -56,7 +57,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) {
|
||||||
|
|
||||||
// Test case 5: Large text ending with ASCII characters
|
// Test case 5: Large text ending with ASCII characters
|
||||||
asciiSuffix := strings.Repeat("a", megabyte-100) + strings.Repeat("测试", 50) + "abcdef"
|
asciiSuffix := strings.Repeat("a", megabyte-100) + strings.Repeat("测试", 50) + "abcdef"
|
||||||
result = truncateStringForTSVectorField(asciiSuffix)
|
result = truncateStringForTSVectorField(asciiSuffix, megabyte)
|
||||||
|
|
||||||
if len(result) >= megabyte {
|
if len(result) >= megabyte {
|
||||||
t.Errorf("ASCII suffix text should be truncated under 1MB, got %d bytes", len(result))
|
t.Errorf("ASCII suffix text should be truncated under 1MB, got %d bytes", len(result))
|
||||||
|
@ -69,7 +70,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) {
|
||||||
|
|
||||||
// Test case 6: Large ASCII text to cover ASCII branch in UTF-8 detection
|
// Test case 6: Large ASCII text to cover ASCII branch in UTF-8 detection
|
||||||
largeAscii := strings.Repeat("abcdefghijklmnopqrstuvwxyz", megabyte/26+1000)
|
largeAscii := strings.Repeat("abcdefghijklmnopqrstuvwxyz", megabyte/26+1000)
|
||||||
result = truncateStringForTSVectorField(largeAscii)
|
result = truncateStringForTSVectorField(largeAscii, megabyte)
|
||||||
|
|
||||||
if len(result) >= megabyte {
|
if len(result) >= megabyte {
|
||||||
t.Errorf("Large ASCII text should be truncated under 1MB, got %d bytes", len(result))
|
t.Errorf("Large ASCII text should be truncated under 1MB, got %d bytes", len(result))
|
||||||
|
@ -87,7 +88,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) {
|
||||||
for i := range invalidBytes {
|
for i := range invalidBytes {
|
||||||
invalidBytes[i] = 0x80 // Continuation byte without start byte
|
invalidBytes[i] = 0x80 // Continuation byte without start byte
|
||||||
}
|
}
|
||||||
result = truncateStringForTSVectorField(string(invalidBytes))
|
result = truncateStringForTSVectorField(string(invalidBytes), megabyte)
|
||||||
|
|
||||||
// Should return empty string as fallback
|
// Should return empty string as fallback
|
||||||
if result != "" {
|
if result != "" {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue