From f2f60a8f73ec2f1c82a9834452a29930f3302dbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Thu, 6 Feb 2025 21:17:10 -0800 Subject: [PATCH] feat(sanitizer): improve text truncation with better space handling --- internal/reader/sanitizer/truncate.go | 5 ++- internal/reader/sanitizer/truncate_test.go | 50 ++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/internal/reader/sanitizer/truncate.go b/internal/reader/sanitizer/truncate.go index bac2b453..c6afdd75 100644 --- a/internal/reader/sanitizer/truncate.go +++ b/internal/reader/sanitizer/truncate.go @@ -9,8 +9,9 @@ func TruncateHTML(input string, max int) string { text := StripTags(input) text = strings.ReplaceAll(text, "\n", " ") text = strings.ReplaceAll(text, "\t", " ") - text = strings.ReplaceAll(text, " ", " ") - text = strings.TrimSpace(text) + + // Collapse multiple spaces into a single space + text = strings.Join(strings.Fields(text), " ") // Convert to runes to be safe with unicode runes := []rune(text) diff --git a/internal/reader/sanitizer/truncate_test.go b/internal/reader/sanitizer/truncate_test.go index bb50f039..0cd7fcdb 100644 --- a/internal/reader/sanitizer/truncate_test.go +++ b/internal/reader/sanitizer/truncate_test.go @@ -62,3 +62,53 @@ func TestTruncateHTMLWithMultilineTextLowerThanLimit(t *testing.T) { t.Errorf(`Wrong output: %q != %q`, expected, output) } } + +func TestTruncateHTMLWithMultipleSpaces(t *testing.T) { + tests := []struct { + name string + input string + maxLen int + expected string + }{ + { + name: "multiple spaces", + input: "hello world test", + maxLen: 20, + expected: "hello world test", + }, + { + name: "tabs and newlines", + input: "hello\t\tworld\n\ntest", + maxLen: 20, + expected: "hello world test", + }, + { + name: "truncation with unicode", + input: "hello world 你好", + maxLen: 11, + expected: "hello world…", + }, + { + name: "html stripping", + input: "

hello world test

", + maxLen: 20, + expected: "hello world test", + }, + { + name: "no truncation needed", + input: "hello world", + maxLen: 20, + expected: "hello world", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := TruncateHTML(tt.input, tt.maxLen) + if result != tt.expected { + t.Errorf("TruncateHTML(%q, %d) = %q, want %q", + tt.input, tt.maxLen, result, tt.expected) + } + }) + } +}