From 7f54b270791f842546cd9a259894867aa77d76c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= Date: Sat, 15 Feb 2025 14:46:04 -0800 Subject: [PATCH] fix(rss): handle item title with CDATA content correctly Fix regression introduced in commit a3ce03cc --- internal/reader/rss/adapter.go | 2 +- internal/reader/rss/parser_test.go | 335 +++++++++++++++-------------- internal/reader/rss/rss.go | 32 ++- 3 files changed, 205 insertions(+), 164 deletions(-) diff --git a/internal/reader/rss/adapter.go b/internal/reader/rss/adapter.go index be5b23b3..cbde88b3 100644 --- a/internal/reader/rss/adapter.go +++ b/internal/reader/rss/adapter.go @@ -173,7 +173,7 @@ func findFeedAuthor(rssChannel *RSSChannel) string { } func findEntryTitle(rssItem *RSSItem) string { - title := sanitizer.StripTags(rssItem.Title.Inner) + title := rssItem.Title.Content if rssItem.DublinCoreTitle != "" { title = rssItem.DublinCoreTitle diff --git a/internal/reader/rss/parser_test.go b/internal/reader/rss/parser_test.go index 50b3f446..7bee9b29 100644 --- a/internal/reader/rss/parser_test.go +++ b/internal/reader/rss/parser_test.go @@ -311,6 +311,184 @@ func TestParseEntryWithDCTitleOnly(t *testing.T) { } } +func TestParseFeedTitleWithHTMLEntity(t *testing.T) { + data := ` + + + https://example.org/ + Example   Feed + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Title != "Example \u00a0 Feed" { + t.Errorf(`Incorrect title, got: %q`, feed.Title) + } +} + +func TestParseFeedTitleWithUnicodeEntityAndCdata(t *testing.T) { + data := ` + + + https://example.org/ + <![CDATA[Jenny’s Newsletter]]> + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Title != `Jenny’s Newsletter` { + t.Errorf(`Incorrect title, got: %q`, feed.Title) + } +} + +func TestParseItemTitleWithHTMLEntity(t *testing.T) { + data := ` + + + https://example.org/ + Example + + </example> + http://www.example.org/entries/1 + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "" { + t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title) + } +} + +func TestParseItemTitleWithNumericCharacterReference(t *testing.T) { + data := ` + + + https://example.org/ + Example + + Σ ß + http://www.example.org/article.html + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "Σ ß" { + t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title) + } +} + +func TestParseItemTitleWithDoubleEncodedEntities(t *testing.T) { + data := ` + + + https://example.org/ + Example + + &#39;Text&#39; + http://www.example.org/article.html + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "'Text'" { + t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title) + } +} + +func TestParseItemTitleWithWhitespaces(t *testing.T) { + data := ` + + + Example + http://example.org + + + Some Title + + http://www.example.org/entries/1 + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "Some Title" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseItemTitleWithCDATA(t *testing.T) { + data := ` + + + Example + http://example.org + + <![CDATA[This is a title]]> + http://www.example.org/entries/1 + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "This is a title" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseItemTitleWithInnerHTML(t *testing.T) { + data := ` + + + Example + http://example.org + + Test: <b>bold</b> + http://www.example.org/entries/1 + + + ` + + feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "Test: bold" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + func TestParseEntryWithoutLink(t *testing.T) { data := ` @@ -997,56 +1175,6 @@ func TestParseEntryWithFeedBurnerLink(t *testing.T) { } } -func TestParseEntryTitleWithWhitespaces(t *testing.T) { - data := ` - - - Example - http://example.org - - - Some Title - - http://www.example.org/entries/1 - Fri, 15 Jul 2005 00:00:00 -0500 - - - ` - - feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) - if err != nil { - t.Fatal(err) - } - - if feed.Entries[0].Title != "Some Title" { - t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) - } -} - -func TestParseEntryTitleWithInnerHTML(t *testing.T) { - data := ` - - - Example - http://example.org - - Test: <b>bold</b> - http://www.example.org/entries/1 - Fri, 15 Jul 2005 00:00:00 -0500 - - - ` - - feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) - if err != nil { - t.Fatal(err) - } - - if feed.Entries[0].Title != "Test: bold" { - t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) - } -} - func TestParseEntryWithEnclosures(t *testing.T) { data := ` @@ -1404,113 +1532,6 @@ func TestParseInvalidXml(t *testing.T) { } } -func TestParseFeedTitleWithHTMLEntity(t *testing.T) { - data := ` - - - https://example.org/ - Example   Feed - - ` - - feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) - if err != nil { - t.Fatal(err) - } - - if feed.Title != "Example \u00a0 Feed" { - t.Errorf(`Incorrect title, got: %q`, feed.Title) - } -} - -func TestParseFeedTitleWithUnicodeEntityAndCdata(t *testing.T) { - data := ` - - - https://example.org/ - <![CDATA[Jenny’s Newsletter]]> - - ` - - feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) - if err != nil { - t.Fatal(err) - } - - if feed.Title != `Jenny’s Newsletter` { - t.Errorf(`Incorrect title, got: %q`, feed.Title) - } -} - -func TestParseItemTitleWithHTMLEntity(t *testing.T) { - data := ` - - - https://example.org/ - Example - - </example> - http://www.example.org/entries/1 - - - ` - - feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) - if err != nil { - t.Fatal(err) - } - - if feed.Entries[0].Title != "" { - t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title) - } -} - -func TestParseItemTitleWithNumericCharacterReference(t *testing.T) { - data := ` - - - https://example.org/ - Example - - Σ ß - http://www.example.org/article.html - - - ` - - feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) - if err != nil { - t.Fatal(err) - } - - if feed.Entries[0].Title != "Σ ß" { - t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title) - } -} - -func TestParseItemTitleWithDoubleEncodedEntities(t *testing.T) { - data := ` - - - https://example.org/ - Example - - &#39;Text&#39; - http://www.example.org/article.html - - - ` - - feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data))) - if err != nil { - t.Fatal(err) - } - - if feed.Entries[0].Title != "'Text'" { - t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title) - } -} - func TestParseFeedLinkWithInvalidCharacterEntity(t *testing.T) { data := ` diff --git a/internal/reader/rss/rss.go b/internal/reader/rss/rss.go index 8ac4982f..5c65c8f6 100644 --- a/internal/reader/rss/rss.go +++ b/internal/reader/rss/rss.go @@ -111,7 +111,7 @@ type RSSImage struct { type RSSItem struct { // Title is the title of the item. - Title RSSTitle `xml:"rss title"` + Title InnerContent `xml:"rss title"` // Link is the URL of the item. Link string `xml:"rss link"` @@ -169,11 +169,6 @@ type RSSItem struct { googleplay.GooglePlayItemElement } -type RSSTitle struct { - Data string `xml:",chardata"` - Inner string `xml:",innerxml"` -} - type RSSAuthor struct { XMLName xml.Name Data string `xml:",chardata"` @@ -203,3 +198,28 @@ type RSSSource struct { URL string `xml:"url,attr"` Name string `xml:",chardata"` } + +type InnerContent struct { + Content string +} + +func (ic *InnerContent) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { + var content strings.Builder + + for { + token, err := d.Token() + if err != nil { + return err + } + + switch t := token.(type) { + case xml.CharData: + content.Write(t) + case xml.EndElement: + if t == start.End() { + ic.Content = strings.TrimSpace(content.String()) + return nil + } + } + } +}