Refactor Atom parser to use an adapter

2025-10-05 19:31:01 +00:00 · 2024-03-15 16:39:32 -07:00 · 2024-03-15 16:39:32 -07:00 · dd4fb660c1
commit dd4fb660c1
parent 2ba893bc79
11 changed files with 795 additions and 500 deletions
--- a/internal/reader/atom/atom_10.go
+++ b/internal/reader/atom/atom_10.go
@ -6,286 +6,199 @@ package atom // import "miniflux.app/v2/internal/reader/atom"
 import (
 	"encoding/xml"
 	"html"
-	"log/slog"
-	"strconv"
 	"strings"
-	"time"

-	"miniflux.app/v2/internal/crypto"
-	"miniflux.app/v2/internal/model"
-	"miniflux.app/v2/internal/reader/date"
 	"miniflux.app/v2/internal/reader/media"
 	"miniflux.app/v2/internal/reader/sanitizer"
-	"miniflux.app/v2/internal/urllib"
 )

+// The "atom:feed" element is the document (i.e., top-level) element of
+// an Atom Feed Document, acting as a container for metadata and data
+// associated with the feed. Its element children consist of metadata
+// elements followed by zero or more atom:entry child elements.
+//
 // Specs:
 // https://tools.ietf.org/html/rfc4287
 // https://validator.w3.org/feed/docs/atom.html
-type atom10Feed struct {
-	XMLName xml.Name      `xml:"http://www.w3.org/2005/Atom feed"`
-	ID      string        `xml:"id"`
-	Title   atom10Text    `xml:"title"`
-	Authors atomAuthors   `xml:"author"`
-	Icon    string        `xml:"icon"`
-	Links   atomLinks     `xml:"link"`
-	Entries []atom10Entry `xml:"entry"`
+type Atom10Feed struct {
+	XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
+
+	// The "atom:id" element conveys a permanent, universally unique
+	// identifier for an entry or feed.
+	//
+	// Its content MUST be an IRI, as defined by [RFC3987].  Note that the
+	// definition of "IRI" excludes relative references.  Though the IRI
+	// might use a dereferencable scheme, Atom Processors MUST NOT assume it
+	// can be dereferenced.
+	//
+	// atom:feed elements MUST contain exactly one atom:id element.
+	ID string `xml:"http://www.w3.org/2005/Atom id"`
+
+	// The "atom:title" element is a Text construct that conveys a human-
+	// readable title for an entry or feed.
+	//
+	// atom:feed elements MUST contain exactly one atom:title element.
+	Title Atom10Text `xml:"http://www.w3.org/2005/Atom title"`
+
+	// The "atom:author" element is a Person construct that indicates the
+	// author of the entry or feed.
+	//
+	// atom:feed elements MUST contain one or more atom:author elements,
+	// unless all of the atom:feed element's child atom:entry elements
+	// contain at least one atom:author element.
+	Authors AtomPersons `xml:"http://www.w3.org/2005/Atom author"`
+
+	// The "atom:icon" element's content is an IRI reference [RFC3987] that
+	// identifies an image that provides iconic visual identification for a
+	// feed.
+	//
+	// atom:feed elements MUST NOT contain more than one atom:icon element.
+	Icon string `xml:"http://www.w3.org/2005/Atom icon"`
+
+	// The "atom:logo" element's content is an IRI reference [RFC3987] that
+	// identifies an image that provides visual identification for a feed.
+	//
+	// atom:feed elements MUST NOT contain more than one atom:logo element.
+	Logo string `xml:"http://www.w3.org/2005/Atom logo"`
+
+	// atom:feed elements SHOULD contain one atom:link element with a rel
+	// attribute value of "self". This is the preferred URI for
+	// retrieving Atom Feed Documents representing this Atom feed.
+	//
+	// atom:feed elements MUST NOT contain more than one atom:link
+	// element with a rel attribute value of "alternate" that has the
+	// same combination of type and hreflang attribute values.
+	Links AtomLinks `xml:"http://www.w3.org/2005/Atom link"`
+
+	// The "atom:category" element conveys information about a category
+	// associated with an entry or feed.  This specification assigns no
+	// meaning to the content (if any) of this element.
+	//
+	// atom:feed elements MAY contain any number of atom:category
+	// elements.
+	Categories AtomCategories `xml:"http://www.w3.org/2005/Atom category"`
+
+	Entries []Atom10Entry `xml:"http://www.w3.org/2005/Atom entry"`
 }

-func (a *atom10Feed) Transform(baseURL string) *model.Feed {
-	var err error
+type Atom10Entry struct {
+	// The "atom:id" element conveys a permanent, universally unique
+	// identifier for an entry or feed.
+	//
+	// Its content MUST be an IRI, as defined by [RFC3987].  Note that the
+	// definition of "IRI" excludes relative references.  Though the IRI
+	// might use a dereferencable scheme, Atom Processors MUST NOT assume it
+	// can be dereferenced.
+	//
+	// atom:entry elements MUST contain exactly one atom:id element.
+	ID string `xml:"http://www.w3.org/2005/Atom id"`

-	feed := new(model.Feed)
+	// The "atom:title" element is a Text construct that conveys a human-
+	// readable title for an entry or feed.
+	//
+	// atom:entry elements MUST contain exactly one atom:title element.
+	Title Atom10Text `xml:"http://www.w3.org/2005/Atom title"`

-	feedURL := a.Links.firstLinkWithRelation("self")
-	feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL)
-	if err != nil {
-		feed.FeedURL = feedURL
-	}
+	// The "atom:published" element is a Date construct indicating an
+	// instant in time associated with an event early in the life cycle of
+	// the entry.
+	Published string `xml:"http://www.w3.org/2005/Atom published"`

-	siteURL := a.Links.originalLink()
-	feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL)
-	if err != nil {
-		feed.SiteURL = siteURL
-	}
+	// The "atom:updated" element is a Date construct indicating the most
+	// recent instant in time when an entry or feed was modified in a way
+	// the publisher considers significant. Therefore, not all
+	// modifications necessarily result in a changed atom:updated value.
+	//
+	// atom:entry elements MUST contain exactly one atom:updated element.
+	Updated string `xml:"http://www.w3.org/2005/Atom updated"`

-	feed.Title = html.UnescapeString(a.Title.String())
-	if feed.Title == "" {
-		feed.Title = feed.SiteURL
-	}
+	// atom:entry elements MUST NOT contain more than one atom:link
+	// element with a rel attribute value of "alternate" that has the
+	// same combination of type and hreflang attribute values.
+	Links AtomLinks `xml:"http://www.w3.org/2005/Atom link"`

-	feed.IconURL = strings.TrimSpace(a.Icon)
+	// atom:entry elements MUST contain an atom:summary element in either
+	// of the following cases:
+	// *  the atom:entry contains an atom:content that has a "src"
+	//    attribute (and is thus empty).
+	// *  the atom:entry contains content that is encoded in Base64;
+	//    i.e., the "type" attribute of atom:content is a MIME media type
+	//    [MIMEREG], but is not an XML media type [RFC3023], does not
+	//    begin with "text/", and does not end with "/xml" or "+xml".
+	//
+	// atom:entry elements MUST NOT contain more than one atom:summary
+	// element.
+	Summary Atom10Text `xml:"http://www.w3.org/2005/Atom summary"`

-	for _, entry := range a.Entries {
-		item := entry.Transform()
-		entryURL, err := urllib.AbsoluteURL(feed.SiteURL, item.URL)
-		if err == nil {
-			item.URL = entryURL
-		}
+	// atom:entry elements MUST NOT contain more than one atom:content
+	// element.
+	Content Atom10Text `xml:"http://www.w3.org/2005/Atom content"`

-		if item.Author == "" {
-			item.Author = a.Authors.String()
-		}
+	// The "atom:author" element is a Person construct that indicates the
+	// author of the entry or feed.
+	//
+	// atom:entry elements MUST contain one or more atom:author elements
+	Authors AtomPersons `xml:"http://www.w3.org/2005/Atom author"`

-		if item.Title == "" {
-			item.Title = sanitizer.TruncateHTML(item.Content, 100)
-		}
+	// The "atom:category" element conveys information about a category
+	// associated with an entry or feed.  This specification assigns no
+	// meaning to the content (if any) of this element.
+	//
+	// atom:entry elements MAY contain any number of atom:category
+	// elements.
+	Categories AtomCategories `xml:"http://www.w3.org/2005/Atom category"`

-		if item.Title == "" {
-			item.Title = item.URL
-		}
-
-		feed.Entries = append(feed.Entries, item)
-	}
-
-	return feed
-}
-
-type atom10Entry struct {
-	ID         string           `xml:"id"`
-	Title      atom10Text       `xml:"title"`
-	Published  string           `xml:"published"`
-	Updated    string           `xml:"updated"`
-	Links      atomLinks        `xml:"link"`
-	Summary    atom10Text       `xml:"summary"`
-	Content    atom10Text       `xml:"http://www.w3.org/2005/Atom content"`
-	Authors    atomAuthors      `xml:"author"`
-	Categories []atom10Category `xml:"category"`
 	media.MediaItemElement
 }

-func (a *atom10Entry) Transform() *model.Entry {
-	entry := model.NewEntry()
-	entry.URL = a.Links.originalLink()
-	entry.Date = a.entryDate()
-	entry.Author = a.Authors.String()
-	entry.Hash = a.entryHash()
-	entry.Content = a.entryContent()
-	entry.Title = a.entryTitle()
-	entry.Enclosures = a.entryEnclosures()
-	entry.CommentsURL = a.entryCommentsURL()
-	entry.Tags = a.entryCategories()
-	return entry
-}
-
-func (a *atom10Entry) entryTitle() string {
-	return html.UnescapeString(a.Title.String())
-}
-
-func (a *atom10Entry) entryContent() string {
-	content := a.Content.String()
-	if content != "" {
-		return content
-	}
-
-	summary := a.Summary.String()
-	if summary != "" {
-		return summary
-	}
-
-	mediaDescription := a.FirstMediaDescription()
-	if mediaDescription != "" {
-		return mediaDescription
-	}
-
-	return ""
-}
-
-// Note: The published date represents the original creation date for YouTube feeds.
-// Example:
-// <published>2019-01-26T08:02:28+00:00</published>
-// <updated>2019-01-29T07:27:27+00:00</updated>
-func (a *atom10Entry) entryDate() time.Time {
-	dateText := a.Published
-	if dateText == "" {
-		dateText = a.Updated
-	}
-
-	if dateText != "" {
-		result, err := date.Parse(dateText)
-		if err != nil {
-			slog.Debug("Unable to parse date from Atom 0.3 feed",
-				slog.String("date", dateText),
-				slog.String("id", a.ID),
-				slog.Any("error", err),
-			)
-			return time.Now()
-		}
-
-		return result
-	}
-
-	return time.Now()
-}
-
-func (a *atom10Entry) entryHash() string {
-	for _, value := range []string{a.ID, a.Links.originalLink()} {
-		if value != "" {
-			return crypto.Hash(value)
-		}
-	}
-
-	return ""
-}
-
-func (a *atom10Entry) entryEnclosures() model.EnclosureList {
-	enclosures := make(model.EnclosureList, 0)
-	duplicates := make(map[string]bool)
-
-	for _, mediaThumbnail := range a.AllMediaThumbnails() {
-		if _, found := duplicates[mediaThumbnail.URL]; !found {
-			duplicates[mediaThumbnail.URL] = true
-			enclosures = append(enclosures, &model.Enclosure{
-				URL:      mediaThumbnail.URL,
-				MimeType: mediaThumbnail.MimeType(),
-				Size:     mediaThumbnail.Size(),
-			})
-		}
-	}
-
-	for _, link := range a.Links {
-		if strings.EqualFold(link.Rel, "enclosure") {
-			if link.URL == "" {
-				continue
-			}
-
-			if _, found := duplicates[link.URL]; !found {
-				duplicates[link.URL] = true
-				length, _ := strconv.ParseInt(link.Length, 10, 0)
-				enclosures = append(enclosures, &model.Enclosure{URL: link.URL, MimeType: link.Type, Size: length})
-			}
-		}
-	}
-
-	for _, mediaContent := range a.AllMediaContents() {
-		if _, found := duplicates[mediaContent.URL]; !found {
-			duplicates[mediaContent.URL] = true
-			enclosures = append(enclosures, &model.Enclosure{
-				URL:      mediaContent.URL,
-				MimeType: mediaContent.MimeType(),
-				Size:     mediaContent.Size(),
-			})
-		}
-	}
-
-	for _, mediaPeerLink := range a.AllMediaPeerLinks() {
-		if _, found := duplicates[mediaPeerLink.URL]; !found {
-			duplicates[mediaPeerLink.URL] = true
-			enclosures = append(enclosures, &model.Enclosure{
-				URL:      mediaPeerLink.URL,
-				MimeType: mediaPeerLink.MimeType(),
-				Size:     mediaPeerLink.Size(),
-			})
-		}
-	}
-
-	return enclosures
-}
-
-func (r *atom10Entry) entryCategories() []string {
-	categoryList := make([]string, 0)
-
-	for _, atomCategory := range r.Categories {
-		if strings.TrimSpace(atomCategory.Label) != "" {
-			categoryList = append(categoryList, strings.TrimSpace(atomCategory.Label))
-		} else {
-			categoryList = append(categoryList, strings.TrimSpace(atomCategory.Term))
-		}
-	}
-
-	return categoryList
-}
-
-// See https://tools.ietf.org/html/rfc4685#section-4
-// If the type attribute of the atom:link is omitted, its value is assumed to be "application/atom+xml".
-// We accept only HTML or XHTML documents for now since the intention is to have the same behavior as RSS.
-func (a *atom10Entry) entryCommentsURL() string {
-	commentsURL := a.Links.firstLinkWithRelationAndType("replies", "text/html", "application/xhtml+xml")
-	if urllib.IsAbsoluteURL(commentsURL) {
-		return commentsURL
-	}
-	return ""
-}
-
-type atom10Text struct {
-	Type             string               `xml:"type,attr"`
-	CharData         string               `xml:",chardata"`
-	InnerXML         string               `xml:",innerxml"`
-	XHTMLRootElement atomXHTMLRootElement `xml:"http://www.w3.org/1999/xhtml div"`
-}
-
-type atom10Category struct {
-	Term  string `xml:"term,attr"`
-	Label string `xml:"label,attr"`
-}
-
+// A Text construct contains human-readable text, usually in small
+// quantities. The content of Text constructs is Language-Sensitive.
+// Specs: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1
 // Text: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.1
 // HTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.2
 // XHTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.3
-func (a *atom10Text) String() string {
+type Atom10Text struct {
+	Type             string               `xml:"type,attr"`
+	CharData         string               `xml:",chardata"`
+	InnerXML         string               `xml:",innerxml"`
+	XHTMLRootElement AtomXHTMLRootElement `xml:"http://www.w3.org/1999/xhtml div"`
+}
+
+func (a *Atom10Text) Body() string {
 	var content string
-	switch {
-	case a.Type == "", a.Type == "text", a.Type == "text/plain":
-		if strings.HasPrefix(strings.TrimSpace(a.InnerXML), `<![CDATA[`) {
-			content = html.EscapeString(a.CharData)
-		} else {
-			content = a.InnerXML
-		}
-	case a.Type == "xhtml":
-		var root = a.XHTMLRootElement
-		if root.XMLName.Local == "div" {
-			content = root.InnerXML
-		} else {
-			content = a.InnerXML
-		}
-	default:
+
+	if strings.EqualFold(a.Type, "xhtml") {
+		content = a.xhtmlContent()
+	} else {
 		content = a.CharData
 	}

 	return strings.TrimSpace(content)
 }

-type atomXHTMLRootElement struct {
+func (a *Atom10Text) Title() string {
+	var content string
+
+	if strings.EqualFold(a.Type, "xhtml") {
+		content = a.xhtmlContent()
+	} else if strings.Contains(a.InnerXML, "<![CDATA[") {
+		content = html.UnescapeString(a.CharData)
+	} else {
+		content = a.CharData
+	}
+
+	content = sanitizer.StripTags(content)
+	return strings.TrimSpace(content)
+}
+
+func (a *Atom10Text) xhtmlContent() string {
+	if a.XHTMLRootElement.XMLName.Local == "div" {
+		return a.XHTMLRootElement.InnerXML
+	}
+	return a.InnerXML
+}
+
+type AtomXHTMLRootElement struct {
 	XMLName  xml.Name `xml:"div"`
 	InnerXML string   `xml:",innerxml"`
 }