1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

Refactor JSON Feed parser to use an adapter

This commit is contained in:
Frédéric Guillot 2024-03-12 22:10:59 -07:00
parent 6bc4b35e38
commit 8429c6b0ab
4 changed files with 487 additions and 253 deletions

View file

@ -3,207 +3,141 @@
package json // import "miniflux.app/v2/internal/reader/json"
import (
"log/slog"
"strings"
"time"
// JSON Feed specs:
// https://www.jsonfeed.org/version/1.1/
// https://www.jsonfeed.org/version/1/
type JSONFeed struct {
// Version is the URL of the version of the format the feed uses.
// This should appear at the very top, though we recognize that not all JSON generators allow for ordering.
Version string `json:"version"`
"miniflux.app/v2/internal/crypto"
"miniflux.app/v2/internal/model"
"miniflux.app/v2/internal/reader/date"
"miniflux.app/v2/internal/reader/sanitizer"
"miniflux.app/v2/internal/urllib"
)
// Title is the name of the feed, which will often correspond to the name of the website.
Title string `json:"title"`
type jsonFeed struct {
Version string `json:"version"`
Title string `json:"title"`
SiteURL string `json:"home_page_url"`
IconURL string `json:"icon"`
FaviconURL string `json:"favicon"`
FeedURL string `json:"feed_url"`
Authors []jsonAuthor `json:"authors"`
Author jsonAuthor `json:"author"`
Items []jsonItem `json:"items"`
// HomePageURL is the URL of the resource that the feed describes.
// This resource may or may not actually be a “home” page, but it should be an HTML page.
HomePageURL string `json:"home_page_url"`
// FeedURL is the URL of the feed, and serves as the unique identifier for the feed.
FeedURL string `json:"feed_url"`
// Description provides more detail, beyond the title, on what the feed is about.
Description string `json:"description"`
// IconURL is the URL of an image for the feed suitable to be used in a timeline, much the way an avatar might be used.
IconURL string `json:"icon"`
// FaviconURL is the URL of an image for the feed suitable to be used in a source list. It should be square and relatively small.
FaviconURL string `json:"favicon"`
// Authors specifies one or more feed authors. The author object has several members.
Authors []JSONAuthor `json:"authors"` // JSON Feed v1.1
// Author specifies the feed author. The author object has several members.
// JSON Feed v1 (deprecated)
Author JSONAuthor `json:"author"`
// Language is the primary language for the feed in the format specified in RFC 5646.
// The value is usually a 2-letter language tag from ISO 639-1, optionally followed by a region tag. (Examples: en or en-US.)
Language string `json:"language"`
// Expired is a boolean value that specifies whether or not the feed is finished.
Expired bool `json:"expired"`
// Items is an array, each representing an individual item in the feed.
Items []JSONItem `json:"items"`
// Hubs describes endpoints that can be used to subscribe to real-time notifications from the publisher of this feed.
Hubs []JSONHub `json:"hubs"`
}
type jsonAuthor struct {
type JSONAuthor struct {
// Author's name.
Name string `json:"name"`
URL string `json:"url"`
// Author's website URL (Blog or micro-blog).
WebsiteURL string `json:"url"`
// Author's avatar URL.
AvatarURL string `json:"avatar"`
}
type jsonItem struct {
ID string `json:"id"`
URL string `json:"url"`
Title string `json:"title"`
Summary string `json:"summary"`
Text string `json:"content_text"`
HTML string `json:"content_html"`
DatePublished string `json:"date_published"`
DateModified string `json:"date_modified"`
Authors []jsonAuthor `json:"authors"`
Author jsonAuthor `json:"author"`
Attachments []jsonAttachment `json:"attachments"`
Tags []string `json:"tags"`
type JSONHub struct {
// Type defines the protocol used to talk with the hub: "rssCloud" or "WebSub".
Type string `json:"type"`
// URL is the location of the hub.
URL string `json:"url"`
}
type jsonAttachment struct {
URL string `json:"url"`
type JSONItem struct {
// Unique identifier for the item.
// Ideally, the id is the full URL of the resource described by the item, since URLs make great unique identifiers.
ID string `json:"id"`
// URL of the resource described by the item.
URL string `json:"url"`
// ExternalURL is the URL of a page elsewhere.
// This is especially useful for linkblogs.
// If url links to where youre talking about a thing, then external_url links to the thing youre talking about.
ExternalURL string `json:"external_url"`
// Title of the item (optional).
// Microblog items in particular may omit titles.
Title string `json:"title"`
// ContentHTML is the HTML body of the item.
ContentHTML string `json:"content_html"`
// ContentText is the text body of the item.
ContentText string `json:"content_text"`
// Summary is a plain text sentence or two describing the item.
Summary string `json:"summary"`
// ImageURL is the URL of the main image for the item.
ImageURL string `json:"image"`
// BannerImageURL is the URL of an image to use as a banner.
BannerImageURL string `json:"banner_image"`
// DatePublished is the date the item was published.
DatePublished string `json:"date_published"`
// DateModified is the date the item was modified.
DateModified string `json:"date_modified"`
// Language is the language of the item.
Language string `json:"language"`
// Authors is an array of JSONAuthor.
Authors []JSONAuthor `json:"authors"`
// Author is a JSONAuthor.
// JSON Feed v1 (deprecated)
Author JSONAuthor `json:"author"`
// Tags is an array of strings.
Tags []string `json:"tags"`
// Attachments is an array of JSONAttachment.
Attachments []JSONAttachment `json:"attachments"`
}
type JSONAttachment struct {
// URL of the attachment.
URL string `json:"url"`
// MIME type of the attachment.
MimeType string `json:"mime_type"`
Title string `json:"title"`
Size int64 `json:"size_in_bytes"`
Duration int `json:"duration_in_seconds"`
}
func (j *jsonFeed) GetAuthor() string {
if len(j.Authors) > 0 {
return (getAuthor(j.Authors[0]))
}
return getAuthor(j.Author)
}
func (j *jsonFeed) Transform(baseURL string) *model.Feed {
var err error
feed := new(model.Feed)
feed.FeedURL, err = urllib.AbsoluteURL(baseURL, j.FeedURL)
if err != nil {
feed.FeedURL = j.FeedURL
}
feed.SiteURL, err = urllib.AbsoluteURL(baseURL, j.SiteURL)
if err != nil {
feed.SiteURL = j.SiteURL
}
feed.IconURL = strings.TrimSpace(j.IconURL)
if feed.IconURL == "" {
feed.IconURL = strings.TrimSpace(j.FaviconURL)
}
feed.Title = strings.TrimSpace(j.Title)
if feed.Title == "" {
feed.Title = feed.SiteURL
}
for _, item := range j.Items {
entry := item.Transform()
entryURL, err := urllib.AbsoluteURL(feed.SiteURL, entry.URL)
if err == nil {
entry.URL = entryURL
}
if entry.Author == "" {
entry.Author = j.GetAuthor()
}
feed.Entries = append(feed.Entries, entry)
}
return feed
}
func (j *jsonItem) GetDate() time.Time {
for _, value := range []string{j.DatePublished, j.DateModified} {
if value != "" {
d, err := date.Parse(value)
if err != nil {
slog.Debug("Unable to parse date from JSON feed",
slog.String("date", value),
slog.String("url", j.URL),
slog.Any("error", err),
)
return time.Now()
}
return d
}
}
return time.Now()
}
func (j *jsonItem) GetAuthor() string {
if len(j.Authors) > 0 {
return getAuthor(j.Authors[0])
}
return getAuthor(j.Author)
}
func (j *jsonItem) GetHash() string {
for _, value := range []string{j.ID, j.URL, j.Text + j.HTML + j.Summary} {
if value != "" {
return crypto.Hash(value)
}
}
return ""
}
func (j *jsonItem) GetTitle() string {
if j.Title != "" {
return j.Title
}
for _, value := range []string{j.Summary, j.Text, j.HTML} {
if value != "" {
return sanitizer.TruncateHTML(value, 100)
}
}
return j.URL
}
func (j *jsonItem) GetContent() string {
for _, value := range []string{j.HTML, j.Text, j.Summary} {
if value != "" {
return value
}
}
return ""
}
func (j *jsonItem) GetEnclosures() model.EnclosureList {
enclosures := make(model.EnclosureList, 0)
for _, attachment := range j.Attachments {
if attachment.URL == "" {
continue
}
enclosures = append(enclosures, &model.Enclosure{
URL: attachment.URL,
MimeType: attachment.MimeType,
Size: attachment.Size,
})
}
return enclosures
}
func (j *jsonItem) Transform() *model.Entry {
entry := model.NewEntry()
entry.URL = j.URL
entry.Date = j.GetDate()
entry.Author = j.GetAuthor()
entry.Hash = j.GetHash()
entry.Content = j.GetContent()
entry.Title = strings.TrimSpace(j.GetTitle())
entry.Enclosures = j.GetEnclosures()
if len(j.Tags) > 0 {
entry.Tags = j.Tags
}
return entry
}
func getAuthor(author jsonAuthor) string {
if author.Name != "" {
return strings.TrimSpace(author.Name)
}
return ""
// Title of the attachment.
Title string `json:"title"`
// Size of the attachment in bytes.
Size int64 `json:"size_in_bytes"`
// Duration of the attachment in seconds.
Duration int `json:"duration_in_seconds"`
}