1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-06-27 16:36:00 +00:00

Simplify feed parser and format detection

- Avoid doing multiple buffer copies
- Move parser and format detection logic to its own package
This commit is contained in:
Frédéric Guillot 2018-10-14 11:46:41 -07:00
parent d5ff4191b6
commit 5870f04260
11 changed files with 229 additions and 221 deletions

10
reader/parser/doc.go Normal file
View file

@ -0,0 +1,10 @@
// Copyright 2018 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
/*
Package parser provides a generic feed parser that abstract all different formats.
*/
package parser // import "miniflux.app/reader/parser"

51
reader/parser/format.go Normal file
View file

@ -0,0 +1,51 @@
// Copyright 2018 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package parser // import "miniflux.app/reader/parser"
import (
"encoding/xml"
"strings"
"miniflux.app/reader/encoding"
)
// List of feed formats.
const (
FormatRDF = "rdf"
FormatRSS = "rss"
FormatAtom = "atom"
FormatJSON = "json"
FormatUnknown = "unknown"
)
// DetectFeedFormat tries to guess the feed format from input data.
func DetectFeedFormat(data string) string {
if strings.HasPrefix(strings.TrimSpace(data), "{") {
return FormatJSON
}
decoder := xml.NewDecoder(strings.NewReader(data))
decoder.CharsetReader = encoding.CharsetReader
for {
token, _ := decoder.Token()
if token == nil {
break
}
if element, ok := token.(xml.StartElement); ok {
switch element.Name.Local {
case "rss":
return FormatRSS
case "feed":
return FormatAtom
case "RDF":
return FormatRDF
}
}
}
return FormatUnknown
}

View file

@ -0,0 +1,70 @@
// Copyright 2018 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package parser // import "miniflux.app/reader/parser"
import (
"testing"
)
func TestDetectRDF(t *testing.T) {
data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>`
format := DetectFeedFormat(data)
if format != FormatRDF {
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRDF)
}
}
func TestDetectRSS(t *testing.T) {
data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`
format := DetectFeedFormat(data)
if format != FormatRSS {
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRSS)
}
}
func TestDetectAtom(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
format := DetectFeedFormat(data)
if format != FormatAtom {
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom)
}
}
func TestDetectAtomWithISOCharset(t *testing.T) {
data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
format := DetectFeedFormat(data)
if format != FormatAtom {
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom)
}
}
func TestDetectJSON(t *testing.T) {
data := `
{
"version" : "https://jsonfeed.org/version/1",
"title" : "Example"
}
`
format := DetectFeedFormat(data)
if format != FormatJSON {
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatJSON)
}
}
func TestDetectUnknown(t *testing.T) {
data := `
<!DOCTYPE html> <html> </html>
`
format := DetectFeedFormat(data)
if format != FormatUnknown {
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatUnknown)
}
}

58
reader/parser/parser.go Normal file
View file

@ -0,0 +1,58 @@
// Copyright 2018 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package parser // import "miniflux.app/reader/parser"
import (
"strings"
"miniflux.app/errors"
"miniflux.app/logger"
"miniflux.app/model"
"miniflux.app/reader/atom"
"miniflux.app/reader/json"
"miniflux.app/reader/rdf"
"miniflux.app/reader/rss"
)
// ParseFeed analyzes the input data and returns a normalized feed object.
func ParseFeed(data string) (*model.Feed, *errors.LocalizedError) {
data = stripInvalidXMLCharacters(data)
switch DetectFeedFormat(data) {
case FormatAtom:
return atom.Parse(strings.NewReader(data))
case FormatRSS:
return rss.Parse(strings.NewReader(data))
case FormatJSON:
return json.Parse(strings.NewReader(data))
case FormatRDF:
return rdf.Parse(strings.NewReader(data))
default:
return nil, errors.NewLocalizedError("Unsupported feed format")
}
}
func stripInvalidXMLCharacters(input string) string {
return strings.Map(func(r rune) rune {
if isInCharacterRange(r) {
return r
}
logger.Debug("Strip invalid XML characters: %U", r)
return -1
}, input)
}
// Decide whether the given rune is in the XML Character Range, per
// the Char production of http://www.xml.com/axml/testaxml.htm,
// Section 2.2 Characters.
func isInCharacterRange(r rune) (inrange bool) {
return r == 0x09 ||
r == 0x0A ||
r == 0x0D ||
r >= 0x20 && r <= 0xDF77 ||
r >= 0xE000 && r <= 0xFFFD ||
r >= 0x10000 && r <= 0x10FFFF
}

View file

@ -0,0 +1,152 @@
// Copyright 2017 Frédéric Guillot. All rights reserved.
// Use of this source code is governed by the Apache 2.0
// license that can be found in the LICENSE file.
package parser // import "miniflux.app/reader/parser"
import (
"testing"
)
func TestParseAtom(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Feed</title>
<link href="http://example.org/"/>
<updated>2003-12-13T18:30:02Z</updated>
<author>
<name>John Doe</name>
</author>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
<entry>
<title>Atom-Powered Robots Run Amok</title>
<link href="http://example.org/2003/12/13/atom03"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<updated>2003-12-13T18:30:02Z</updated>
<summary>Some text.</summary>
</entry>
</feed>`
feed, err := ParseFeed(data)
if err != nil {
t.Error(err)
}
if feed.Title != "Example Feed" {
t.Errorf("Incorrect title, got: %s", feed.Title)
}
}
func TestParseRSS(t *testing.T) {
data := `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<title>Liftoff News</title>
<link>http://liftoff.msfc.nasa.gov/</link>
<item>
<title>Star City</title>
<link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
<description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm"&gt;Star City&lt;/a&gt;.</description>
<pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
<guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
</item>
</channel>
</rss>`
feed, err := ParseFeed(data)
if err != nil {
t.Error(err)
}
if feed.Title != "Liftoff News" {
t.Errorf("Incorrect title, got: %s", feed.Title)
}
}
func TestParseRDF(t *testing.T) {
data := `<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/"
>
<channel>
<title>RDF Example</title>
<link>http://example.org/</link>
</channel>
<item>
<title>Title</title>
<link>http://example.org/item</link>
<description>Test</description>
</item>
</rdf:RDF>`
feed, err := ParseFeed(data)
if err != nil {
t.Error(err)
}
if feed.Title != "RDF Example" {
t.Errorf("Incorrect title, got: %s", feed.Title)
}
}
func TestParseJson(t *testing.T) {
data := `{
"version": "https://jsonfeed.org/version/1",
"title": "My Example Feed",
"home_page_url": "https://example.org/",
"feed_url": "https://example.org/feed.json",
"items": [
{
"id": "2",
"content_text": "This is a second item.",
"url": "https://example.org/second-item"
},
{
"id": "1",
"content_html": "<p>Hello, world!</p>",
"url": "https://example.org/initial-post"
}
]
}`
feed, err := ParseFeed(data)
if err != nil {
t.Error(err)
}
if feed.Title != "My Example Feed" {
t.Errorf("Incorrect title, got: %s", feed.Title)
}
}
func TestParseUnknownFeed(t *testing.T) {
data := `
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Title of document</title>
</head>
<body>
some content
</body>
</html>
`
_, err := ParseFeed(data)
if err == nil {
t.Error("ParseFeed must returns an error")
}
}
func TestParseEmptyFeed(t *testing.T) {
_, err := ParseFeed("")
if err == nil {
t.Error("ParseFeed must returns an error")
}
}