mirror of
https://github.com/miniflux/v2.git
synced 2025-06-27 16:36:00 +00:00
Simplify feed parser and format detection
- Avoid doing multiple buffer copies - Move parser and format detection logic to its own package
This commit is contained in:
parent
d5ff4191b6
commit
5870f04260
11 changed files with 229 additions and 221 deletions
10
reader/parser/doc.go
Normal file
10
reader/parser/doc.go
Normal file
|
@ -0,0 +1,10 @@
|
|||
// Copyright 2018 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
/*
|
||||
|
||||
Package parser provides a generic feed parser that abstract all different formats.
|
||||
|
||||
*/
|
||||
package parser // import "miniflux.app/reader/parser"
|
51
reader/parser/format.go
Normal file
51
reader/parser/format.go
Normal file
|
@ -0,0 +1,51 @@
|
|||
// Copyright 2018 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package parser // import "miniflux.app/reader/parser"
|
||||
|
||||
import (
|
||||
"encoding/xml"
|
||||
"strings"
|
||||
|
||||
"miniflux.app/reader/encoding"
|
||||
)
|
||||
|
||||
// List of feed formats.
|
||||
const (
|
||||
FormatRDF = "rdf"
|
||||
FormatRSS = "rss"
|
||||
FormatAtom = "atom"
|
||||
FormatJSON = "json"
|
||||
FormatUnknown = "unknown"
|
||||
)
|
||||
|
||||
// DetectFeedFormat tries to guess the feed format from input data.
|
||||
func DetectFeedFormat(data string) string {
|
||||
if strings.HasPrefix(strings.TrimSpace(data), "{") {
|
||||
return FormatJSON
|
||||
}
|
||||
|
||||
decoder := xml.NewDecoder(strings.NewReader(data))
|
||||
decoder.CharsetReader = encoding.CharsetReader
|
||||
|
||||
for {
|
||||
token, _ := decoder.Token()
|
||||
if token == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if element, ok := token.(xml.StartElement); ok {
|
||||
switch element.Name.Local {
|
||||
case "rss":
|
||||
return FormatRSS
|
||||
case "feed":
|
||||
return FormatAtom
|
||||
case "RDF":
|
||||
return FormatRDF
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return FormatUnknown
|
||||
}
|
70
reader/parser/format_test.go
Normal file
70
reader/parser/format_test.go
Normal file
|
@ -0,0 +1,70 @@
|
|||
// Copyright 2018 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package parser // import "miniflux.app/reader/parser"
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestDetectRDF(t *testing.T) {
|
||||
data := `<?xml version="1.0"?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://my.netscape.com/rdf/simple/0.9/"></rdf:RDF>`
|
||||
format := DetectFeedFormat(data)
|
||||
|
||||
if format != FormatRDF {
|
||||
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRDF)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectRSS(t *testing.T) {
|
||||
data := `<?xml version="1.0"?><rss version="2.0"><channel></channel></rss>`
|
||||
format := DetectFeedFormat(data)
|
||||
|
||||
if format != FormatRSS {
|
||||
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatRSS)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectAtom(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
|
||||
format := DetectFeedFormat(data)
|
||||
|
||||
if format != FormatAtom {
|
||||
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectAtomWithISOCharset(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="ISO-8859-15"?><feed xmlns="http://www.w3.org/2005/Atom"></feed>`
|
||||
format := DetectFeedFormat(data)
|
||||
|
||||
if format != FormatAtom {
|
||||
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatAtom)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectJSON(t *testing.T) {
|
||||
data := `
|
||||
{
|
||||
"version" : "https://jsonfeed.org/version/1",
|
||||
"title" : "Example"
|
||||
}
|
||||
`
|
||||
format := DetectFeedFormat(data)
|
||||
|
||||
if format != FormatJSON {
|
||||
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatJSON)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectUnknown(t *testing.T) {
|
||||
data := `
|
||||
<!DOCTYPE html> <html> </html>
|
||||
`
|
||||
format := DetectFeedFormat(data)
|
||||
|
||||
if format != FormatUnknown {
|
||||
t.Errorf(`Wrong format detected: %q instead of %q`, format, FormatUnknown)
|
||||
}
|
||||
}
|
58
reader/parser/parser.go
Normal file
58
reader/parser/parser.go
Normal file
|
@ -0,0 +1,58 @@
|
|||
// Copyright 2018 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package parser // import "miniflux.app/reader/parser"
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"miniflux.app/errors"
|
||||
"miniflux.app/logger"
|
||||
"miniflux.app/model"
|
||||
"miniflux.app/reader/atom"
|
||||
"miniflux.app/reader/json"
|
||||
"miniflux.app/reader/rdf"
|
||||
"miniflux.app/reader/rss"
|
||||
)
|
||||
|
||||
// ParseFeed analyzes the input data and returns a normalized feed object.
|
||||
func ParseFeed(data string) (*model.Feed, *errors.LocalizedError) {
|
||||
data = stripInvalidXMLCharacters(data)
|
||||
|
||||
switch DetectFeedFormat(data) {
|
||||
case FormatAtom:
|
||||
return atom.Parse(strings.NewReader(data))
|
||||
case FormatRSS:
|
||||
return rss.Parse(strings.NewReader(data))
|
||||
case FormatJSON:
|
||||
return json.Parse(strings.NewReader(data))
|
||||
case FormatRDF:
|
||||
return rdf.Parse(strings.NewReader(data))
|
||||
default:
|
||||
return nil, errors.NewLocalizedError("Unsupported feed format")
|
||||
}
|
||||
}
|
||||
|
||||
func stripInvalidXMLCharacters(input string) string {
|
||||
return strings.Map(func(r rune) rune {
|
||||
if isInCharacterRange(r) {
|
||||
return r
|
||||
}
|
||||
|
||||
logger.Debug("Strip invalid XML characters: %U", r)
|
||||
return -1
|
||||
}, input)
|
||||
}
|
||||
|
||||
// Decide whether the given rune is in the XML Character Range, per
|
||||
// the Char production of http://www.xml.com/axml/testaxml.htm,
|
||||
// Section 2.2 Characters.
|
||||
func isInCharacterRange(r rune) (inrange bool) {
|
||||
return r == 0x09 ||
|
||||
r == 0x0A ||
|
||||
r == 0x0D ||
|
||||
r >= 0x20 && r <= 0xDF77 ||
|
||||
r >= 0xE000 && r <= 0xFFFD ||
|
||||
r >= 0x10000 && r <= 0x10FFFF
|
||||
}
|
152
reader/parser/parser_test.go
Normal file
152
reader/parser/parser_test.go
Normal file
|
@ -0,0 +1,152 @@
|
|||
// Copyright 2017 Frédéric Guillot. All rights reserved.
|
||||
// Use of this source code is governed by the Apache 2.0
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package parser // import "miniflux.app/reader/parser"
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseAtom(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
|
||||
<title>Example Feed</title>
|
||||
<link href="http://example.org/"/>
|
||||
<updated>2003-12-13T18:30:02Z</updated>
|
||||
<author>
|
||||
<name>John Doe</name>
|
||||
</author>
|
||||
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
|
||||
|
||||
<entry>
|
||||
<title>Atom-Powered Robots Run Amok</title>
|
||||
<link href="http://example.org/2003/12/13/atom03"/>
|
||||
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
|
||||
<updated>2003-12-13T18:30:02Z</updated>
|
||||
<summary>Some text.</summary>
|
||||
</entry>
|
||||
|
||||
</feed>`
|
||||
|
||||
feed, err := ParseFeed(data)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if feed.Title != "Example Feed" {
|
||||
t.Errorf("Incorrect title, got: %s", feed.Title)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRSS(t *testing.T) {
|
||||
data := `<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Liftoff News</title>
|
||||
<link>http://liftoff.msfc.nasa.gov/</link>
|
||||
<item>
|
||||
<title>Star City</title>
|
||||
<link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
|
||||
<description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>.</description>
|
||||
<pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
|
||||
<guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`
|
||||
|
||||
feed, err := ParseFeed(data)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if feed.Title != "Liftoff News" {
|
||||
t.Errorf("Incorrect title, got: %s", feed.Title)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRDF(t *testing.T) {
|
||||
data := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<rdf:RDF
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns="http://purl.org/rss/1.0/"
|
||||
>
|
||||
|
||||
<channel>
|
||||
<title>RDF Example</title>
|
||||
<link>http://example.org/</link>
|
||||
</channel>
|
||||
|
||||
<item>
|
||||
<title>Title</title>
|
||||
<link>http://example.org/item</link>
|
||||
<description>Test</description>
|
||||
</item>
|
||||
</rdf:RDF>`
|
||||
|
||||
feed, err := ParseFeed(data)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if feed.Title != "RDF Example" {
|
||||
t.Errorf("Incorrect title, got: %s", feed.Title)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseJson(t *testing.T) {
|
||||
data := `{
|
||||
"version": "https://jsonfeed.org/version/1",
|
||||
"title": "My Example Feed",
|
||||
"home_page_url": "https://example.org/",
|
||||
"feed_url": "https://example.org/feed.json",
|
||||
"items": [
|
||||
{
|
||||
"id": "2",
|
||||
"content_text": "This is a second item.",
|
||||
"url": "https://example.org/second-item"
|
||||
},
|
||||
{
|
||||
"id": "1",
|
||||
"content_html": "<p>Hello, world!</p>",
|
||||
"url": "https://example.org/initial-post"
|
||||
}
|
||||
]
|
||||
}`
|
||||
|
||||
feed, err := ParseFeed(data)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if feed.Title != "My Example Feed" {
|
||||
t.Errorf("Incorrect title, got: %s", feed.Title)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseUnknownFeed(t *testing.T) {
|
||||
data := `
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>Title of document</title>
|
||||
</head>
|
||||
<body>
|
||||
some content
|
||||
</body>
|
||||
</html>
|
||||
`
|
||||
|
||||
_, err := ParseFeed(data)
|
||||
if err == nil {
|
||||
t.Error("ParseFeed must returns an error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseEmptyFeed(t *testing.T) {
|
||||
_, err := ParseFeed("")
|
||||
if err == nil {
|
||||
t.Error("ParseFeed must returns an error")
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue