mirror of
https://github.com/miniflux/v2.git
synced 2025-07-27 17:28:38 +00:00
First commit
This commit is contained in:
commit
8ffb773f43
2121 changed files with 1118910 additions and 0 deletions
101
vendor/github.com/tdewolff/parse/xml/README.md
generated
vendored
Normal file
101
vendor/github.com/tdewolff/parse/xml/README.md
generated
vendored
Normal file
|
@ -0,0 +1,101 @@
|
|||
# XML [](http://godoc.org/github.com/tdewolff/parse/xml) [](http://gocover.io/github.com/tdewolff/parse/xml)
|
||||
|
||||
This package is an XML lexer written in [Go][1]. It follows the specification at [Extensible Markup Language (XML) 1.0 (Fifth Edition)](http://www.w3.org/TR/REC-xml/). The lexer takes an io.Reader and converts it into tokens until the EOF.
|
||||
|
||||
## Installation
|
||||
Run the following command
|
||||
|
||||
go get github.com/tdewolff/parse/xml
|
||||
|
||||
or add the following import and run project with `go get`
|
||||
|
||||
import "github.com/tdewolff/parse/xml"
|
||||
|
||||
## Lexer
|
||||
### Usage
|
||||
The following initializes a new Lexer with io.Reader `r`:
|
||||
``` go
|
||||
l := xml.NewLexer(r)
|
||||
```
|
||||
|
||||
To tokenize until EOF an error, use:
|
||||
``` go
|
||||
for {
|
||||
tt, data := l.Next()
|
||||
switch tt {
|
||||
case xml.ErrorToken:
|
||||
// error or EOF set in l.Err()
|
||||
return
|
||||
case xml.StartTagToken:
|
||||
// ...
|
||||
for {
|
||||
ttAttr, dataAttr := l.Next()
|
||||
if ttAttr != xml.AttributeToken {
|
||||
// handle StartTagCloseToken/StartTagCloseVoidToken/StartTagClosePIToken
|
||||
break
|
||||
}
|
||||
// ...
|
||||
}
|
||||
case xml.EndTagToken:
|
||||
// ...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
All tokens:
|
||||
``` go
|
||||
ErrorToken TokenType = iota // extra token when errors occur
|
||||
CommentToken
|
||||
CDATAToken
|
||||
StartTagToken
|
||||
StartTagCloseToken
|
||||
StartTagCloseVoidToken
|
||||
StartTagClosePIToken
|
||||
EndTagToken
|
||||
AttributeToken
|
||||
TextToken
|
||||
```
|
||||
|
||||
### Examples
|
||||
``` go
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"github.com/tdewolff/parse/xml"
|
||||
)
|
||||
|
||||
// Tokenize XML from stdin.
|
||||
func main() {
|
||||
l := xml.NewLexer(os.Stdin)
|
||||
for {
|
||||
tt, data := l.Next()
|
||||
switch tt {
|
||||
case xml.ErrorToken:
|
||||
if l.Err() != io.EOF {
|
||||
fmt.Println("Error on line", l.Line(), ":", l.Err())
|
||||
}
|
||||
return
|
||||
case xml.StartTagToken:
|
||||
fmt.Println("Tag", string(data))
|
||||
for {
|
||||
ttAttr, dataAttr := l.Next()
|
||||
if ttAttr != xml.AttributeToken {
|
||||
break
|
||||
}
|
||||
|
||||
key := dataAttr
|
||||
val := l.AttrVal()
|
||||
fmt.Println("Attribute", string(key), "=", string(val))
|
||||
}
|
||||
// ...
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
Released under the [MIT license](https://github.com/tdewolff/parse/blob/master/LICENSE.md).
|
||||
|
||||
[1]: http://golang.org/ "Go Language"
|
345
vendor/github.com/tdewolff/parse/xml/lex.go
generated
vendored
Normal file
345
vendor/github.com/tdewolff/parse/xml/lex.go
generated
vendored
Normal file
|
@ -0,0 +1,345 @@
|
|||
// Package xml is an XML1.0 lexer following the specifications at http://www.w3.org/TR/xml/.
|
||||
package xml // import "github.com/tdewolff/parse/xml"
|
||||
|
||||
import (
|
||||
"io"
|
||||
"strconv"
|
||||
|
||||
"github.com/tdewolff/parse"
|
||||
"github.com/tdewolff/parse/buffer"
|
||||
)
|
||||
|
||||
// TokenType determines the type of token, eg. a number or a semicolon.
|
||||
type TokenType uint32
|
||||
|
||||
// TokenType values.
|
||||
const (
|
||||
ErrorToken TokenType = iota // extra token when errors occur
|
||||
CommentToken
|
||||
DOCTYPEToken
|
||||
CDATAToken
|
||||
StartTagToken
|
||||
StartTagPIToken
|
||||
StartTagCloseToken
|
||||
StartTagCloseVoidToken
|
||||
StartTagClosePIToken
|
||||
EndTagToken
|
||||
AttributeToken
|
||||
TextToken
|
||||
)
|
||||
|
||||
// String returns the string representation of a TokenType.
|
||||
func (tt TokenType) String() string {
|
||||
switch tt {
|
||||
case ErrorToken:
|
||||
return "Error"
|
||||
case CommentToken:
|
||||
return "Comment"
|
||||
case DOCTYPEToken:
|
||||
return "DOCTYPE"
|
||||
case CDATAToken:
|
||||
return "CDATA"
|
||||
case StartTagToken:
|
||||
return "StartTag"
|
||||
case StartTagPIToken:
|
||||
return "StartTagPI"
|
||||
case StartTagCloseToken:
|
||||
return "StartTagClose"
|
||||
case StartTagCloseVoidToken:
|
||||
return "StartTagCloseVoid"
|
||||
case StartTagClosePIToken:
|
||||
return "StartTagClosePI"
|
||||
case EndTagToken:
|
||||
return "EndTag"
|
||||
case AttributeToken:
|
||||
return "Attribute"
|
||||
case TextToken:
|
||||
return "Text"
|
||||
}
|
||||
return "Invalid(" + strconv.Itoa(int(tt)) + ")"
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
// Lexer is the state for the lexer.
|
||||
type Lexer struct {
|
||||
r *buffer.Lexer
|
||||
err error
|
||||
|
||||
inTag bool
|
||||
|
||||
text []byte
|
||||
attrVal []byte
|
||||
}
|
||||
|
||||
// NewLexer returns a new Lexer for a given io.Reader.
|
||||
func NewLexer(r io.Reader) *Lexer {
|
||||
return &Lexer{
|
||||
r: buffer.NewLexer(r),
|
||||
}
|
||||
}
|
||||
|
||||
// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
|
||||
func (l *Lexer) Err() error {
|
||||
err := l.r.Err()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return l.err
|
||||
}
|
||||
|
||||
// Restore restores the NULL byte at the end of the buffer.
|
||||
func (l *Lexer) Restore() {
|
||||
l.r.Restore()
|
||||
}
|
||||
|
||||
// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
|
||||
func (l *Lexer) Next() (TokenType, []byte) {
|
||||
l.text = nil
|
||||
var c byte
|
||||
if l.inTag {
|
||||
l.attrVal = nil
|
||||
for { // before attribute name state
|
||||
if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
|
||||
l.r.Move(1)
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
if c == 0 {
|
||||
l.err = parse.NewErrorLexer("unexpected null character", l.r)
|
||||
return ErrorToken, nil
|
||||
} else if c != '>' && (c != '/' && c != '?' || l.r.Peek(1) != '>') {
|
||||
return AttributeToken, l.shiftAttribute()
|
||||
}
|
||||
start := l.r.Pos()
|
||||
l.inTag = false
|
||||
if c == '/' {
|
||||
l.r.Move(2)
|
||||
l.text = l.r.Lexeme()[start:]
|
||||
return StartTagCloseVoidToken, l.r.Shift()
|
||||
} else if c == '?' {
|
||||
l.r.Move(2)
|
||||
l.text = l.r.Lexeme()[start:]
|
||||
return StartTagClosePIToken, l.r.Shift()
|
||||
} else {
|
||||
l.r.Move(1)
|
||||
l.text = l.r.Lexeme()[start:]
|
||||
return StartTagCloseToken, l.r.Shift()
|
||||
}
|
||||
}
|
||||
|
||||
for {
|
||||
c = l.r.Peek(0)
|
||||
if c == '<' {
|
||||
if l.r.Pos() > 0 {
|
||||
return TextToken, l.r.Shift()
|
||||
}
|
||||
c = l.r.Peek(1)
|
||||
if c == '/' {
|
||||
l.r.Move(2)
|
||||
return EndTagToken, l.shiftEndTag()
|
||||
} else if c == '!' {
|
||||
l.r.Move(2)
|
||||
if l.at('-', '-') {
|
||||
l.r.Move(2)
|
||||
return CommentToken, l.shiftCommentText()
|
||||
} else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
|
||||
l.r.Move(7)
|
||||
return CDATAToken, l.shiftCDATAText()
|
||||
} else if l.at('D', 'O', 'C', 'T', 'Y', 'P', 'E') {
|
||||
l.r.Move(8)
|
||||
return DOCTYPEToken, l.shiftDOCTYPEText()
|
||||
}
|
||||
l.r.Move(-2)
|
||||
} else if c == '?' {
|
||||
l.r.Move(2)
|
||||
l.inTag = true
|
||||
return StartTagPIToken, l.shiftStartTag()
|
||||
}
|
||||
l.r.Move(1)
|
||||
l.inTag = true
|
||||
return StartTagToken, l.shiftStartTag()
|
||||
} else if c == 0 {
|
||||
if l.r.Pos() > 0 {
|
||||
return TextToken, l.r.Shift()
|
||||
}
|
||||
l.err = parse.NewErrorLexer("unexpected null character", l.r)
|
||||
return ErrorToken, nil
|
||||
}
|
||||
l.r.Move(1)
|
||||
}
|
||||
}
|
||||
|
||||
// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
|
||||
func (l *Lexer) Text() []byte {
|
||||
return l.text
|
||||
}
|
||||
|
||||
// AttrVal returns the attribute value when an AttributeToken was returned from Next.
|
||||
func (l *Lexer) AttrVal() []byte {
|
||||
return l.attrVal
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
// The following functions follow the specifications at http://www.w3.org/html/wg/drafts/html/master/syntax.html
|
||||
|
||||
func (l *Lexer) shiftDOCTYPEText() []byte {
|
||||
inString := false
|
||||
inBrackets := false
|
||||
for {
|
||||
c := l.r.Peek(0)
|
||||
if c == '"' {
|
||||
inString = !inString
|
||||
} else if (c == '[' || c == ']') && !inString {
|
||||
inBrackets = (c == '[')
|
||||
} else if c == '>' && !inString && !inBrackets {
|
||||
l.text = l.r.Lexeme()[9:]
|
||||
l.r.Move(1)
|
||||
return l.r.Shift()
|
||||
} else if c == 0 {
|
||||
l.text = l.r.Lexeme()[9:]
|
||||
return l.r.Shift()
|
||||
}
|
||||
l.r.Move(1)
|
||||
}
|
||||
}
|
||||
|
||||
func (l *Lexer) shiftCDATAText() []byte {
|
||||
for {
|
||||
c := l.r.Peek(0)
|
||||
if c == ']' && l.r.Peek(1) == ']' && l.r.Peek(2) == '>' {
|
||||
l.text = l.r.Lexeme()[9:]
|
||||
l.r.Move(3)
|
||||
return l.r.Shift()
|
||||
} else if c == 0 {
|
||||
l.text = l.r.Lexeme()[9:]
|
||||
return l.r.Shift()
|
||||
}
|
||||
l.r.Move(1)
|
||||
}
|
||||
}
|
||||
|
||||
func (l *Lexer) shiftCommentText() []byte {
|
||||
for {
|
||||
c := l.r.Peek(0)
|
||||
if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
|
||||
l.text = l.r.Lexeme()[4:]
|
||||
l.r.Move(3)
|
||||
return l.r.Shift()
|
||||
} else if c == 0 {
|
||||
return l.r.Shift()
|
||||
}
|
||||
l.r.Move(1)
|
||||
}
|
||||
}
|
||||
|
||||
func (l *Lexer) shiftStartTag() []byte {
|
||||
nameStart := l.r.Pos()
|
||||
for {
|
||||
if c := l.r.Peek(0); c == ' ' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
|
||||
break
|
||||
}
|
||||
l.r.Move(1)
|
||||
}
|
||||
l.text = l.r.Lexeme()[nameStart:]
|
||||
return l.r.Shift()
|
||||
}
|
||||
|
||||
func (l *Lexer) shiftAttribute() []byte {
|
||||
nameStart := l.r.Pos()
|
||||
var c byte
|
||||
for { // attribute name state
|
||||
if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
|
||||
break
|
||||
}
|
||||
l.r.Move(1)
|
||||
}
|
||||
nameEnd := l.r.Pos()
|
||||
for { // after attribute name state
|
||||
if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
|
||||
l.r.Move(1)
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
if c == '=' {
|
||||
l.r.Move(1)
|
||||
for { // before attribute value state
|
||||
if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
|
||||
l.r.Move(1)
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
attrPos := l.r.Pos()
|
||||
delim := c
|
||||
if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
|
||||
l.r.Move(1)
|
||||
for {
|
||||
c = l.r.Peek(0)
|
||||
if c == delim {
|
||||
l.r.Move(1)
|
||||
break
|
||||
} else if c == 0 {
|
||||
break
|
||||
}
|
||||
l.r.Move(1)
|
||||
if c == '\t' || c == '\n' || c == '\r' {
|
||||
l.r.Lexeme()[l.r.Pos()-1] = ' '
|
||||
}
|
||||
}
|
||||
} else { // attribute value unquoted state
|
||||
for {
|
||||
if c = l.r.Peek(0); c == ' ' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
|
||||
break
|
||||
}
|
||||
l.r.Move(1)
|
||||
}
|
||||
}
|
||||
l.attrVal = l.r.Lexeme()[attrPos:]
|
||||
} else {
|
||||
l.r.Rewind(nameEnd)
|
||||
l.attrVal = nil
|
||||
}
|
||||
l.text = l.r.Lexeme()[nameStart:nameEnd]
|
||||
return l.r.Shift()
|
||||
}
|
||||
|
||||
func (l *Lexer) shiftEndTag() []byte {
|
||||
for {
|
||||
c := l.r.Peek(0)
|
||||
if c == '>' {
|
||||
l.text = l.r.Lexeme()[2:]
|
||||
l.r.Move(1)
|
||||
break
|
||||
} else if c == 0 {
|
||||
l.text = l.r.Lexeme()[2:]
|
||||
break
|
||||
}
|
||||
l.r.Move(1)
|
||||
}
|
||||
|
||||
end := len(l.text)
|
||||
for end > 0 {
|
||||
if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
|
||||
end--
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
l.text = l.text[:end]
|
||||
return l.r.Shift()
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
func (l *Lexer) at(b ...byte) bool {
|
||||
for i, c := range b {
|
||||
if l.r.Peek(i) != c {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
193
vendor/github.com/tdewolff/parse/xml/lex_test.go
generated
vendored
Normal file
193
vendor/github.com/tdewolff/parse/xml/lex_test.go
generated
vendored
Normal file
|
@ -0,0 +1,193 @@
|
|||
package xml // import "github.com/tdewolff/parse/xml"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"testing"
|
||||
|
||||
"github.com/tdewolff/parse"
|
||||
"github.com/tdewolff/test"
|
||||
)
|
||||
|
||||
type TTs []TokenType
|
||||
|
||||
func TestTokens(t *testing.T) {
|
||||
var tokenTests = []struct {
|
||||
xml string
|
||||
expected []TokenType
|
||||
}{
|
||||
{"", TTs{}},
|
||||
{"<!-- comment -->", TTs{CommentToken}},
|
||||
{"<!-- comment \n multi \r line -->", TTs{CommentToken}},
|
||||
{"<foo/>", TTs{StartTagToken, StartTagCloseVoidToken}},
|
||||
{"<foo \t\r\n/>", TTs{StartTagToken, StartTagCloseVoidToken}},
|
||||
{"<foo:bar.qux-norf/>", TTs{StartTagToken, StartTagCloseVoidToken}},
|
||||
{"<foo></foo>", TTs{StartTagToken, StartTagCloseToken, EndTagToken}},
|
||||
{"<foo>text</foo>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
|
||||
{"<foo/> text", TTs{StartTagToken, StartTagCloseVoidToken, TextToken}},
|
||||
{"<a> <b> <c>text</c> </b> </a>", TTs{StartTagToken, StartTagCloseToken, TextToken, StartTagToken, StartTagCloseToken, TextToken, StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken, TextToken, EndTagToken}},
|
||||
{"<foo a='a' b=\"b\" c=c/>", TTs{StartTagToken, AttributeToken, AttributeToken, AttributeToken, StartTagCloseVoidToken}},
|
||||
{"<foo a=\"\"/>", TTs{StartTagToken, AttributeToken, StartTagCloseVoidToken}},
|
||||
{"<foo a-b=\"\"/>", TTs{StartTagToken, AttributeToken, StartTagCloseVoidToken}},
|
||||
{"<foo \nchecked \r\n value\r=\t'=/>\"' />", TTs{StartTagToken, AttributeToken, AttributeToken, StartTagCloseVoidToken}},
|
||||
{"<?xml?>", TTs{StartTagPIToken, StartTagClosePIToken}},
|
||||
{"<?xml a=\"a\" ?>", TTs{StartTagPIToken, AttributeToken, StartTagClosePIToken}},
|
||||
{"<?xml a=a?>", TTs{StartTagPIToken, AttributeToken, StartTagClosePIToken}},
|
||||
{"<![CDATA[ test ]]>", TTs{CDATAToken}},
|
||||
{"<!DOCTYPE>", TTs{DOCTYPEToken}},
|
||||
{"<!DOCTYPE note SYSTEM \"Note.dtd\">", TTs{DOCTYPEToken}},
|
||||
{`<!DOCTYPE note [<!ENTITY nbsp " "><!ENTITY writer "Writer: Donald Duck."><!ENTITY copyright "Copyright:]> W3Schools.">]>`, TTs{DOCTYPEToken}},
|
||||
{"<!foo>", TTs{StartTagToken, StartTagCloseToken}},
|
||||
|
||||
// early endings
|
||||
{"<!-- comment", TTs{CommentToken}},
|
||||
{"<foo", TTs{StartTagToken}},
|
||||
{"</foo", TTs{EndTagToken}},
|
||||
{"<foo x", TTs{StartTagToken, AttributeToken}},
|
||||
{"<foo x=", TTs{StartTagToken, AttributeToken}},
|
||||
{"<foo x='", TTs{StartTagToken, AttributeToken}},
|
||||
{"<foo x=''", TTs{StartTagToken, AttributeToken}},
|
||||
{"<?xml", TTs{StartTagPIToken}},
|
||||
{"<![CDATA[ test", TTs{CDATAToken}},
|
||||
{"<!DOCTYPE note SYSTEM", TTs{DOCTYPEToken}},
|
||||
|
||||
// go fuzz
|
||||
{"</", TTs{EndTagToken}},
|
||||
{"</\n", TTs{EndTagToken}},
|
||||
}
|
||||
for _, tt := range tokenTests {
|
||||
t.Run(tt.xml, func(t *testing.T) {
|
||||
l := NewLexer(bytes.NewBufferString(tt.xml))
|
||||
i := 0
|
||||
for {
|
||||
token, _ := l.Next()
|
||||
if token == ErrorToken {
|
||||
test.T(t, l.Err(), io.EOF)
|
||||
test.T(t, i, len(tt.expected), "when error occurred we must be at the end")
|
||||
break
|
||||
}
|
||||
test.That(t, i < len(tt.expected), "index", i, "must not exceed expected token types size", len(tt.expected))
|
||||
if i < len(tt.expected) {
|
||||
test.T(t, token, tt.expected[i], "token types must match")
|
||||
}
|
||||
i++
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
test.T(t, TokenType(100).String(), "Invalid(100)")
|
||||
}
|
||||
|
||||
func TestTags(t *testing.T) {
|
||||
var tagTests = []struct {
|
||||
xml string
|
||||
expected string
|
||||
}{
|
||||
{"<foo:bar.qux-norf/>", "foo:bar.qux-norf"},
|
||||
{"<?xml?>", "xml"},
|
||||
{"<foo?bar/qux>", "foo?bar/qux"},
|
||||
{"<!DOCTYPE note SYSTEM \"Note.dtd\">", " note SYSTEM \"Note.dtd\""},
|
||||
|
||||
// early endings
|
||||
{"<foo ", "foo"},
|
||||
}
|
||||
for _, tt := range tagTests {
|
||||
t.Run(tt.xml, func(t *testing.T) {
|
||||
l := NewLexer(bytes.NewBufferString(tt.xml))
|
||||
for {
|
||||
token, _ := l.Next()
|
||||
if token == ErrorToken {
|
||||
test.T(t, l.Err(), io.EOF)
|
||||
test.Fail(t, "when error occurred we must be at the end")
|
||||
break
|
||||
} else if token == StartTagToken || token == StartTagPIToken || token == EndTagToken || token == DOCTYPEToken {
|
||||
test.String(t, string(l.Text()), tt.expected, "tags must match")
|
||||
break
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAttributes(t *testing.T) {
|
||||
var attributeTests = []struct {
|
||||
attr string
|
||||
expected []string
|
||||
}{
|
||||
{"<foo a=\"b\" />", []string{"a", "\"b\""}},
|
||||
{"<foo \nchecked \r\n value\r=\t'=/>\"' />", []string{"checked", "", "value", "'=/>\"'"}},
|
||||
{"<foo bar=\" a \n\t\r b \" />", []string{"bar", "\" a b \""}},
|
||||
{"<?xml a=b?>", []string{"a", "b"}},
|
||||
{"<foo /=? >", []string{"/", "?"}},
|
||||
|
||||
// early endings
|
||||
{"<foo x", []string{"x", ""}},
|
||||
{"<foo x=", []string{"x", ""}},
|
||||
{"<foo x='", []string{"x", "'"}},
|
||||
}
|
||||
for _, tt := range attributeTests {
|
||||
t.Run(tt.attr, func(t *testing.T) {
|
||||
l := NewLexer(bytes.NewBufferString(tt.attr))
|
||||
i := 0
|
||||
for {
|
||||
token, _ := l.Next()
|
||||
if token == ErrorToken {
|
||||
test.T(t, l.Err(), io.EOF)
|
||||
test.T(t, i, len(tt.expected), "when error occurred we must be at the end")
|
||||
break
|
||||
} else if token == AttributeToken {
|
||||
test.That(t, i+1 < len(tt.expected), "index", i+1, "must not exceed expected attributes size", len(tt.expected))
|
||||
if i+1 < len(tt.expected) {
|
||||
test.String(t, string(l.Text()), tt.expected[i], "attribute keys must match")
|
||||
test.String(t, string(l.AttrVal()), tt.expected[i+1], "attribute keys must match")
|
||||
i += 2
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestErrors(t *testing.T) {
|
||||
var errorTests = []struct {
|
||||
xml string
|
||||
col int
|
||||
}{
|
||||
{"a\x00b", 2},
|
||||
}
|
||||
for _, tt := range errorTests {
|
||||
t.Run(tt.xml, func(t *testing.T) {
|
||||
l := NewLexer(bytes.NewBufferString(tt.xml))
|
||||
for {
|
||||
token, _ := l.Next()
|
||||
if token == ErrorToken {
|
||||
if tt.col == 0 {
|
||||
test.T(t, l.Err(), io.EOF)
|
||||
} else if perr, ok := l.Err().(*parse.Error); ok {
|
||||
test.T(t, perr.Col, tt.col)
|
||||
} else {
|
||||
test.Fail(t, "bad error:", l.Err())
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
func ExampleNewLexer() {
|
||||
l := NewLexer(bytes.NewBufferString("<span class='user'>John Doe</span>"))
|
||||
out := ""
|
||||
for {
|
||||
tt, data := l.Next()
|
||||
if tt == ErrorToken {
|
||||
break
|
||||
}
|
||||
out += string(data)
|
||||
}
|
||||
fmt.Println(out)
|
||||
// Output: <span class='user'>John Doe</span>
|
||||
}
|
108
vendor/github.com/tdewolff/parse/xml/util.go
generated
vendored
Normal file
108
vendor/github.com/tdewolff/parse/xml/util.go
generated
vendored
Normal file
|
@ -0,0 +1,108 @@
|
|||
package xml // import "github.com/tdewolff/parse/xml"
|
||||
|
||||
import "github.com/tdewolff/parse"
|
||||
|
||||
var (
|
||||
ltEntityBytes = []byte("<")
|
||||
ampEntityBytes = []byte("&")
|
||||
singleQuoteEntityBytes = []byte("'")
|
||||
doubleQuoteEntityBytes = []byte(""")
|
||||
)
|
||||
|
||||
// EscapeAttrVal returns the escape attribute value bytes without quotes.
|
||||
func EscapeAttrVal(buf *[]byte, b []byte) []byte {
|
||||
singles := 0
|
||||
doubles := 0
|
||||
for i, c := range b {
|
||||
if c == '&' {
|
||||
if quote, n := parse.QuoteEntity(b[i:]); n > 0 {
|
||||
if quote == '"' {
|
||||
doubles++
|
||||
} else {
|
||||
singles++
|
||||
}
|
||||
}
|
||||
} else if c == '"' {
|
||||
doubles++
|
||||
} else if c == '\'' {
|
||||
singles++
|
||||
}
|
||||
}
|
||||
|
||||
n := len(b) + 2
|
||||
var quote byte
|
||||
var escapedQuote []byte
|
||||
if doubles > singles {
|
||||
n += singles * 4
|
||||
quote = '\''
|
||||
escapedQuote = singleQuoteEntityBytes
|
||||
} else {
|
||||
n += doubles * 4
|
||||
quote = '"'
|
||||
escapedQuote = doubleQuoteEntityBytes
|
||||
}
|
||||
if n > cap(*buf) {
|
||||
*buf = make([]byte, 0, n) // maximum size, not actual size
|
||||
}
|
||||
t := (*buf)[:n] // maximum size, not actual size
|
||||
t[0] = quote
|
||||
j := 1
|
||||
start := 0
|
||||
for i, c := range b {
|
||||
if c == '&' {
|
||||
if entityQuote, n := parse.QuoteEntity(b[i:]); n > 0 {
|
||||
j += copy(t[j:], b[start:i])
|
||||
if entityQuote != quote {
|
||||
t[j] = entityQuote
|
||||
j++
|
||||
} else {
|
||||
j += copy(t[j:], escapedQuote)
|
||||
}
|
||||
start = i + n
|
||||
}
|
||||
} else if c == quote {
|
||||
j += copy(t[j:], b[start:i])
|
||||
j += copy(t[j:], escapedQuote)
|
||||
start = i + 1
|
||||
}
|
||||
}
|
||||
j += copy(t[j:], b[start:])
|
||||
t[j] = quote
|
||||
return t[:j+1]
|
||||
}
|
||||
|
||||
// EscapeCDATAVal returns the escaped text bytes.
|
||||
func EscapeCDATAVal(buf *[]byte, b []byte) ([]byte, bool) {
|
||||
n := 0
|
||||
for _, c := range b {
|
||||
if c == '<' || c == '&' {
|
||||
if c == '<' {
|
||||
n += 3 // <
|
||||
} else {
|
||||
n += 4 // &
|
||||
}
|
||||
if n > len("<![CDATA[]]>") {
|
||||
return b, false
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(b)+n > cap(*buf) {
|
||||
*buf = make([]byte, 0, len(b)+n)
|
||||
}
|
||||
t := (*buf)[:len(b)+n]
|
||||
j := 0
|
||||
start := 0
|
||||
for i, c := range b {
|
||||
if c == '<' {
|
||||
j += copy(t[j:], b[start:i])
|
||||
j += copy(t[j:], ltEntityBytes)
|
||||
start = i + 1
|
||||
} else if c == '&' {
|
||||
j += copy(t[j:], b[start:i])
|
||||
j += copy(t[j:], ampEntityBytes)
|
||||
start = i + 1
|
||||
}
|
||||
}
|
||||
j += copy(t[j:], b[start:])
|
||||
return t[:j], true
|
||||
}
|
63
vendor/github.com/tdewolff/parse/xml/util_test.go
generated
vendored
Normal file
63
vendor/github.com/tdewolff/parse/xml/util_test.go
generated
vendored
Normal file
|
@ -0,0 +1,63 @@
|
|||
package xml // import "github.com/tdewolff/parse/xml"
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/tdewolff/test"
|
||||
)
|
||||
|
||||
func TestEscapeAttrVal(t *testing.T) {
|
||||
var attrValTests = []struct {
|
||||
attrVal string
|
||||
expected string
|
||||
}{
|
||||
{"xyz", "\"xyz\""},
|
||||
{"", "\"\""},
|
||||
{"x&z", "\"x&z\""},
|
||||
{"x'z", "\"x'z\""},
|
||||
{"x\"z", "'x\"z'"},
|
||||
{"a'b=\"\"", "'a'b=\"\"'"},
|
||||
{"'x'\"'z'", "\"x'"'z\""},
|
||||
{"\"x"'"z\"", "'x\"'\"z'"},
|
||||
{"a'b=\"\"", "'a'b=\"\"'"},
|
||||
}
|
||||
var buf []byte
|
||||
for _, tt := range attrValTests {
|
||||
t.Run(tt.attrVal, func(t *testing.T) {
|
||||
b := []byte(tt.attrVal)
|
||||
if len(b) > 1 && (b[0] == '"' || b[0] == '\'') && b[0] == b[len(b)-1] {
|
||||
b = b[1 : len(b)-1]
|
||||
}
|
||||
val := EscapeAttrVal(&buf, []byte(b))
|
||||
test.String(t, string(val), tt.expected)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestEscapeCDATAVal(t *testing.T) {
|
||||
var CDATAValTests = []struct {
|
||||
CDATAVal string
|
||||
expected string
|
||||
}{
|
||||
{"<![CDATA[<b>]]>", "<b>"},
|
||||
{"<![CDATA[abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz]]>", "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"},
|
||||
{"<![CDATA[ <b> ]]>", " <b> "},
|
||||
{"<![CDATA[<<<<<]]>", "<![CDATA[<<<<<]]>"},
|
||||
{"<![CDATA[&]]>", "&"},
|
||||
{"<![CDATA[&&&&]]>", "<![CDATA[&&&&]]>"},
|
||||
{"<![CDATA[ a ]]>", " a "},
|
||||
{"<![CDATA[]]>", ""},
|
||||
}
|
||||
var buf []byte
|
||||
for _, tt := range CDATAValTests {
|
||||
t.Run(tt.CDATAVal, func(t *testing.T) {
|
||||
b := []byte(tt.CDATAVal[len("<![CDATA[") : len(tt.CDATAVal)-len("]]>")])
|
||||
data, useText := EscapeCDATAVal(&buf, b)
|
||||
text := string(data)
|
||||
if !useText {
|
||||
text = "<![CDATA[" + text + "]]>"
|
||||
}
|
||||
test.String(t, text, tt.expected)
|
||||
})
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue