1
0
Fork 0
mirror of https://github.com/miniflux/v2.git synced 2025-08-26 18:21:01 +00:00

Update dependencies

This commit is contained in:
Frédéric Guillot 2019-09-05 21:39:32 -07:00 committed by Frédéric Guillot
parent 456ebaf423
commit b94160df72
350 changed files with 119448 additions and 25865 deletions

View file

@ -1,556 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/internal/gen"
)
const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
` !"#$%&'()*+,-./0123456789:;<=>?` +
`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
"`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
var encodings = []struct {
name string
mib string
comment string
varName string
replacement byte
mapping string
}{
{
"IBM Code Page 037",
"IBM037",
"",
"CodePage037",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
},
{
"IBM Code Page 437",
"PC8CodePage437",
"",
"CodePage437",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
},
{
"IBM Code Page 850",
"PC850Multilingual",
"",
"CodePage850",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
},
{
"IBM Code Page 852",
"PCp852",
"",
"CodePage852",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
},
{
"IBM Code Page 855",
"IBM855",
"",
"CodePage855",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
},
{
"Windows Code Page 858", // PC latin1 with Euro
"IBM00858",
"",
"CodePage858",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
},
{
"IBM Code Page 860",
"IBM860",
"",
"CodePage860",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
},
{
"IBM Code Page 862",
"PC862LatinHebrew",
"",
"CodePage862",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
},
{
"IBM Code Page 863",
"IBM863",
"",
"CodePage863",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
},
{
"IBM Code Page 865",
"IBM865",
"",
"CodePage865",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
},
{
"IBM Code Page 866",
"IBM866",
"",
"CodePage866",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-ibm866.txt",
},
{
"IBM Code Page 1047",
"IBM1047",
"",
"CodePage1047",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
},
{
"IBM Code Page 1140",
"IBM01140",
"",
"CodePage1140",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
},
{
"ISO 8859-1",
"ISOLatin1",
"",
"ISO8859_1",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
},
{
"ISO 8859-2",
"ISOLatin2",
"",
"ISO8859_2",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
},
{
"ISO 8859-3",
"ISOLatin3",
"",
"ISO8859_3",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
},
{
"ISO 8859-4",
"ISOLatin4",
"",
"ISO8859_4",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
},
{
"ISO 8859-5",
"ISOLatinCyrillic",
"",
"ISO8859_5",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
},
{
"ISO 8859-6",
"ISOLatinArabic",
"",
"ISO8859_6,ISO8859_6E,ISO8859_6I",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
},
{
"ISO 8859-7",
"ISOLatinGreek",
"",
"ISO8859_7",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
},
{
"ISO 8859-8",
"ISOLatinHebrew",
"",
"ISO8859_8,ISO8859_8E,ISO8859_8I",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
},
{
"ISO 8859-9",
"ISOLatin5",
"",
"ISO8859_9",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
},
{
"ISO 8859-10",
"ISOLatin6",
"",
"ISO8859_10",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
},
{
"ISO 8859-13",
"ISO885913",
"",
"ISO8859_13",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
},
{
"ISO 8859-14",
"ISO885914",
"",
"ISO8859_14",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
},
{
"ISO 8859-15",
"ISO885915",
"",
"ISO8859_15",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
},
{
"ISO 8859-16",
"ISO885916",
"",
"ISO8859_16",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
},
{
"KOI8-R",
"KOI8R",
"",
"KOI8R",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-koi8-r.txt",
},
{
"KOI8-U",
"KOI8U",
"",
"KOI8U",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-koi8-u.txt",
},
{
"Macintosh",
"Macintosh",
"",
"Macintosh",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-macintosh.txt",
},
{
"Macintosh Cyrillic",
"MacintoshCyrillic",
"",
"MacintoshCyrillic",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
},
{
"Windows 874",
"Windows874",
"",
"Windows874",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-874.txt",
},
{
"Windows 1250",
"Windows1250",
"",
"Windows1250",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1250.txt",
},
{
"Windows 1251",
"Windows1251",
"",
"Windows1251",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1251.txt",
},
{
"Windows 1252",
"Windows1252",
"",
"Windows1252",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1252.txt",
},
{
"Windows 1253",
"Windows1253",
"",
"Windows1253",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1253.txt",
},
{
"Windows 1254",
"Windows1254",
"",
"Windows1254",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1254.txt",
},
{
"Windows 1255",
"Windows1255",
"",
"Windows1255",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1255.txt",
},
{
"Windows 1256",
"Windows1256",
"",
"Windows1256",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1256.txt",
},
{
"Windows 1257",
"Windows1257",
"",
"Windows1257",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1257.txt",
},
{
"Windows 1258",
"Windows1258",
"",
"Windows1258",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1258.txt",
},
{
"X-User-Defined",
"XUserDefined",
"It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
"XUserDefined",
encoding.ASCIISub,
ascii +
"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
},
}
func getWHATWG(url string) string {
res, err := http.Get(url)
if err != nil {
log.Fatalf("%q: Get: %v", url, err)
}
defer res.Body.Close()
mapping := make([]rune, 128)
for i := range mapping {
mapping[i] = '\ufffd'
}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := 0, 0
if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 128 <= x {
log.Fatalf("code %d is out of range", x)
}
if 0x80 <= y && y < 0xa0 {
// We diverge from the WHATWG spec by mapping control characters
// in the range [0x80, 0xa0) to U+FFFD.
continue
}
mapping[x] = rune(y)
}
return ascii + string(mapping)
}
func getUCM(url string) string {
res, err := http.Get(url)
if err != nil {
log.Fatalf("%q: Get: %v", url, err)
}
defer res.Body.Close()
mapping := make([]rune, 256)
for i := range mapping {
mapping[i] = '\ufffd'
}
charsFound := 0
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
var c byte
var r rune
if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
continue
}
mapping[c] = r
charsFound++
}
if charsFound < 200 {
log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
}
return string(mapping)
}
func main() {
mibs := map[string]bool{}
all := []string{}
w := gen.NewCodeWriter()
defer w.WriteGoFile("tables.go", "charmap")
printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
printf("import (\n")
printf("\t\"golang.org/x/text/encoding\"\n")
printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
printf(")\n\n")
for _, e := range encodings {
varNames := strings.Split(e.varName, ",")
all = append(all, varNames...)
varName := varNames[0]
switch {
case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
e.mapping = getWHATWG(e.mapping)
case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
e.mapping = getUCM(e.mapping)
}
asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
if asciiSuperset {
low = 0x80
}
lvn := 1
if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
lvn = 3
}
lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
printf("// %s is the %s encoding.\n", varName, e.name)
if e.comment != "" {
printf("//\n// %s\n", e.comment)
}
printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n",
varName, lowerVarName, lowerVarName, e.name)
if mibs[e.mib] {
log.Fatalf("MIB type %q declared multiple times.", e.mib)
}
printf("mib: identifier.%s,\n", e.mib)
printf("asciiSuperset: %t,\n", asciiSuperset)
printf("low: 0x%02x,\n", low)
printf("replacement: 0x%02x,\n", e.replacement)
printf("decode: [256]utf8Enc{\n")
i, backMapping := 0, map[rune]byte{}
for _, c := range e.mapping {
if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
backMapping[c] = byte(i)
}
var buf [8]byte
n := utf8.EncodeRune(buf[:], c)
if n > 3 {
panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
}
printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
if i%2 == 1 {
printf("\n")
}
i++
}
printf("},\n")
printf("encode: [256]uint32{\n")
encode := make([]uint32, 0, 256)
for c, i := range backMapping {
encode = append(encode, uint32(i)<<24|uint32(c))
}
sort.Sort(byRune(encode))
for len(encode) < cap(encode) {
encode = append(encode, encode[len(encode)-1])
}
for i, enc := range encode {
printf("0x%08x,", enc)
if i%8 == 7 {
printf("\n")
}
}
printf("},\n}\n")
// Add an estimate of the size of a single Charmap{} struct value, which
// includes two 256 elem arrays of 4 bytes and some extra fields, which
// align to 3 uint64s on 64-bit architectures.
w.Size += 2*4*256 + 3*8
}
// TODO: add proper line breaking.
printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
}
type byRune []uint32
func (b byRune) Len() int { return len(b) }
func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

View file

@ -124,7 +124,7 @@ func (e *Encoder) Writer(w io.Writer) io.Writer {
}
// ASCIISub is the ASCII substitute character, as recommended by
// http://unicode.org/reports/tr36/#Text_Comparison
// https://unicode.org/reports/tr36/#Text_Comparison
const ASCIISub = '\x1a'
// Nop is the nop encoding. Its transformed bytes are the same as the source

View file

@ -1,173 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bytes"
"encoding/json"
"fmt"
"log"
"strings"
"golang.org/x/text/internal/gen"
)
type group struct {
Encodings []struct {
Labels []string
Name string
}
}
func main() {
gen.Init()
r := gen.Open("https://encoding.spec.whatwg.org", "whatwg", "encodings.json")
var groups []group
if err := json.NewDecoder(r).Decode(&groups); err != nil {
log.Fatalf("Error reading encodings.json: %v", err)
}
w := &bytes.Buffer{}
fmt.Fprintln(w, "type htmlEncoding byte")
fmt.Fprintln(w, "const (")
for i, g := range groups {
for _, e := range g.Encodings {
key := strings.ToLower(e.Name)
name := consts[key]
if name == "" {
log.Fatalf("No const defined for %s.", key)
}
if i == 0 {
fmt.Fprintf(w, "%s htmlEncoding = iota\n", name)
} else {
fmt.Fprintf(w, "%s\n", name)
}
}
}
fmt.Fprintln(w, "numEncodings")
fmt.Fprint(w, ")\n\n")
fmt.Fprintln(w, "var canonical = [numEncodings]string{")
for _, g := range groups {
for _, e := range g.Encodings {
fmt.Fprintf(w, "%q,\n", strings.ToLower(e.Name))
}
}
fmt.Fprint(w, "}\n\n")
fmt.Fprintln(w, "var nameMap = map[string]htmlEncoding{")
for _, g := range groups {
for _, e := range g.Encodings {
for _, l := range e.Labels {
key := strings.ToLower(e.Name)
name := consts[key]
fmt.Fprintf(w, "%q: %s,\n", l, name)
}
}
}
fmt.Fprint(w, "}\n\n")
var tags []string
fmt.Fprintln(w, "var localeMap = []htmlEncoding{")
for _, loc := range locales {
tags = append(tags, loc.tag)
fmt.Fprintf(w, "%s, // %s \n", consts[loc.name], loc.tag)
}
fmt.Fprint(w, "}\n\n")
fmt.Fprintf(w, "const locales = %q\n", strings.Join(tags, " "))
gen.WriteGoFile("tables.go", "htmlindex", w.Bytes())
}
// consts maps canonical encoding name to internal constant.
var consts = map[string]string{
"utf-8": "utf8",
"ibm866": "ibm866",
"iso-8859-2": "iso8859_2",
"iso-8859-3": "iso8859_3",
"iso-8859-4": "iso8859_4",
"iso-8859-5": "iso8859_5",
"iso-8859-6": "iso8859_6",
"iso-8859-7": "iso8859_7",
"iso-8859-8": "iso8859_8",
"iso-8859-8-i": "iso8859_8I",
"iso-8859-10": "iso8859_10",
"iso-8859-13": "iso8859_13",
"iso-8859-14": "iso8859_14",
"iso-8859-15": "iso8859_15",
"iso-8859-16": "iso8859_16",
"koi8-r": "koi8r",
"koi8-u": "koi8u",
"macintosh": "macintosh",
"windows-874": "windows874",
"windows-1250": "windows1250",
"windows-1251": "windows1251",
"windows-1252": "windows1252",
"windows-1253": "windows1253",
"windows-1254": "windows1254",
"windows-1255": "windows1255",
"windows-1256": "windows1256",
"windows-1257": "windows1257",
"windows-1258": "windows1258",
"x-mac-cyrillic": "macintoshCyrillic",
"gbk": "gbk",
"gb18030": "gb18030",
// "hz-gb-2312": "hzgb2312", // Was removed from WhatWG
"big5": "big5",
"euc-jp": "eucjp",
"iso-2022-jp": "iso2022jp",
"shift_jis": "shiftJIS",
"euc-kr": "euckr",
"replacement": "replacement",
"utf-16be": "utf16be",
"utf-16le": "utf16le",
"x-user-defined": "xUserDefined",
}
// locales is taken from
// https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm.
var locales = []struct{ tag, name string }{
// The default value. Explicitly state latin to benefit from the exact
// script option, while still making 1252 the default encoding for languages
// written in Latin script.
{"und_Latn", "windows-1252"},
{"ar", "windows-1256"},
{"ba", "windows-1251"},
{"be", "windows-1251"},
{"bg", "windows-1251"},
{"cs", "windows-1250"},
{"el", "iso-8859-7"},
{"et", "windows-1257"},
{"fa", "windows-1256"},
{"he", "windows-1255"},
{"hr", "windows-1250"},
{"hu", "iso-8859-2"},
{"ja", "shift_jis"},
{"kk", "windows-1251"},
{"ko", "euc-kr"},
{"ku", "windows-1254"},
{"ky", "windows-1251"},
{"lt", "windows-1257"},
{"lv", "windows-1257"},
{"mk", "windows-1251"},
{"pl", "iso-8859-2"},
{"ru", "windows-1251"},
{"sah", "windows-1251"},
{"sk", "windows-1250"},
{"sl", "iso-8859-2"},
{"sr", "windows-1251"},
{"tg", "windows-1251"},
{"th", "windows-874"},
{"tr", "windows-1254"},
{"tt", "windows-1251"},
{"uk", "windows-1251"},
{"vi", "windows-1258"},
{"zh-hans", "gb18030"},
{"zh-hant", "big5"},
}

View file

@ -306,6 +306,7 @@ var nameMap = map[string]htmlEncoding{
"iso-2022-cn": replacement,
"iso-2022-cn-ext": replacement,
"iso-2022-kr": replacement,
"replacement": replacement,
"utf-16be": utf16be,
"utf-16": utf16le,
"utf-16le": utf16le,

View file

@ -1,137 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bytes"
"encoding/xml"
"fmt"
"io"
"log"
"strings"
"golang.org/x/text/internal/gen"
)
type registry struct {
XMLName xml.Name `xml:"registry"`
Updated string `xml:"updated"`
Registry []struct {
ID string `xml:"id,attr"`
Record []struct {
Name string `xml:"name"`
Xref []struct {
Type string `xml:"type,attr"`
Data string `xml:"data,attr"`
} `xml:"xref"`
Desc struct {
Data string `xml:",innerxml"`
// Any []struct {
// Data string `xml:",chardata"`
// } `xml:",any"`
// Data string `xml:",chardata"`
} `xml:"description,"`
MIB string `xml:"value"`
Alias []string `xml:"alias"`
MIME string `xml:"preferred_alias"`
} `xml:"record"`
} `xml:"registry"`
}
func main() {
r := gen.OpenIANAFile("assignments/character-sets/character-sets.xml")
reg := &registry{}
if err := xml.NewDecoder(r).Decode(&reg); err != nil && err != io.EOF {
log.Fatalf("Error decoding charset registry: %v", err)
}
if len(reg.Registry) == 0 || reg.Registry[0].ID != "character-sets-1" {
log.Fatalf("Unexpected ID %s", reg.Registry[0].ID)
}
w := &bytes.Buffer{}
fmt.Fprintf(w, "const (\n")
for _, rec := range reg.Registry[0].Record {
constName := ""
for _, a := range rec.Alias {
if strings.HasPrefix(a, "cs") && strings.IndexByte(a, '-') == -1 {
// Some of the constant definitions have comments in them. Strip those.
constName = strings.Title(strings.SplitN(a[2:], "\n", 2)[0])
}
}
if constName == "" {
switch rec.MIB {
case "2085":
constName = "HZGB2312" // Not listed as alias for some reason.
default:
log.Fatalf("No cs alias defined for %s.", rec.MIB)
}
}
if rec.MIME != "" {
rec.MIME = fmt.Sprintf(" (MIME: %s)", rec.MIME)
}
fmt.Fprintf(w, "// %s is the MIB identifier with IANA name %s%s.\n//\n", constName, rec.Name, rec.MIME)
if len(rec.Desc.Data) > 0 {
fmt.Fprint(w, "// ")
d := xml.NewDecoder(strings.NewReader(rec.Desc.Data))
inElem := true
attr := ""
for {
t, err := d.Token()
if err != nil {
if err != io.EOF {
log.Fatal(err)
}
break
}
switch x := t.(type) {
case xml.CharData:
attr = "" // Don't need attribute info.
a := bytes.Split([]byte(x), []byte("\n"))
for i, b := range a {
if b = bytes.TrimSpace(b); len(b) != 0 {
if !inElem && i > 0 {
fmt.Fprint(w, "\n// ")
}
inElem = false
fmt.Fprintf(w, "%s ", string(b))
}
}
case xml.StartElement:
if x.Name.Local == "xref" {
inElem = true
use := false
for _, a := range x.Attr {
if a.Name.Local == "type" {
use = use || a.Value != "person"
}
if a.Name.Local == "data" && use {
attr = a.Value + " "
}
}
}
case xml.EndElement:
inElem = false
fmt.Fprint(w, attr)
}
}
fmt.Fprint(w, "\n")
}
for _, x := range rec.Xref {
switch x.Type {
case "rfc":
fmt.Fprintf(w, "// Reference: %s\n", strings.ToUpper(x.Data))
case "uri":
fmt.Fprintf(w, "// Reference: %s\n", x.Data)
}
}
fmt.Fprintf(w, "%s MIB = %s\n", constName, rec.MIB)
fmt.Fprintln(w)
}
fmt.Fprintln(w, ")")
gen.WriteGoFile("mib.go", "identifier", w.Bytes())
}

View file

@ -34,7 +34,7 @@ package identifier
// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
// - http://www.ietf.org/rfc/rfc2978.txt
// - http://www.unicode.org/reports/tr22/
// - https://www.unicode.org/reports/tr22/
// - http://www.w3.org/TR/encoding/
// - https://encoding.spec.whatwg.org/
// - https://encoding.spec.whatwg.org/encodings.json

View file

@ -538,8 +538,6 @@ const (
// ISO111ECMACyrillic is the MIB identifier with IANA name ECMA-cyrillic.
//
// ISO registry
// (formerly ECMA
// registry )
ISO111ECMACyrillic MIB = 77
// ISO121Canadian1 is the MIB identifier with IANA name CSA_Z243.4-1985-1.
@ -732,18 +730,18 @@ const (
// ISO885913 is the MIB identifier with IANA name ISO-8859-13.
//
// ISO See http://www.iana.org/assignments/charset-reg/ISO-8859-13 http://www.iana.org/assignments/charset-reg/ISO-8859-13
// ISO See https://www.iana.org/assignments/charset-reg/ISO-8859-13 https://www.iana.org/assignments/charset-reg/ISO-8859-13
ISO885913 MIB = 109
// ISO885914 is the MIB identifier with IANA name ISO-8859-14.
//
// ISO See http://www.iana.org/assignments/charset-reg/ISO-8859-14
// ISO See https://www.iana.org/assignments/charset-reg/ISO-8859-14
ISO885914 MIB = 110
// ISO885915 is the MIB identifier with IANA name ISO-8859-15.
//
// ISO
// Please see: http://www.iana.org/assignments/charset-reg/ISO-8859-15
// Please see: https://www.iana.org/assignments/charset-reg/ISO-8859-15
ISO885915 MIB = 111
// ISO885916 is the MIB identifier with IANA name ISO-8859-16.
@ -754,41 +752,41 @@ const (
// GBK is the MIB identifier with IANA name GBK.
//
// Chinese IT Standardization Technical Committee
// Please see: http://www.iana.org/assignments/charset-reg/GBK
// Please see: https://www.iana.org/assignments/charset-reg/GBK
GBK MIB = 113
// GB18030 is the MIB identifier with IANA name GB18030.
//
// Chinese IT Standardization Technical Committee
// Please see: http://www.iana.org/assignments/charset-reg/GB18030
// Please see: https://www.iana.org/assignments/charset-reg/GB18030
GB18030 MIB = 114
// OSDEBCDICDF0415 is the MIB identifier with IANA name OSD_EBCDIC_DF04_15.
//
// Fujitsu-Siemens standard mainframe EBCDIC encoding
// Please see: http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-15
// Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-15
OSDEBCDICDF0415 MIB = 115
// OSDEBCDICDF03IRV is the MIB identifier with IANA name OSD_EBCDIC_DF03_IRV.
//
// Fujitsu-Siemens standard mainframe EBCDIC encoding
// Please see: http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF03-IRV
// Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF03-IRV
OSDEBCDICDF03IRV MIB = 116
// OSDEBCDICDF041 is the MIB identifier with IANA name OSD_EBCDIC_DF04_1.
//
// Fujitsu-Siemens standard mainframe EBCDIC encoding
// Please see: http://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-1
// Please see: https://www.iana.org/assignments/charset-reg/OSD-EBCDIC-DF04-1
OSDEBCDICDF041 MIB = 117
// ISO115481 is the MIB identifier with IANA name ISO-11548-1.
//
// See http://www.iana.org/assignments/charset-reg/ISO-11548-1
// See https://www.iana.org/assignments/charset-reg/ISO-11548-1
ISO115481 MIB = 118
// KZ1048 is the MIB identifier with IANA name KZ-1048.
//
// See http://www.iana.org/assignments/charset-reg/KZ-1048
// See https://www.iana.org/assignments/charset-reg/KZ-1048
KZ1048 MIB = 119
// Unicode is the MIB identifier with IANA name ISO-10646-UCS-2.
@ -855,7 +853,7 @@ const (
// SCSU is the MIB identifier with IANA name SCSU.
//
// SCSU See http://www.iana.org/assignments/charset-reg/SCSU
// SCSU See https://www.iana.org/assignments/charset-reg/SCSU
SCSU MIB = 1011
// UTF7 is the MIB identifier with IANA name UTF-7.
@ -884,27 +882,27 @@ const (
// CESU8 is the MIB identifier with IANA name CESU-8.
//
// http://www.unicode.org/unicode/reports/tr26
// https://www.unicode.org/reports/tr26
CESU8 MIB = 1016
// UTF32 is the MIB identifier with IANA name UTF-32.
//
// http://www.unicode.org/unicode/reports/tr19/
// https://www.unicode.org/reports/tr19/
UTF32 MIB = 1017
// UTF32BE is the MIB identifier with IANA name UTF-32BE.
//
// http://www.unicode.org/unicode/reports/tr19/
// https://www.unicode.org/reports/tr19/
UTF32BE MIB = 1018
// UTF32LE is the MIB identifier with IANA name UTF-32LE.
//
// http://www.unicode.org/unicode/reports/tr19/
// https://www.unicode.org/reports/tr19/
UTF32LE MIB = 1019
// BOCU1 is the MIB identifier with IANA name BOCU-1.
//
// http://www.unicode.org/notes/tn6/
// https://www.unicode.org/notes/tn6/
BOCU1 MIB = 1020
// Windows30Latin1 is the MIB identifier with IANA name ISO-8859-1-Windows-3.0-Latin-1.
@ -1461,152 +1459,152 @@ const (
// IBM00858 is the MIB identifier with IANA name IBM00858.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM00858
// IBM See https://www.iana.org/assignments/charset-reg/IBM00858
IBM00858 MIB = 2089
// IBM00924 is the MIB identifier with IANA name IBM00924.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM00924
// IBM See https://www.iana.org/assignments/charset-reg/IBM00924
IBM00924 MIB = 2090
// IBM01140 is the MIB identifier with IANA name IBM01140.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01140
// IBM See https://www.iana.org/assignments/charset-reg/IBM01140
IBM01140 MIB = 2091
// IBM01141 is the MIB identifier with IANA name IBM01141.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01141
// IBM See https://www.iana.org/assignments/charset-reg/IBM01141
IBM01141 MIB = 2092
// IBM01142 is the MIB identifier with IANA name IBM01142.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01142
// IBM See https://www.iana.org/assignments/charset-reg/IBM01142
IBM01142 MIB = 2093
// IBM01143 is the MIB identifier with IANA name IBM01143.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01143
// IBM See https://www.iana.org/assignments/charset-reg/IBM01143
IBM01143 MIB = 2094
// IBM01144 is the MIB identifier with IANA name IBM01144.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01144
// IBM See https://www.iana.org/assignments/charset-reg/IBM01144
IBM01144 MIB = 2095
// IBM01145 is the MIB identifier with IANA name IBM01145.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01145
// IBM See https://www.iana.org/assignments/charset-reg/IBM01145
IBM01145 MIB = 2096
// IBM01146 is the MIB identifier with IANA name IBM01146.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01146
// IBM See https://www.iana.org/assignments/charset-reg/IBM01146
IBM01146 MIB = 2097
// IBM01147 is the MIB identifier with IANA name IBM01147.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01147
// IBM See https://www.iana.org/assignments/charset-reg/IBM01147
IBM01147 MIB = 2098
// IBM01148 is the MIB identifier with IANA name IBM01148.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01148
// IBM See https://www.iana.org/assignments/charset-reg/IBM01148
IBM01148 MIB = 2099
// IBM01149 is the MIB identifier with IANA name IBM01149.
//
// IBM See http://www.iana.org/assignments/charset-reg/IBM01149
// IBM See https://www.iana.org/assignments/charset-reg/IBM01149
IBM01149 MIB = 2100
// Big5HKSCS is the MIB identifier with IANA name Big5-HKSCS.
//
// See http://www.iana.org/assignments/charset-reg/Big5-HKSCS
// See https://www.iana.org/assignments/charset-reg/Big5-HKSCS
Big5HKSCS MIB = 2101
// IBM1047 is the MIB identifier with IANA name IBM1047.
//
// IBM1047 (EBCDIC Latin 1/Open Systems) http://www-1.ibm.com/servers/eserver/iseries/software/globalization/pdf/cp01047z.pdf
// IBM1047 (EBCDIC Latin 1/Open Systems) https://www-1.ibm.com/servers/eserver/iseries/software/globalization/pdf/cp01047z.pdf
IBM1047 MIB = 2102
// PTCP154 is the MIB identifier with IANA name PTCP154.
//
// See http://www.iana.org/assignments/charset-reg/PTCP154
// See https://www.iana.org/assignments/charset-reg/PTCP154
PTCP154 MIB = 2103
// Amiga1251 is the MIB identifier with IANA name Amiga-1251.
//
// See http://www.amiga.ultranet.ru/Amiga-1251.html
// See https://www.amiga.ultranet.ru/Amiga-1251.html
Amiga1251 MIB = 2104
// KOI7switched is the MIB identifier with IANA name KOI7-switched.
//
// See http://www.iana.org/assignments/charset-reg/KOI7-switched
// See https://www.iana.org/assignments/charset-reg/KOI7-switched
KOI7switched MIB = 2105
// BRF is the MIB identifier with IANA name BRF.
//
// See http://www.iana.org/assignments/charset-reg/BRF
// See https://www.iana.org/assignments/charset-reg/BRF
BRF MIB = 2106
// TSCII is the MIB identifier with IANA name TSCII.
//
// See http://www.iana.org/assignments/charset-reg/TSCII
// See https://www.iana.org/assignments/charset-reg/TSCII
TSCII MIB = 2107
// CP51932 is the MIB identifier with IANA name CP51932.
//
// See http://www.iana.org/assignments/charset-reg/CP51932
// See https://www.iana.org/assignments/charset-reg/CP51932
CP51932 MIB = 2108
// Windows874 is the MIB identifier with IANA name windows-874.
//
// See http://www.iana.org/assignments/charset-reg/windows-874
// See https://www.iana.org/assignments/charset-reg/windows-874
Windows874 MIB = 2109
// Windows1250 is the MIB identifier with IANA name windows-1250.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1250
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1250
Windows1250 MIB = 2250
// Windows1251 is the MIB identifier with IANA name windows-1251.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1251
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1251
Windows1251 MIB = 2251
// Windows1252 is the MIB identifier with IANA name windows-1252.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1252
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1252
Windows1252 MIB = 2252
// Windows1253 is the MIB identifier with IANA name windows-1253.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1253
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1253
Windows1253 MIB = 2253
// Windows1254 is the MIB identifier with IANA name windows-1254.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1254
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1254
Windows1254 MIB = 2254
// Windows1255 is the MIB identifier with IANA name windows-1255.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1255
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1255
Windows1255 MIB = 2255
// Windows1256 is the MIB identifier with IANA name windows-1256.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1256
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1256
Windows1256 MIB = 2256
// Windows1257 is the MIB identifier with IANA name windows-1257.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1257
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1257
Windows1257 MIB = 2257
// Windows1258 is the MIB identifier with IANA name windows-1258.
//
// Microsoft http://www.iana.org/assignments/charset-reg/windows-1258
// Microsoft https://www.iana.org/assignments/charset-reg/windows-1258
Windows1258 MIB = 2258
// TIS620 is the MIB identifier with IANA name TIS-620.
@ -1616,6 +1614,6 @@ const (
// CP50220 is the MIB identifier with IANA name CP50220.
//
// See http://www.iana.org/assignments/charset-reg/CP50220
// See https://www.iana.org/assignments/charset-reg/CP50220
CP50220 MIB = 2260
)

View file

@ -1,161 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
// TODO: Emoji extensions?
// http://www.unicode.org/faq/emoji_dingbats.html
// http://www.unicode.org/Public/UNIDATA/EmojiSources.txt
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
type entry struct {
jisCode, table int
}
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package japanese provides Japanese encodings such as EUC-JP and Shift JIS.\n")
fmt.Printf(`package japanese // import "golang.org/x/text/encoding/japanese"` + "\n\n")
reverse := [65536]entry{}
for i := range reverse {
reverse[i].table = -1
}
tables := []struct {
url string
name string
}{
{"http://encoding.spec.whatwg.org/index-jis0208.txt", "0208"},
{"http://encoding.spec.whatwg.org/index-jis0212.txt", "0212"},
}
for i, table := range tables {
res, err := http.Get(table.url)
if err != nil {
log.Fatalf("%q: Get: %v", table.url, err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := 0, uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("%q: could not parse %q", table.url, s)
}
if x < 0 || 120*94 <= x {
log.Fatalf("%q: JIS code %d is out of range", table.url, x)
}
mapping[x] = y
if reverse[y].table == -1 {
reverse[y] = entry{jisCode: x, table: i}
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("%q: scanner error: %v", table.url, err)
}
fmt.Printf("// jis%sDecode is the decoding table from JIS %s code to Unicode.\n// It is defined at %s\n",
table.name, table.name, table.url)
fmt.Printf("var jis%sDecode = [...]uint16{\n", table.name)
for i, m := range mapping {
if m != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, m)
}
}
fmt.Printf("}\n\n")
}
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v.table == -1 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const (\n")
fmt.Printf("\tjis0208 = 1\n")
fmt.Printf("\tjis0212 = 2\n")
fmt.Printf("\tcodeMask = 0x7f\n")
fmt.Printf("\tcodeShift = 7\n")
fmt.Printf("\ttableShift = 14\n")
fmt.Printf(")\n\n")
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("//\n")
fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n")
fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n")
fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n")
fmt.Printf("// JIS code (94*j1 + j2) within that table.\n")
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x.table == -1 {
continue
}
fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n",
j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

View file

@ -1,143 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package korean provides Korean encodings such as EUC-KR.\n")
fmt.Printf(`package korean // import "golang.org/x/text/encoding/korean"` + "\n\n")
res, err := http.Get("http://encoding.spec.whatwg.org/index-euc-kr.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
reverse := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 178*(0xc7-0x81)+(0xfe-0xc7)*94+(0xff-0xa1) <= x {
log.Fatalf("EUC-KR code %d is out of range", x)
}
mapping[x] = y
if reverse[y] == 0 {
c0, c1 := uint16(0), uint16(0)
if x < 178*(0xc7-0x81) {
c0 = uint16(x/178) + 0x81
c1 = uint16(x % 178)
switch {
case c1 < 1*26:
c1 += 0x41
case c1 < 2*26:
c1 += 0x47
default:
c1 += 0x4d
}
} else {
x -= 178 * (0xc7 - 0x81)
c0 = uint16(x/94) + 0xc7
c1 = uint16(x%94) + 0xa1
}
reverse[y] = c0<<8 | c1
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n")
fmt.Printf("var decode = [...]uint16{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

View file

@ -1,161 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.\n")
fmt.Printf(`package simplifiedchinese // import "golang.org/x/text/encoding/simplifiedchinese"` + "\n\n")
printGB18030()
printGBK()
}
func printGB18030() {
res, err := http.Get("http://encoding.spec.whatwg.org/index-gb18030.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
fmt.Printf("// gb18030 is the table from http://encoding.spec.whatwg.org/index-gb18030.txt\n")
fmt.Printf("var gb18030 = [...][2]uint16{\n")
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint32(0), uint32(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0x10000 && y < 0x10000 {
fmt.Printf("\t{0x%04x, 0x%04x},\n", x, y)
}
}
fmt.Printf("}\n\n")
}
func printGBK() {
res, err := http.Get("http://encoding.spec.whatwg.org/index-gbk.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
reverse := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 126*190 <= x {
log.Fatalf("GBK code %d is out of range", x)
}
mapping[x] = y
if reverse[y] == 0 {
c0, c1 := x/190, x%190
if c1 >= 0x3f {
c1++
}
reverse[y] = (0x81+c0)<<8 | (0x40 + c1)
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n")
fmt.Printf("var decode = [...]uint16{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

View file

@ -1,140 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package traditionalchinese provides Traditional Chinese encodings such as Big5.\n")
fmt.Printf(`package traditionalchinese // import "golang.org/x/text/encoding/traditionalchinese"` + "\n\n")
res, err := http.Get("http://encoding.spec.whatwg.org/index-big5.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint32{}
reverse := [65536 * 4]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint32(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 126*157 <= x {
log.Fatalf("Big5 code %d is out of range", x)
}
mapping[x] = y
// The WHATWG spec http://encoding.spec.whatwg.org/#indexes says that
// "The index pointer for code point in index is the first pointer
// corresponding to code point in index", which would normally mean
// that the code below should be guarded by "if reverse[y] == 0", but
// last instead of first seems to match the behavior of
// "iconv -f UTF-8 -t BIG5". For example, U+8005 者 occurs twice in
// http://encoding.spec.whatwg.org/index-big5.txt, as index 2148
// (encoded as "\x8e\xcd") and index 6543 (encoded as "\xaa\xcc")
// and "echo 者 | iconv -f UTF-8 -t BIG5 | xxd" gives "\xaa\xcc".
c0, c1 := x/157, x%157
if c1 < 0x3f {
c1 += 0x40
} else {
c1 += 0x62
}
reverse[y] = (0x81+c0)<<8 | c1
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n")
fmt.Printf("var decode = [...]uint32{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%08X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

View file

@ -145,7 +145,7 @@ func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err e
// and consumed in a greater context that implies a certain endianness, use
// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
//
// In the language of http://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
// In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
// corresponds to "Where the precise type of the data stream is known... the
// BOM should not be used" and ExpectBOM corresponds to "A particular
// protocol... may require use of the BOM".

View file

@ -4,13 +4,13 @@ package language
// This file contains code common to the maketables.go and the package code.
// langAliasType is the type of an alias in langAliasMap.
type langAliasType int8
// AliasType is the type of an alias in AliasMap.
type AliasType int8
const (
langDeprecated langAliasType = iota
langMacro
langLegacy
Deprecated AliasType = iota
Macro
Legacy
langAliasTypeUnknown langAliasType = -1
AliasTypeUnknown AliasType = -1
)

29
vendor/golang.org/x/text/internal/language/compact.go generated vendored Normal file
View file

@ -0,0 +1,29 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
// CompactCoreInfo is a compact integer with the three core tags encoded.
type CompactCoreInfo uint32
// GetCompactCore generates a uint32 value that is guaranteed to be unique for
// different language, region, and script values.
func GetCompactCore(t Tag) (cci CompactCoreInfo, ok bool) {
if t.LangID > langNoIndexOffset {
return 0, false
}
cci |= CompactCoreInfo(t.LangID) << (8 + 12)
cci |= CompactCoreInfo(t.ScriptID) << 12
cci |= CompactCoreInfo(t.RegionID)
return cci, true
}
// Tag generates a tag from c.
func (c CompactCoreInfo) Tag() Tag {
return Tag{
LangID: Language(c >> 20),
RegionID: Region(c & 0x3ff),
ScriptID: Script(c>>12) & 0xff,
}
}

View file

@ -0,0 +1,61 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package compact defines a compact representation of language tags.
//
// Common language tags (at least all for which locale information is defined
// in CLDR) are assigned a unique index. Each Tag is associated with such an
// ID for selecting language-related resources (such as translations) as well
// as one for selecting regional defaults (currency, number formatting, etc.)
//
// It may want to export this functionality at some point, but at this point
// this is only available for use within x/text.
package compact // import "golang.org/x/text/internal/language/compact"
import (
"sort"
"strings"
"golang.org/x/text/internal/language"
)
// ID is an integer identifying a single tag.
type ID uint16
func getCoreIndex(t language.Tag) (id ID, ok bool) {
cci, ok := language.GetCompactCore(t)
if !ok {
return 0, false
}
i := sort.Search(len(coreTags), func(i int) bool {
return cci <= coreTags[i]
})
if i == len(coreTags) || coreTags[i] != cci {
return 0, false
}
return ID(i), true
}
// Parent returns the ID of the parent or the root ID if id is already the root.
func (id ID) Parent() ID {
return parents[id]
}
// Tag converts id to an internal language Tag.
func (id ID) Tag() language.Tag {
if int(id) >= len(coreTags) {
return specialTags[int(id)-len(coreTags)]
}
return coreTags[id].Tag()
}
var specialTags []language.Tag
func init() {
tags := strings.Split(specialTagsStr, " ")
specialTags = make([]language.Tag, len(tags))
for i, t := range tags {
specialTags[i] = language.MustParse(t)
}
}

View file

@ -0,0 +1,260 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go gen_index.go -output tables.go
//go:generate go run gen_parents.go
package compact
// TODO: Remove above NOTE after:
// - verifying that tables are dropped correctly (most notably matcher tables).
import (
"strings"
"golang.org/x/text/internal/language"
)
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
// specific language or locale. All language tag values are guaranteed to be
// well-formed.
type Tag struct {
// NOTE: exported tags will become part of the public API.
language ID
locale ID
full fullTag // always a language.Tag for now.
}
const _und = 0
type fullTag interface {
IsRoot() bool
Parent() language.Tag
}
// Make a compact Tag from a fully specified internal language Tag.
func Make(t language.Tag) (tag Tag) {
if region := t.TypeForKey("rg"); len(region) == 6 && region[2:] == "zzzz" {
if r, err := language.ParseRegion(region[:2]); err == nil {
tFull := t
t, _ = t.SetTypeForKey("rg", "")
// TODO: should we not consider "va" for the language tag?
var exact1, exact2 bool
tag.language, exact1 = FromTag(t)
t.RegionID = r
tag.locale, exact2 = FromTag(t)
if !exact1 || !exact2 {
tag.full = tFull
}
return tag
}
}
lang, ok := FromTag(t)
tag.language = lang
tag.locale = lang
if !ok {
tag.full = t
}
return tag
}
// Tag returns an internal language Tag version of this tag.
func (t Tag) Tag() language.Tag {
if t.full != nil {
return t.full.(language.Tag)
}
tag := t.language.Tag()
if t.language != t.locale {
loc := t.locale.Tag()
tag, _ = tag.SetTypeForKey("rg", strings.ToLower(loc.RegionID.String())+"zzzz")
}
return tag
}
// IsCompact reports whether this tag is fully defined in terms of ID.
func (t *Tag) IsCompact() bool {
return t.full == nil
}
// MayHaveVariants reports whether a tag may have variants. If it returns false
// it is guaranteed the tag does not have variants.
func (t Tag) MayHaveVariants() bool {
return t.full != nil || int(t.language) >= len(coreTags)
}
// MayHaveExtensions reports whether a tag may have extensions. If it returns
// false it is guaranteed the tag does not have them.
func (t Tag) MayHaveExtensions() bool {
return t.full != nil ||
int(t.language) >= len(coreTags) ||
t.language != t.locale
}
// IsRoot returns true if t is equal to language "und".
func (t Tag) IsRoot() bool {
if t.full != nil {
return t.full.IsRoot()
}
return t.language == _und
}
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
// specific language are substituted with fields from the parent language.
// The parent for a language may change for newer versions of CLDR.
func (t Tag) Parent() Tag {
if t.full != nil {
return Make(t.full.Parent())
}
if t.language != t.locale {
// Simulate stripping -u-rg-xxxxxx
return Tag{language: t.language, locale: t.language}
}
// TODO: use parent lookup table once cycle from internal package is
// removed. Probably by internalizing the table and declaring this fast
// enough.
// lang := compactID(internal.Parent(uint16(t.language)))
lang, _ := FromTag(t.language.Tag().Parent())
return Tag{language: lang, locale: lang}
}
// returns token t and the rest of the string.
func nextToken(s string) (t, tail string) {
p := strings.Index(s[1:], "-")
if p == -1 {
return s[1:], ""
}
p++
return s[1:p], s[p:]
}
// LanguageID returns an index, where 0 <= index < NumCompactTags, for tags
// for which data exists in the text repository.The index will change over time
// and should not be stored in persistent storage. If t does not match a compact
// index, exact will be false and the compact index will be returned for the
// first match after repeatedly taking the Parent of t.
func LanguageID(t Tag) (id ID, exact bool) {
return t.language, t.full == nil
}
// RegionalID returns the ID for the regional variant of this tag. This index is
// used to indicate region-specific overrides, such as default currency, default
// calendar and week data, default time cycle, and default measurement system
// and unit preferences.
//
// For instance, the tag en-GB-u-rg-uszzzz specifies British English with US
// settings for currency, number formatting, etc. The CompactIndex for this tag
// will be that for en-GB, while the RegionalID will be the one corresponding to
// en-US.
func RegionalID(t Tag) (id ID, exact bool) {
return t.locale, t.full == nil
}
// LanguageTag returns t stripped of regional variant indicators.
//
// At the moment this means it is stripped of a regional and variant subtag "rg"
// and "va" in the "u" extension.
func (t Tag) LanguageTag() Tag {
if t.full == nil {
return Tag{language: t.language, locale: t.language}
}
tt := t.Tag()
tt.SetTypeForKey("rg", "")
tt.SetTypeForKey("va", "")
return Make(tt)
}
// RegionalTag returns the regional variant of the tag.
//
// At the moment this means that the region is set from the regional subtag
// "rg" in the "u" extension.
func (t Tag) RegionalTag() Tag {
rt := Tag{language: t.locale, locale: t.locale}
if t.full == nil {
return rt
}
b := language.Builder{}
tag := t.Tag()
// tag, _ = tag.SetTypeForKey("rg", "")
b.SetTag(t.locale.Tag())
if v := tag.Variants(); v != "" {
for _, v := range strings.Split(v, "-") {
b.AddVariant(v)
}
}
for _, e := range tag.Extensions() {
b.AddExt(e)
}
return t
}
// FromTag reports closest matching ID for an internal language Tag.
func FromTag(t language.Tag) (id ID, exact bool) {
// TODO: perhaps give more frequent tags a lower index.
// TODO: we could make the indexes stable. This will excluded some
// possibilities for optimization, so don't do this quite yet.
exact = true
b, s, r := t.Raw()
if t.HasString() {
if t.IsPrivateUse() {
// We have no entries for user-defined tags.
return 0, false
}
hasExtra := false
if t.HasVariants() {
if t.HasExtensions() {
build := language.Builder{}
build.SetTag(language.Tag{LangID: b, ScriptID: s, RegionID: r})
build.AddVariant(t.Variants())
exact = false
t = build.Make()
}
hasExtra = true
} else if _, ok := t.Extension('u'); ok {
// TODO: va may mean something else. Consider not considering it.
// Strip all but the 'va' entry.
old := t
variant := t.TypeForKey("va")
t = language.Tag{LangID: b, ScriptID: s, RegionID: r}
if variant != "" {
t, _ = t.SetTypeForKey("va", variant)
hasExtra = true
}
exact = old == t
} else {
exact = false
}
if hasExtra {
// We have some variants.
for i, s := range specialTags {
if s == t {
return ID(i + len(coreTags)), exact
}
}
exact = false
}
}
if x, ok := getCoreIndex(t); ok {
return x, exact
}
exact = false
if r != 0 && s == 0 {
// Deal with cases where an extra script is inserted for the region.
t, _ := t.Maximize()
if x, ok := getCoreIndex(t); ok {
return x, exact
}
}
for t = t.Parent(); t != root; t = t.Parent() {
// No variants specified: just compare core components.
// The key has the form lllssrrr, where l, s, and r are nibbles for
// respectively the langID, scriptID, and regionID.
if x, ok := getCoreIndex(t); ok {
return x, exact
}
}
return 0, exact
}
var root = language.Tag{}

View file

@ -0,0 +1,120 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package compact
// parents maps a compact index of a tag to the compact index of the parent of
// this tag.
var parents = []ID{ // 775 elements
// Entry 0 - 3F
0x0000, 0x0000, 0x0001, 0x0001, 0x0000, 0x0004, 0x0000, 0x0006,
0x0000, 0x0008, 0x0000, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a,
0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a,
0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a,
0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x0000,
0x0000, 0x0028, 0x0000, 0x002a, 0x0000, 0x002c, 0x0000, 0x0000,
0x002f, 0x002e, 0x002e, 0x0000, 0x0033, 0x0000, 0x0035, 0x0000,
0x0037, 0x0000, 0x0039, 0x0000, 0x003b, 0x0000, 0x0000, 0x003e,
// Entry 40 - 7F
0x0000, 0x0040, 0x0040, 0x0000, 0x0043, 0x0043, 0x0000, 0x0046,
0x0000, 0x0048, 0x0000, 0x0000, 0x004b, 0x004a, 0x004a, 0x0000,
0x004f, 0x004f, 0x004f, 0x004f, 0x0000, 0x0054, 0x0054, 0x0000,
0x0057, 0x0000, 0x0059, 0x0000, 0x005b, 0x0000, 0x005d, 0x005d,
0x0000, 0x0060, 0x0000, 0x0062, 0x0000, 0x0064, 0x0000, 0x0066,
0x0066, 0x0000, 0x0069, 0x0000, 0x006b, 0x006b, 0x006b, 0x006b,
0x006b, 0x006b, 0x006b, 0x0000, 0x0073, 0x0000, 0x0075, 0x0000,
0x0077, 0x0000, 0x0000, 0x007a, 0x0000, 0x007c, 0x0000, 0x007e,
// Entry 80 - BF
0x0000, 0x0080, 0x0080, 0x0000, 0x0083, 0x0083, 0x0000, 0x0086,
0x0087, 0x0087, 0x0087, 0x0086, 0x0088, 0x0087, 0x0087, 0x0087,
0x0086, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0088,
0x0087, 0x0087, 0x0087, 0x0087, 0x0088, 0x0087, 0x0088, 0x0087,
0x0087, 0x0088, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
0x0087, 0x0087, 0x0087, 0x0086, 0x0087, 0x0087, 0x0087, 0x0087,
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0086, 0x0087, 0x0086,
// Entry C0 - FF
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
0x0088, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
0x0086, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0088, 0x0087,
0x0087, 0x0088, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0086, 0x0086, 0x0087,
0x0087, 0x0086, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0000,
0x00ef, 0x0000, 0x00f1, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f2,
0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f1, 0x00f2, 0x00f1, 0x00f1,
// Entry 100 - 13F
0x00f2, 0x00f2, 0x00f1, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f1,
0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x0000, 0x010e,
0x0000, 0x0110, 0x0000, 0x0112, 0x0000, 0x0114, 0x0114, 0x0000,
0x0117, 0x0117, 0x0117, 0x0117, 0x0000, 0x011c, 0x0000, 0x011e,
0x0000, 0x0120, 0x0120, 0x0000, 0x0123, 0x0123, 0x0123, 0x0123,
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
// Entry 140 - 17F
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
0x0123, 0x0123, 0x0000, 0x0152, 0x0000, 0x0154, 0x0000, 0x0156,
0x0000, 0x0158, 0x0000, 0x015a, 0x0000, 0x015c, 0x015c, 0x015c,
0x0000, 0x0160, 0x0000, 0x0000, 0x0163, 0x0000, 0x0165, 0x0000,
0x0167, 0x0167, 0x0167, 0x0000, 0x016b, 0x0000, 0x016d, 0x0000,
0x016f, 0x0000, 0x0171, 0x0171, 0x0000, 0x0174, 0x0000, 0x0176,
0x0000, 0x0178, 0x0000, 0x017a, 0x0000, 0x017c, 0x0000, 0x017e,
// Entry 180 - 1BF
0x0000, 0x0000, 0x0000, 0x0182, 0x0000, 0x0184, 0x0184, 0x0184,
0x0184, 0x0000, 0x0000, 0x0000, 0x018b, 0x0000, 0x0000, 0x018e,
0x0000, 0x0000, 0x0191, 0x0000, 0x0000, 0x0000, 0x0195, 0x0000,
0x0197, 0x0000, 0x0000, 0x019a, 0x0000, 0x0000, 0x019d, 0x0000,
0x019f, 0x0000, 0x01a1, 0x0000, 0x01a3, 0x0000, 0x01a5, 0x0000,
0x01a7, 0x0000, 0x01a9, 0x0000, 0x01ab, 0x0000, 0x01ad, 0x0000,
0x01af, 0x0000, 0x01b1, 0x01b1, 0x0000, 0x01b4, 0x0000, 0x01b6,
0x0000, 0x01b8, 0x0000, 0x01ba, 0x0000, 0x01bc, 0x0000, 0x0000,
// Entry 1C0 - 1FF
0x01bf, 0x0000, 0x01c1, 0x0000, 0x01c3, 0x0000, 0x01c5, 0x0000,
0x01c7, 0x0000, 0x01c9, 0x0000, 0x01cb, 0x01cb, 0x01cb, 0x01cb,
0x0000, 0x01d0, 0x0000, 0x01d2, 0x01d2, 0x0000, 0x01d5, 0x0000,
0x01d7, 0x0000, 0x01d9, 0x0000, 0x01db, 0x0000, 0x01dd, 0x0000,
0x01df, 0x01df, 0x0000, 0x01e2, 0x0000, 0x01e4, 0x0000, 0x01e6,
0x0000, 0x01e8, 0x0000, 0x01ea, 0x0000, 0x01ec, 0x0000, 0x01ee,
0x0000, 0x01f0, 0x0000, 0x0000, 0x01f3, 0x0000, 0x01f5, 0x01f5,
0x01f5, 0x0000, 0x01f9, 0x0000, 0x01fb, 0x0000, 0x01fd, 0x0000,
// Entry 200 - 23F
0x01ff, 0x0000, 0x0000, 0x0202, 0x0000, 0x0204, 0x0204, 0x0000,
0x0207, 0x0000, 0x0209, 0x0209, 0x0000, 0x020c, 0x020c, 0x0000,
0x020f, 0x020f, 0x020f, 0x020f, 0x020f, 0x020f, 0x020f, 0x0000,
0x0217, 0x0000, 0x0219, 0x0000, 0x021b, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0221, 0x0000, 0x0000, 0x0224, 0x0000, 0x0226,
0x0226, 0x0000, 0x0229, 0x0000, 0x022b, 0x022b, 0x0000, 0x0000,
0x022f, 0x022e, 0x022e, 0x0000, 0x0000, 0x0234, 0x0000, 0x0236,
0x0000, 0x0238, 0x0000, 0x0244, 0x023a, 0x0244, 0x0244, 0x0244,
// Entry 240 - 27F
0x0244, 0x0244, 0x0244, 0x0244, 0x023a, 0x0244, 0x0244, 0x0000,
0x0247, 0x0247, 0x0247, 0x0000, 0x024b, 0x0000, 0x024d, 0x0000,
0x024f, 0x024f, 0x0000, 0x0252, 0x0000, 0x0254, 0x0254, 0x0254,
0x0254, 0x0254, 0x0254, 0x0000, 0x025b, 0x0000, 0x025d, 0x0000,
0x025f, 0x0000, 0x0261, 0x0000, 0x0263, 0x0000, 0x0265, 0x0000,
0x0000, 0x0268, 0x0268, 0x0268, 0x0000, 0x026c, 0x0000, 0x026e,
0x0000, 0x0270, 0x0000, 0x0000, 0x0000, 0x0274, 0x0273, 0x0273,
0x0000, 0x0278, 0x0000, 0x027a, 0x0000, 0x027c, 0x0000, 0x0000,
// Entry 280 - 2BF
0x0000, 0x0000, 0x0281, 0x0000, 0x0000, 0x0284, 0x0000, 0x0286,
0x0286, 0x0286, 0x0286, 0x0000, 0x028b, 0x028b, 0x028b, 0x0000,
0x028f, 0x028f, 0x028f, 0x028f, 0x028f, 0x0000, 0x0295, 0x0295,
0x0295, 0x0295, 0x0000, 0x0000, 0x0000, 0x0000, 0x029d, 0x029d,
0x029d, 0x0000, 0x02a1, 0x02a1, 0x02a1, 0x02a1, 0x0000, 0x0000,
0x02a7, 0x02a7, 0x02a7, 0x02a7, 0x0000, 0x02ac, 0x0000, 0x02ae,
0x02ae, 0x0000, 0x02b1, 0x0000, 0x02b3, 0x0000, 0x02b5, 0x02b5,
0x0000, 0x0000, 0x02b9, 0x0000, 0x0000, 0x0000, 0x02bd, 0x0000,
// Entry 2C0 - 2FF
0x02bf, 0x02bf, 0x0000, 0x0000, 0x02c3, 0x0000, 0x02c5, 0x0000,
0x02c7, 0x0000, 0x02c9, 0x0000, 0x02cb, 0x0000, 0x02cd, 0x02cd,
0x0000, 0x0000, 0x02d1, 0x0000, 0x02d3, 0x02d0, 0x02d0, 0x0000,
0x0000, 0x02d8, 0x02d7, 0x02d7, 0x0000, 0x0000, 0x02dd, 0x0000,
0x02df, 0x0000, 0x02e1, 0x0000, 0x0000, 0x02e4, 0x0000, 0x02e6,
0x0000, 0x0000, 0x02e9, 0x0000, 0x02eb, 0x0000, 0x02ed, 0x0000,
0x02ef, 0x02ef, 0x0000, 0x0000, 0x02f3, 0x02f2, 0x02f2, 0x0000,
0x02f7, 0x0000, 0x02f9, 0x02f9, 0x02f9, 0x02f9, 0x02f9, 0x0000,
// Entry 300 - 33F
0x02ff, 0x0300, 0x02ff, 0x0000, 0x0303, 0x0051, 0x00e6,
} // Size: 1574 bytes
// Total table size 1574 bytes (1KiB); checksum: 895AAF0B

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,91 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package compact
var (
und = Tag{}
Und Tag = Tag{}
Afrikaans Tag = Tag{language: afIndex, locale: afIndex}
Amharic Tag = Tag{language: amIndex, locale: amIndex}
Arabic Tag = Tag{language: arIndex, locale: arIndex}
ModernStandardArabic Tag = Tag{language: ar001Index, locale: ar001Index}
Azerbaijani Tag = Tag{language: azIndex, locale: azIndex}
Bulgarian Tag = Tag{language: bgIndex, locale: bgIndex}
Bengali Tag = Tag{language: bnIndex, locale: bnIndex}
Catalan Tag = Tag{language: caIndex, locale: caIndex}
Czech Tag = Tag{language: csIndex, locale: csIndex}
Danish Tag = Tag{language: daIndex, locale: daIndex}
German Tag = Tag{language: deIndex, locale: deIndex}
Greek Tag = Tag{language: elIndex, locale: elIndex}
English Tag = Tag{language: enIndex, locale: enIndex}
AmericanEnglish Tag = Tag{language: enUSIndex, locale: enUSIndex}
BritishEnglish Tag = Tag{language: enGBIndex, locale: enGBIndex}
Spanish Tag = Tag{language: esIndex, locale: esIndex}
EuropeanSpanish Tag = Tag{language: esESIndex, locale: esESIndex}
LatinAmericanSpanish Tag = Tag{language: es419Index, locale: es419Index}
Estonian Tag = Tag{language: etIndex, locale: etIndex}
Persian Tag = Tag{language: faIndex, locale: faIndex}
Finnish Tag = Tag{language: fiIndex, locale: fiIndex}
Filipino Tag = Tag{language: filIndex, locale: filIndex}
French Tag = Tag{language: frIndex, locale: frIndex}
CanadianFrench Tag = Tag{language: frCAIndex, locale: frCAIndex}
Gujarati Tag = Tag{language: guIndex, locale: guIndex}
Hebrew Tag = Tag{language: heIndex, locale: heIndex}
Hindi Tag = Tag{language: hiIndex, locale: hiIndex}
Croatian Tag = Tag{language: hrIndex, locale: hrIndex}
Hungarian Tag = Tag{language: huIndex, locale: huIndex}
Armenian Tag = Tag{language: hyIndex, locale: hyIndex}
Indonesian Tag = Tag{language: idIndex, locale: idIndex}
Icelandic Tag = Tag{language: isIndex, locale: isIndex}
Italian Tag = Tag{language: itIndex, locale: itIndex}
Japanese Tag = Tag{language: jaIndex, locale: jaIndex}
Georgian Tag = Tag{language: kaIndex, locale: kaIndex}
Kazakh Tag = Tag{language: kkIndex, locale: kkIndex}
Khmer Tag = Tag{language: kmIndex, locale: kmIndex}
Kannada Tag = Tag{language: knIndex, locale: knIndex}
Korean Tag = Tag{language: koIndex, locale: koIndex}
Kirghiz Tag = Tag{language: kyIndex, locale: kyIndex}
Lao Tag = Tag{language: loIndex, locale: loIndex}
Lithuanian Tag = Tag{language: ltIndex, locale: ltIndex}
Latvian Tag = Tag{language: lvIndex, locale: lvIndex}
Macedonian Tag = Tag{language: mkIndex, locale: mkIndex}
Malayalam Tag = Tag{language: mlIndex, locale: mlIndex}
Mongolian Tag = Tag{language: mnIndex, locale: mnIndex}
Marathi Tag = Tag{language: mrIndex, locale: mrIndex}
Malay Tag = Tag{language: msIndex, locale: msIndex}
Burmese Tag = Tag{language: myIndex, locale: myIndex}
Nepali Tag = Tag{language: neIndex, locale: neIndex}
Dutch Tag = Tag{language: nlIndex, locale: nlIndex}
Norwegian Tag = Tag{language: noIndex, locale: noIndex}
Punjabi Tag = Tag{language: paIndex, locale: paIndex}
Polish Tag = Tag{language: plIndex, locale: plIndex}
Portuguese Tag = Tag{language: ptIndex, locale: ptIndex}
BrazilianPortuguese Tag = Tag{language: ptBRIndex, locale: ptBRIndex}
EuropeanPortuguese Tag = Tag{language: ptPTIndex, locale: ptPTIndex}
Romanian Tag = Tag{language: roIndex, locale: roIndex}
Russian Tag = Tag{language: ruIndex, locale: ruIndex}
Sinhala Tag = Tag{language: siIndex, locale: siIndex}
Slovak Tag = Tag{language: skIndex, locale: skIndex}
Slovenian Tag = Tag{language: slIndex, locale: slIndex}
Albanian Tag = Tag{language: sqIndex, locale: sqIndex}
Serbian Tag = Tag{language: srIndex, locale: srIndex}
SerbianLatin Tag = Tag{language: srLatnIndex, locale: srLatnIndex}
Swedish Tag = Tag{language: svIndex, locale: svIndex}
Swahili Tag = Tag{language: swIndex, locale: swIndex}
Tamil Tag = Tag{language: taIndex, locale: taIndex}
Telugu Tag = Tag{language: teIndex, locale: teIndex}
Thai Tag = Tag{language: thIndex, locale: thIndex}
Turkish Tag = Tag{language: trIndex, locale: trIndex}
Ukrainian Tag = Tag{language: ukIndex, locale: ukIndex}
Urdu Tag = Tag{language: urIndex, locale: urIndex}
Uzbek Tag = Tag{language: uzIndex, locale: uzIndex}
Vietnamese Tag = Tag{language: viIndex, locale: viIndex}
Chinese Tag = Tag{language: zhIndex, locale: zhIndex}
SimplifiedChinese Tag = Tag{language: zhHansIndex, locale: zhHansIndex}
TraditionalChinese Tag = Tag{language: zhHantIndex, locale: zhHantIndex}
Zulu Tag = Tag{language: zuIndex, locale: zuIndex}
)

167
vendor/golang.org/x/text/internal/language/compose.go generated vendored Normal file
View file

@ -0,0 +1,167 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"sort"
"strings"
)
// A Builder allows constructing a Tag from individual components.
// Its main user is Compose in the top-level language package.
type Builder struct {
Tag Tag
private string // the x extension
variants []string
extensions []string
}
// Make returns a new Tag from the current settings.
func (b *Builder) Make() Tag {
t := b.Tag
if len(b.extensions) > 0 || len(b.variants) > 0 {
sort.Sort(sortVariants(b.variants))
sort.Strings(b.extensions)
if b.private != "" {
b.extensions = append(b.extensions, b.private)
}
n := maxCoreSize + tokenLen(b.variants...) + tokenLen(b.extensions...)
buf := make([]byte, n)
p := t.genCoreBytes(buf)
t.pVariant = byte(p)
p += appendTokens(buf[p:], b.variants...)
t.pExt = uint16(p)
p += appendTokens(buf[p:], b.extensions...)
t.str = string(buf[:p])
// We may not always need to remake the string, but when or when not
// to do so is rather tricky.
scan := makeScanner(buf[:p])
t, _ = parse(&scan, "")
return t
} else if b.private != "" {
t.str = b.private
t.RemakeString()
}
return t
}
// SetTag copies all the settings from a given Tag. Any previously set values
// are discarded.
func (b *Builder) SetTag(t Tag) {
b.Tag.LangID = t.LangID
b.Tag.RegionID = t.RegionID
b.Tag.ScriptID = t.ScriptID
// TODO: optimize
b.variants = b.variants[:0]
if variants := t.Variants(); variants != "" {
for _, vr := range strings.Split(variants[1:], "-") {
b.variants = append(b.variants, vr)
}
}
b.extensions, b.private = b.extensions[:0], ""
for _, e := range t.Extensions() {
b.AddExt(e)
}
}
// AddExt adds extension e to the tag. e must be a valid extension as returned
// by Tag.Extension. If the extension already exists, it will be discarded,
// except for a -u extension, where non-existing key-type pairs will added.
func (b *Builder) AddExt(e string) {
if e[0] == 'x' {
if b.private == "" {
b.private = e
}
return
}
for i, s := range b.extensions {
if s[0] == e[0] {
if e[0] == 'u' {
b.extensions[i] += e[1:]
}
return
}
}
b.extensions = append(b.extensions, e)
}
// SetExt sets the extension e to the tag. e must be a valid extension as
// returned by Tag.Extension. If the extension already exists, it will be
// overwritten, except for a -u extension, where the individual key-type pairs
// will be set.
func (b *Builder) SetExt(e string) {
if e[0] == 'x' {
b.private = e
return
}
for i, s := range b.extensions {
if s[0] == e[0] {
if e[0] == 'u' {
b.extensions[i] = e + s[1:]
} else {
b.extensions[i] = e
}
return
}
}
b.extensions = append(b.extensions, e)
}
// AddVariant adds any number of variants.
func (b *Builder) AddVariant(v ...string) {
for _, v := range v {
if v != "" {
b.variants = append(b.variants, v)
}
}
}
// ClearVariants removes any variants previously added, including those
// copied from a Tag in SetTag.
func (b *Builder) ClearVariants() {
b.variants = b.variants[:0]
}
// ClearExtensions removes any extensions previously added, including those
// copied from a Tag in SetTag.
func (b *Builder) ClearExtensions() {
b.private = ""
b.extensions = b.extensions[:0]
}
func tokenLen(token ...string) (n int) {
for _, t := range token {
n += len(t) + 1
}
return
}
func appendTokens(b []byte, token ...string) int {
p := 0
for _, t := range token {
b[p] = '-'
copy(b[p+1:], t)
p += 1 + len(t)
}
return p
}
type sortVariants []string
func (s sortVariants) Len() int {
return len(s)
}
func (s sortVariants) Swap(i, j int) {
s[j], s[i] = s[i], s[j]
}
func (s sortVariants) Less(i, j int) bool {
return variantIndex[s[i]] < variantIndex[s[j]]
}

28
vendor/golang.org/x/text/internal/language/coverage.go generated vendored Normal file
View file

@ -0,0 +1,28 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
// BaseLanguages returns the list of all supported base languages. It generates
// the list by traversing the internal structures.
func BaseLanguages() []Language {
base := make([]Language, 0, NumLanguages)
for i := 0; i < langNoIndexOffset; i++ {
// We included "und" already for the value 0.
if i != nonCanonicalUnd {
base = append(base, Language(i))
}
}
i := langNoIndexOffset
for _, v := range langNoIndex {
for k := 0; k < 8; k++ {
if v&1 == 1 {
base = append(base, Language(i))
}
v >>= 1
i++
}
}
return base
}

596
vendor/golang.org/x/text/internal/language/language.go generated vendored Normal file
View file

@ -0,0 +1,596 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go gen_common.go -output tables.go
package language // import "golang.org/x/text/internal/language"
// TODO: Remove above NOTE after:
// - verifying that tables are dropped correctly (most notably matcher tables).
import (
"errors"
"fmt"
"strings"
)
const (
// maxCoreSize is the maximum size of a BCP 47 tag without variants and
// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
maxCoreSize = 12
// max99thPercentileSize is a somewhat arbitrary buffer size that presumably
// is large enough to hold at least 99% of the BCP 47 tags.
max99thPercentileSize = 32
// maxSimpleUExtensionSize is the maximum size of a -u extension with one
// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
maxSimpleUExtensionSize = 14
)
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
// specific language or locale. All language tag values are guaranteed to be
// well-formed. The zero value of Tag is Und.
type Tag struct {
// TODO: the following fields have the form TagTypeID. This name is chosen
// to allow refactoring the public package without conflicting with its
// Base, Script, and Region methods. Once the transition is fully completed
// the ID can be stripped from the name.
LangID Language
RegionID Region
// TODO: we will soon run out of positions for ScriptID. Idea: instead of
// storing lang, region, and ScriptID codes, store only the compact index and
// have a lookup table from this code to its expansion. This greatly speeds
// up table lookup, speed up common variant cases.
// This will also immediately free up 3 extra bytes. Also, the pVariant
// field can now be moved to the lookup table, as the compact index uniquely
// determines the offset of a possible variant.
ScriptID Script
pVariant byte // offset in str, includes preceding '-'
pExt uint16 // offset of first extension, includes preceding '-'
// str is the string representation of the Tag. It will only be used if the
// tag has variants or extensions.
str string
}
// Make is a convenience wrapper for Parse that omits the error.
// In case of an error, a sensible default is returned.
func Make(s string) Tag {
t, _ := Parse(s)
return t
}
// Raw returns the raw base language, script and region, without making an
// attempt to infer their values.
// TODO: consider removing
func (t Tag) Raw() (b Language, s Script, r Region) {
return t.LangID, t.ScriptID, t.RegionID
}
// equalTags compares language, script and region subtags only.
func (t Tag) equalTags(a Tag) bool {
return t.LangID == a.LangID && t.ScriptID == a.ScriptID && t.RegionID == a.RegionID
}
// IsRoot returns true if t is equal to language "und".
func (t Tag) IsRoot() bool {
if int(t.pVariant) < len(t.str) {
return false
}
return t.equalTags(Und)
}
// IsPrivateUse reports whether the Tag consists solely of an IsPrivateUse use
// tag.
func (t Tag) IsPrivateUse() bool {
return t.str != "" && t.pVariant == 0
}
// RemakeString is used to update t.str in case lang, script or region changed.
// It is assumed that pExt and pVariant still point to the start of the
// respective parts.
func (t *Tag) RemakeString() {
if t.str == "" {
return
}
extra := t.str[t.pVariant:]
if t.pVariant > 0 {
extra = extra[1:]
}
if t.equalTags(Und) && strings.HasPrefix(extra, "x-") {
t.str = extra
t.pVariant = 0
t.pExt = 0
return
}
var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
b := buf[:t.genCoreBytes(buf[:])]
if extra != "" {
diff := len(b) - int(t.pVariant)
b = append(b, '-')
b = append(b, extra...)
t.pVariant = uint8(int(t.pVariant) + diff)
t.pExt = uint16(int(t.pExt) + diff)
} else {
t.pVariant = uint8(len(b))
t.pExt = uint16(len(b))
}
t.str = string(b)
}
// genCoreBytes writes a string for the base languages, script and region tags
// to the given buffer and returns the number of bytes written. It will never
// write more than maxCoreSize bytes.
func (t *Tag) genCoreBytes(buf []byte) int {
n := t.LangID.StringToBuf(buf[:])
if t.ScriptID != 0 {
n += copy(buf[n:], "-")
n += copy(buf[n:], t.ScriptID.String())
}
if t.RegionID != 0 {
n += copy(buf[n:], "-")
n += copy(buf[n:], t.RegionID.String())
}
return n
}
// String returns the canonical string representation of the language tag.
func (t Tag) String() string {
if t.str != "" {
return t.str
}
if t.ScriptID == 0 && t.RegionID == 0 {
return t.LangID.String()
}
buf := [maxCoreSize]byte{}
return string(buf[:t.genCoreBytes(buf[:])])
}
// MarshalText implements encoding.TextMarshaler.
func (t Tag) MarshalText() (text []byte, err error) {
if t.str != "" {
text = append(text, t.str...)
} else if t.ScriptID == 0 && t.RegionID == 0 {
text = append(text, t.LangID.String()...)
} else {
buf := [maxCoreSize]byte{}
text = buf[:t.genCoreBytes(buf[:])]
}
return text, nil
}
// UnmarshalText implements encoding.TextUnmarshaler.
func (t *Tag) UnmarshalText(text []byte) error {
tag, err := Parse(string(text))
*t = tag
return err
}
// Variants returns the part of the tag holding all variants or the empty string
// if there are no variants defined.
func (t Tag) Variants() string {
if t.pVariant == 0 {
return ""
}
return t.str[t.pVariant:t.pExt]
}
// VariantOrPrivateUseTags returns variants or private use tags.
func (t Tag) VariantOrPrivateUseTags() string {
if t.pExt > 0 {
return t.str[t.pVariant:t.pExt]
}
return t.str[t.pVariant:]
}
// HasString reports whether this tag defines more than just the raw
// components.
func (t Tag) HasString() bool {
return t.str != ""
}
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
// specific language are substituted with fields from the parent language.
// The parent for a language may change for newer versions of CLDR.
func (t Tag) Parent() Tag {
if t.str != "" {
// Strip the variants and extensions.
b, s, r := t.Raw()
t = Tag{LangID: b, ScriptID: s, RegionID: r}
if t.RegionID == 0 && t.ScriptID != 0 && t.LangID != 0 {
base, _ := addTags(Tag{LangID: t.LangID})
if base.ScriptID == t.ScriptID {
return Tag{LangID: t.LangID}
}
}
return t
}
if t.LangID != 0 {
if t.RegionID != 0 {
maxScript := t.ScriptID
if maxScript == 0 {
max, _ := addTags(t)
maxScript = max.ScriptID
}
for i := range parents {
if Language(parents[i].lang) == t.LangID && Script(parents[i].maxScript) == maxScript {
for _, r := range parents[i].fromRegion {
if Region(r) == t.RegionID {
return Tag{
LangID: t.LangID,
ScriptID: Script(parents[i].script),
RegionID: Region(parents[i].toRegion),
}
}
}
}
}
// Strip the script if it is the default one.
base, _ := addTags(Tag{LangID: t.LangID})
if base.ScriptID != maxScript {
return Tag{LangID: t.LangID, ScriptID: maxScript}
}
return Tag{LangID: t.LangID}
} else if t.ScriptID != 0 {
// The parent for an base-script pair with a non-default script is
// "und" instead of the base language.
base, _ := addTags(Tag{LangID: t.LangID})
if base.ScriptID != t.ScriptID {
return Und
}
return Tag{LangID: t.LangID}
}
}
return Und
}
// ParseExtension parses s as an extension and returns it on success.
func ParseExtension(s string) (ext string, err error) {
scan := makeScannerString(s)
var end int
if n := len(scan.token); n != 1 {
return "", ErrSyntax
}
scan.toLower(0, len(scan.b))
end = parseExtension(&scan)
if end != len(s) {
return "", ErrSyntax
}
return string(scan.b), nil
}
// HasVariants reports whether t has variants.
func (t Tag) HasVariants() bool {
return uint16(t.pVariant) < t.pExt
}
// HasExtensions reports whether t has extensions.
func (t Tag) HasExtensions() bool {
return int(t.pExt) < len(t.str)
}
// Extension returns the extension of type x for tag t. It will return
// false for ok if t does not have the requested extension. The returned
// extension will be invalid in this case.
func (t Tag) Extension(x byte) (ext string, ok bool) {
for i := int(t.pExt); i < len(t.str)-1; {
var ext string
i, ext = getExtension(t.str, i)
if ext[0] == x {
return ext, true
}
}
return "", false
}
// Extensions returns all extensions of t.
func (t Tag) Extensions() []string {
e := []string{}
for i := int(t.pExt); i < len(t.str)-1; {
var ext string
i, ext = getExtension(t.str, i)
e = append(e, ext)
}
return e
}
// TypeForKey returns the type associated with the given key, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// TypeForKey will traverse the inheritance chain to get the correct value.
func (t Tag) TypeForKey(key string) string {
if start, end, _ := t.findTypeForKey(key); end != start {
return t.str[start:end]
}
return ""
}
var (
errPrivateUse = errors.New("cannot set a key on a private use tag")
errInvalidArguments = errors.New("invalid key or type")
)
// SetTypeForKey returns a new Tag with the key set to type, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// An empty value removes an existing pair with the same key.
func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
if t.IsPrivateUse() {
return t, errPrivateUse
}
if len(key) != 2 {
return t, errInvalidArguments
}
// Remove the setting if value is "".
if value == "" {
start, end, _ := t.findTypeForKey(key)
if start != end {
// Remove key tag and leading '-'.
start -= 4
// Remove a possible empty extension.
if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
start -= 2
}
if start == int(t.pVariant) && end == len(t.str) {
t.str = ""
t.pVariant, t.pExt = 0, 0
} else {
t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
}
}
return t, nil
}
if len(value) < 3 || len(value) > 8 {
return t, errInvalidArguments
}
var (
buf [maxCoreSize + maxSimpleUExtensionSize]byte
uStart int // start of the -u extension.
)
// Generate the tag string if needed.
if t.str == "" {
uStart = t.genCoreBytes(buf[:])
buf[uStart] = '-'
uStart++
}
// Create new key-type pair and parse it to verify.
b := buf[uStart:]
copy(b, "u-")
copy(b[2:], key)
b[4] = '-'
b = b[:5+copy(b[5:], value)]
scan := makeScanner(b)
if parseExtensions(&scan); scan.err != nil {
return t, scan.err
}
// Assemble the replacement string.
if t.str == "" {
t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
t.str = string(buf[:uStart+len(b)])
} else {
s := t.str
start, end, hasExt := t.findTypeForKey(key)
if start == end {
if hasExt {
b = b[2:]
}
t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
} else {
t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
}
}
return t, nil
}
// findKeyAndType returns the start and end position for the type corresponding
// to key or the point at which to insert the key-value pair if the type
// wasn't found. The hasExt return value reports whether an -u extension was present.
// Note: the extensions are typically very small and are likely to contain
// only one key-type pair.
func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
p := int(t.pExt)
if len(key) != 2 || p == len(t.str) || p == 0 {
return p, p, false
}
s := t.str
// Find the correct extension.
for p++; s[p] != 'u'; p++ {
if s[p] > 'u' {
p--
return p, p, false
}
if p = nextExtension(s, p); p == len(s) {
return len(s), len(s), false
}
}
// Proceed to the hyphen following the extension name.
p++
// curKey is the key currently being processed.
curKey := ""
// Iterate over keys until we get the end of a section.
for {
// p points to the hyphen preceding the current token.
if p3 := p + 3; s[p3] == '-' {
// Found a key.
// Check whether we just processed the key that was requested.
if curKey == key {
return start, p, true
}
// Set to the next key and continue scanning type tokens.
curKey = s[p+1 : p3]
if curKey > key {
return p, p, true
}
// Start of the type token sequence.
start = p + 4
// A type is at least 3 characters long.
p += 7 // 4 + 3
} else {
// Attribute or type, which is at least 3 characters long.
p += 4
}
// p points past the third character of a type or attribute.
max := p + 5 // maximum length of token plus hyphen.
if len(s) < max {
max = len(s)
}
for ; p < max && s[p] != '-'; p++ {
}
// Bail if we have exhausted all tokens or if the next token starts
// a new extension.
if p == len(s) || s[p+2] == '-' {
if curKey == key {
return start, p, true
}
return p, p, true
}
}
}
// ParseBase parses a 2- or 3-letter ISO 639 code.
// It returns a ValueError if s is a well-formed but unknown language identifier
// or another error if another error occurred.
func ParseBase(s string) (Language, error) {
if n := len(s); n < 2 || 3 < n {
return 0, ErrSyntax
}
var buf [3]byte
return getLangID(buf[:copy(buf[:], s)])
}
// ParseScript parses a 4-letter ISO 15924 code.
// It returns a ValueError if s is a well-formed but unknown script identifier
// or another error if another error occurred.
func ParseScript(s string) (Script, error) {
if len(s) != 4 {
return 0, ErrSyntax
}
var buf [4]byte
return getScriptID(script, buf[:copy(buf[:], s)])
}
// EncodeM49 returns the Region for the given UN M.49 code.
// It returns an error if r is not a valid code.
func EncodeM49(r int) (Region, error) {
return getRegionM49(r)
}
// ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
// It returns a ValueError if s is a well-formed but unknown region identifier
// or another error if another error occurred.
func ParseRegion(s string) (Region, error) {
if n := len(s); n < 2 || 3 < n {
return 0, ErrSyntax
}
var buf [3]byte
return getRegionID(buf[:copy(buf[:], s)])
}
// IsCountry returns whether this region is a country or autonomous area. This
// includes non-standard definitions from CLDR.
func (r Region) IsCountry() bool {
if r == 0 || r.IsGroup() || r.IsPrivateUse() && r != _XK {
return false
}
return true
}
// IsGroup returns whether this region defines a collection of regions. This
// includes non-standard definitions from CLDR.
func (r Region) IsGroup() bool {
if r == 0 {
return false
}
return int(regionInclusion[r]) < len(regionContainment)
}
// Contains returns whether Region c is contained by Region r. It returns true
// if c == r.
func (r Region) Contains(c Region) bool {
if r == c {
return true
}
g := regionInclusion[r]
if g >= nRegionGroups {
return false
}
m := regionContainment[g]
d := regionInclusion[c]
b := regionInclusionBits[d]
// A contained country may belong to multiple disjoint groups. Matching any
// of these indicates containment. If the contained region is a group, it
// must strictly be a subset.
if d >= nRegionGroups {
return b&m != 0
}
return b&^m == 0
}
var errNoTLD = errors.New("language: region is not a valid ccTLD")
// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
// In all other cases it returns either the region itself or an error.
//
// This method may return an error for a region for which there exists a
// canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
// region will already be canonicalized it was obtained from a Tag that was
// obtained using any of the default methods.
func (r Region) TLD() (Region, error) {
// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
// difference between ISO 3166-1 and IANA ccTLD.
if r == _GB {
r = _UK
}
if (r.typ() & ccTLD) == 0 {
return 0, errNoTLD
}
return r, nil
}
// Canonicalize returns the region or a possible replacement if the region is
// deprecated. It will not return a replacement for deprecated regions that
// are split into multiple regions.
func (r Region) Canonicalize() Region {
if cr := normRegion(r); cr != 0 {
return cr
}
return r
}
// Variant represents a registered variant of a language as defined by BCP 47.
type Variant struct {
ID uint8
str string
}
// ParseVariant parses and returns a Variant. An error is returned if s is not
// a valid variant.
func ParseVariant(s string) (Variant, error) {
s = strings.ToLower(s)
if id, ok := variantIndex[s]; ok {
return Variant{id, s}, nil
}
return Variant{}, NewValueError([]byte(s))
}
// String returns the string representation of the variant.
func (v Variant) String() string {
return v.str
}

View file

@ -17,11 +17,11 @@ import (
// if it could not be found.
func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
if !tag.FixCase(form, key) {
return 0, errSyntax
return 0, ErrSyntax
}
i := idx.Index(key)
if i == -1 {
return 0, mkErrInvalid(key)
return 0, NewValueError(key)
}
return i, nil
}
@ -32,38 +32,45 @@ func searchUint(imap []uint16, key uint16) int {
})
}
type langID uint16
type Language uint16
// getLangID returns the langID of s if s is a canonical subtag
// or langUnknown if s is not a canonical subtag.
func getLangID(s []byte) (langID, error) {
func getLangID(s []byte) (Language, error) {
if len(s) == 2 {
return getLangISO2(s)
}
return getLangISO3(s)
}
// TODO language normalization as well as the AliasMaps could be moved to the
// higher level package, but it is a bit tricky to separate the generation.
func (id Language) Canonicalize() (Language, AliasType) {
return normLang(id)
}
// mapLang returns the mapped langID of id according to mapping m.
func normLang(id langID) (langID, langAliasType) {
k := sort.Search(len(langAliasMap), func(i int) bool {
return langAliasMap[i].from >= uint16(id)
func normLang(id Language) (Language, AliasType) {
k := sort.Search(len(AliasMap), func(i int) bool {
return AliasMap[i].From >= uint16(id)
})
if k < len(langAliasMap) && langAliasMap[k].from == uint16(id) {
return langID(langAliasMap[k].to), langAliasTypes[k]
if k < len(AliasMap) && AliasMap[k].From == uint16(id) {
return Language(AliasMap[k].To), AliasTypes[k]
}
return id, langAliasTypeUnknown
return id, AliasTypeUnknown
}
// getLangISO2 returns the langID for the given 2-letter ISO language code
// or unknownLang if this does not exist.
func getLangISO2(s []byte) (langID, error) {
func getLangISO2(s []byte) (Language, error) {
if !tag.FixCase("zz", s) {
return 0, errSyntax
return 0, ErrSyntax
}
if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
return langID(i), nil
return Language(i), nil
}
return 0, mkErrInvalid(s)
return 0, NewValueError(s)
}
const base = 'z' - 'a' + 1
@ -88,7 +95,7 @@ func intToStr(v uint, s []byte) {
// getLangISO3 returns the langID for the given 3-letter ISO language code
// or unknownLang if this does not exist.
func getLangISO3(s []byte) (langID, error) {
func getLangISO3(s []byte) (Language, error) {
if tag.FixCase("und", s) {
// first try to match canonical 3-letter entries
for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
@ -96,7 +103,7 @@ func getLangISO3(s []byte) (langID, error) {
// We treat "und" as special and always translate it to "unspecified".
// Note that ZZ and Zzzz are private use and are not treated as
// unspecified by default.
id := langID(i)
id := Language(i)
if id == nonCanonicalUnd {
return 0, nil
}
@ -104,26 +111,26 @@ func getLangISO3(s []byte) (langID, error) {
}
}
if i := altLangISO3.Index(s); i != -1 {
return langID(altLangIndex[altLangISO3.Elem(i)[3]]), nil
return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil
}
n := strToInt(s)
if langNoIndex[n/8]&(1<<(n%8)) != 0 {
return langID(n) + langNoIndexOffset, nil
return Language(n) + langNoIndexOffset, nil
}
// Check for non-canonical uses of ISO3.
for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
return langID(i), nil
return Language(i), nil
}
}
return 0, mkErrInvalid(s)
return 0, NewValueError(s)
}
return 0, errSyntax
return 0, ErrSyntax
}
// stringToBuf writes the string to b and returns the number of bytes
// StringToBuf writes the string to b and returns the number of bytes
// written. cap(b) must be >= 3.
func (id langID) stringToBuf(b []byte) int {
func (id Language) StringToBuf(b []byte) int {
if id >= langNoIndexOffset {
intToStr(uint(id)-langNoIndexOffset, b[:3])
return 3
@ -140,7 +147,7 @@ func (id langID) stringToBuf(b []byte) int {
// String returns the BCP 47 representation of the langID.
// Use b as variable name, instead of id, to ensure the variable
// used is consistent with that of Base in which this type is embedded.
func (b langID) String() string {
func (b Language) String() string {
if b == 0 {
return "und"
} else if b >= langNoIndexOffset {
@ -157,7 +164,7 @@ func (b langID) String() string {
}
// ISO3 returns the ISO 639-3 language code.
func (b langID) ISO3() string {
func (b Language) ISO3() string {
if b == 0 || b >= langNoIndexOffset {
return b.String()
}
@ -173,15 +180,24 @@ func (b langID) ISO3() string {
}
// IsPrivateUse reports whether this language code is reserved for private use.
func (b langID) IsPrivateUse() bool {
func (b Language) IsPrivateUse() bool {
return langPrivateStart <= b && b <= langPrivateEnd
}
type regionID uint16
// SuppressScript returns the script marked as SuppressScript in the IANA
// language tag repository, or 0 if there is no such script.
func (b Language) SuppressScript() Script {
if b < langNoIndexOffset {
return Script(suppressScript[b])
}
return 0
}
type Region uint16
// getRegionID returns the region id for s if s is a valid 2-letter region code
// or unknownRegion.
func getRegionID(s []byte) (regionID, error) {
func getRegionID(s []byte) (Region, error) {
if len(s) == 3 {
if isAlpha(s[0]) {
return getRegionISO3(s)
@ -195,34 +211,34 @@ func getRegionID(s []byte) (regionID, error) {
// getRegionISO2 returns the regionID for the given 2-letter ISO country code
// or unknownRegion if this does not exist.
func getRegionISO2(s []byte) (regionID, error) {
func getRegionISO2(s []byte) (Region, error) {
i, err := findIndex(regionISO, s, "ZZ")
if err != nil {
return 0, err
}
return regionID(i) + isoRegionOffset, nil
return Region(i) + isoRegionOffset, nil
}
// getRegionISO3 returns the regionID for the given 3-letter ISO country code
// or unknownRegion if this does not exist.
func getRegionISO3(s []byte) (regionID, error) {
func getRegionISO3(s []byte) (Region, error) {
if tag.FixCase("ZZZ", s) {
for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
return regionID(i) + isoRegionOffset, nil
return Region(i) + isoRegionOffset, nil
}
}
for i := 0; i < len(altRegionISO3); i += 3 {
if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
return regionID(altRegionIDs[i/3]), nil
return Region(altRegionIDs[i/3]), nil
}
}
return 0, mkErrInvalid(s)
return 0, NewValueError(s)
}
return 0, errSyntax
return 0, ErrSyntax
}
func getRegionM49(n int) (regionID, error) {
func getRegionM49(n int) (Region, error) {
if 0 < n && n <= 999 {
const (
searchBits = 7
@ -236,7 +252,7 @@ func getRegionM49(n int) (regionID, error) {
return buf[i] >= val
})
if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
return regionID(r & regionMask), nil
return Region(r & regionMask), nil
}
}
var e ValueError
@ -247,13 +263,13 @@ func getRegionM49(n int) (regionID, error) {
// normRegion returns a region if r is deprecated or 0 otherwise.
// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
// TODO: consider mapping split up regions to new most populous one (like CLDR).
func normRegion(r regionID) regionID {
func normRegion(r Region) Region {
m := regionOldMap
k := sort.Search(len(m), func(i int) bool {
return m[i].from >= uint16(r)
return m[i].From >= uint16(r)
})
if k < len(m) && m[k].from == uint16(r) {
return regionID(m[k].to)
if k < len(m) && m[k].From == uint16(r) {
return Region(m[k].To)
}
return 0
}
@ -264,13 +280,13 @@ const (
bcp47Region
)
func (r regionID) typ() byte {
func (r Region) typ() byte {
return regionTypes[r]
}
// String returns the BCP 47 representation for the region.
// It returns "ZZ" for an unspecified region.
func (r regionID) String() string {
func (r Region) String() string {
if r < isoRegionOffset {
if r == 0 {
return "ZZ"
@ -284,7 +300,7 @@ func (r regionID) String() string {
// ISO3 returns the 3-letter ISO code of r.
// Note that not all regions have a 3-letter ISO code.
// In such cases this method returns "ZZZ".
func (r regionID) ISO3() string {
func (r Region) ISO3() string {
if r < isoRegionOffset {
return "ZZZ"
}
@ -301,29 +317,29 @@ func (r regionID) ISO3() string {
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
// is not defined for r.
func (r regionID) M49() int {
func (r Region) M49() int {
return int(m49[r])
}
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
// may include private-use tags that are assigned by CLDR and used in this
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
func (r regionID) IsPrivateUse() bool {
func (r Region) IsPrivateUse() bool {
return r.typ()&iso3166UserAssigned != 0
}
type scriptID uint8
type Script uint8
// getScriptID returns the script id for string s. It assumes that s
// is of the format [A-Z][a-z]{3}.
func getScriptID(idx tag.Index, s []byte) (scriptID, error) {
func getScriptID(idx tag.Index, s []byte) (Script, error) {
i, err := findIndex(idx, s, "Zzzz")
return scriptID(i), err
return Script(i), err
}
// String returns the script code in title case.
// It returns "Zzzz" for an unspecified script.
func (s scriptID) String() string {
func (s Script) String() string {
if s == 0 {
return "Zzzz"
}
@ -331,7 +347,7 @@ func (s scriptID) String() string {
}
// IsPrivateUse reports whether this script code is reserved for private use.
func (s scriptID) IsPrivateUse() bool {
func (s Script) IsPrivateUse() bool {
return _Qaaa <= s && s <= _Qabx
}
@ -389,7 +405,7 @@ func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
if v < 0 {
return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
}
t.lang = langID(v)
t.LangID = Language(v)
return t, true
}
return t, false

226
vendor/golang.org/x/text/internal/language/match.go generated vendored Normal file
View file

@ -0,0 +1,226 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import "errors"
type scriptRegionFlags uint8
const (
isList = 1 << iota
scriptInFrom
regionInFrom
)
func (t *Tag) setUndefinedLang(id Language) {
if t.LangID == 0 {
t.LangID = id
}
}
func (t *Tag) setUndefinedScript(id Script) {
if t.ScriptID == 0 {
t.ScriptID = id
}
}
func (t *Tag) setUndefinedRegion(id Region) {
if t.RegionID == 0 || t.RegionID.Contains(id) {
t.RegionID = id
}
}
// ErrMissingLikelyTagsData indicates no information was available
// to compute likely values of missing tags.
var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
// addLikelySubtags sets subtags to their most likely value, given the locale.
// In most cases this means setting fields for unknown values, but in some
// cases it may alter a value. It returns an ErrMissingLikelyTagsData error
// if the given locale cannot be expanded.
func (t Tag) addLikelySubtags() (Tag, error) {
id, err := addTags(t)
if err != nil {
return t, err
} else if id.equalTags(t) {
return t, nil
}
id.RemakeString()
return id, nil
}
// specializeRegion attempts to specialize a group region.
func specializeRegion(t *Tag) bool {
if i := regionInclusion[t.RegionID]; i < nRegionGroups {
x := likelyRegionGroup[i]
if Language(x.lang) == t.LangID && Script(x.script) == t.ScriptID {
t.RegionID = Region(x.region)
}
return true
}
return false
}
// Maximize returns a new tag with missing tags filled in.
func (t Tag) Maximize() (Tag, error) {
return addTags(t)
}
func addTags(t Tag) (Tag, error) {
// We leave private use identifiers alone.
if t.IsPrivateUse() {
return t, nil
}
if t.ScriptID != 0 && t.RegionID != 0 {
if t.LangID != 0 {
// already fully specified
specializeRegion(&t)
return t, nil
}
// Search matches for und-script-region. Note that for these cases
// region will never be a group so there is no need to check for this.
list := likelyRegion[t.RegionID : t.RegionID+1]
if x := list[0]; x.flags&isList != 0 {
list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
}
for _, x := range list {
// Deviating from the spec. See match_test.go for details.
if Script(x.script) == t.ScriptID {
t.setUndefinedLang(Language(x.lang))
return t, nil
}
}
}
if t.LangID != 0 {
// Search matches for lang-script and lang-region, where lang != und.
if t.LangID < langNoIndexOffset {
x := likelyLang[t.LangID]
if x.flags&isList != 0 {
list := likelyLangList[x.region : x.region+uint16(x.script)]
if t.ScriptID != 0 {
for _, x := range list {
if Script(x.script) == t.ScriptID && x.flags&scriptInFrom != 0 {
t.setUndefinedRegion(Region(x.region))
return t, nil
}
}
} else if t.RegionID != 0 {
count := 0
goodScript := true
tt := t
for _, x := range list {
// We visit all entries for which the script was not
// defined, including the ones where the region was not
// defined. This allows for proper disambiguation within
// regions.
if x.flags&scriptInFrom == 0 && t.RegionID.Contains(Region(x.region)) {
tt.RegionID = Region(x.region)
tt.setUndefinedScript(Script(x.script))
goodScript = goodScript && tt.ScriptID == Script(x.script)
count++
}
}
if count == 1 {
return tt, nil
}
// Even if we fail to find a unique Region, we might have
// an unambiguous script.
if goodScript {
t.ScriptID = tt.ScriptID
}
}
}
}
} else {
// Search matches for und-script.
if t.ScriptID != 0 {
x := likelyScript[t.ScriptID]
if x.region != 0 {
t.setUndefinedRegion(Region(x.region))
t.setUndefinedLang(Language(x.lang))
return t, nil
}
}
// Search matches for und-region. If und-script-region exists, it would
// have been found earlier.
if t.RegionID != 0 {
if i := regionInclusion[t.RegionID]; i < nRegionGroups {
x := likelyRegionGroup[i]
if x.region != 0 {
t.setUndefinedLang(Language(x.lang))
t.setUndefinedScript(Script(x.script))
t.RegionID = Region(x.region)
}
} else {
x := likelyRegion[t.RegionID]
if x.flags&isList != 0 {
x = likelyRegionList[x.lang]
}
if x.script != 0 && x.flags != scriptInFrom {
t.setUndefinedLang(Language(x.lang))
t.setUndefinedScript(Script(x.script))
return t, nil
}
}
}
}
// Search matches for lang.
if t.LangID < langNoIndexOffset {
x := likelyLang[t.LangID]
if x.flags&isList != 0 {
x = likelyLangList[x.region]
}
if x.region != 0 {
t.setUndefinedScript(Script(x.script))
t.setUndefinedRegion(Region(x.region))
}
specializeRegion(&t)
if t.LangID == 0 {
t.LangID = _en // default language
}
return t, nil
}
return t, ErrMissingLikelyTagsData
}
func (t *Tag) setTagsFrom(id Tag) {
t.LangID = id.LangID
t.ScriptID = id.ScriptID
t.RegionID = id.RegionID
}
// minimize removes the region or script subtags from t such that
// t.addLikelySubtags() == t.minimize().addLikelySubtags().
func (t Tag) minimize() (Tag, error) {
t, err := minimizeTags(t)
if err != nil {
return t, err
}
t.RemakeString()
return t, nil
}
// minimizeTags mimics the behavior of the ICU 51 C implementation.
func minimizeTags(t Tag) (Tag, error) {
if t.equalTags(Und) {
return t, nil
}
max, err := addTags(t)
if err != nil {
return t, err
}
for _, id := range [...]Tag{
{LangID: t.LangID},
{LangID: t.LangID, RegionID: t.RegionID},
{LangID: t.LangID, ScriptID: t.ScriptID},
} {
if x, err := addTags(id); err == nil && max.equalTags(x) {
t.setTagsFrom(id)
break
}
}
return t, nil
}

594
vendor/golang.org/x/text/internal/language/parse.go generated vendored Normal file
View file

@ -0,0 +1,594 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"bytes"
"errors"
"fmt"
"sort"
"golang.org/x/text/internal/tag"
)
// isAlpha returns true if the byte is not a digit.
// b must be an ASCII letter or digit.
func isAlpha(b byte) bool {
return b > '9'
}
// isAlphaNum returns true if the string contains only ASCII letters or digits.
func isAlphaNum(s []byte) bool {
for _, c := range s {
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
return false
}
}
return true
}
// ErrSyntax is returned by any of the parsing functions when the
// input is not well-formed, according to BCP 47.
// TODO: return the position at which the syntax error occurred?
var ErrSyntax = errors.New("language: tag is not well-formed")
// ErrDuplicateKey is returned when a tag contains the same key twice with
// different values in the -u section.
var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
// ValueError is returned by any of the parsing functions when the
// input is well-formed but the respective subtag is not recognized
// as a valid value.
type ValueError struct {
v [8]byte
}
// NewValueError creates a new ValueError.
func NewValueError(tag []byte) ValueError {
var e ValueError
copy(e.v[:], tag)
return e
}
func (e ValueError) tag() []byte {
n := bytes.IndexByte(e.v[:], 0)
if n == -1 {
n = 8
}
return e.v[:n]
}
// Error implements the error interface.
func (e ValueError) Error() string {
return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
}
// Subtag returns the subtag for which the error occurred.
func (e ValueError) Subtag() string {
return string(e.tag())
}
// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
type scanner struct {
b []byte
bytes [max99thPercentileSize]byte
token []byte
start int // start position of the current token
end int // end position of the current token
next int // next point for scan
err error
done bool
}
func makeScannerString(s string) scanner {
scan := scanner{}
if len(s) <= len(scan.bytes) {
scan.b = scan.bytes[:copy(scan.bytes[:], s)]
} else {
scan.b = []byte(s)
}
scan.init()
return scan
}
// makeScanner returns a scanner using b as the input buffer.
// b is not copied and may be modified by the scanner routines.
func makeScanner(b []byte) scanner {
scan := scanner{b: b}
scan.init()
return scan
}
func (s *scanner) init() {
for i, c := range s.b {
if c == '_' {
s.b[i] = '-'
}
}
s.scan()
}
// restToLower converts the string between start and end to lower case.
func (s *scanner) toLower(start, end int) {
for i := start; i < end; i++ {
c := s.b[i]
if 'A' <= c && c <= 'Z' {
s.b[i] += 'a' - 'A'
}
}
}
func (s *scanner) setError(e error) {
if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
s.err = e
}
}
// resizeRange shrinks or grows the array at position oldStart such that
// a new string of size newSize can fit between oldStart and oldEnd.
// Sets the scan point to after the resized range.
func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
s.start = oldStart
if end := oldStart + newSize; end != oldEnd {
diff := end - oldEnd
if end < cap(s.b) {
b := make([]byte, len(s.b)+diff)
copy(b, s.b[:oldStart])
copy(b[end:], s.b[oldEnd:])
s.b = b
} else {
s.b = append(s.b[end:], s.b[oldEnd:]...)
}
s.next = end + (s.next - s.end)
s.end = end
}
}
// replace replaces the current token with repl.
func (s *scanner) replace(repl string) {
s.resizeRange(s.start, s.end, len(repl))
copy(s.b[s.start:], repl)
}
// gobble removes the current token from the input.
// Caller must call scan after calling gobble.
func (s *scanner) gobble(e error) {
s.setError(e)
if s.start == 0 {
s.b = s.b[:+copy(s.b, s.b[s.next:])]
s.end = 0
} else {
s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
s.end = s.start - 1
}
s.next = s.start
}
// deleteRange removes the given range from s.b before the current token.
func (s *scanner) deleteRange(start, end int) {
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
diff := end - start
s.next -= diff
s.start -= diff
s.end -= diff
}
// scan parses the next token of a BCP 47 string. Tokens that are larger
// than 8 characters or include non-alphanumeric characters result in an error
// and are gobbled and removed from the output.
// It returns the end position of the last token consumed.
func (s *scanner) scan() (end int) {
end = s.end
s.token = nil
for s.start = s.next; s.next < len(s.b); {
i := bytes.IndexByte(s.b[s.next:], '-')
if i == -1 {
s.end = len(s.b)
s.next = len(s.b)
i = s.end - s.start
} else {
s.end = s.next + i
s.next = s.end + 1
}
token := s.b[s.start:s.end]
if i < 1 || i > 8 || !isAlphaNum(token) {
s.gobble(ErrSyntax)
continue
}
s.token = token
return end
}
if n := len(s.b); n > 0 && s.b[n-1] == '-' {
s.setError(ErrSyntax)
s.b = s.b[:len(s.b)-1]
}
s.done = true
return end
}
// acceptMinSize parses multiple tokens of the given size or greater.
// It returns the end position of the last token consumed.
func (s *scanner) acceptMinSize(min int) (end int) {
end = s.end
s.scan()
for ; len(s.token) >= min; s.scan() {
end = s.end
}
return end
}
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
// failed it returns an error and any part of the tag that could be parsed.
// If parsing succeeded but an unknown value was found, it returns
// ValueError. The Tag returned in this case is just stripped of the unknown
// value. All other values are preserved. It accepts tags in the BCP 47 format
// and extensions to this standard defined in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
func Parse(s string) (t Tag, err error) {
// TODO: consider supporting old-style locale key-value pairs.
if s == "" {
return Und, ErrSyntax
}
if len(s) <= maxAltTaglen {
b := [maxAltTaglen]byte{}
for i, c := range s {
// Generating invalid UTF-8 is okay as it won't match.
if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
} else if c == '_' {
c = '-'
}
b[i] = byte(c)
}
if t, ok := grandfathered(b); ok {
return t, nil
}
}
scan := makeScannerString(s)
return parse(&scan, s)
}
func parse(scan *scanner, s string) (t Tag, err error) {
t = Und
var end int
if n := len(scan.token); n <= 1 {
scan.toLower(0, len(scan.b))
if n == 0 || scan.token[0] != 'x' {
return t, ErrSyntax
}
end = parseExtensions(scan)
} else if n >= 4 {
return Und, ErrSyntax
} else { // the usual case
t, end = parseTag(scan)
if n := len(scan.token); n == 1 {
t.pExt = uint16(end)
end = parseExtensions(scan)
} else if end < len(scan.b) {
scan.setError(ErrSyntax)
scan.b = scan.b[:end]
}
}
if int(t.pVariant) < len(scan.b) {
if end < len(s) {
s = s[:end]
}
if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
t.str = s
} else {
t.str = string(scan.b)
}
} else {
t.pVariant, t.pExt = 0, 0
}
return t, scan.err
}
// parseTag parses language, script, region and variants.
// It returns a Tag and the end position in the input that was parsed.
func parseTag(scan *scanner) (t Tag, end int) {
var e error
// TODO: set an error if an unknown lang, script or region is encountered.
t.LangID, e = getLangID(scan.token)
scan.setError(e)
scan.replace(t.LangID.String())
langStart := scan.start
end = scan.scan()
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
// to a tag of the form <extlang>.
lang, e := getLangID(scan.token)
if lang != 0 {
t.LangID = lang
copy(scan.b[langStart:], lang.String())
scan.b[langStart+3] = '-'
scan.start = langStart + 4
}
scan.gobble(e)
end = scan.scan()
}
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
t.ScriptID, e = getScriptID(script, scan.token)
if t.ScriptID == 0 {
scan.gobble(e)
}
end = scan.scan()
}
if n := len(scan.token); n >= 2 && n <= 3 {
t.RegionID, e = getRegionID(scan.token)
if t.RegionID == 0 {
scan.gobble(e)
} else {
scan.replace(t.RegionID.String())
}
end = scan.scan()
}
scan.toLower(scan.start, len(scan.b))
t.pVariant = byte(end)
end = parseVariants(scan, end, t)
t.pExt = uint16(end)
return t, end
}
var separator = []byte{'-'}
// parseVariants scans tokens as long as each token is a valid variant string.
// Duplicate variants are removed.
func parseVariants(scan *scanner, end int, t Tag) int {
start := scan.start
varIDBuf := [4]uint8{}
variantBuf := [4][]byte{}
varID := varIDBuf[:0]
variant := variantBuf[:0]
last := -1
needSort := false
for ; len(scan.token) >= 4; scan.scan() {
// TODO: measure the impact of needing this conversion and redesign
// the data structure if there is an issue.
v, ok := variantIndex[string(scan.token)]
if !ok {
// unknown variant
// TODO: allow user-defined variants?
scan.gobble(NewValueError(scan.token))
continue
}
varID = append(varID, v)
variant = append(variant, scan.token)
if !needSort {
if last < int(v) {
last = int(v)
} else {
needSort = true
// There is no legal combinations of more than 7 variants
// (and this is by no means a useful sequence).
const maxVariants = 8
if len(varID) > maxVariants {
break
}
}
}
end = scan.end
}
if needSort {
sort.Sort(variantsSort{varID, variant})
k, l := 0, -1
for i, v := range varID {
w := int(v)
if l == w {
// Remove duplicates.
continue
}
varID[k] = varID[i]
variant[k] = variant[i]
k++
l = w
}
if str := bytes.Join(variant[:k], separator); len(str) == 0 {
end = start - 1
} else {
scan.resizeRange(start, end, len(str))
copy(scan.b[scan.start:], str)
end = scan.end
}
}
return end
}
type variantsSort struct {
i []uint8
v [][]byte
}
func (s variantsSort) Len() int {
return len(s.i)
}
func (s variantsSort) Swap(i, j int) {
s.i[i], s.i[j] = s.i[j], s.i[i]
s.v[i], s.v[j] = s.v[j], s.v[i]
}
func (s variantsSort) Less(i, j int) bool {
return s.i[i] < s.i[j]
}
type bytesSort struct {
b [][]byte
n int // first n bytes to compare
}
func (b bytesSort) Len() int {
return len(b.b)
}
func (b bytesSort) Swap(i, j int) {
b.b[i], b.b[j] = b.b[j], b.b[i]
}
func (b bytesSort) Less(i, j int) bool {
for k := 0; k < b.n; k++ {
if b.b[i][k] == b.b[j][k] {
continue
}
return b.b[i][k] < b.b[j][k]
}
return false
}
// parseExtensions parses and normalizes the extensions in the buffer.
// It returns the last position of scan.b that is part of any extension.
// It also trims scan.b to remove excess parts accordingly.
func parseExtensions(scan *scanner) int {
start := scan.start
exts := [][]byte{}
private := []byte{}
end := scan.end
for len(scan.token) == 1 {
extStart := scan.start
ext := scan.token[0]
end = parseExtension(scan)
extension := scan.b[extStart:end]
if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
scan.setError(ErrSyntax)
end = extStart
continue
} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
scan.b = scan.b[:end]
return end
} else if ext == 'x' {
private = extension
break
}
exts = append(exts, extension)
}
sort.Sort(bytesSort{exts, 1})
if len(private) > 0 {
exts = append(exts, private)
}
scan.b = scan.b[:start]
if len(exts) > 0 {
scan.b = append(scan.b, bytes.Join(exts, separator)...)
} else if start > 0 {
// Strip trailing '-'.
scan.b = scan.b[:start-1]
}
return end
}
// parseExtension parses a single extension and returns the position of
// the extension end.
func parseExtension(scan *scanner) int {
start, end := scan.start, scan.end
switch scan.token[0] {
case 'u':
attrStart := end
scan.scan()
for last := []byte{}; len(scan.token) > 2; scan.scan() {
if bytes.Compare(scan.token, last) != -1 {
// Attributes are unsorted. Start over from scratch.
p := attrStart + 1
scan.next = p
attrs := [][]byte{}
for scan.scan(); len(scan.token) > 2; scan.scan() {
attrs = append(attrs, scan.token)
end = scan.end
}
sort.Sort(bytesSort{attrs, 3})
copy(scan.b[p:], bytes.Join(attrs, separator))
break
}
last = scan.token
end = scan.end
}
var last, key []byte
for attrEnd := end; len(scan.token) == 2; last = key {
key = scan.token
keyEnd := scan.end
end = scan.acceptMinSize(3)
// TODO: check key value validity
if keyEnd == end || bytes.Compare(key, last) != 1 {
// We have an invalid key or the keys are not sorted.
// Start scanning keys from scratch and reorder.
p := attrEnd + 1
scan.next = p
keys := [][]byte{}
for scan.scan(); len(scan.token) == 2; {
keyStart, keyEnd := scan.start, scan.end
end = scan.acceptMinSize(3)
if keyEnd != end {
keys = append(keys, scan.b[keyStart:end])
} else {
scan.setError(ErrSyntax)
end = keyStart
}
}
sort.Stable(bytesSort{keys, 2})
if n := len(keys); n > 0 {
k := 0
for i := 1; i < n; i++ {
if !bytes.Equal(keys[k][:2], keys[i][:2]) {
k++
keys[k] = keys[i]
} else if !bytes.Equal(keys[k], keys[i]) {
scan.setError(ErrDuplicateKey)
}
}
keys = keys[:k+1]
}
reordered := bytes.Join(keys, separator)
if e := p + len(reordered); e < end {
scan.deleteRange(e, end)
end = e
}
copy(scan.b[p:], reordered)
break
}
}
case 't':
scan.scan()
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
_, end = parseTag(scan)
scan.toLower(start, end)
}
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
end = scan.acceptMinSize(3)
}
case 'x':
end = scan.acceptMinSize(1)
default:
end = scan.acceptMinSize(2)
}
return end
}
// getExtension returns the name, body and end position of the extension.
func getExtension(s string, p int) (end int, ext string) {
if s[p] == '-' {
p++
}
if s[p] == 'x' {
return len(s), s[p:]
}
end = nextExtension(s, p)
return end, s[p:end]
}
// nextExtension finds the next extension within the string, searching
// for the -<char>- pattern from position p.
// In the fast majority of cases, language tags will have at most
// one extension and extensions tend to be small.
func nextExtension(s string, p int) int {
for n := len(s) - 3; p < n; {
if s[p] == '-' {
if s[p+2] == '-' {
return p
}
p += 3
} else {
p++
}
}
return len(s)
}

3431
vendor/golang.org/x/text/internal/language/tables.go generated vendored Normal file

File diff suppressed because it is too large Load diff

48
vendor/golang.org/x/text/internal/language/tags.go generated vendored Normal file
View file

@ -0,0 +1,48 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
// It simplifies safe initialization of Tag values.
func MustParse(s string) Tag {
t, err := Parse(s)
if err != nil {
panic(err)
}
return t
}
// MustParseBase is like ParseBase, but panics if the given base cannot be parsed.
// It simplifies safe initialization of Base values.
func MustParseBase(s string) Language {
b, err := ParseBase(s)
if err != nil {
panic(err)
}
return b
}
// MustParseScript is like ParseScript, but panics if the given script cannot be
// parsed. It simplifies safe initialization of Script values.
func MustParseScript(s string) Script {
scr, err := ParseScript(s)
if err != nil {
panic(err)
}
return scr
}
// MustParseRegion is like ParseRegion, but panics if the given region cannot be
// parsed. It simplifies safe initialization of Region values.
func MustParseRegion(s string) Region {
r, err := ParseRegion(s)
if err != nil {
panic(err)
}
return r
}
// Und is the root language.
var Und Tag

View file

@ -1,16 +0,0 @@
# Copyright 2013 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
CLEANFILES+=maketables
maketables: maketables.go
go build $^
tables: maketables
./maketables > tables.go
gofmt -w -s tables.go
# Build (but do not run) maketables during testing,
# just to make sure it still compiles.
testshort: maketables

View file

@ -7,6 +7,8 @@ package language
import (
"fmt"
"sort"
"golang.org/x/text/internal/language"
)
// The Coverage interface is used to define the level of coverage of an
@ -44,9 +46,9 @@ type allSubtags struct{}
// consecutive range, it simply returns a slice of numbers in increasing order.
// The "undefined" region is not returned.
func (s allSubtags) Regions() []Region {
reg := make([]Region, numRegions)
reg := make([]Region, language.NumRegions)
for i := range reg {
reg[i] = Region{regionID(i + 1)}
reg[i] = Region{language.Region(i + 1)}
}
return reg
}
@ -55,9 +57,9 @@ func (s allSubtags) Regions() []Region {
// consecutive range, it simply returns a slice of numbers in increasing order.
// The "undefined" script is not returned.
func (s allSubtags) Scripts() []Script {
scr := make([]Script, numScripts)
scr := make([]Script, language.NumScripts)
for i := range scr {
scr[i] = Script{scriptID(i + 1)}
scr[i] = Script{language.Script(i + 1)}
}
return scr
}
@ -65,22 +67,10 @@ func (s allSubtags) Scripts() []Script {
// BaseLanguages returns the list of all supported base languages. It generates
// the list by traversing the internal structures.
func (s allSubtags) BaseLanguages() []Base {
base := make([]Base, 0, numLanguages)
for i := 0; i < langNoIndexOffset; i++ {
// We included "und" already for the value 0.
if i != nonCanonicalUnd {
base = append(base, Base{langID(i)})
}
}
i := langNoIndexOffset
for _, v := range langNoIndex {
for k := 0; k < 8; k++ {
if v&1 == 1 {
base = append(base, Base{langID(i)})
}
v >>= 1
i++
}
bs := language.BaseLanguages()
base := make([]Base, len(bs))
for i, b := range bs {
base[i] = Base{b}
}
return base
}
@ -90,7 +80,7 @@ func (s allSubtags) Tags() []Tag {
return nil
}
// coverage is used used by NewCoverage which is used as a convenient way for
// coverage is used by NewCoverage which is used as a convenient way for
// creating Coverage implementations for partially defined data. Very often a
// package will only need to define a subset of slices. coverage provides a
// convenient way to do this. Moreover, packages using NewCoverage, instead of
@ -134,7 +124,7 @@ func (s *coverage) BaseLanguages() []Base {
}
a := make([]Base, len(tags))
for i, t := range tags {
a[i] = Base{langID(t.lang)}
a[i] = Base{language.Language(t.lang())}
}
sort.Sort(bases(a))
k := 0

File diff suppressed because it is too large Load diff

View file

@ -1,20 +0,0 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This file contains code common to the maketables.go and the package code.
// langAliasType is the type of an alias in langAliasMap.
type langAliasType int8
const (
langDeprecated langAliasType = iota
langMacro
langLegacy
langAliasTypeUnknown langAliasType = -1
)

View file

@ -1,162 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This file generates derivative tables based on the language package itself.
import (
"bytes"
"flag"
"fmt"
"io/ioutil"
"log"
"reflect"
"sort"
"strings"
"golang.org/x/text/internal/gen"
"golang.org/x/text/language"
"golang.org/x/text/unicode/cldr"
)
var (
test = flag.Bool("test", false,
"test existing tables; can be used to compare web data with package data.")
draft = flag.String("draft",
"contributed",
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
)
func main() {
gen.Init()
// Read the CLDR zip file.
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
data, err := d.DecodeZip(r)
if err != nil {
log.Fatalf("DecodeZip: %v", err)
}
w := gen.NewCodeWriter()
defer func() {
buf := &bytes.Buffer{}
if _, err = w.WriteGo(buf, "language", ""); err != nil {
log.Fatalf("Error formatting file index.go: %v", err)
}
// Since we're generating a table for our own package we need to rewrite
// doing the equivalent of go fmt -r 'language.b -> b'. Using
// bytes.Replace will do.
out := bytes.Replace(buf.Bytes(), []byte("language."), nil, -1)
if err := ioutil.WriteFile("index.go", out, 0600); err != nil {
log.Fatalf("Could not create file index.go: %v", err)
}
}()
m := map[language.Tag]bool{}
for _, lang := range data.Locales() {
// We include all locales unconditionally to be consistent with en_US.
// We want en_US, even though it has no data associated with it.
// TODO: put any of the languages for which no data exists at the end
// of the index. This allows all components based on ICU to use that
// as the cutoff point.
// if x := data.RawLDML(lang); false ||
// x.LocaleDisplayNames != nil ||
// x.Characters != nil ||
// x.Delimiters != nil ||
// x.Measurement != nil ||
// x.Dates != nil ||
// x.Numbers != nil ||
// x.Units != nil ||
// x.ListPatterns != nil ||
// x.Collations != nil ||
// x.Segmentations != nil ||
// x.Rbnf != nil ||
// x.Annotations != nil ||
// x.Metadata != nil {
// TODO: support POSIX natively, albeit non-standard.
tag := language.Make(strings.Replace(lang, "_POSIX", "-u-va-posix", 1))
m[tag] = true
// }
}
// Include locales for plural rules, which uses a different structure.
for _, plurals := range data.Supplemental().Plurals {
for _, rules := range plurals.PluralRules {
for _, lang := range strings.Split(rules.Locales, " ") {
m[language.Make(lang)] = true
}
}
}
var core, special []language.Tag
for t := range m {
if x := t.Extensions(); len(x) != 0 && fmt.Sprint(x) != "[u-va-posix]" {
log.Fatalf("Unexpected extension %v in %v", x, t)
}
if len(t.Variants()) == 0 && len(t.Extensions()) == 0 {
core = append(core, t)
} else {
special = append(special, t)
}
}
w.WriteComment(`
NumCompactTags is the number of common tags. The maximum tag is
NumCompactTags-1.`)
w.WriteConst("NumCompactTags", len(core)+len(special))
sort.Sort(byAlpha(special))
w.WriteVar("specialTags", special)
// TODO: order by frequency?
sort.Sort(byAlpha(core))
// Size computations are just an estimate.
w.Size += int(reflect.TypeOf(map[uint32]uint16{}).Size())
w.Size += len(core) * 6 // size of uint32 and uint16
fmt.Fprintln(w)
fmt.Fprintln(w, "var coreTags = map[uint32]uint16{")
fmt.Fprintln(w, "0x0: 0, // und")
i := len(special) + 1 // Und and special tags already written.
for _, t := range core {
if t == language.Und {
continue
}
fmt.Fprint(w.Hash, t, i)
b, s, r := t.Raw()
fmt.Fprintf(w, "0x%s%s%s: %d, // %s\n",
getIndex(b, 3), // 3 is enough as it is guaranteed to be a compact number
getIndex(s, 2),
getIndex(r, 3),
i, t)
i++
}
fmt.Fprintln(w, "}")
}
// getIndex prints the subtag type and extracts its index of size nibble.
// If the index is less than n nibbles, the result is prefixed with 0s.
func getIndex(x interface{}, n int) string {
s := fmt.Sprintf("%#v", x) // s is of form Type{typeID: 0x00}
s = s[strings.Index(s, "0x")+2 : len(s)-1]
return strings.Repeat("0", n-len(s)) + s
}
type byAlpha []language.Tag
func (a byAlpha) Len() int { return len(a) }
func (a byAlpha) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a byAlpha) Less(i, j int) bool { return a[i].String() < a[j].String() }

View file

@ -1,783 +0,0 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package language
// NumCompactTags is the number of common tags. The maximum tag is
// NumCompactTags-1.
const NumCompactTags = 768
var specialTags = []Tag{ // 2 elements
0: {lang: 0xd7, region: 0x6e, script: 0x0, pVariant: 0x5, pExt: 0xe, str: "ca-ES-valencia"},
1: {lang: 0x139, region: 0x135, script: 0x0, pVariant: 0x5, pExt: 0x5, str: "en-US-u-va-posix"},
} // Size: 72 bytes
var coreTags = map[uint32]uint16{
0x0: 0, // und
0x01600000: 3, // af
0x016000d2: 4, // af-NA
0x01600161: 5, // af-ZA
0x01c00000: 6, // agq
0x01c00052: 7, // agq-CM
0x02100000: 8, // ak
0x02100080: 9, // ak-GH
0x02700000: 10, // am
0x0270006f: 11, // am-ET
0x03a00000: 12, // ar
0x03a00001: 13, // ar-001
0x03a00023: 14, // ar-AE
0x03a00039: 15, // ar-BH
0x03a00062: 16, // ar-DJ
0x03a00067: 17, // ar-DZ
0x03a0006b: 18, // ar-EG
0x03a0006c: 19, // ar-EH
0x03a0006d: 20, // ar-ER
0x03a00097: 21, // ar-IL
0x03a0009b: 22, // ar-IQ
0x03a000a1: 23, // ar-JO
0x03a000a8: 24, // ar-KM
0x03a000ac: 25, // ar-KW
0x03a000b0: 26, // ar-LB
0x03a000b9: 27, // ar-LY
0x03a000ba: 28, // ar-MA
0x03a000c9: 29, // ar-MR
0x03a000e1: 30, // ar-OM
0x03a000ed: 31, // ar-PS
0x03a000f3: 32, // ar-QA
0x03a00108: 33, // ar-SA
0x03a0010b: 34, // ar-SD
0x03a00115: 35, // ar-SO
0x03a00117: 36, // ar-SS
0x03a0011c: 37, // ar-SY
0x03a00120: 38, // ar-TD
0x03a00128: 39, // ar-TN
0x03a0015e: 40, // ar-YE
0x04000000: 41, // ars
0x04300000: 42, // as
0x04300099: 43, // as-IN
0x04400000: 44, // asa
0x0440012f: 45, // asa-TZ
0x04800000: 46, // ast
0x0480006e: 47, // ast-ES
0x05800000: 48, // az
0x0581f000: 49, // az-Cyrl
0x0581f032: 50, // az-Cyrl-AZ
0x05857000: 51, // az-Latn
0x05857032: 52, // az-Latn-AZ
0x05e00000: 53, // bas
0x05e00052: 54, // bas-CM
0x07100000: 55, // be
0x07100047: 56, // be-BY
0x07500000: 57, // bem
0x07500162: 58, // bem-ZM
0x07900000: 59, // bez
0x0790012f: 60, // bez-TZ
0x07e00000: 61, // bg
0x07e00038: 62, // bg-BG
0x08200000: 63, // bh
0x0a000000: 64, // bm
0x0a0000c3: 65, // bm-ML
0x0a500000: 66, // bn
0x0a500035: 67, // bn-BD
0x0a500099: 68, // bn-IN
0x0a900000: 69, // bo
0x0a900053: 70, // bo-CN
0x0a900099: 71, // bo-IN
0x0b200000: 72, // br
0x0b200078: 73, // br-FR
0x0b500000: 74, // brx
0x0b500099: 75, // brx-IN
0x0b700000: 76, // bs
0x0b71f000: 77, // bs-Cyrl
0x0b71f033: 78, // bs-Cyrl-BA
0x0b757000: 79, // bs-Latn
0x0b757033: 80, // bs-Latn-BA
0x0d700000: 81, // ca
0x0d700022: 82, // ca-AD
0x0d70006e: 83, // ca-ES
0x0d700078: 84, // ca-FR
0x0d70009e: 85, // ca-IT
0x0db00000: 86, // ccp
0x0db00035: 87, // ccp-BD
0x0db00099: 88, // ccp-IN
0x0dc00000: 89, // ce
0x0dc00106: 90, // ce-RU
0x0df00000: 91, // cgg
0x0df00131: 92, // cgg-UG
0x0e500000: 93, // chr
0x0e500135: 94, // chr-US
0x0e900000: 95, // ckb
0x0e90009b: 96, // ckb-IQ
0x0e90009c: 97, // ckb-IR
0x0fa00000: 98, // cs
0x0fa0005e: 99, // cs-CZ
0x0fe00000: 100, // cu
0x0fe00106: 101, // cu-RU
0x10000000: 102, // cy
0x1000007b: 103, // cy-GB
0x10100000: 104, // da
0x10100063: 105, // da-DK
0x10100082: 106, // da-GL
0x10800000: 107, // dav
0x108000a4: 108, // dav-KE
0x10d00000: 109, // de
0x10d0002e: 110, // de-AT
0x10d00036: 111, // de-BE
0x10d0004e: 112, // de-CH
0x10d00060: 113, // de-DE
0x10d0009e: 114, // de-IT
0x10d000b2: 115, // de-LI
0x10d000b7: 116, // de-LU
0x11700000: 117, // dje
0x117000d4: 118, // dje-NE
0x11f00000: 119, // dsb
0x11f00060: 120, // dsb-DE
0x12400000: 121, // dua
0x12400052: 122, // dua-CM
0x12800000: 123, // dv
0x12b00000: 124, // dyo
0x12b00114: 125, // dyo-SN
0x12d00000: 126, // dz
0x12d00043: 127, // dz-BT
0x12f00000: 128, // ebu
0x12f000a4: 129, // ebu-KE
0x13000000: 130, // ee
0x13000080: 131, // ee-GH
0x13000122: 132, // ee-TG
0x13600000: 133, // el
0x1360005d: 134, // el-CY
0x13600087: 135, // el-GR
0x13900000: 136, // en
0x13900001: 137, // en-001
0x1390001a: 138, // en-150
0x13900025: 139, // en-AG
0x13900026: 140, // en-AI
0x1390002d: 141, // en-AS
0x1390002e: 142, // en-AT
0x1390002f: 143, // en-AU
0x13900034: 144, // en-BB
0x13900036: 145, // en-BE
0x1390003a: 146, // en-BI
0x1390003d: 147, // en-BM
0x13900042: 148, // en-BS
0x13900046: 149, // en-BW
0x13900048: 150, // en-BZ
0x13900049: 151, // en-CA
0x1390004a: 152, // en-CC
0x1390004e: 153, // en-CH
0x13900050: 154, // en-CK
0x13900052: 155, // en-CM
0x1390005c: 156, // en-CX
0x1390005d: 157, // en-CY
0x13900060: 158, // en-DE
0x13900061: 159, // en-DG
0x13900063: 160, // en-DK
0x13900064: 161, // en-DM
0x1390006d: 162, // en-ER
0x13900072: 163, // en-FI
0x13900073: 164, // en-FJ
0x13900074: 165, // en-FK
0x13900075: 166, // en-FM
0x1390007b: 167, // en-GB
0x1390007c: 168, // en-GD
0x1390007f: 169, // en-GG
0x13900080: 170, // en-GH
0x13900081: 171, // en-GI
0x13900083: 172, // en-GM
0x1390008a: 173, // en-GU
0x1390008c: 174, // en-GY
0x1390008d: 175, // en-HK
0x13900096: 176, // en-IE
0x13900097: 177, // en-IL
0x13900098: 178, // en-IM
0x13900099: 179, // en-IN
0x1390009a: 180, // en-IO
0x1390009f: 181, // en-JE
0x139000a0: 182, // en-JM
0x139000a4: 183, // en-KE
0x139000a7: 184, // en-KI
0x139000a9: 185, // en-KN
0x139000ad: 186, // en-KY
0x139000b1: 187, // en-LC
0x139000b4: 188, // en-LR
0x139000b5: 189, // en-LS
0x139000bf: 190, // en-MG
0x139000c0: 191, // en-MH
0x139000c6: 192, // en-MO
0x139000c7: 193, // en-MP
0x139000ca: 194, // en-MS
0x139000cb: 195, // en-MT
0x139000cc: 196, // en-MU
0x139000ce: 197, // en-MW
0x139000d0: 198, // en-MY
0x139000d2: 199, // en-NA
0x139000d5: 200, // en-NF
0x139000d6: 201, // en-NG
0x139000d9: 202, // en-NL
0x139000dd: 203, // en-NR
0x139000df: 204, // en-NU
0x139000e0: 205, // en-NZ
0x139000e6: 206, // en-PG
0x139000e7: 207, // en-PH
0x139000e8: 208, // en-PK
0x139000eb: 209, // en-PN
0x139000ec: 210, // en-PR
0x139000f0: 211, // en-PW
0x13900107: 212, // en-RW
0x13900109: 213, // en-SB
0x1390010a: 214, // en-SC
0x1390010b: 215, // en-SD
0x1390010c: 216, // en-SE
0x1390010d: 217, // en-SG
0x1390010e: 218, // en-SH
0x1390010f: 219, // en-SI
0x13900112: 220, // en-SL
0x13900117: 221, // en-SS
0x1390011b: 222, // en-SX
0x1390011d: 223, // en-SZ
0x1390011f: 224, // en-TC
0x13900125: 225, // en-TK
0x13900129: 226, // en-TO
0x1390012c: 227, // en-TT
0x1390012d: 228, // en-TV
0x1390012f: 229, // en-TZ
0x13900131: 230, // en-UG
0x13900133: 231, // en-UM
0x13900135: 232, // en-US
0x13900139: 233, // en-VC
0x1390013c: 234, // en-VG
0x1390013d: 235, // en-VI
0x1390013f: 236, // en-VU
0x13900142: 237, // en-WS
0x13900161: 238, // en-ZA
0x13900162: 239, // en-ZM
0x13900164: 240, // en-ZW
0x13c00000: 241, // eo
0x13c00001: 242, // eo-001
0x13e00000: 243, // es
0x13e0001f: 244, // es-419
0x13e0002c: 245, // es-AR
0x13e0003f: 246, // es-BO
0x13e00041: 247, // es-BR
0x13e00048: 248, // es-BZ
0x13e00051: 249, // es-CL
0x13e00054: 250, // es-CO
0x13e00056: 251, // es-CR
0x13e00059: 252, // es-CU
0x13e00065: 253, // es-DO
0x13e00068: 254, // es-EA
0x13e00069: 255, // es-EC
0x13e0006e: 256, // es-ES
0x13e00086: 257, // es-GQ
0x13e00089: 258, // es-GT
0x13e0008f: 259, // es-HN
0x13e00094: 260, // es-IC
0x13e000cf: 261, // es-MX
0x13e000d8: 262, // es-NI
0x13e000e2: 263, // es-PA
0x13e000e4: 264, // es-PE
0x13e000e7: 265, // es-PH
0x13e000ec: 266, // es-PR
0x13e000f1: 267, // es-PY
0x13e0011a: 268, // es-SV
0x13e00135: 269, // es-US
0x13e00136: 270, // es-UY
0x13e0013b: 271, // es-VE
0x14000000: 272, // et
0x1400006a: 273, // et-EE
0x14500000: 274, // eu
0x1450006e: 275, // eu-ES
0x14600000: 276, // ewo
0x14600052: 277, // ewo-CM
0x14800000: 278, // fa
0x14800024: 279, // fa-AF
0x1480009c: 280, // fa-IR
0x14e00000: 281, // ff
0x14e00052: 282, // ff-CM
0x14e00084: 283, // ff-GN
0x14e000c9: 284, // ff-MR
0x14e00114: 285, // ff-SN
0x15100000: 286, // fi
0x15100072: 287, // fi-FI
0x15300000: 288, // fil
0x153000e7: 289, // fil-PH
0x15800000: 290, // fo
0x15800063: 291, // fo-DK
0x15800076: 292, // fo-FO
0x15e00000: 293, // fr
0x15e00036: 294, // fr-BE
0x15e00037: 295, // fr-BF
0x15e0003a: 296, // fr-BI
0x15e0003b: 297, // fr-BJ
0x15e0003c: 298, // fr-BL
0x15e00049: 299, // fr-CA
0x15e0004b: 300, // fr-CD
0x15e0004c: 301, // fr-CF
0x15e0004d: 302, // fr-CG
0x15e0004e: 303, // fr-CH
0x15e0004f: 304, // fr-CI
0x15e00052: 305, // fr-CM
0x15e00062: 306, // fr-DJ
0x15e00067: 307, // fr-DZ
0x15e00078: 308, // fr-FR
0x15e0007a: 309, // fr-GA
0x15e0007e: 310, // fr-GF
0x15e00084: 311, // fr-GN
0x15e00085: 312, // fr-GP
0x15e00086: 313, // fr-GQ
0x15e00091: 314, // fr-HT
0x15e000a8: 315, // fr-KM
0x15e000b7: 316, // fr-LU
0x15e000ba: 317, // fr-MA
0x15e000bb: 318, // fr-MC
0x15e000be: 319, // fr-MF
0x15e000bf: 320, // fr-MG
0x15e000c3: 321, // fr-ML
0x15e000c8: 322, // fr-MQ
0x15e000c9: 323, // fr-MR
0x15e000cc: 324, // fr-MU
0x15e000d3: 325, // fr-NC
0x15e000d4: 326, // fr-NE
0x15e000e5: 327, // fr-PF
0x15e000ea: 328, // fr-PM
0x15e00102: 329, // fr-RE
0x15e00107: 330, // fr-RW
0x15e0010a: 331, // fr-SC
0x15e00114: 332, // fr-SN
0x15e0011c: 333, // fr-SY
0x15e00120: 334, // fr-TD
0x15e00122: 335, // fr-TG
0x15e00128: 336, // fr-TN
0x15e0013f: 337, // fr-VU
0x15e00140: 338, // fr-WF
0x15e0015f: 339, // fr-YT
0x16900000: 340, // fur
0x1690009e: 341, // fur-IT
0x16d00000: 342, // fy
0x16d000d9: 343, // fy-NL
0x16e00000: 344, // ga
0x16e00096: 345, // ga-IE
0x17e00000: 346, // gd
0x17e0007b: 347, // gd-GB
0x19000000: 348, // gl
0x1900006e: 349, // gl-ES
0x1a300000: 350, // gsw
0x1a30004e: 351, // gsw-CH
0x1a300078: 352, // gsw-FR
0x1a3000b2: 353, // gsw-LI
0x1a400000: 354, // gu
0x1a400099: 355, // gu-IN
0x1a900000: 356, // guw
0x1ab00000: 357, // guz
0x1ab000a4: 358, // guz-KE
0x1ac00000: 359, // gv
0x1ac00098: 360, // gv-IM
0x1b400000: 361, // ha
0x1b400080: 362, // ha-GH
0x1b4000d4: 363, // ha-NE
0x1b4000d6: 364, // ha-NG
0x1b800000: 365, // haw
0x1b800135: 366, // haw-US
0x1bc00000: 367, // he
0x1bc00097: 368, // he-IL
0x1be00000: 369, // hi
0x1be00099: 370, // hi-IN
0x1d100000: 371, // hr
0x1d100033: 372, // hr-BA
0x1d100090: 373, // hr-HR
0x1d200000: 374, // hsb
0x1d200060: 375, // hsb-DE
0x1d500000: 376, // hu
0x1d500092: 377, // hu-HU
0x1d700000: 378, // hy
0x1d700028: 379, // hy-AM
0x1e100000: 380, // id
0x1e100095: 381, // id-ID
0x1e700000: 382, // ig
0x1e7000d6: 383, // ig-NG
0x1ea00000: 384, // ii
0x1ea00053: 385, // ii-CN
0x1f500000: 386, // io
0x1f800000: 387, // is
0x1f80009d: 388, // is-IS
0x1f900000: 389, // it
0x1f90004e: 390, // it-CH
0x1f90009e: 391, // it-IT
0x1f900113: 392, // it-SM
0x1f900138: 393, // it-VA
0x1fa00000: 394, // iu
0x20000000: 395, // ja
0x200000a2: 396, // ja-JP
0x20300000: 397, // jbo
0x20700000: 398, // jgo
0x20700052: 399, // jgo-CM
0x20a00000: 400, // jmc
0x20a0012f: 401, // jmc-TZ
0x20e00000: 402, // jv
0x21000000: 403, // ka
0x2100007d: 404, // ka-GE
0x21200000: 405, // kab
0x21200067: 406, // kab-DZ
0x21600000: 407, // kaj
0x21700000: 408, // kam
0x217000a4: 409, // kam-KE
0x21f00000: 410, // kcg
0x22300000: 411, // kde
0x2230012f: 412, // kde-TZ
0x22700000: 413, // kea
0x2270005a: 414, // kea-CV
0x23400000: 415, // khq
0x234000c3: 416, // khq-ML
0x23900000: 417, // ki
0x239000a4: 418, // ki-KE
0x24200000: 419, // kk
0x242000ae: 420, // kk-KZ
0x24400000: 421, // kkj
0x24400052: 422, // kkj-CM
0x24500000: 423, // kl
0x24500082: 424, // kl-GL
0x24600000: 425, // kln
0x246000a4: 426, // kln-KE
0x24a00000: 427, // km
0x24a000a6: 428, // km-KH
0x25100000: 429, // kn
0x25100099: 430, // kn-IN
0x25400000: 431, // ko
0x254000aa: 432, // ko-KP
0x254000ab: 433, // ko-KR
0x25600000: 434, // kok
0x25600099: 435, // kok-IN
0x26a00000: 436, // ks
0x26a00099: 437, // ks-IN
0x26b00000: 438, // ksb
0x26b0012f: 439, // ksb-TZ
0x26d00000: 440, // ksf
0x26d00052: 441, // ksf-CM
0x26e00000: 442, // ksh
0x26e00060: 443, // ksh-DE
0x27400000: 444, // ku
0x28100000: 445, // kw
0x2810007b: 446, // kw-GB
0x28a00000: 447, // ky
0x28a000a5: 448, // ky-KG
0x29100000: 449, // lag
0x2910012f: 450, // lag-TZ
0x29500000: 451, // lb
0x295000b7: 452, // lb-LU
0x2a300000: 453, // lg
0x2a300131: 454, // lg-UG
0x2af00000: 455, // lkt
0x2af00135: 456, // lkt-US
0x2b500000: 457, // ln
0x2b50002a: 458, // ln-AO
0x2b50004b: 459, // ln-CD
0x2b50004c: 460, // ln-CF
0x2b50004d: 461, // ln-CG
0x2b800000: 462, // lo
0x2b8000af: 463, // lo-LA
0x2bf00000: 464, // lrc
0x2bf0009b: 465, // lrc-IQ
0x2bf0009c: 466, // lrc-IR
0x2c000000: 467, // lt
0x2c0000b6: 468, // lt-LT
0x2c200000: 469, // lu
0x2c20004b: 470, // lu-CD
0x2c400000: 471, // luo
0x2c4000a4: 472, // luo-KE
0x2c500000: 473, // luy
0x2c5000a4: 474, // luy-KE
0x2c700000: 475, // lv
0x2c7000b8: 476, // lv-LV
0x2d100000: 477, // mas
0x2d1000a4: 478, // mas-KE
0x2d10012f: 479, // mas-TZ
0x2e900000: 480, // mer
0x2e9000a4: 481, // mer-KE
0x2ed00000: 482, // mfe
0x2ed000cc: 483, // mfe-MU
0x2f100000: 484, // mg
0x2f1000bf: 485, // mg-MG
0x2f200000: 486, // mgh
0x2f2000d1: 487, // mgh-MZ
0x2f400000: 488, // mgo
0x2f400052: 489, // mgo-CM
0x2ff00000: 490, // mk
0x2ff000c2: 491, // mk-MK
0x30400000: 492, // ml
0x30400099: 493, // ml-IN
0x30b00000: 494, // mn
0x30b000c5: 495, // mn-MN
0x31b00000: 496, // mr
0x31b00099: 497, // mr-IN
0x31f00000: 498, // ms
0x31f0003e: 499, // ms-BN
0x31f000d0: 500, // ms-MY
0x31f0010d: 501, // ms-SG
0x32000000: 502, // mt
0x320000cb: 503, // mt-MT
0x32500000: 504, // mua
0x32500052: 505, // mua-CM
0x33100000: 506, // my
0x331000c4: 507, // my-MM
0x33a00000: 508, // mzn
0x33a0009c: 509, // mzn-IR
0x34100000: 510, // nah
0x34500000: 511, // naq
0x345000d2: 512, // naq-NA
0x34700000: 513, // nb
0x347000da: 514, // nb-NO
0x34700110: 515, // nb-SJ
0x34e00000: 516, // nd
0x34e00164: 517, // nd-ZW
0x35000000: 518, // nds
0x35000060: 519, // nds-DE
0x350000d9: 520, // nds-NL
0x35100000: 521, // ne
0x35100099: 522, // ne-IN
0x351000db: 523, // ne-NP
0x36700000: 524, // nl
0x36700030: 525, // nl-AW
0x36700036: 526, // nl-BE
0x36700040: 527, // nl-BQ
0x3670005b: 528, // nl-CW
0x367000d9: 529, // nl-NL
0x36700116: 530, // nl-SR
0x3670011b: 531, // nl-SX
0x36800000: 532, // nmg
0x36800052: 533, // nmg-CM
0x36a00000: 534, // nn
0x36a000da: 535, // nn-NO
0x36c00000: 536, // nnh
0x36c00052: 537, // nnh-CM
0x36f00000: 538, // no
0x37500000: 539, // nqo
0x37600000: 540, // nr
0x37a00000: 541, // nso
0x38000000: 542, // nus
0x38000117: 543, // nus-SS
0x38700000: 544, // ny
0x38900000: 545, // nyn
0x38900131: 546, // nyn-UG
0x39000000: 547, // om
0x3900006f: 548, // om-ET
0x390000a4: 549, // om-KE
0x39500000: 550, // or
0x39500099: 551, // or-IN
0x39800000: 552, // os
0x3980007d: 553, // os-GE
0x39800106: 554, // os-RU
0x39d00000: 555, // pa
0x39d05000: 556, // pa-Arab
0x39d050e8: 557, // pa-Arab-PK
0x39d33000: 558, // pa-Guru
0x39d33099: 559, // pa-Guru-IN
0x3a100000: 560, // pap
0x3b300000: 561, // pl
0x3b3000e9: 562, // pl-PL
0x3bd00000: 563, // prg
0x3bd00001: 564, // prg-001
0x3be00000: 565, // ps
0x3be00024: 566, // ps-AF
0x3c000000: 567, // pt
0x3c00002a: 568, // pt-AO
0x3c000041: 569, // pt-BR
0x3c00004e: 570, // pt-CH
0x3c00005a: 571, // pt-CV
0x3c000086: 572, // pt-GQ
0x3c00008b: 573, // pt-GW
0x3c0000b7: 574, // pt-LU
0x3c0000c6: 575, // pt-MO
0x3c0000d1: 576, // pt-MZ
0x3c0000ee: 577, // pt-PT
0x3c000118: 578, // pt-ST
0x3c000126: 579, // pt-TL
0x3c400000: 580, // qu
0x3c40003f: 581, // qu-BO
0x3c400069: 582, // qu-EC
0x3c4000e4: 583, // qu-PE
0x3d400000: 584, // rm
0x3d40004e: 585, // rm-CH
0x3d900000: 586, // rn
0x3d90003a: 587, // rn-BI
0x3dc00000: 588, // ro
0x3dc000bc: 589, // ro-MD
0x3dc00104: 590, // ro-RO
0x3de00000: 591, // rof
0x3de0012f: 592, // rof-TZ
0x3e200000: 593, // ru
0x3e200047: 594, // ru-BY
0x3e2000a5: 595, // ru-KG
0x3e2000ae: 596, // ru-KZ
0x3e2000bc: 597, // ru-MD
0x3e200106: 598, // ru-RU
0x3e200130: 599, // ru-UA
0x3e500000: 600, // rw
0x3e500107: 601, // rw-RW
0x3e600000: 602, // rwk
0x3e60012f: 603, // rwk-TZ
0x3eb00000: 604, // sah
0x3eb00106: 605, // sah-RU
0x3ec00000: 606, // saq
0x3ec000a4: 607, // saq-KE
0x3f300000: 608, // sbp
0x3f30012f: 609, // sbp-TZ
0x3fa00000: 610, // sd
0x3fa000e8: 611, // sd-PK
0x3fc00000: 612, // sdh
0x3fd00000: 613, // se
0x3fd00072: 614, // se-FI
0x3fd000da: 615, // se-NO
0x3fd0010c: 616, // se-SE
0x3ff00000: 617, // seh
0x3ff000d1: 618, // seh-MZ
0x40100000: 619, // ses
0x401000c3: 620, // ses-ML
0x40200000: 621, // sg
0x4020004c: 622, // sg-CF
0x40800000: 623, // shi
0x40857000: 624, // shi-Latn
0x408570ba: 625, // shi-Latn-MA
0x408dc000: 626, // shi-Tfng
0x408dc0ba: 627, // shi-Tfng-MA
0x40c00000: 628, // si
0x40c000b3: 629, // si-LK
0x41200000: 630, // sk
0x41200111: 631, // sk-SK
0x41600000: 632, // sl
0x4160010f: 633, // sl-SI
0x41c00000: 634, // sma
0x41d00000: 635, // smi
0x41e00000: 636, // smj
0x41f00000: 637, // smn
0x41f00072: 638, // smn-FI
0x42200000: 639, // sms
0x42300000: 640, // sn
0x42300164: 641, // sn-ZW
0x42900000: 642, // so
0x42900062: 643, // so-DJ
0x4290006f: 644, // so-ET
0x429000a4: 645, // so-KE
0x42900115: 646, // so-SO
0x43100000: 647, // sq
0x43100027: 648, // sq-AL
0x431000c2: 649, // sq-MK
0x4310014d: 650, // sq-XK
0x43200000: 651, // sr
0x4321f000: 652, // sr-Cyrl
0x4321f033: 653, // sr-Cyrl-BA
0x4321f0bd: 654, // sr-Cyrl-ME
0x4321f105: 655, // sr-Cyrl-RS
0x4321f14d: 656, // sr-Cyrl-XK
0x43257000: 657, // sr-Latn
0x43257033: 658, // sr-Latn-BA
0x432570bd: 659, // sr-Latn-ME
0x43257105: 660, // sr-Latn-RS
0x4325714d: 661, // sr-Latn-XK
0x43700000: 662, // ss
0x43a00000: 663, // ssy
0x43b00000: 664, // st
0x44400000: 665, // sv
0x44400031: 666, // sv-AX
0x44400072: 667, // sv-FI
0x4440010c: 668, // sv-SE
0x44500000: 669, // sw
0x4450004b: 670, // sw-CD
0x445000a4: 671, // sw-KE
0x4450012f: 672, // sw-TZ
0x44500131: 673, // sw-UG
0x44e00000: 674, // syr
0x45000000: 675, // ta
0x45000099: 676, // ta-IN
0x450000b3: 677, // ta-LK
0x450000d0: 678, // ta-MY
0x4500010d: 679, // ta-SG
0x46100000: 680, // te
0x46100099: 681, // te-IN
0x46400000: 682, // teo
0x464000a4: 683, // teo-KE
0x46400131: 684, // teo-UG
0x46700000: 685, // tg
0x46700124: 686, // tg-TJ
0x46b00000: 687, // th
0x46b00123: 688, // th-TH
0x46f00000: 689, // ti
0x46f0006d: 690, // ti-ER
0x46f0006f: 691, // ti-ET
0x47100000: 692, // tig
0x47600000: 693, // tk
0x47600127: 694, // tk-TM
0x48000000: 695, // tn
0x48200000: 696, // to
0x48200129: 697, // to-TO
0x48a00000: 698, // tr
0x48a0005d: 699, // tr-CY
0x48a0012b: 700, // tr-TR
0x48e00000: 701, // ts
0x49400000: 702, // tt
0x49400106: 703, // tt-RU
0x4a400000: 704, // twq
0x4a4000d4: 705, // twq-NE
0x4a900000: 706, // tzm
0x4a9000ba: 707, // tzm-MA
0x4ac00000: 708, // ug
0x4ac00053: 709, // ug-CN
0x4ae00000: 710, // uk
0x4ae00130: 711, // uk-UA
0x4b400000: 712, // ur
0x4b400099: 713, // ur-IN
0x4b4000e8: 714, // ur-PK
0x4bc00000: 715, // uz
0x4bc05000: 716, // uz-Arab
0x4bc05024: 717, // uz-Arab-AF
0x4bc1f000: 718, // uz-Cyrl
0x4bc1f137: 719, // uz-Cyrl-UZ
0x4bc57000: 720, // uz-Latn
0x4bc57137: 721, // uz-Latn-UZ
0x4be00000: 722, // vai
0x4be57000: 723, // vai-Latn
0x4be570b4: 724, // vai-Latn-LR
0x4bee3000: 725, // vai-Vaii
0x4bee30b4: 726, // vai-Vaii-LR
0x4c000000: 727, // ve
0x4c300000: 728, // vi
0x4c30013e: 729, // vi-VN
0x4c900000: 730, // vo
0x4c900001: 731, // vo-001
0x4cc00000: 732, // vun
0x4cc0012f: 733, // vun-TZ
0x4ce00000: 734, // wa
0x4cf00000: 735, // wae
0x4cf0004e: 736, // wae-CH
0x4e500000: 737, // wo
0x4e500114: 738, // wo-SN
0x4f200000: 739, // xh
0x4fb00000: 740, // xog
0x4fb00131: 741, // xog-UG
0x50900000: 742, // yav
0x50900052: 743, // yav-CM
0x51200000: 744, // yi
0x51200001: 745, // yi-001
0x51800000: 746, // yo
0x5180003b: 747, // yo-BJ
0x518000d6: 748, // yo-NG
0x51f00000: 749, // yue
0x51f38000: 750, // yue-Hans
0x51f38053: 751, // yue-Hans-CN
0x51f39000: 752, // yue-Hant
0x51f3908d: 753, // yue-Hant-HK
0x52800000: 754, // zgh
0x528000ba: 755, // zgh-MA
0x52900000: 756, // zh
0x52938000: 757, // zh-Hans
0x52938053: 758, // zh-Hans-CN
0x5293808d: 759, // zh-Hans-HK
0x529380c6: 760, // zh-Hans-MO
0x5293810d: 761, // zh-Hans-SG
0x52939000: 762, // zh-Hant
0x5293908d: 763, // zh-Hant-HK
0x529390c6: 764, // zh-Hant-MO
0x5293912e: 765, // zh-Hant-TW
0x52f00000: 766, // zu
0x52f00161: 767, // zu-ZA
}
// Total table size 4676 bytes (4KiB); checksum: 17BE3673

View file

@ -2,8 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go gen_common.go -output tables.go
//go:generate go run gen_index.go
//go:generate go run gen.go -output tables.go
package language
@ -11,47 +10,34 @@ package language
// - verifying that tables are dropped correctly (most notably matcher tables).
import (
"errors"
"fmt"
"strings"
)
const (
// maxCoreSize is the maximum size of a BCP 47 tag without variants and
// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
maxCoreSize = 12
// max99thPercentileSize is a somewhat arbitrary buffer size that presumably
// is large enough to hold at least 99% of the BCP 47 tags.
max99thPercentileSize = 32
// maxSimpleUExtensionSize is the maximum size of a -u extension with one
// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
maxSimpleUExtensionSize = 14
"golang.org/x/text/internal/language"
"golang.org/x/text/internal/language/compact"
)
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
// specific language or locale. All language tag values are guaranteed to be
// well-formed.
type Tag struct {
lang langID
region regionID
// TODO: we will soon run out of positions for script. Idea: instead of
// storing lang, region, and script codes, store only the compact index and
// have a lookup table from this code to its expansion. This greatly speeds
// up table lookup, speed up common variant cases.
// This will also immediately free up 3 extra bytes. Also, the pVariant
// field can now be moved to the lookup table, as the compact index uniquely
// determines the offset of a possible variant.
script scriptID
pVariant byte // offset in str, includes preceding '-'
pExt uint16 // offset of first extension, includes preceding '-'
type Tag compact.Tag
// str is the string representation of the Tag. It will only be used if the
// tag has variants or extensions.
str string
func makeTag(t language.Tag) (tag Tag) {
return Tag(compact.Make(t))
}
func (t *Tag) tag() language.Tag {
return (*compact.Tag)(t).Tag()
}
func (t *Tag) isCompact() bool {
return (*compact.Tag)(t).IsCompact()
}
// TODO: improve performance.
func (t *Tag) lang() language.Language { return t.tag().LangID }
func (t *Tag) region() language.Region { return t.tag().RegionID }
func (t *Tag) script() language.Script { return t.tag().ScriptID }
// Make is a convenience wrapper for Parse that omits the error.
// In case of an error, a sensible default is returned.
func Make(s string) Tag {
@ -68,25 +54,13 @@ func (c CanonType) Make(s string) Tag {
// Raw returns the raw base language, script and region, without making an
// attempt to infer their values.
func (t Tag) Raw() (b Base, s Script, r Region) {
return Base{t.lang}, Script{t.script}, Region{t.region}
}
// equalTags compares language, script and region subtags only.
func (t Tag) equalTags(a Tag) bool {
return t.lang == a.lang && t.script == a.script && t.region == a.region
tt := t.tag()
return Base{tt.LangID}, Script{tt.ScriptID}, Region{tt.RegionID}
}
// IsRoot returns true if t is equal to language "und".
func (t Tag) IsRoot() bool {
if int(t.pVariant) < len(t.str) {
return false
}
return t.equalTags(und)
}
// private reports whether the Tag consists solely of a private use tag.
func (t Tag) private() bool {
return t.str != "" && t.pVariant == 0
return compact.Tag(t).IsRoot()
}
// CanonType can be used to enable or disable various types of canonicalization.
@ -138,73 +112,73 @@ const (
// canonicalize returns the canonicalized equivalent of the tag and
// whether there was any change.
func (t Tag) canonicalize(c CanonType) (Tag, bool) {
func canonicalize(c CanonType, t language.Tag) (language.Tag, bool) {
if c == Raw {
return t, false
}
changed := false
if c&SuppressScript != 0 {
if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] {
t.script = 0
if t.LangID.SuppressScript() == t.ScriptID {
t.ScriptID = 0
changed = true
}
}
if c&canonLang != 0 {
for {
if l, aliasType := normLang(t.lang); l != t.lang {
if l, aliasType := t.LangID.Canonicalize(); l != t.LangID {
switch aliasType {
case langLegacy:
case language.Legacy:
if c&Legacy != 0 {
if t.lang == _sh && t.script == 0 {
t.script = _Latn
if t.LangID == _sh && t.ScriptID == 0 {
t.ScriptID = _Latn
}
t.lang = l
t.LangID = l
changed = true
}
case langMacro:
case language.Macro:
if c&Macro != 0 {
// We deviate here from CLDR. The mapping "nb" -> "no"
// qualifies as a typical Macro language mapping. However,
// for legacy reasons, CLDR maps "no", the macro language
// code for Norwegian, to the dominant variant "nb". This
// change is currently under consideration for CLDR as well.
// See http://unicode.org/cldr/trac/ticket/2698 and also
// http://unicode.org/cldr/trac/ticket/1790 for some of the
// See https://unicode.org/cldr/trac/ticket/2698 and also
// https://unicode.org/cldr/trac/ticket/1790 for some of the
// practical implications. TODO: this check could be removed
// if CLDR adopts this change.
if c&CLDR == 0 || t.lang != _nb {
if c&CLDR == 0 || t.LangID != _nb {
changed = true
t.lang = l
t.LangID = l
}
}
case langDeprecated:
case language.Deprecated:
if c&DeprecatedBase != 0 {
if t.lang == _mo && t.region == 0 {
t.region = _MD
if t.LangID == _mo && t.RegionID == 0 {
t.RegionID = _MD
}
t.lang = l
t.LangID = l
changed = true
// Other canonicalization types may still apply.
continue
}
}
} else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 {
t.lang = _nb
} else if c&Legacy != 0 && t.LangID == _no && c&CLDR != 0 {
t.LangID = _nb
changed = true
}
break
}
}
if c&DeprecatedScript != 0 {
if t.script == _Qaai {
if t.ScriptID == _Qaai {
changed = true
t.script = _Zinh
t.ScriptID = _Zinh
}
}
if c&DeprecatedRegion != 0 {
if r := normRegion(t.region); r != 0 {
if r := t.RegionID.Canonicalize(); r != t.RegionID {
changed = true
t.region = r
t.RegionID = r
}
}
return t, changed
@ -212,11 +186,20 @@ func (t Tag) canonicalize(c CanonType) (Tag, bool) {
// Canonicalize returns the canonicalized equivalent of the tag.
func (c CanonType) Canonicalize(t Tag) (Tag, error) {
t, changed := t.canonicalize(c)
if changed {
t.remakeString()
// First try fast path.
if t.isCompact() {
if _, changed := canonicalize(c, compact.Tag(t).Tag()); !changed {
return t, nil
}
}
// It is unlikely that one will canonicalize a tag after matching. So do
// a slow but simple approach here.
if tag, changed := canonicalize(c, t.tag()); changed {
tag.RemakeString()
return makeTag(tag), nil
}
return t, nil
}
// Confidence indicates the level of certainty for a given return value.
@ -239,83 +222,21 @@ func (c Confidence) String() string {
return confName[c]
}
// remakeString is used to update t.str in case lang, script or region changed.
// It is assumed that pExt and pVariant still point to the start of the
// respective parts.
func (t *Tag) remakeString() {
if t.str == "" {
return
}
extra := t.str[t.pVariant:]
if t.pVariant > 0 {
extra = extra[1:]
}
if t.equalTags(und) && strings.HasPrefix(extra, "x-") {
t.str = extra
t.pVariant = 0
t.pExt = 0
return
}
var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
b := buf[:t.genCoreBytes(buf[:])]
if extra != "" {
diff := len(b) - int(t.pVariant)
b = append(b, '-')
b = append(b, extra...)
t.pVariant = uint8(int(t.pVariant) + diff)
t.pExt = uint16(int(t.pExt) + diff)
} else {
t.pVariant = uint8(len(b))
t.pExt = uint16(len(b))
}
t.str = string(b)
}
// genCoreBytes writes a string for the base languages, script and region tags
// to the given buffer and returns the number of bytes written. It will never
// write more than maxCoreSize bytes.
func (t *Tag) genCoreBytes(buf []byte) int {
n := t.lang.stringToBuf(buf[:])
if t.script != 0 {
n += copy(buf[n:], "-")
n += copy(buf[n:], t.script.String())
}
if t.region != 0 {
n += copy(buf[n:], "-")
n += copy(buf[n:], t.region.String())
}
return n
}
// String returns the canonical string representation of the language tag.
func (t Tag) String() string {
if t.str != "" {
return t.str
}
if t.script == 0 && t.region == 0 {
return t.lang.String()
}
buf := [maxCoreSize]byte{}
return string(buf[:t.genCoreBytes(buf[:])])
return t.tag().String()
}
// MarshalText implements encoding.TextMarshaler.
func (t Tag) MarshalText() (text []byte, err error) {
if t.str != "" {
text = append(text, t.str...)
} else if t.script == 0 && t.region == 0 {
text = append(text, t.lang.String()...)
} else {
buf := [maxCoreSize]byte{}
text = buf[:t.genCoreBytes(buf[:])]
}
return text, nil
return t.tag().MarshalText()
}
// UnmarshalText implements encoding.TextUnmarshaler.
func (t *Tag) UnmarshalText(text []byte) error {
tag, err := Raw.Parse(string(text))
*t = tag
var tag language.Tag
err := tag.UnmarshalText(text)
*t = makeTag(tag)
return err
}
@ -323,15 +244,16 @@ func (t *Tag) UnmarshalText(text []byte) error {
// unspecified, an attempt will be made to infer it from the context.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Base() (Base, Confidence) {
if t.lang != 0 {
return Base{t.lang}, Exact
if b := t.lang(); b != 0 {
return Base{b}, Exact
}
tt := t.tag()
c := High
if t.script == 0 && !(Region{t.region}).IsCountry() {
if tt.ScriptID == 0 && !tt.RegionID.IsCountry() {
c = Low
}
if tag, err := addTags(t); err == nil && tag.lang != 0 {
return Base{tag.lang}, c
if tag, err := tt.Maximize(); err == nil && tag.LangID != 0 {
return Base{tag.LangID}, c
}
return Base{0}, No
}
@ -344,35 +266,34 @@ func (t Tag) Base() (Base, Confidence) {
// If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
// as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
// common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
// See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
// See https://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
// unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
// Note that an inferred script is never guaranteed to be the correct one. Latin is
// almost exclusively used for Afrikaans, but Arabic has been used for some texts
// in the past. Also, the script that is commonly used may change over time.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Script() (Script, Confidence) {
if t.script != 0 {
return Script{t.script}, Exact
if scr := t.script(); scr != 0 {
return Script{scr}, Exact
}
sc, c := scriptID(_Zzzz), No
if t.lang < langNoIndexOffset {
if scr := scriptID(suppressScript[t.lang]); scr != 0 {
// Note: it is not always the case that a language with a suppress
// script value is only written in one script (e.g. kk, ms, pa).
if t.region == 0 {
return Script{scriptID(scr)}, High
}
sc, c = scr, High
tt := t.tag()
sc, c := language.Script(_Zzzz), No
if scr := tt.LangID.SuppressScript(); scr != 0 {
// Note: it is not always the case that a language with a suppress
// script value is only written in one script (e.g. kk, ms, pa).
if tt.RegionID == 0 {
return Script{scr}, High
}
sc, c = scr, High
}
if tag, err := addTags(t); err == nil {
if tag.script != sc {
sc, c = tag.script, Low
if tag, err := tt.Maximize(); err == nil {
if tag.ScriptID != sc {
sc, c = tag.ScriptID, Low
}
} else {
t, _ = (Deprecated | Macro).Canonicalize(t)
if tag, err := addTags(t); err == nil && tag.script != sc {
sc, c = tag.script, Low
tt, _ = canonicalize(Deprecated|Macro, tt)
if tag, err := tt.Maximize(); err == nil && tag.ScriptID != sc {
sc, c = tag.ScriptID, Low
}
}
return Script{sc}, c
@ -382,28 +303,31 @@ func (t Tag) Script() (Script, Confidence) {
// infer a most likely candidate from the context.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Region() (Region, Confidence) {
if t.region != 0 {
return Region{t.region}, Exact
if r := t.region(); r != 0 {
return Region{r}, Exact
}
if t, err := addTags(t); err == nil {
return Region{t.region}, Low // TODO: differentiate between high and low.
tt := t.tag()
if tt, err := tt.Maximize(); err == nil {
return Region{tt.RegionID}, Low // TODO: differentiate between high and low.
}
t, _ = (Deprecated | Macro).Canonicalize(t)
if tag, err := addTags(t); err == nil {
return Region{tag.region}, Low
tt, _ = canonicalize(Deprecated|Macro, tt)
if tag, err := tt.Maximize(); err == nil {
return Region{tag.RegionID}, Low
}
return Region{_ZZ}, No // TODO: return world instead of undetermined?
}
// Variant returns the variants specified explicitly for this language tag.
// Variants returns the variants specified explicitly for this language tag.
// or nil if no variant was specified.
func (t Tag) Variants() []Variant {
if !compact.Tag(t).MayHaveVariants() {
return nil
}
v := []Variant{}
if int(t.pVariant) < int(t.pExt) {
for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; {
x, str = nextToken(str)
v = append(v, Variant{x})
}
x, str := "", t.tag().Variants()
for str != "" {
x, str = nextToken(str)
v = append(v, Variant{x})
}
return v
}
@ -411,57 +335,13 @@ func (t Tag) Variants() []Variant {
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
// specific language are substituted with fields from the parent language.
// The parent for a language may change for newer versions of CLDR.
//
// Parent returns a tag for a less specific language that is mutually
// intelligible or Und if there is no such language. This may not be the same as
// simply stripping the last BCP 47 subtag. For instance, the parent of "zh-TW"
// is "zh-Hant", and the parent of "zh-Hant" is "und".
func (t Tag) Parent() Tag {
if t.str != "" {
// Strip the variants and extensions.
t, _ = Raw.Compose(t.Raw())
if t.region == 0 && t.script != 0 && t.lang != 0 {
base, _ := addTags(Tag{lang: t.lang})
if base.script == t.script {
return Tag{lang: t.lang}
}
}
return t
}
if t.lang != 0 {
if t.region != 0 {
maxScript := t.script
if maxScript == 0 {
max, _ := addTags(t)
maxScript = max.script
}
for i := range parents {
if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript {
for _, r := range parents[i].fromRegion {
if regionID(r) == t.region {
return Tag{
lang: t.lang,
script: scriptID(parents[i].script),
region: regionID(parents[i].toRegion),
}
}
}
}
}
// Strip the script if it is the default one.
base, _ := addTags(Tag{lang: t.lang})
if base.script != maxScript {
return Tag{lang: t.lang, script: maxScript}
}
return Tag{lang: t.lang}
} else if t.script != 0 {
// The parent for an base-script pair with a non-default script is
// "und" instead of the base language.
base, _ := addTags(Tag{lang: t.lang})
if base.script != t.script {
return und
}
return Tag{lang: t.lang}
}
}
return und
return Tag(compact.Tag(t).Parent())
}
// returns token t and the rest of the string.
@ -487,17 +367,8 @@ func (e Extension) String() string {
// ParseExtension parses s as an extension and returns it on success.
func ParseExtension(s string) (e Extension, err error) {
scan := makeScannerString(s)
var end int
if n := len(scan.token); n != 1 {
return Extension{}, errSyntax
}
scan.toLower(0, len(scan.b))
end = parseExtension(&scan)
if end != len(s) {
return Extension{}, errSyntax
}
return Extension{string(scan.b)}, nil
ext, err := language.ParseExtension(s)
return Extension{ext}, err
}
// Type returns the one-byte extension type of e. It returns 0 for the zero
@ -518,22 +389,20 @@ func (e Extension) Tokens() []string {
// false for ok if t does not have the requested extension. The returned
// extension will be invalid in this case.
func (t Tag) Extension(x byte) (ext Extension, ok bool) {
for i := int(t.pExt); i < len(t.str)-1; {
var ext string
i, ext = getExtension(t.str, i)
if ext[0] == x {
return Extension{ext}, true
}
if !compact.Tag(t).MayHaveExtensions() {
return Extension{}, false
}
return Extension{}, false
e, ok := t.tag().Extension(x)
return Extension{e}, ok
}
// Extensions returns all extensions of t.
func (t Tag) Extensions() []Extension {
if !compact.Tag(t).MayHaveExtensions() {
return nil
}
e := []Extension{}
for i := int(t.pExt); i < len(t.str)-1; {
var ext string
i, ext = getExtension(t.str, i)
for _, ext := range t.tag().Extensions() {
e = append(e, Extension{ext})
}
return e
@ -541,259 +410,105 @@ func (t Tag) Extensions() []Extension {
// TypeForKey returns the type associated with the given key, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// TypeForKey will traverse the inheritance chain to get the correct value.
func (t Tag) TypeForKey(key string) string {
if start, end, _ := t.findTypeForKey(key); end != start {
return t.str[start:end]
if !compact.Tag(t).MayHaveExtensions() {
if key != "rg" && key != "va" {
return ""
}
}
return ""
return t.tag().TypeForKey(key)
}
var (
errPrivateUse = errors.New("cannot set a key on a private use tag")
errInvalidArguments = errors.New("invalid key or type")
)
// SetTypeForKey returns a new Tag with the key set to type, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// An empty value removes an existing pair with the same key.
func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
if t.private() {
return t, errPrivateUse
}
if len(key) != 2 {
return t, errInvalidArguments
}
// Remove the setting if value is "".
if value == "" {
start, end, _ := t.findTypeForKey(key)
if start != end {
// Remove key tag and leading '-'.
start -= 4
// Remove a possible empty extension.
if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
start -= 2
}
if start == int(t.pVariant) && end == len(t.str) {
t.str = ""
t.pVariant, t.pExt = 0, 0
} else {
t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
}
}
return t, nil
}
if len(value) < 3 || len(value) > 8 {
return t, errInvalidArguments
}
var (
buf [maxCoreSize + maxSimpleUExtensionSize]byte
uStart int // start of the -u extension.
)
// Generate the tag string if needed.
if t.str == "" {
uStart = t.genCoreBytes(buf[:])
buf[uStart] = '-'
uStart++
}
// Create new key-type pair and parse it to verify.
b := buf[uStart:]
copy(b, "u-")
copy(b[2:], key)
b[4] = '-'
b = b[:5+copy(b[5:], value)]
scan := makeScanner(b)
if parseExtensions(&scan); scan.err != nil {
return t, scan.err
}
// Assemble the replacement string.
if t.str == "" {
t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
t.str = string(buf[:uStart+len(b)])
} else {
s := t.str
start, end, hasExt := t.findTypeForKey(key)
if start == end {
if hasExt {
b = b[2:]
}
t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
} else {
t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
}
}
return t, nil
tt, err := t.tag().SetTypeForKey(key, value)
return makeTag(tt), err
}
// findKeyAndType returns the start and end position for the type corresponding
// to key or the point at which to insert the key-value pair if the type
// wasn't found. The hasExt return value reports whether an -u extension was present.
// Note: the extensions are typically very small and are likely to contain
// only one key-type pair.
func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
p := int(t.pExt)
if len(key) != 2 || p == len(t.str) || p == 0 {
return p, p, false
}
s := t.str
// Find the correct extension.
for p++; s[p] != 'u'; p++ {
if s[p] > 'u' {
p--
return p, p, false
}
if p = nextExtension(s, p); p == len(s) {
return len(s), len(s), false
}
}
// Proceed to the hyphen following the extension name.
p++
// curKey is the key currently being processed.
curKey := ""
// Iterate over keys until we get the end of a section.
for {
// p points to the hyphen preceding the current token.
if p3 := p + 3; s[p3] == '-' {
// Found a key.
// Check whether we just processed the key that was requested.
if curKey == key {
return start, p, true
}
// Set to the next key and continue scanning type tokens.
curKey = s[p+1 : p3]
if curKey > key {
return p, p, true
}
// Start of the type token sequence.
start = p + 4
// A type is at least 3 characters long.
p += 7 // 4 + 3
} else {
// Attribute or type, which is at least 3 characters long.
p += 4
}
// p points past the third character of a type or attribute.
max := p + 5 // maximum length of token plus hyphen.
if len(s) < max {
max = len(s)
}
for ; p < max && s[p] != '-'; p++ {
}
// Bail if we have exhausted all tokens or if the next token starts
// a new extension.
if p == len(s) || s[p+2] == '-' {
if curKey == key {
return start, p, true
}
return p, p, true
}
}
}
// NumCompactTags is the number of compact tags. The maximum tag is
// NumCompactTags-1.
const NumCompactTags = compact.NumCompactTags
// CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
// for which data exists in the text repository. The index will change over time
// and should not be stored in persistent storage. Extensions, except for the
// 'va' type of the 'u' extension, are ignored. It will return 0, false if no
// compact tag exists, where 0 is the index for the root language (Und).
func CompactIndex(t Tag) (index int, ok bool) {
// TODO: perhaps give more frequent tags a lower index.
// TODO: we could make the indexes stable. This will excluded some
// possibilities for optimization, so don't do this quite yet.
b, s, r := t.Raw()
if len(t.str) > 0 {
if strings.HasPrefix(t.str, "x-") {
// We have no entries for user-defined tags.
return 0, false
}
if uint16(t.pVariant) != t.pExt {
// There are no tags with variants and an u-va type.
if t.TypeForKey("va") != "" {
return 0, false
}
t, _ = Raw.Compose(b, s, r, t.Variants())
} else if _, ok := t.Extension('u'); ok {
// Strip all but the 'va' entry.
variant := t.TypeForKey("va")
t, _ = Raw.Compose(b, s, r)
t, _ = t.SetTypeForKey("va", variant)
}
if len(t.str) > 0 {
// We have some variants.
for i, s := range specialTags {
if s == t {
return i + 1, true
}
}
return 0, false
}
}
// No variants specified: just compare core components.
// The key has the form lllssrrr, where l, s, and r are nibbles for
// respectively the langID, scriptID, and regionID.
key := uint32(b.langID) << (8 + 12)
key |= uint32(s.scriptID) << 12
key |= uint32(r.regionID)
x, ok := coreTags[key]
return int(x), ok
// for which data exists in the text repository.The index will change over time
// and should not be stored in persistent storage. If t does not match a compact
// index, exact will be false and the compact index will be returned for the
// first match after repeatedly taking the Parent of t.
func CompactIndex(t Tag) (index int, exact bool) {
id, exact := compact.LanguageID(compact.Tag(t))
return int(id), exact
}
var root = language.Tag{}
// Base is an ISO 639 language code, used for encoding the base language
// of a language tag.
type Base struct {
langID
langID language.Language
}
// ParseBase parses a 2- or 3-letter ISO 639 code.
// It returns a ValueError if s is a well-formed but unknown language identifier
// or another error if another error occurred.
func ParseBase(s string) (Base, error) {
if n := len(s); n < 2 || 3 < n {
return Base{}, errSyntax
}
var buf [3]byte
l, err := getLangID(buf[:copy(buf[:], s)])
l, err := language.ParseBase(s)
return Base{l}, err
}
// String returns the BCP 47 representation of the base language.
func (b Base) String() string {
return b.langID.String()
}
// ISO3 returns the ISO 639-3 language code.
func (b Base) ISO3() string {
return b.langID.ISO3()
}
// IsPrivateUse reports whether this language code is reserved for private use.
func (b Base) IsPrivateUse() bool {
return b.langID.IsPrivateUse()
}
// Script is a 4-letter ISO 15924 code for representing scripts.
// It is idiomatically represented in title case.
type Script struct {
scriptID
scriptID language.Script
}
// ParseScript parses a 4-letter ISO 15924 code.
// It returns a ValueError if s is a well-formed but unknown script identifier
// or another error if another error occurred.
func ParseScript(s string) (Script, error) {
if len(s) != 4 {
return Script{}, errSyntax
}
var buf [4]byte
sc, err := getScriptID(script, buf[:copy(buf[:], s)])
sc, err := language.ParseScript(s)
return Script{sc}, err
}
// String returns the script code in title case.
// It returns "Zzzz" for an unspecified script.
func (s Script) String() string {
return s.scriptID.String()
}
// IsPrivateUse reports whether this script code is reserved for private use.
func (s Script) IsPrivateUse() bool {
return s.scriptID.IsPrivateUse()
}
// Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
type Region struct {
regionID
regionID language.Region
}
// EncodeM49 returns the Region for the given UN M.49 code.
// It returns an error if r is not a valid code.
func EncodeM49(r int) (Region, error) {
rid, err := getRegionM49(r)
rid, err := language.EncodeM49(r)
return Region{rid}, err
}
@ -801,62 +516,54 @@ func EncodeM49(r int) (Region, error) {
// It returns a ValueError if s is a well-formed but unknown region identifier
// or another error if another error occurred.
func ParseRegion(s string) (Region, error) {
if n := len(s); n < 2 || 3 < n {
return Region{}, errSyntax
}
var buf [3]byte
r, err := getRegionID(buf[:copy(buf[:], s)])
r, err := language.ParseRegion(s)
return Region{r}, err
}
// String returns the BCP 47 representation for the region.
// It returns "ZZ" for an unspecified region.
func (r Region) String() string {
return r.regionID.String()
}
// ISO3 returns the 3-letter ISO code of r.
// Note that not all regions have a 3-letter ISO code.
// In such cases this method returns "ZZZ".
func (r Region) ISO3() string {
return r.regionID.ISO3()
}
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
// is not defined for r.
func (r Region) M49() int {
return r.regionID.M49()
}
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
// may include private-use tags that are assigned by CLDR and used in this
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
func (r Region) IsPrivateUse() bool {
return r.regionID.IsPrivateUse()
}
// IsCountry returns whether this region is a country or autonomous area. This
// includes non-standard definitions from CLDR.
func (r Region) IsCountry() bool {
if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK {
return false
}
return true
return r.regionID.IsCountry()
}
// IsGroup returns whether this region defines a collection of regions. This
// includes non-standard definitions from CLDR.
func (r Region) IsGroup() bool {
if r.regionID == 0 {
return false
}
return int(regionInclusion[r.regionID]) < len(regionContainment)
return r.regionID.IsGroup()
}
// Contains returns whether Region c is contained by Region r. It returns true
// if c == r.
func (r Region) Contains(c Region) bool {
return r.regionID.contains(c.regionID)
return r.regionID.Contains(c.regionID)
}
func (r regionID) contains(c regionID) bool {
if r == c {
return true
}
g := regionInclusion[r]
if g >= nRegionGroups {
return false
}
m := regionContainment[g]
d := regionInclusion[c]
b := regionInclusionBits[d]
// A contained country may belong to multiple disjoint groups. Matching any
// of these indicates containment. If the contained region is a group, it
// must strictly be a subset.
if d >= nRegionGroups {
return b&m != 0
}
return b&^m == 0
}
var errNoTLD = errors.New("language: region is not a valid ccTLD")
// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
// In all other cases it returns either the region itself or an error.
//
@ -865,25 +572,15 @@ var errNoTLD = errors.New("language: region is not a valid ccTLD")
// region will already be canonicalized it was obtained from a Tag that was
// obtained using any of the default methods.
func (r Region) TLD() (Region, error) {
// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
// difference between ISO 3166-1 and IANA ccTLD.
if r.regionID == _GB {
r = Region{_UK}
}
if (r.typ() & ccTLD) == 0 {
return Region{}, errNoTLD
}
return r, nil
tld, err := r.regionID.TLD()
return Region{tld}, err
}
// Canonicalize returns the region or a possible replacement if the region is
// deprecated. It will not return a replacement for deprecated regions that
// are split into multiple regions.
func (r Region) Canonicalize() Region {
if cr := normRegion(r.regionID); cr != 0 {
return Region{cr}
}
return r
return Region{r.regionID.Canonicalize()}
}
// Variant represents a registered variant of a language as defined by BCP 47.
@ -894,11 +591,8 @@ type Variant struct {
// ParseVariant parses and returns a Variant. An error is returned if s is not
// a valid variant.
func ParseVariant(s string) (Variant, error) {
s = strings.ToLower(s)
if _, ok := variantIndex[s]; ok {
return Variant{s}, nil
}
return Variant{}, mkErrInvalid([]byte(s))
v, err := language.ParseVariant(s)
return Variant{v.String()}, err
}
// String returns the string representation of the variant.

View file

@ -4,7 +4,12 @@
package language
import "errors"
import (
"errors"
"strings"
"golang.org/x/text/internal/language"
)
// A MatchOption configures a Matcher.
type MatchOption func(*matcher)
@ -74,12 +79,13 @@ func NewMatcher(t []Tag, options ...MatchOption) Matcher {
}
func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
var tt language.Tag
match, w, c := m.getBest(want...)
if match != nil {
t, index = match.tag, match.index
tt, index = match.tag, match.index
} else {
// TODO: this should be an option
t = m.default_.tag
tt = m.default_.tag
if m.preferSameScript {
outer:
for _, w := range want {
@ -91,7 +97,7 @@ func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
}
for i, h := range m.supported {
if script.scriptID == h.maxScript {
t, index = h.tag, i
tt, index = h.tag, i
break outer
}
}
@ -99,238 +105,45 @@ func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
}
// TODO: select first language tag based on script.
}
if w.region != 0 && t.region != 0 && t.region.contains(w.region) {
t, _ = Raw.Compose(t, Region{w.region})
if w.RegionID != tt.RegionID && w.RegionID != 0 {
if w.RegionID != 0 && tt.RegionID != 0 && tt.RegionID.Contains(w.RegionID) {
tt.RegionID = w.RegionID
tt.RemakeString()
} else if r := w.RegionID.String(); len(r) == 2 {
// TODO: also filter macro and deprecated.
tt, _ = tt.SetTypeForKey("rg", strings.ToLower(r)+"zzzz")
}
}
// Copy options from the user-provided tag into the result tag. This is hard
// to do after the fact, so we do it here.
// TODO: add in alternative variants to -u-va-.
// TODO: add preferred region to -u-rg-.
if e := w.Extensions(); len(e) > 0 {
t, _ = Raw.Compose(t, e)
}
return t, index, c
}
type scriptRegionFlags uint8
const (
isList = 1 << iota
scriptInFrom
regionInFrom
)
func (t *Tag) setUndefinedLang(id langID) {
if t.lang == 0 {
t.lang = id
}
}
func (t *Tag) setUndefinedScript(id scriptID) {
if t.script == 0 {
t.script = id
}
}
func (t *Tag) setUndefinedRegion(id regionID) {
if t.region == 0 || t.region.contains(id) {
t.region = id
b := language.Builder{}
b.SetTag(tt)
for _, e := range e {
b.AddExt(e)
}
tt = b.Make()
}
return makeTag(tt), index, c
}
// ErrMissingLikelyTagsData indicates no information was available
// to compute likely values of missing tags.
var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
// addLikelySubtags sets subtags to their most likely value, given the locale.
// In most cases this means setting fields for unknown values, but in some
// cases it may alter a value. It returns an ErrMissingLikelyTagsData error
// if the given locale cannot be expanded.
func (t Tag) addLikelySubtags() (Tag, error) {
id, err := addTags(t)
if err != nil {
return t, err
} else if id.equalTags(t) {
return t, nil
}
id.remakeString()
return id, nil
}
// specializeRegion attempts to specialize a group region.
func specializeRegion(t *Tag) bool {
if i := regionInclusion[t.region]; i < nRegionGroups {
x := likelyRegionGroup[i]
if langID(x.lang) == t.lang && scriptID(x.script) == t.script {
t.region = regionID(x.region)
}
return true
}
return false
}
func addTags(t Tag) (Tag, error) {
// We leave private use identifiers alone.
if t.private() {
return t, nil
}
if t.script != 0 && t.region != 0 {
if t.lang != 0 {
// already fully specified
specializeRegion(&t)
return t, nil
}
// Search matches for und-script-region. Note that for these cases
// region will never be a group so there is no need to check for this.
list := likelyRegion[t.region : t.region+1]
if x := list[0]; x.flags&isList != 0 {
list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
}
for _, x := range list {
// Deviating from the spec. See match_test.go for details.
if scriptID(x.script) == t.script {
t.setUndefinedLang(langID(x.lang))
return t, nil
}
}
}
if t.lang != 0 {
// Search matches for lang-script and lang-region, where lang != und.
if t.lang < langNoIndexOffset {
x := likelyLang[t.lang]
if x.flags&isList != 0 {
list := likelyLangList[x.region : x.region+uint16(x.script)]
if t.script != 0 {
for _, x := range list {
if scriptID(x.script) == t.script && x.flags&scriptInFrom != 0 {
t.setUndefinedRegion(regionID(x.region))
return t, nil
}
}
} else if t.region != 0 {
count := 0
goodScript := true
tt := t
for _, x := range list {
// We visit all entries for which the script was not
// defined, including the ones where the region was not
// defined. This allows for proper disambiguation within
// regions.
if x.flags&scriptInFrom == 0 && t.region.contains(regionID(x.region)) {
tt.region = regionID(x.region)
tt.setUndefinedScript(scriptID(x.script))
goodScript = goodScript && tt.script == scriptID(x.script)
count++
}
}
if count == 1 {
return tt, nil
}
// Even if we fail to find a unique Region, we might have
// an unambiguous script.
if goodScript {
t.script = tt.script
}
}
}
}
} else {
// Search matches for und-script.
if t.script != 0 {
x := likelyScript[t.script]
if x.region != 0 {
t.setUndefinedRegion(regionID(x.region))
t.setUndefinedLang(langID(x.lang))
return t, nil
}
}
// Search matches for und-region. If und-script-region exists, it would
// have been found earlier.
if t.region != 0 {
if i := regionInclusion[t.region]; i < nRegionGroups {
x := likelyRegionGroup[i]
if x.region != 0 {
t.setUndefinedLang(langID(x.lang))
t.setUndefinedScript(scriptID(x.script))
t.region = regionID(x.region)
}
} else {
x := likelyRegion[t.region]
if x.flags&isList != 0 {
x = likelyRegionList[x.lang]
}
if x.script != 0 && x.flags != scriptInFrom {
t.setUndefinedLang(langID(x.lang))
t.setUndefinedScript(scriptID(x.script))
return t, nil
}
}
}
}
// Search matches for lang.
if t.lang < langNoIndexOffset {
x := likelyLang[t.lang]
if x.flags&isList != 0 {
x = likelyLangList[x.region]
}
if x.region != 0 {
t.setUndefinedScript(scriptID(x.script))
t.setUndefinedRegion(regionID(x.region))
}
specializeRegion(&t)
if t.lang == 0 {
t.lang = _en // default language
}
return t, nil
}
return t, ErrMissingLikelyTagsData
}
func (t *Tag) setTagsFrom(id Tag) {
t.lang = id.lang
t.script = id.script
t.region = id.region
}
// minimize removes the region or script subtags from t such that
// t.addLikelySubtags() == t.minimize().addLikelySubtags().
func (t Tag) minimize() (Tag, error) {
t, err := minimizeTags(t)
if err != nil {
return t, err
}
t.remakeString()
return t, nil
}
// minimizeTags mimics the behavior of the ICU 51 C implementation.
func minimizeTags(t Tag) (Tag, error) {
if t.equalTags(und) {
return t, nil
}
max, err := addTags(t)
if err != nil {
return t, err
}
for _, id := range [...]Tag{
{lang: t.lang},
{lang: t.lang, region: t.region},
{lang: t.lang, script: t.script},
} {
if x, err := addTags(id); err == nil && max.equalTags(x) {
t.setTagsFrom(id)
break
}
}
return t, nil
}
// func (t *Tag) setTagsFrom(id Tag) {
// t.LangID = id.LangID
// t.ScriptID = id.ScriptID
// t.RegionID = id.RegionID
// }
// Tag Matching
// CLDR defines an algorithm for finding the best match between two sets of language
// tags. The basic algorithm defines how to score a possible match and then find
// the match with the best score
// (see http://www.unicode.org/reports/tr35/#LanguageMatching).
// (see https://www.unicode.org/reports/tr35/#LanguageMatching).
// Using scoring has several disadvantages. The scoring obfuscates the importance of
// the various factors considered, making the algorithm harder to understand. Using
// scoring also requires the full score to be computed for each pair of tags.
@ -441,7 +254,7 @@ func minimizeTags(t Tag) (Tag, error) {
type matcher struct {
default_ *haveTag
supported []*haveTag
index map[langID]*matchHeader
index map[language.Language]*matchHeader
passSettings bool
preferSameScript bool
}
@ -456,7 +269,7 @@ type matchHeader struct {
// haveTag holds a supported Tag and its maximized script and region. The maximized
// or canonicalized language is not stored as it is not needed during matching.
type haveTag struct {
tag Tag
tag language.Tag
// index of this tag in the original list of supported tags.
index int
@ -466,37 +279,37 @@ type haveTag struct {
conf Confidence
// Maximized region and script.
maxRegion regionID
maxScript scriptID
maxRegion language.Region
maxScript language.Script
// altScript may be checked as an alternative match to maxScript. If altScript
// matches, the confidence level for this match is Low. Theoretically there
// could be multiple alternative scripts. This does not occur in practice.
altScript scriptID
altScript language.Script
// nextMax is the index of the next haveTag with the same maximized tags.
nextMax uint16
}
func makeHaveTag(tag Tag, index int) (haveTag, langID) {
func makeHaveTag(tag language.Tag, index int) (haveTag, language.Language) {
max := tag
if tag.lang != 0 || tag.region != 0 || tag.script != 0 {
max, _ = max.canonicalize(All)
max, _ = addTags(max)
max.remakeString()
if tag.LangID != 0 || tag.RegionID != 0 || tag.ScriptID != 0 {
max, _ = canonicalize(All, max)
max, _ = max.Maximize()
max.RemakeString()
}
return haveTag{tag, index, Exact, max.region, max.script, altScript(max.lang, max.script), 0}, max.lang
return haveTag{tag, index, Exact, max.RegionID, max.ScriptID, altScript(max.LangID, max.ScriptID), 0}, max.LangID
}
// altScript returns an alternative script that may match the given script with
// a low confidence. At the moment, the langMatch data allows for at most one
// script to map to another and we rely on this to keep the code simple.
func altScript(l langID, s scriptID) scriptID {
func altScript(l language.Language, s language.Script) language.Script {
for _, alt := range matchScript {
// TODO: also match cases where language is not the same.
if (langID(alt.wantLang) == l || langID(alt.haveLang) == l) &&
scriptID(alt.haveScript) == s {
return scriptID(alt.wantScript)
if (language.Language(alt.wantLang) == l || language.Language(alt.haveLang) == l) &&
language.Script(alt.haveScript) == s {
return language.Script(alt.wantScript)
}
}
return 0
@ -508,7 +321,7 @@ func (h *matchHeader) addIfNew(n haveTag, exact bool) {
h.original = h.original || exact
// Don't add new exact matches.
for _, v := range h.haveTags {
if v.tag.equalsRest(n.tag) {
if equalsRest(v.tag, n.tag) {
return
}
}
@ -517,7 +330,7 @@ func (h *matchHeader) addIfNew(n haveTag, exact bool) {
for i, v := range h.haveTags {
if v.maxScript == n.maxScript &&
v.maxRegion == n.maxRegion &&
v.tag.variantOrPrivateTagStr() == n.tag.variantOrPrivateTagStr() {
v.tag.VariantOrPrivateUseTags() == n.tag.VariantOrPrivateUseTags() {
for h.haveTags[i].nextMax != 0 {
i = int(h.haveTags[i].nextMax)
}
@ -530,7 +343,7 @@ func (h *matchHeader) addIfNew(n haveTag, exact bool) {
// header returns the matchHeader for the given language. It creates one if
// it doesn't already exist.
func (m *matcher) header(l langID) *matchHeader {
func (m *matcher) header(l language.Language) *matchHeader {
if h := m.index[l]; h != nil {
return h
}
@ -554,7 +367,7 @@ func toConf(d uint8) Confidence {
// for a given tag.
func newMatcher(supported []Tag, options []MatchOption) *matcher {
m := &matcher{
index: make(map[langID]*matchHeader),
index: make(map[language.Language]*matchHeader),
preferSameScript: true,
}
for _, o := range options {
@ -567,16 +380,18 @@ func newMatcher(supported []Tag, options []MatchOption) *matcher {
// Add supported languages to the index. Add exact matches first to give
// them precedence.
for i, tag := range supported {
pair, _ := makeHaveTag(tag, i)
m.header(tag.lang).addIfNew(pair, true)
tt := tag.tag()
pair, _ := makeHaveTag(tt, i)
m.header(tt.LangID).addIfNew(pair, true)
m.supported = append(m.supported, &pair)
}
m.default_ = m.header(supported[0].lang).haveTags[0]
m.default_ = m.header(supported[0].lang()).haveTags[0]
// Keep these in two different loops to support the case that two equivalent
// languages are distinguished, such as iw and he.
for i, tag := range supported {
pair, max := makeHaveTag(tag, i)
if max != tag.lang {
tt := tag.tag()
pair, max := makeHaveTag(tt, i)
if max != tt.LangID {
m.header(max).addIfNew(pair, true)
}
}
@ -585,11 +400,11 @@ func newMatcher(supported []Tag, options []MatchOption) *matcher {
// update will only add entries to original indexes, thus not computing any
// transitive relations.
update := func(want, have uint16, conf Confidence) {
if hh := m.index[langID(have)]; hh != nil {
if hh := m.index[language.Language(have)]; hh != nil {
if !hh.original {
return
}
hw := m.header(langID(want))
hw := m.header(language.Language(want))
for _, ht := range hh.haveTags {
v := *ht
if conf < v.conf {
@ -597,7 +412,7 @@ func newMatcher(supported []Tag, options []MatchOption) *matcher {
}
v.nextMax = 0 // this value needs to be recomputed
if v.altScript != 0 {
v.altScript = altScript(langID(want), v.maxScript)
v.altScript = altScript(language.Language(want), v.maxScript)
}
hw.addIfNew(v, conf == Exact && hh.original)
}
@ -618,66 +433,67 @@ func newMatcher(supported []Tag, options []MatchOption) *matcher {
// First we match deprecated equivalents. If they are perfect equivalents
// (their canonicalization simply substitutes a different language code, but
// nothing else), the match confidence is Exact, otherwise it is High.
for i, lm := range langAliasMap {
for i, lm := range language.AliasMap {
// If deprecated codes match and there is no fiddling with the script or
// or region, we consider it an exact match.
conf := Exact
if langAliasTypes[i] != langMacro {
if !isExactEquivalent(langID(lm.from)) {
if language.AliasTypes[i] != language.Macro {
if !isExactEquivalent(language.Language(lm.From)) {
conf = High
}
update(lm.to, lm.from, conf)
update(lm.To, lm.From, conf)
}
update(lm.from, lm.to, conf)
update(lm.From, lm.To, conf)
}
return m
}
// getBest gets the best matching tag in m for any of the given tags, taking into
// account the order of preference of the given tags.
func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
func (m *matcher) getBest(want ...Tag) (got *haveTag, orig language.Tag, c Confidence) {
best := bestMatch{}
for i, w := range want {
var max Tag
for i, ww := range want {
w := ww.tag()
var max language.Tag
// Check for exact match first.
h := m.index[w.lang]
if w.lang != 0 {
h := m.index[w.LangID]
if w.LangID != 0 {
if h == nil {
continue
}
// Base language is defined.
max, _ = w.canonicalize(Legacy | Deprecated | Macro)
max, _ = canonicalize(Legacy|Deprecated|Macro, w)
// A region that is added through canonicalization is stronger than
// a maximized region: set it in the original (e.g. mo -> ro-MD).
if w.region != max.region {
w.region = max.region
if w.RegionID != max.RegionID {
w.RegionID = max.RegionID
}
// TODO: should we do the same for scripts?
// See test case: en, sr, nl ; sh ; sr
max, _ = addTags(max)
max, _ = max.Maximize()
} else {
// Base language is not defined.
if h != nil {
for i := range h.haveTags {
have := h.haveTags[i]
if have.tag.equalsRest(w) {
if equalsRest(have.tag, w) {
return have, w, Exact
}
}
}
if w.script == 0 && w.region == 0 {
if w.ScriptID == 0 && w.RegionID == 0 {
// We skip all tags matching und for approximate matching, including
// private tags.
continue
}
max, _ = addTags(w)
if h = m.index[max.lang]; h == nil {
max, _ = w.Maximize()
if h = m.index[max.LangID]; h == nil {
continue
}
}
pin := true
for _, t := range want[i+1:] {
if w.lang == t.lang {
if w.LangID == t.lang() {
pin = false
break
}
@ -685,11 +501,11 @@ func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
// Check for match based on maximized tag.
for i := range h.haveTags {
have := h.haveTags[i]
best.update(have, w, max.script, max.region, pin)
best.update(have, w, max.ScriptID, max.RegionID, pin)
if best.conf == Exact {
for have.nextMax != 0 {
have = h.haveTags[have.nextMax]
best.update(have, w, max.script, max.region, pin)
best.update(have, w, max.ScriptID, max.RegionID, pin)
}
return best.have, best.want, best.conf
}
@ -697,9 +513,9 @@ func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
}
if best.conf <= No {
if len(want) != 0 {
return nil, want[0], No
return nil, want[0].tag(), No
}
return nil, Tag{}, No
return nil, language.Tag{}, No
}
return best.have, best.want, best.conf
}
@ -707,9 +523,9 @@ func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
// bestMatch accumulates the best match so far.
type bestMatch struct {
have *haveTag
want Tag
want language.Tag
conf Confidence
pinnedRegion regionID
pinnedRegion language.Region
pinLanguage bool
sameRegionGroup bool
// Cached results from applying tie-breaking rules.
@ -734,19 +550,19 @@ type bestMatch struct {
// still prefer a second language over a dialect of the preferred language by
// explicitly specifying dialects, e.g. "en, nl, en-GB". In this case pin should
// be false.
func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion regionID, pin bool) {
func (m *bestMatch) update(have *haveTag, tag language.Tag, maxScript language.Script, maxRegion language.Region, pin bool) {
// Bail if the maximum attainable confidence is below that of the current best match.
c := have.conf
if c < m.conf {
return
}
// Don't change the language once we already have found an exact match.
if m.pinLanguage && tag.lang != m.want.lang {
if m.pinLanguage && tag.LangID != m.want.LangID {
return
}
// Pin the region group if we are comparing tags for the same language.
if tag.lang == m.want.lang && m.sameRegionGroup {
_, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.lang)
if tag.LangID == m.want.LangID && m.sameRegionGroup {
_, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.LangID)
if !sameGroup {
return
}
@ -756,7 +572,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
// don't pin anything, otherwise pin the language.
m.pinLanguage = pin
}
if have.tag.equalsRest(tag) {
if equalsRest(have.tag, tag) {
} else if have.maxScript != maxScript {
// There is usually very little comprehension between different scripts.
// In a few cases there may still be Low comprehension. This possibility
@ -786,7 +602,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
// Tie-breaker rules:
// We prefer if the pre-maximized language was specified and identical.
origLang := have.tag.lang == tag.lang && tag.lang != 0
origLang := have.tag.LangID == tag.LangID && tag.LangID != 0
if !beaten && m.origLang != origLang {
if m.origLang {
return
@ -795,7 +611,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
}
// We prefer if the pre-maximized region was specified and identical.
origReg := have.tag.region == tag.region && tag.region != 0
origReg := have.tag.RegionID == tag.RegionID && tag.RegionID != 0
if !beaten && m.origReg != origReg {
if m.origReg {
return
@ -803,7 +619,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
beaten = true
}
regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.lang)
regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.LangID)
if !beaten && m.regGroupDist != regGroupDist {
if regGroupDist > m.regGroupDist {
return
@ -811,7 +627,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
beaten = true
}
paradigmReg := isParadigmLocale(tag.lang, have.maxRegion)
paradigmReg := isParadigmLocale(tag.LangID, have.maxRegion)
if !beaten && m.paradigmReg != paradigmReg {
if !paradigmReg {
return
@ -820,7 +636,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
}
// Next we prefer if the pre-maximized script was specified and identical.
origScript := have.tag.script == tag.script && tag.script != 0
origScript := have.tag.ScriptID == tag.ScriptID && tag.ScriptID != 0
if !beaten && m.origScript != origScript {
if m.origScript {
return
@ -843,9 +659,9 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
}
}
func isParadigmLocale(lang langID, r regionID) bool {
func isParadigmLocale(lang language.Language, r language.Region) bool {
for _, e := range paradigmLocales {
if langID(e[0]) == lang && (r == regionID(e[1]) || r == regionID(e[2])) {
if language.Language(e[0]) == lang && (r == language.Region(e[1]) || r == language.Region(e[2])) {
return true
}
}
@ -854,13 +670,13 @@ func isParadigmLocale(lang langID, r regionID) bool {
// regionGroupDist computes the distance between two regions based on their
// CLDR grouping.
func regionGroupDist(a, b regionID, script scriptID, lang langID) (dist uint8, same bool) {
func regionGroupDist(a, b language.Region, script language.Script, lang language.Language) (dist uint8, same bool) {
const defaultDistance = 4
aGroup := uint(regionToGroups[a]) << 1
bGroup := uint(regionToGroups[b]) << 1
for _, ri := range matchRegion {
if langID(ri.lang) == lang && (ri.script == 0 || scriptID(ri.script) == script) {
if language.Language(ri.lang) == lang && (ri.script == 0 || language.Script(ri.script) == script) {
group := uint(1 << (ri.group &^ 0x80))
if 0x80&ri.group == 0 {
if aGroup&bGroup&group != 0 { // Both regions are in the group.
@ -876,31 +692,16 @@ func regionGroupDist(a, b regionID, script scriptID, lang langID) (dist uint8, s
return defaultDistance, true
}
func (t Tag) variants() string {
if t.pVariant == 0 {
return ""
}
return t.str[t.pVariant:t.pExt]
}
// variantOrPrivateTagStr returns variants or private use tags.
func (t Tag) variantOrPrivateTagStr() string {
if t.pExt > 0 {
return t.str[t.pVariant:t.pExt]
}
return t.str[t.pVariant:]
}
// equalsRest compares everything except the language.
func (a Tag) equalsRest(b Tag) bool {
func equalsRest(a, b language.Tag) bool {
// TODO: don't include extensions in this comparison. To do this efficiently,
// though, we should handle private tags separately.
return a.script == b.script && a.region == b.region && a.variantOrPrivateTagStr() == b.variantOrPrivateTagStr()
return a.ScriptID == b.ScriptID && a.RegionID == b.RegionID && a.VariantOrPrivateUseTags() == b.VariantOrPrivateUseTags()
}
// isExactEquivalent returns true if canonicalizing the language will not alter
// the script or region of a tag.
func isExactEquivalent(l langID) bool {
func isExactEquivalent(l language.Language) bool {
for _, o := range notEquivalent {
if o == l {
return false
@ -909,25 +710,26 @@ func isExactEquivalent(l langID) bool {
return true
}
var notEquivalent []langID
var notEquivalent []language.Language
func init() {
// Create a list of all languages for which canonicalization may alter the
// script or region.
for _, lm := range langAliasMap {
tag := Tag{lang: langID(lm.from)}
if tag, _ = tag.canonicalize(All); tag.script != 0 || tag.region != 0 {
notEquivalent = append(notEquivalent, langID(lm.from))
for _, lm := range language.AliasMap {
tag := language.Tag{LangID: language.Language(lm.From)}
if tag, _ = canonicalize(All, tag); tag.ScriptID != 0 || tag.RegionID != 0 {
notEquivalent = append(notEquivalent, language.Language(lm.From))
}
}
// Maximize undefined regions of paradigm locales.
for i, v := range paradigmLocales {
max, _ := addTags(Tag{lang: langID(v[0])})
t := language.Tag{LangID: language.Language(v[0])}
max, _ := t.Maximize()
if v[1] == 0 {
paradigmLocales[i][1] = uint16(max.region)
paradigmLocales[i][1] = uint16(max.RegionID)
}
if v[2] == 0 {
paradigmLocales[i][2] = uint16(max.region)
paradigmLocales[i][2] = uint16(max.RegionID)
}
}
}

View file

@ -5,216 +5,21 @@
package language
import (
"bytes"
"errors"
"fmt"
"sort"
"strconv"
"strings"
"golang.org/x/text/internal/tag"
"golang.org/x/text/internal/language"
)
// isAlpha returns true if the byte is not a digit.
// b must be an ASCII letter or digit.
func isAlpha(b byte) bool {
return b > '9'
}
// isAlphaNum returns true if the string contains only ASCII letters or digits.
func isAlphaNum(s []byte) bool {
for _, c := range s {
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
return false
}
}
return true
}
// errSyntax is returned by any of the parsing functions when the
// input is not well-formed, according to BCP 47.
// TODO: return the position at which the syntax error occurred?
var errSyntax = errors.New("language: tag is not well-formed")
// ValueError is returned by any of the parsing functions when the
// input is well-formed but the respective subtag is not recognized
// as a valid value.
type ValueError struct {
v [8]byte
}
type ValueError interface {
error
func mkErrInvalid(s []byte) error {
var e ValueError
copy(e.v[:], s)
return e
}
func (e ValueError) tag() []byte {
n := bytes.IndexByte(e.v[:], 0)
if n == -1 {
n = 8
}
return e.v[:n]
}
// Error implements the error interface.
func (e ValueError) Error() string {
return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
}
// Subtag returns the subtag for which the error occurred.
func (e ValueError) Subtag() string {
return string(e.tag())
}
// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
type scanner struct {
b []byte
bytes [max99thPercentileSize]byte
token []byte
start int // start position of the current token
end int // end position of the current token
next int // next point for scan
err error
done bool
}
func makeScannerString(s string) scanner {
scan := scanner{}
if len(s) <= len(scan.bytes) {
scan.b = scan.bytes[:copy(scan.bytes[:], s)]
} else {
scan.b = []byte(s)
}
scan.init()
return scan
}
// makeScanner returns a scanner using b as the input buffer.
// b is not copied and may be modified by the scanner routines.
func makeScanner(b []byte) scanner {
scan := scanner{b: b}
scan.init()
return scan
}
func (s *scanner) init() {
for i, c := range s.b {
if c == '_' {
s.b[i] = '-'
}
}
s.scan()
}
// restToLower converts the string between start and end to lower case.
func (s *scanner) toLower(start, end int) {
for i := start; i < end; i++ {
c := s.b[i]
if 'A' <= c && c <= 'Z' {
s.b[i] += 'a' - 'A'
}
}
}
func (s *scanner) setError(e error) {
if s.err == nil || (e == errSyntax && s.err != errSyntax) {
s.err = e
}
}
// resizeRange shrinks or grows the array at position oldStart such that
// a new string of size newSize can fit between oldStart and oldEnd.
// Sets the scan point to after the resized range.
func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
s.start = oldStart
if end := oldStart + newSize; end != oldEnd {
diff := end - oldEnd
if end < cap(s.b) {
b := make([]byte, len(s.b)+diff)
copy(b, s.b[:oldStart])
copy(b[end:], s.b[oldEnd:])
s.b = b
} else {
s.b = append(s.b[end:], s.b[oldEnd:]...)
}
s.next = end + (s.next - s.end)
s.end = end
}
}
// replace replaces the current token with repl.
func (s *scanner) replace(repl string) {
s.resizeRange(s.start, s.end, len(repl))
copy(s.b[s.start:], repl)
}
// gobble removes the current token from the input.
// Caller must call scan after calling gobble.
func (s *scanner) gobble(e error) {
s.setError(e)
if s.start == 0 {
s.b = s.b[:+copy(s.b, s.b[s.next:])]
s.end = 0
} else {
s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
s.end = s.start - 1
}
s.next = s.start
}
// deleteRange removes the given range from s.b before the current token.
func (s *scanner) deleteRange(start, end int) {
s.setError(errSyntax)
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
diff := end - start
s.next -= diff
s.start -= diff
s.end -= diff
}
// scan parses the next token of a BCP 47 string. Tokens that are larger
// than 8 characters or include non-alphanumeric characters result in an error
// and are gobbled and removed from the output.
// It returns the end position of the last token consumed.
func (s *scanner) scan() (end int) {
end = s.end
s.token = nil
for s.start = s.next; s.next < len(s.b); {
i := bytes.IndexByte(s.b[s.next:], '-')
if i == -1 {
s.end = len(s.b)
s.next = len(s.b)
i = s.end - s.start
} else {
s.end = s.next + i
s.next = s.end + 1
}
token := s.b[s.start:s.end]
if i < 1 || i > 8 || !isAlphaNum(token) {
s.gobble(errSyntax)
continue
}
s.token = token
return end
}
if n := len(s.b); n > 0 && s.b[n-1] == '-' {
s.setError(errSyntax)
s.b = s.b[:len(s.b)-1]
}
s.done = true
return end
}
// acceptMinSize parses multiple tokens of the given size or greater.
// It returns the end position of the last token consumed.
func (s *scanner) acceptMinSize(min int) (end int) {
end = s.end
s.scan()
for ; len(s.token) >= min; s.scan() {
end = s.end
}
return end
// Subtag returns the subtag for which the error occurred.
Subtag() string
}
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
@ -223,7 +28,7 @@ func (s *scanner) acceptMinSize(min int) (end int) {
// ValueError. The Tag returned in this case is just stripped of the unknown
// value. All other values are preserved. It accepts tags in the BCP 47 format
// and extensions to this standard defined in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// The resulting tag is canonicalized using the default canonicalization type.
func Parse(s string) (t Tag, err error) {
return Default.Parse(s)
@ -235,327 +40,18 @@ func Parse(s string) (t Tag, err error) {
// ValueError. The Tag returned in this case is just stripped of the unknown
// value. All other values are preserved. It accepts tags in the BCP 47 format
// and extensions to this standard defined in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// The resulting tag is canonicalized using the the canonicalization type c.
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// The resulting tag is canonicalized using the canonicalization type c.
func (c CanonType) Parse(s string) (t Tag, err error) {
// TODO: consider supporting old-style locale key-value pairs.
if s == "" {
return und, errSyntax
tt, err := language.Parse(s)
if err != nil {
return makeTag(tt), err
}
if len(s) <= maxAltTaglen {
b := [maxAltTaglen]byte{}
for i, c := range s {
// Generating invalid UTF-8 is okay as it won't match.
if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
} else if c == '_' {
c = '-'
}
b[i] = byte(c)
}
if t, ok := grandfathered(b); ok {
return t, nil
}
}
scan := makeScannerString(s)
t, err = parse(&scan, s)
t, changed := t.canonicalize(c)
tt, changed := canonicalize(c, tt)
if changed {
t.remakeString()
tt.RemakeString()
}
return t, err
}
func parse(scan *scanner, s string) (t Tag, err error) {
t = und
var end int
if n := len(scan.token); n <= 1 {
scan.toLower(0, len(scan.b))
if n == 0 || scan.token[0] != 'x' {
return t, errSyntax
}
end = parseExtensions(scan)
} else if n >= 4 {
return und, errSyntax
} else { // the usual case
t, end = parseTag(scan)
if n := len(scan.token); n == 1 {
t.pExt = uint16(end)
end = parseExtensions(scan)
} else if end < len(scan.b) {
scan.setError(errSyntax)
scan.b = scan.b[:end]
}
}
if int(t.pVariant) < len(scan.b) {
if end < len(s) {
s = s[:end]
}
if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
t.str = s
} else {
t.str = string(scan.b)
}
} else {
t.pVariant, t.pExt = 0, 0
}
return t, scan.err
}
// parseTag parses language, script, region and variants.
// It returns a Tag and the end position in the input that was parsed.
func parseTag(scan *scanner) (t Tag, end int) {
var e error
// TODO: set an error if an unknown lang, script or region is encountered.
t.lang, e = getLangID(scan.token)
scan.setError(e)
scan.replace(t.lang.String())
langStart := scan.start
end = scan.scan()
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
// to a tag of the form <extlang>.
lang, e := getLangID(scan.token)
if lang != 0 {
t.lang = lang
copy(scan.b[langStart:], lang.String())
scan.b[langStart+3] = '-'
scan.start = langStart + 4
}
scan.gobble(e)
end = scan.scan()
}
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
t.script, e = getScriptID(script, scan.token)
if t.script == 0 {
scan.gobble(e)
}
end = scan.scan()
}
if n := len(scan.token); n >= 2 && n <= 3 {
t.region, e = getRegionID(scan.token)
if t.region == 0 {
scan.gobble(e)
} else {
scan.replace(t.region.String())
}
end = scan.scan()
}
scan.toLower(scan.start, len(scan.b))
t.pVariant = byte(end)
end = parseVariants(scan, end, t)
t.pExt = uint16(end)
return t, end
}
var separator = []byte{'-'}
// parseVariants scans tokens as long as each token is a valid variant string.
// Duplicate variants are removed.
func parseVariants(scan *scanner, end int, t Tag) int {
start := scan.start
varIDBuf := [4]uint8{}
variantBuf := [4][]byte{}
varID := varIDBuf[:0]
variant := variantBuf[:0]
last := -1
needSort := false
for ; len(scan.token) >= 4; scan.scan() {
// TODO: measure the impact of needing this conversion and redesign
// the data structure if there is an issue.
v, ok := variantIndex[string(scan.token)]
if !ok {
// unknown variant
// TODO: allow user-defined variants?
scan.gobble(mkErrInvalid(scan.token))
continue
}
varID = append(varID, v)
variant = append(variant, scan.token)
if !needSort {
if last < int(v) {
last = int(v)
} else {
needSort = true
// There is no legal combinations of more than 7 variants
// (and this is by no means a useful sequence).
const maxVariants = 8
if len(varID) > maxVariants {
break
}
}
}
end = scan.end
}
if needSort {
sort.Sort(variantsSort{varID, variant})
k, l := 0, -1
for i, v := range varID {
w := int(v)
if l == w {
// Remove duplicates.
continue
}
varID[k] = varID[i]
variant[k] = variant[i]
k++
l = w
}
if str := bytes.Join(variant[:k], separator); len(str) == 0 {
end = start - 1
} else {
scan.resizeRange(start, end, len(str))
copy(scan.b[scan.start:], str)
end = scan.end
}
}
return end
}
type variantsSort struct {
i []uint8
v [][]byte
}
func (s variantsSort) Len() int {
return len(s.i)
}
func (s variantsSort) Swap(i, j int) {
s.i[i], s.i[j] = s.i[j], s.i[i]
s.v[i], s.v[j] = s.v[j], s.v[i]
}
func (s variantsSort) Less(i, j int) bool {
return s.i[i] < s.i[j]
}
type bytesSort [][]byte
func (b bytesSort) Len() int {
return len(b)
}
func (b bytesSort) Swap(i, j int) {
b[i], b[j] = b[j], b[i]
}
func (b bytesSort) Less(i, j int) bool {
return bytes.Compare(b[i], b[j]) == -1
}
// parseExtensions parses and normalizes the extensions in the buffer.
// It returns the last position of scan.b that is part of any extension.
// It also trims scan.b to remove excess parts accordingly.
func parseExtensions(scan *scanner) int {
start := scan.start
exts := [][]byte{}
private := []byte{}
end := scan.end
for len(scan.token) == 1 {
extStart := scan.start
ext := scan.token[0]
end = parseExtension(scan)
extension := scan.b[extStart:end]
if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
scan.setError(errSyntax)
end = extStart
continue
} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
scan.b = scan.b[:end]
return end
} else if ext == 'x' {
private = extension
break
}
exts = append(exts, extension)
}
sort.Sort(bytesSort(exts))
if len(private) > 0 {
exts = append(exts, private)
}
scan.b = scan.b[:start]
if len(exts) > 0 {
scan.b = append(scan.b, bytes.Join(exts, separator)...)
} else if start > 0 {
// Strip trailing '-'.
scan.b = scan.b[:start-1]
}
return end
}
// parseExtension parses a single extension and returns the position of
// the extension end.
func parseExtension(scan *scanner) int {
start, end := scan.start, scan.end
switch scan.token[0] {
case 'u':
attrStart := end
scan.scan()
for last := []byte{}; len(scan.token) > 2; scan.scan() {
if bytes.Compare(scan.token, last) != -1 {
// Attributes are unsorted. Start over from scratch.
p := attrStart + 1
scan.next = p
attrs := [][]byte{}
for scan.scan(); len(scan.token) > 2; scan.scan() {
attrs = append(attrs, scan.token)
end = scan.end
}
sort.Sort(bytesSort(attrs))
copy(scan.b[p:], bytes.Join(attrs, separator))
break
}
last = scan.token
end = scan.end
}
var last, key []byte
for attrEnd := end; len(scan.token) == 2; last = key {
key = scan.token
keyEnd := scan.end
end = scan.acceptMinSize(3)
// TODO: check key value validity
if keyEnd == end || bytes.Compare(key, last) != 1 {
// We have an invalid key or the keys are not sorted.
// Start scanning keys from scratch and reorder.
p := attrEnd + 1
scan.next = p
keys := [][]byte{}
for scan.scan(); len(scan.token) == 2; {
keyStart, keyEnd := scan.start, scan.end
end = scan.acceptMinSize(3)
if keyEnd != end {
keys = append(keys, scan.b[keyStart:end])
} else {
scan.setError(errSyntax)
end = keyStart
}
}
sort.Sort(bytesSort(keys))
reordered := bytes.Join(keys, separator)
if e := p + len(reordered); e < end {
scan.deleteRange(e, end)
end = e
}
copy(scan.b[p:], bytes.Join(keys, separator))
break
}
}
case 't':
scan.scan()
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
_, end = parseTag(scan)
scan.toLower(start, end)
}
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
end = scan.acceptMinSize(3)
}
case 'x':
end = scan.acceptMinSize(1)
default:
end = scan.acceptMinSize(2)
}
return end
return makeTag(tt), err
}
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
@ -563,10 +59,11 @@ func parseExtension(scan *scanner) int {
// Base, Script or Region or slice of type Variant or Extension is passed more
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
// will replace the former. A Tag overwrites all former values and typically
// only makes sense as the first argument. The resulting tag is returned after
// canonicalizing using the Default CanonType. If one or more errors are
// encountered, one of the errors is returned.
// will replace the former. For -u extensions, though, the key-type pairs are
// added, where later values overwrite older ones. A Tag overwrites all former
// values and typically only makes sense as the first argument. The resulting
// tag is returned after canonicalizing using the Default CanonType. If one or
// more errors are encountered, one of the errors is returned.
func Compose(part ...interface{}) (t Tag, err error) {
return Default.Compose(part...)
}
@ -576,191 +73,63 @@ func Compose(part ...interface{}) (t Tag, err error) {
// Base, Script or Region or slice of type Variant or Extension is passed more
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
// will replace the former. A Tag overwrites all former values and typically
// only makes sense as the first argument. The resulting tag is returned after
// canonicalizing using CanonType c. If one or more errors are encountered,
// one of the errors is returned.
// will replace the former. For -u extensions, though, the key-type pairs are
// added, where later values overwrite older ones. A Tag overwrites all former
// values and typically only makes sense as the first argument. The resulting
// tag is returned after canonicalizing using CanonType c. If one or more errors
// are encountered, one of the errors is returned.
func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
var b builder
if err = b.update(part...); err != nil {
var b language.Builder
if err = update(&b, part...); err != nil {
return und, err
}
t, _ = b.tag.canonicalize(c)
if len(b.ext) > 0 || len(b.variant) > 0 {
sort.Sort(sortVariant(b.variant))
sort.Strings(b.ext)
if b.private != "" {
b.ext = append(b.ext, b.private)
}
n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...)
buf := make([]byte, n)
p := t.genCoreBytes(buf)
t.pVariant = byte(p)
p += appendTokens(buf[p:], b.variant...)
t.pExt = uint16(p)
p += appendTokens(buf[p:], b.ext...)
t.str = string(buf[:p])
} else if b.private != "" {
t.str = b.private
t.remakeString()
}
return
}
type builder struct {
tag Tag
private string // the x extension
ext []string
variant []string
err error
}
func (b *builder) addExt(e string) {
if e == "" {
} else if e[0] == 'x' {
b.private = e
} else {
b.ext = append(b.ext, e)
}
b.Tag, _ = canonicalize(c, b.Tag)
return makeTag(b.Make()), err
}
var errInvalidArgument = errors.New("invalid Extension or Variant")
func (b *builder) update(part ...interface{}) (err error) {
replace := func(l *[]string, s string, eq func(a, b string) bool) bool {
if s == "" {
b.err = errInvalidArgument
return true
}
for i, v := range *l {
if eq(v, s) {
(*l)[i] = s
return true
}
}
return false
}
func update(b *language.Builder, part ...interface{}) (err error) {
for _, x := range part {
switch v := x.(type) {
case Tag:
b.tag.lang = v.lang
b.tag.region = v.region
b.tag.script = v.script
if v.str != "" {
b.variant = nil
for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; {
x, s = nextToken(s)
b.variant = append(b.variant, x)
}
b.ext, b.private = nil, ""
for i, e := int(v.pExt), ""; i < len(v.str); {
i, e = getExtension(v.str, i)
b.addExt(e)
}
}
b.SetTag(v.tag())
case Base:
b.tag.lang = v.langID
b.Tag.LangID = v.langID
case Script:
b.tag.script = v.scriptID
b.Tag.ScriptID = v.scriptID
case Region:
b.tag.region = v.regionID
b.Tag.RegionID = v.regionID
case Variant:
if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) {
b.variant = append(b.variant, v.variant)
if v.variant == "" {
err = errInvalidArgument
break
}
b.AddVariant(v.variant)
case Extension:
if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) {
b.addExt(v.s)
if v.s == "" {
err = errInvalidArgument
break
}
b.SetExt(v.s)
case []Variant:
b.variant = nil
for _, x := range v {
b.update(x)
b.ClearVariants()
for _, v := range v {
b.AddVariant(v.variant)
}
case []Extension:
b.ext, b.private = nil, ""
b.ClearExtensions()
for _, e := range v {
b.update(e)
b.SetExt(e.s)
}
// TODO: support parsing of raw strings based on morphology or just extensions?
case error:
err = v
}
}
return
}
func tokenLen(token ...string) (n int) {
for _, t := range token {
n += len(t) + 1
}
return
}
func appendTokens(b []byte, token ...string) int {
p := 0
for _, t := range token {
b[p] = '-'
copy(b[p+1:], t)
p += 1 + len(t)
}
return p
}
type sortVariant []string
func (s sortVariant) Len() int {
return len(s)
}
func (s sortVariant) Swap(i, j int) {
s[j], s[i] = s[i], s[j]
}
func (s sortVariant) Less(i, j int) bool {
return variantIndex[s[i]] < variantIndex[s[j]]
}
func findExt(list []string, x byte) int {
for i, e := range list {
if e[0] == x {
return i
}
}
return -1
}
// getExtension returns the name, body and end position of the extension.
func getExtension(s string, p int) (end int, ext string) {
if s[p] == '-' {
p++
}
if s[p] == 'x' {
return len(s), s[p:]
}
end = nextExtension(s, p)
return end, s[p:end]
}
// nextExtension finds the next extension within the string, searching
// for the -<char>- pattern from position p.
// In the fast majority of cases, language tags will have at most
// one extension and extensions tend to be small.
func nextExtension(s string, p int) int {
for n := len(s) - 3; p < n; {
if s[p] == '-' {
if s[p+2] == '-' {
return p
if v != nil {
err = v
}
p += 3
} else {
p++
}
}
return len(s)
return
}
var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
@ -788,7 +157,7 @@ func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
if !ok {
return nil, nil, err
}
t = Tag{lang: id}
t = makeTag(language.Tag{LangID: id})
}
// Scan the optional weight.
@ -830,9 +199,9 @@ func split(s string, c byte) (head, tail string) {
return strings.TrimSpace(s), ""
}
// Add hack mapping to deal with a small number of cases that that occur
// Add hack mapping to deal with a small number of cases that occur
// in Accept-Language (with reasonable frequency).
var acceptFallback = map[string]langID{
var acceptFallback = map[string]language.Language{
"english": _en,
"deutsch": _de,
"italian": _it,

File diff suppressed because it is too large Load diff

View file

@ -4,6 +4,8 @@
package language
import "golang.org/x/text/internal/language/compact"
// TODO: Various sets of commonly use tags and regions.
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
@ -61,83 +63,83 @@ var (
Und Tag = Tag{}
Afrikaans Tag = Tag{lang: _af} // af
Amharic Tag = Tag{lang: _am} // am
Arabic Tag = Tag{lang: _ar} // ar
ModernStandardArabic Tag = Tag{lang: _ar, region: _001} // ar-001
Azerbaijani Tag = Tag{lang: _az} // az
Bulgarian Tag = Tag{lang: _bg} // bg
Bengali Tag = Tag{lang: _bn} // bn
Catalan Tag = Tag{lang: _ca} // ca
Czech Tag = Tag{lang: _cs} // cs
Danish Tag = Tag{lang: _da} // da
German Tag = Tag{lang: _de} // de
Greek Tag = Tag{lang: _el} // el
English Tag = Tag{lang: _en} // en
AmericanEnglish Tag = Tag{lang: _en, region: _US} // en-US
BritishEnglish Tag = Tag{lang: _en, region: _GB} // en-GB
Spanish Tag = Tag{lang: _es} // es
EuropeanSpanish Tag = Tag{lang: _es, region: _ES} // es-ES
LatinAmericanSpanish Tag = Tag{lang: _es, region: _419} // es-419
Estonian Tag = Tag{lang: _et} // et
Persian Tag = Tag{lang: _fa} // fa
Finnish Tag = Tag{lang: _fi} // fi
Filipino Tag = Tag{lang: _fil} // fil
French Tag = Tag{lang: _fr} // fr
CanadianFrench Tag = Tag{lang: _fr, region: _CA} // fr-CA
Gujarati Tag = Tag{lang: _gu} // gu
Hebrew Tag = Tag{lang: _he} // he
Hindi Tag = Tag{lang: _hi} // hi
Croatian Tag = Tag{lang: _hr} // hr
Hungarian Tag = Tag{lang: _hu} // hu
Armenian Tag = Tag{lang: _hy} // hy
Indonesian Tag = Tag{lang: _id} // id
Icelandic Tag = Tag{lang: _is} // is
Italian Tag = Tag{lang: _it} // it
Japanese Tag = Tag{lang: _ja} // ja
Georgian Tag = Tag{lang: _ka} // ka
Kazakh Tag = Tag{lang: _kk} // kk
Khmer Tag = Tag{lang: _km} // km
Kannada Tag = Tag{lang: _kn} // kn
Korean Tag = Tag{lang: _ko} // ko
Kirghiz Tag = Tag{lang: _ky} // ky
Lao Tag = Tag{lang: _lo} // lo
Lithuanian Tag = Tag{lang: _lt} // lt
Latvian Tag = Tag{lang: _lv} // lv
Macedonian Tag = Tag{lang: _mk} // mk
Malayalam Tag = Tag{lang: _ml} // ml
Mongolian Tag = Tag{lang: _mn} // mn
Marathi Tag = Tag{lang: _mr} // mr
Malay Tag = Tag{lang: _ms} // ms
Burmese Tag = Tag{lang: _my} // my
Nepali Tag = Tag{lang: _ne} // ne
Dutch Tag = Tag{lang: _nl} // nl
Norwegian Tag = Tag{lang: _no} // no
Punjabi Tag = Tag{lang: _pa} // pa
Polish Tag = Tag{lang: _pl} // pl
Portuguese Tag = Tag{lang: _pt} // pt
BrazilianPortuguese Tag = Tag{lang: _pt, region: _BR} // pt-BR
EuropeanPortuguese Tag = Tag{lang: _pt, region: _PT} // pt-PT
Romanian Tag = Tag{lang: _ro} // ro
Russian Tag = Tag{lang: _ru} // ru
Sinhala Tag = Tag{lang: _si} // si
Slovak Tag = Tag{lang: _sk} // sk
Slovenian Tag = Tag{lang: _sl} // sl
Albanian Tag = Tag{lang: _sq} // sq
Serbian Tag = Tag{lang: _sr} // sr
SerbianLatin Tag = Tag{lang: _sr, script: _Latn} // sr-Latn
Swedish Tag = Tag{lang: _sv} // sv
Swahili Tag = Tag{lang: _sw} // sw
Tamil Tag = Tag{lang: _ta} // ta
Telugu Tag = Tag{lang: _te} // te
Thai Tag = Tag{lang: _th} // th
Turkish Tag = Tag{lang: _tr} // tr
Ukrainian Tag = Tag{lang: _uk} // uk
Urdu Tag = Tag{lang: _ur} // ur
Uzbek Tag = Tag{lang: _uz} // uz
Vietnamese Tag = Tag{lang: _vi} // vi
Chinese Tag = Tag{lang: _zh} // zh
SimplifiedChinese Tag = Tag{lang: _zh, script: _Hans} // zh-Hans
TraditionalChinese Tag = Tag{lang: _zh, script: _Hant} // zh-Hant
Zulu Tag = Tag{lang: _zu} // zu
Afrikaans Tag = Tag(compact.Afrikaans)
Amharic Tag = Tag(compact.Amharic)
Arabic Tag = Tag(compact.Arabic)
ModernStandardArabic Tag = Tag(compact.ModernStandardArabic)
Azerbaijani Tag = Tag(compact.Azerbaijani)
Bulgarian Tag = Tag(compact.Bulgarian)
Bengali Tag = Tag(compact.Bengali)
Catalan Tag = Tag(compact.Catalan)
Czech Tag = Tag(compact.Czech)
Danish Tag = Tag(compact.Danish)
German Tag = Tag(compact.German)
Greek Tag = Tag(compact.Greek)
English Tag = Tag(compact.English)
AmericanEnglish Tag = Tag(compact.AmericanEnglish)
BritishEnglish Tag = Tag(compact.BritishEnglish)
Spanish Tag = Tag(compact.Spanish)
EuropeanSpanish Tag = Tag(compact.EuropeanSpanish)
LatinAmericanSpanish Tag = Tag(compact.LatinAmericanSpanish)
Estonian Tag = Tag(compact.Estonian)
Persian Tag = Tag(compact.Persian)
Finnish Tag = Tag(compact.Finnish)
Filipino Tag = Tag(compact.Filipino)
French Tag = Tag(compact.French)
CanadianFrench Tag = Tag(compact.CanadianFrench)
Gujarati Tag = Tag(compact.Gujarati)
Hebrew Tag = Tag(compact.Hebrew)
Hindi Tag = Tag(compact.Hindi)
Croatian Tag = Tag(compact.Croatian)
Hungarian Tag = Tag(compact.Hungarian)
Armenian Tag = Tag(compact.Armenian)
Indonesian Tag = Tag(compact.Indonesian)
Icelandic Tag = Tag(compact.Icelandic)
Italian Tag = Tag(compact.Italian)
Japanese Tag = Tag(compact.Japanese)
Georgian Tag = Tag(compact.Georgian)
Kazakh Tag = Tag(compact.Kazakh)
Khmer Tag = Tag(compact.Khmer)
Kannada Tag = Tag(compact.Kannada)
Korean Tag = Tag(compact.Korean)
Kirghiz Tag = Tag(compact.Kirghiz)
Lao Tag = Tag(compact.Lao)
Lithuanian Tag = Tag(compact.Lithuanian)
Latvian Tag = Tag(compact.Latvian)
Macedonian Tag = Tag(compact.Macedonian)
Malayalam Tag = Tag(compact.Malayalam)
Mongolian Tag = Tag(compact.Mongolian)
Marathi Tag = Tag(compact.Marathi)
Malay Tag = Tag(compact.Malay)
Burmese Tag = Tag(compact.Burmese)
Nepali Tag = Tag(compact.Nepali)
Dutch Tag = Tag(compact.Dutch)
Norwegian Tag = Tag(compact.Norwegian)
Punjabi Tag = Tag(compact.Punjabi)
Polish Tag = Tag(compact.Polish)
Portuguese Tag = Tag(compact.Portuguese)
BrazilianPortuguese Tag = Tag(compact.BrazilianPortuguese)
EuropeanPortuguese Tag = Tag(compact.EuropeanPortuguese)
Romanian Tag = Tag(compact.Romanian)
Russian Tag = Tag(compact.Russian)
Sinhala Tag = Tag(compact.Sinhala)
Slovak Tag = Tag(compact.Slovak)
Slovenian Tag = Tag(compact.Slovenian)
Albanian Tag = Tag(compact.Albanian)
Serbian Tag = Tag(compact.Serbian)
SerbianLatin Tag = Tag(compact.SerbianLatin)
Swedish Tag = Tag(compact.Swedish)
Swahili Tag = Tag(compact.Swahili)
Tamil Tag = Tag(compact.Tamil)
Telugu Tag = Tag(compact.Telugu)
Thai Tag = Tag(compact.Thai)
Turkish Tag = Tag(compact.Turkish)
Ukrainian Tag = Tag(compact.Ukrainian)
Urdu Tag = Tag(compact.Urdu)
Uzbek Tag = Tag(compact.Uzbek)
Vietnamese Tag = Tag(compact.Vietnamese)
Chinese Tag = Tag(compact.Chinese)
SimplifiedChinese Tag = Tag(compact.SimplifiedChinese)
TraditionalChinese Tag = Tag(compact.TraditionalChinese)
Zulu Tag = Tag(compact.Zulu)
)

336
vendor/golang.org/x/text/secure/bidirule/bidirule.go generated vendored Normal file
View file

@ -0,0 +1,336 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package bidirule implements the Bidi Rule defined by RFC 5893.
//
// This package is under development. The API may change without notice and
// without preserving backward compatibility.
package bidirule
import (
"errors"
"unicode/utf8"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/bidi"
)
// This file contains an implementation of RFC 5893: Right-to-Left Scripts for
// Internationalized Domain Names for Applications (IDNA)
//
// A label is an individual component of a domain name. Labels are usually
// shown separated by dots; for example, the domain name "www.example.com" is
// composed of three labels: "www", "example", and "com".
//
// An RTL label is a label that contains at least one character of class R, AL,
// or AN. An LTR label is any label that is not an RTL label.
//
// A "Bidi domain name" is a domain name that contains at least one RTL label.
//
// The following guarantees can be made based on the above:
//
// o In a domain name consisting of only labels that satisfy the rule,
// the requirements of Section 3 are satisfied. Note that even LTR
// labels and pure ASCII labels have to be tested.
//
// o In a domain name consisting of only LDH labels (as defined in the
// Definitions document [RFC5890]) and labels that satisfy the rule,
// the requirements of Section 3 are satisfied as long as a label
// that starts with an ASCII digit does not come after a
// right-to-left label.
//
// No guarantee is given for other combinations.
// ErrInvalid indicates a label is invalid according to the Bidi Rule.
var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
type ruleState uint8
const (
ruleInitial ruleState = iota
ruleLTR
ruleLTRFinal
ruleRTL
ruleRTLFinal
ruleInvalid
)
type ruleTransition struct {
next ruleState
mask uint16
}
var transitions = [...][2]ruleTransition{
// [2.1] The first character must be a character with Bidi property L, R, or
// AL. If it has the R or AL property, it is an RTL label; if it has the L
// property, it is an LTR label.
ruleInitial: {
{ruleLTRFinal, 1 << bidi.L},
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
},
ruleRTL: {
// [2.3] In an RTL label, the end of the label must be a character with
// Bidi property R, AL, EN, or AN, followed by zero or more characters
// with Bidi property NSM.
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
// [2.2] In an RTL label, only characters with the Bidi properties R,
// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
// We exclude the entries from [2.3]
{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
},
ruleRTLFinal: {
// [2.3] In an RTL label, the end of the label must be a character with
// Bidi property R, AL, EN, or AN, followed by zero or more characters
// with Bidi property NSM.
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
// [2.2] In an RTL label, only characters with the Bidi properties R,
// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
// We exclude the entries from [2.3] and NSM.
{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
},
ruleLTR: {
// [2.6] In an LTR label, the end of the label must be a character with
// Bidi property L or EN, followed by zero or more characters with Bidi
// property NSM.
{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
// [2.5] In an LTR label, only characters with the Bidi properties L,
// EN, ES, CS, ET, ON, BN, or NSM are allowed.
// We exclude the entries from [2.6].
{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
},
ruleLTRFinal: {
// [2.6] In an LTR label, the end of the label must be a character with
// Bidi property L or EN, followed by zero or more characters with Bidi
// property NSM.
{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
// [2.5] In an LTR label, only characters with the Bidi properties L,
// EN, ES, CS, ET, ON, BN, or NSM are allowed.
// We exclude the entries from [2.6].
{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
},
ruleInvalid: {
{ruleInvalid, 0},
{ruleInvalid, 0},
},
}
// [2.4] In an RTL label, if an EN is present, no AN may be present, and
// vice versa.
const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
// From RFC 5893
// An RTL label is a label that contains at least one character of type
// R, AL, or AN.
//
// An LTR label is any label that is not an RTL label.
// Direction reports the direction of the given label as defined by RFC 5893.
// The Bidi Rule does not have to be applied to labels of the category
// LeftToRight.
func Direction(b []byte) bidi.Direction {
for i := 0; i < len(b); {
e, sz := bidi.Lookup(b[i:])
if sz == 0 {
i++
}
c := e.Class()
if c == bidi.R || c == bidi.AL || c == bidi.AN {
return bidi.RightToLeft
}
i += sz
}
return bidi.LeftToRight
}
// DirectionString reports the direction of the given label as defined by RFC
// 5893. The Bidi Rule does not have to be applied to labels of the category
// LeftToRight.
func DirectionString(s string) bidi.Direction {
for i := 0; i < len(s); {
e, sz := bidi.LookupString(s[i:])
if sz == 0 {
i++
continue
}
c := e.Class()
if c == bidi.R || c == bidi.AL || c == bidi.AN {
return bidi.RightToLeft
}
i += sz
}
return bidi.LeftToRight
}
// Valid reports whether b conforms to the BiDi rule.
func Valid(b []byte) bool {
var t Transformer
if n, ok := t.advance(b); !ok || n < len(b) {
return false
}
return t.isFinal()
}
// ValidString reports whether s conforms to the BiDi rule.
func ValidString(s string) bool {
var t Transformer
if n, ok := t.advanceString(s); !ok || n < len(s) {
return false
}
return t.isFinal()
}
// New returns a Transformer that verifies that input adheres to the Bidi Rule.
func New() *Transformer {
return &Transformer{}
}
// Transformer implements transform.Transform.
type Transformer struct {
state ruleState
hasRTL bool
seen uint16
}
// A rule can only be violated for "Bidi Domain names", meaning if one of the
// following categories has been observed.
func (t *Transformer) isRTL() bool {
const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
return t.seen&isRTL != 0
}
// Reset implements transform.Transformer.
func (t *Transformer) Reset() { *t = Transformer{} }
// Transform implements transform.Transformer. This Transformer has state and
// needs to be reset between uses.
func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(dst) < len(src) {
src = src[:len(dst)]
atEOF = false
err = transform.ErrShortDst
}
n, err1 := t.Span(src, atEOF)
copy(dst, src[:n])
if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
err = err1
}
return n, n, err
}
// Span returns the first n bytes of src that conform to the Bidi rule.
func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
if t.state == ruleInvalid && t.isRTL() {
return 0, ErrInvalid
}
n, ok := t.advance(src)
switch {
case !ok:
err = ErrInvalid
case n < len(src):
if !atEOF {
err = transform.ErrShortSrc
break
}
err = ErrInvalid
case !t.isFinal():
err = ErrInvalid
}
return n, err
}
// Precomputing the ASCII values decreases running time for the ASCII fast path
// by about 30%.
var asciiTable [128]bidi.Properties
func init() {
for i := range asciiTable {
p, _ := bidi.LookupRune(rune(i))
asciiTable[i] = p
}
}
func (t *Transformer) advance(s []byte) (n int, ok bool) {
var e bidi.Properties
var sz int
for n < len(s) {
if s[n] < utf8.RuneSelf {
e, sz = asciiTable[s[n]], 1
} else {
e, sz = bidi.Lookup(s[n:])
if sz <= 1 {
if sz == 1 {
// We always consider invalid UTF-8 to be invalid, even if
// the string has not yet been determined to be RTL.
// TODO: is this correct?
return n, false
}
return n, true // incomplete UTF-8 encoding
}
}
// TODO: using CompactClass would result in noticeable speedup.
// See unicode/bidi/prop.go:Properties.CompactClass.
c := uint16(1 << e.Class())
t.seen |= c
if t.seen&exclusiveRTL == exclusiveRTL {
t.state = ruleInvalid
return n, false
}
switch tr := transitions[t.state]; {
case tr[0].mask&c != 0:
t.state = tr[0].next
case tr[1].mask&c != 0:
t.state = tr[1].next
default:
t.state = ruleInvalid
if t.isRTL() {
return n, false
}
}
n += sz
}
return n, true
}
func (t *Transformer) advanceString(s string) (n int, ok bool) {
var e bidi.Properties
var sz int
for n < len(s) {
if s[n] < utf8.RuneSelf {
e, sz = asciiTable[s[n]], 1
} else {
e, sz = bidi.LookupString(s[n:])
if sz <= 1 {
if sz == 1 {
return n, false // invalid UTF-8
}
return n, true // incomplete UTF-8 encoding
}
}
// TODO: using CompactClass results in noticeable speedup.
// See unicode/bidi/prop.go:Properties.CompactClass.
c := uint16(1 << e.Class())
t.seen |= c
if t.seen&exclusiveRTL == exclusiveRTL {
t.state = ruleInvalid
return n, false
}
switch tr := transitions[t.state]; {
case tr[0].mask&c != 0:
t.state = tr[0].next
case tr[1].mask&c != 0:
t.state = tr[1].next
default:
t.state = ruleInvalid
if t.isRTL() {
return n, false
}
}
n += sz
}
return n, true
}

View file

@ -0,0 +1,11 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build go1.10
package bidirule
func (t *Transformer) isFinal() bool {
return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial
}

View file

@ -0,0 +1,14 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !go1.10
package bidirule
func (t *Transformer) isFinal() bool {
if !t.isRTL() {
return true
}
return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial
}

View file

@ -78,8 +78,8 @@ type SpanningTransformer interface {
// considering the error err.
//
// A nil error means that all input bytes are known to be identical to the
// output produced by the Transformer. A nil error can be be returned
// regardless of whether atEOF is true. If err is nil, then then n must
// output produced by the Transformer. A nil error can be returned
// regardless of whether atEOF is true. If err is nil, then n must
// equal len(src); the converse is not necessarily true.
//
// ErrEndOfSpan means that the Transformer output may differ from the
@ -493,7 +493,7 @@ func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err erro
return dstL.n, srcL.p, err
}
// Deprecated: use runes.Remove instead.
// Deprecated: Use runes.Remove instead.
func RemoveFunc(f func(r rune) bool) Transformer {
return removeF(f)
}

198
vendor/golang.org/x/text/unicode/bidi/bidi.go generated vendored Normal file
View file

@ -0,0 +1,198 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go gen_trieval.go gen_ranges.go
// Package bidi contains functionality for bidirectional text support.
//
// See https://www.unicode.org/reports/tr9.
//
// NOTE: UNDER CONSTRUCTION. This API may change in backwards incompatible ways
// and without notice.
package bidi // import "golang.org/x/text/unicode/bidi"
// TODO:
// The following functionality would not be hard to implement, but hinges on
// the definition of a Segmenter interface. For now this is up to the user.
// - Iterate over paragraphs
// - Segmenter to iterate over runs directly from a given text.
// Also:
// - Transformer for reordering?
// - Transformer (validator, really) for Bidi Rule.
// This API tries to avoid dealing with embedding levels for now. Under the hood
// these will be computed, but the question is to which extent the user should
// know they exist. We should at some point allow the user to specify an
// embedding hierarchy, though.
// A Direction indicates the overall flow of text.
type Direction int
const (
// LeftToRight indicates the text contains no right-to-left characters and
// that either there are some left-to-right characters or the option
// DefaultDirection(LeftToRight) was passed.
LeftToRight Direction = iota
// RightToLeft indicates the text contains no left-to-right characters and
// that either there are some right-to-left characters or the option
// DefaultDirection(RightToLeft) was passed.
RightToLeft
// Mixed indicates text contains both left-to-right and right-to-left
// characters.
Mixed
// Neutral means that text contains no left-to-right and right-to-left
// characters and that no default direction has been set.
Neutral
)
type options struct{}
// An Option is an option for Bidi processing.
type Option func(*options)
// ICU allows the user to define embedding levels. This may be used, for example,
// to use hierarchical structure of markup languages to define embeddings.
// The following option may be a way to expose this functionality in this API.
// // LevelFunc sets a function that associates nesting levels with the given text.
// // The levels function will be called with monotonically increasing values for p.
// func LevelFunc(levels func(p int) int) Option {
// panic("unimplemented")
// }
// DefaultDirection sets the default direction for a Paragraph. The direction is
// overridden if the text contains directional characters.
func DefaultDirection(d Direction) Option {
panic("unimplemented")
}
// A Paragraph holds a single Paragraph for Bidi processing.
type Paragraph struct {
// buffers
}
// SetBytes configures p for the given paragraph text. It replaces text
// previously set by SetBytes or SetString. If b contains a paragraph separator
// it will only process the first paragraph and report the number of bytes
// consumed from b including this separator. Error may be non-nil if options are
// given.
func (p *Paragraph) SetBytes(b []byte, opts ...Option) (n int, err error) {
panic("unimplemented")
}
// SetString configures p for the given paragraph text. It replaces text
// previously set by SetBytes or SetString. If b contains a paragraph separator
// it will only process the first paragraph and report the number of bytes
// consumed from b including this separator. Error may be non-nil if options are
// given.
func (p *Paragraph) SetString(s string, opts ...Option) (n int, err error) {
panic("unimplemented")
}
// IsLeftToRight reports whether the principle direction of rendering for this
// paragraphs is left-to-right. If this returns false, the principle direction
// of rendering is right-to-left.
func (p *Paragraph) IsLeftToRight() bool {
panic("unimplemented")
}
// Direction returns the direction of the text of this paragraph.
//
// The direction may be LeftToRight, RightToLeft, Mixed, or Neutral.
func (p *Paragraph) Direction() Direction {
panic("unimplemented")
}
// RunAt reports the Run at the given position of the input text.
//
// This method can be used for computing line breaks on paragraphs.
func (p *Paragraph) RunAt(pos int) Run {
panic("unimplemented")
}
// Order computes the visual ordering of all the runs in a Paragraph.
func (p *Paragraph) Order() (Ordering, error) {
panic("unimplemented")
}
// Line computes the visual ordering of runs for a single line starting and
// ending at the given positions in the original text.
func (p *Paragraph) Line(start, end int) (Ordering, error) {
panic("unimplemented")
}
// An Ordering holds the computed visual order of runs of a Paragraph. Calling
// SetBytes or SetString on the originating Paragraph invalidates an Ordering.
// The methods of an Ordering should only be called by one goroutine at a time.
type Ordering struct{}
// Direction reports the directionality of the runs.
//
// The direction may be LeftToRight, RightToLeft, Mixed, or Neutral.
func (o *Ordering) Direction() Direction {
panic("unimplemented")
}
// NumRuns returns the number of runs.
func (o *Ordering) NumRuns() int {
panic("unimplemented")
}
// Run returns the ith run within the ordering.
func (o *Ordering) Run(i int) Run {
panic("unimplemented")
}
// TODO: perhaps with options.
// // Reorder creates a reader that reads the runes in visual order per character.
// // Modifiers remain after the runes they modify.
// func (l *Runs) Reorder() io.Reader {
// panic("unimplemented")
// }
// A Run is a continuous sequence of characters of a single direction.
type Run struct {
}
// String returns the text of the run in its original order.
func (r *Run) String() string {
panic("unimplemented")
}
// Bytes returns the text of the run in its original order.
func (r *Run) Bytes() []byte {
panic("unimplemented")
}
// TODO: methods for
// - Display order
// - headers and footers
// - bracket replacement.
// Direction reports the direction of the run.
func (r *Run) Direction() Direction {
panic("unimplemented")
}
// Position of the Run within the text passed to SetBytes or SetString of the
// originating Paragraph value.
func (r *Run) Pos() (start, end int) {
panic("unimplemented")
}
// AppendReverse reverses the order of characters of in, appends them to out,
// and returns the result. Modifiers will still follow the runes they modify.
// Brackets are replaced with their counterparts.
func AppendReverse(out, in []byte) []byte {
panic("unimplemented")
}
// ReverseString reverses the order of characters in s and returns a new string.
// Modifiers will still follow the runes they modify. Brackets are replaced with
// their counterparts.
func ReverseString(s string) string {
panic("unimplemented")
}

335
vendor/golang.org/x/text/unicode/bidi/bracket.go generated vendored Normal file
View file

@ -0,0 +1,335 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bidi
import (
"container/list"
"fmt"
"sort"
)
// This file contains a port of the reference implementation of the
// Bidi Parentheses Algorithm:
// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
//
// The implementation in this file covers definitions BD14-BD16 and rule N0
// of UAX#9.
//
// Some preprocessing is done for each rune before data is passed to this
// algorithm:
// - opening and closing brackets are identified
// - a bracket pair type, like '(' and ')' is assigned a unique identifier that
// is identical for the opening and closing bracket. It is left to do these
// mappings.
// - The BPA algorithm requires that bracket characters that are canonical
// equivalents of each other be able to be substituted for each other.
// It is the responsibility of the caller to do this canonicalization.
//
// In implementing BD16, this implementation departs slightly from the "logical"
// algorithm defined in UAX#9. In particular, the stack referenced there
// supports operations that go beyond a "basic" stack. An equivalent
// implementation based on a linked list is used here.
// Bidi_Paired_Bracket_Type
// BD14. An opening paired bracket is a character whose
// Bidi_Paired_Bracket_Type property value is Open.
//
// BD15. A closing paired bracket is a character whose
// Bidi_Paired_Bracket_Type property value is Close.
type bracketType byte
const (
bpNone bracketType = iota
bpOpen
bpClose
)
// bracketPair holds a pair of index values for opening and closing bracket
// location of a bracket pair.
type bracketPair struct {
opener int
closer int
}
func (b *bracketPair) String() string {
return fmt.Sprintf("(%v, %v)", b.opener, b.closer)
}
// bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
type bracketPairs []bracketPair
func (b bracketPairs) Len() int { return len(b) }
func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener }
// resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
//
// For each rune, it takes the indexes into the original string, the class the
// bracket type (in pairTypes) and the bracket identifier (pairValues). It also
// takes the direction type for the start-of-sentence and the embedding level.
//
// The identifiers for bracket types are the rune of the canonicalized opening
// bracket for brackets (open or close) or 0 for runes that are not brackets.
func resolvePairedBrackets(s *isolatingRunSequence) {
p := bracketPairer{
sos: s.sos,
openers: list.New(),
codesIsolatedRun: s.types,
indexes: s.indexes,
}
dirEmbed := L
if s.level&1 != 0 {
dirEmbed = R
}
p.locateBrackets(s.p.pairTypes, s.p.pairValues)
p.resolveBrackets(dirEmbed, s.p.initialTypes)
}
type bracketPairer struct {
sos Class // direction corresponding to start of sequence
// The following is a restatement of BD 16 using non-algorithmic language.
//
// A bracket pair is a pair of characters consisting of an opening
// paired bracket and a closing paired bracket such that the
// Bidi_Paired_Bracket property value of the former equals the latter,
// subject to the following constraints.
// - both characters of a pair occur in the same isolating run sequence
// - the closing character of a pair follows the opening character
// - any bracket character can belong at most to one pair, the earliest possible one
// - any bracket character not part of a pair is treated like an ordinary character
// - pairs may nest properly, but their spans may not overlap otherwise
// Bracket characters with canonical decompositions are supposed to be
// treated as if they had been normalized, to allow normalized and non-
// normalized text to give the same result. In this implementation that step
// is pushed out to the caller. The caller has to ensure that the pairValue
// slices contain the rune of the opening bracket after normalization for
// any opening or closing bracket.
openers *list.List // list of positions for opening brackets
// bracket pair positions sorted by location of opening bracket
pairPositions bracketPairs
codesIsolatedRun []Class // directional bidi codes for an isolated run
indexes []int // array of index values into the original string
}
// matchOpener reports whether characters at given positions form a matching
// bracket pair.
func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool {
return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]]
}
const maxPairingDepth = 63
// locateBrackets locates matching bracket pairs according to BD16.
//
// This implementation uses a linked list instead of a stack, because, while
// elements are added at the front (like a push) they are not generally removed
// in atomic 'pop' operations, reducing the benefit of the stack archetype.
func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) {
// traverse the run
// do that explicitly (not in a for-each) so we can record position
for i, index := range p.indexes {
// look at the bracket type for each character
if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON {
// continue scanning
continue
}
switch pairTypes[index] {
case bpOpen:
// check if maximum pairing depth reached
if p.openers.Len() == maxPairingDepth {
p.openers.Init()
return
}
// remember opener location, most recent first
p.openers.PushFront(i)
case bpClose:
// see if there is a match
count := 0
for elem := p.openers.Front(); elem != nil; elem = elem.Next() {
count++
opener := elem.Value.(int)
if p.matchOpener(pairValues, opener, i) {
// if the opener matches, add nested pair to the ordered list
p.pairPositions = append(p.pairPositions, bracketPair{opener, i})
// remove up to and including matched opener
for ; count > 0; count-- {
p.openers.Remove(p.openers.Front())
}
break
}
}
sort.Sort(p.pairPositions)
// if we get here, the closing bracket matched no openers
// and gets ignored
}
}
}
// Bracket pairs within an isolating run sequence are processed as units so
// that both the opening and the closing paired bracket in a pair resolve to
// the same direction.
//
// N0. Process bracket pairs in an isolating run sequence sequentially in
// the logical order of the text positions of the opening paired brackets
// using the logic given below. Within this scope, bidirectional types EN
// and AN are treated as R.
//
// Identify the bracket pairs in the current isolating run sequence
// according to BD16. For each bracket-pair element in the list of pairs of
// text positions:
//
// a Inspect the bidirectional types of the characters enclosed within the
// bracket pair.
//
// b If any strong type (either L or R) matching the embedding direction is
// found, set the type for both brackets in the pair to match the embedding
// direction.
//
// o [ e ] o -> o e e e o
//
// o [ o e ] -> o e o e e
//
// o [ NI e ] -> o e NI e e
//
// c Otherwise, if a strong type (opposite the embedding direction) is
// found, test for adjacent strong types as follows: 1 First, check
// backwards before the opening paired bracket until the first strong type
// (L, R, or sos) is found. If that first preceding strong type is opposite
// the embedding direction, then set the type for both brackets in the pair
// to that type. 2 Otherwise, set the type for both brackets in the pair to
// the embedding direction.
//
// o [ o ] e -> o o o o e
//
// o [ o NI ] o -> o o o NI o o
//
// e [ o ] o -> e e o e o
//
// e [ o ] e -> e e o e e
//
// e ( o [ o ] NI ) e -> e e o o o o NI e e
//
// d Otherwise, do not set the type for the current bracket pair. Note that
// if the enclosed text contains no strong types the paired brackets will
// both resolve to the same level when resolved individually using rules N1
// and N2.
//
// e ( NI ) o -> e ( NI ) o
// getStrongTypeN0 maps character's directional code to strong type as required
// by rule N0.
//
// TODO: have separate type for "strong" directionality.
func (p *bracketPairer) getStrongTypeN0(index int) Class {
switch p.codesIsolatedRun[index] {
// in the scope of N0, number types are treated as R
case EN, AN, AL, R:
return R
case L:
return L
default:
return ON
}
}
// classifyPairContent reports the strong types contained inside a Bracket Pair,
// assuming the given embedding direction.
//
// It returns ON if no strong type is found. If a single strong type is found,
// it returns this type. Otherwise it returns the embedding direction.
//
// TODO: use separate type for "strong" directionality.
func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {
dirOpposite := ON
for i := loc.opener + 1; i < loc.closer; i++ {
dir := p.getStrongTypeN0(i)
if dir == ON {
continue
}
if dir == dirEmbed {
return dir // type matching embedding direction found
}
dirOpposite = dir
}
// return ON if no strong type found, or class opposite to dirEmbed
return dirOpposite
}
// classBeforePair determines which strong types are present before a Bracket
// Pair. Return R or L if strong type found, otherwise ON.
func (p *bracketPairer) classBeforePair(loc bracketPair) Class {
for i := loc.opener - 1; i >= 0; i-- {
if dir := p.getStrongTypeN0(i); dir != ON {
return dir
}
}
// no strong types found, return sos
return p.sos
}
// assignBracketType implements rule N0 for a single bracket pair.
func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) {
// rule "N0, a", inspect contents of pair
dirPair := p.classifyPairContent(loc, dirEmbed)
// dirPair is now L, R, or N (no strong type found)
// the following logical tests are performed out of order compared to
// the statement of the rules but yield the same results
if dirPair == ON {
return // case "d" - nothing to do
}
if dirPair != dirEmbed {
// case "c": strong type found, opposite - check before (c.1)
dirPair = p.classBeforePair(loc)
if dirPair == dirEmbed || dirPair == ON {
// no strong opposite type found before - use embedding (c.2)
dirPair = dirEmbed
}
}
// else: case "b", strong type found matching embedding,
// no explicit action needed, as dirPair is already set to embedding
// direction
// set the bracket types to the type found
p.setBracketsToType(loc, dirPair, initialTypes)
}
func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) {
p.codesIsolatedRun[loc.opener] = dirPair
p.codesIsolatedRun[loc.closer] = dirPair
for i := loc.opener + 1; i < loc.closer; i++ {
index := p.indexes[i]
if initialTypes[index] != NSM {
break
}
p.codesIsolatedRun[i] = dirPair
}
for i := loc.closer + 1; i < len(p.indexes); i++ {
index := p.indexes[i]
if initialTypes[index] != NSM {
break
}
p.codesIsolatedRun[i] = dirPair
}
}
// resolveBrackets implements rule N0 for a list of pairs.
func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) {
for _, loc := range p.pairPositions {
p.assignBracketType(loc, dirEmbed, initialTypes)
}
}

1058
vendor/golang.org/x/text/unicode/bidi/core.go generated vendored Normal file

File diff suppressed because it is too large Load diff

206
vendor/golang.org/x/text/unicode/bidi/prop.go generated vendored Normal file
View file

@ -0,0 +1,206 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bidi
import "unicode/utf8"
// Properties provides access to BiDi properties of runes.
type Properties struct {
entry uint8
last uint8
}
var trie = newBidiTrie(0)
// TODO: using this for bidirule reduces the running time by about 5%. Consider
// if this is worth exposing or if we can find a way to speed up the Class
// method.
//
// // CompactClass is like Class, but maps all of the BiDi control classes
// // (LRO, RLO, LRE, RLE, PDF, LRI, RLI, FSI, PDI) to the class Control.
// func (p Properties) CompactClass() Class {
// return Class(p.entry & 0x0F)
// }
// Class returns the Bidi class for p.
func (p Properties) Class() Class {
c := Class(p.entry & 0x0F)
if c == Control {
c = controlByteToClass[p.last&0xF]
}
return c
}
// IsBracket reports whether the rune is a bracket.
func (p Properties) IsBracket() bool { return p.entry&0xF0 != 0 }
// IsOpeningBracket reports whether the rune is an opening bracket.
// IsBracket must return true.
func (p Properties) IsOpeningBracket() bool { return p.entry&openMask != 0 }
// TODO: find a better API and expose.
func (p Properties) reverseBracket(r rune) rune {
return xorMasks[p.entry>>xorMaskShift] ^ r
}
var controlByteToClass = [16]Class{
0xD: LRO, // U+202D LeftToRightOverride,
0xE: RLO, // U+202E RightToLeftOverride,
0xA: LRE, // U+202A LeftToRightEmbedding,
0xB: RLE, // U+202B RightToLeftEmbedding,
0xC: PDF, // U+202C PopDirectionalFormat,
0x6: LRI, // U+2066 LeftToRightIsolate,
0x7: RLI, // U+2067 RightToLeftIsolate,
0x8: FSI, // U+2068 FirstStrongIsolate,
0x9: PDI, // U+2069 PopDirectionalIsolate,
}
// LookupRune returns properties for r.
func LookupRune(r rune) (p Properties, size int) {
var buf [4]byte
n := utf8.EncodeRune(buf[:], r)
return Lookup(buf[:n])
}
// TODO: these lookup methods are based on the generated trie code. The returned
// sizes have slightly different semantics from the generated code, in that it
// always returns size==1 for an illegal UTF-8 byte (instead of the length
// of the maximum invalid subsequence). Most Transformers, like unicode/norm,
// leave invalid UTF-8 untouched, in which case it has performance benefits to
// do so (without changing the semantics). Bidi requires the semantics used here
// for the bidirule implementation to be compatible with the Go semantics.
// They ultimately should perhaps be adopted by all trie implementations, for
// convenience sake.
// This unrolled code also boosts performance of the secure/bidirule package by
// about 30%.
// So, to remove this code:
// - add option to trie generator to define return type.
// - always return 1 byte size for ill-formed UTF-8 runes.
// Lookup returns properties for the first rune in s and the width in bytes of
// its encoding. The size will be 0 if s does not hold enough bytes to complete
// the encoding.
func Lookup(s []byte) (p Properties, sz int) {
c0 := s[0]
switch {
case c0 < 0x80: // is ASCII
return Properties{entry: bidiValues[c0]}, 1
case c0 < 0xC2:
return Properties{}, 1
case c0 < 0xE0: // 2-byte UTF-8
if len(s) < 2 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
case c0 < 0xF0: // 3-byte UTF-8
if len(s) < 3 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
o := uint32(i)<<6 + uint32(c1)
i = bidiIndex[o]
c2 := s[2]
if c2 < 0x80 || 0xC0 <= c2 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
case c0 < 0xF8: // 4-byte UTF-8
if len(s) < 4 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
o := uint32(i)<<6 + uint32(c1)
i = bidiIndex[o]
c2 := s[2]
if c2 < 0x80 || 0xC0 <= c2 {
return Properties{}, 1
}
o = uint32(i)<<6 + uint32(c2)
i = bidiIndex[o]
c3 := s[3]
if c3 < 0x80 || 0xC0 <= c3 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
}
// Illegal rune
return Properties{}, 1
}
// LookupString returns properties for the first rune in s and the width in
// bytes of its encoding. The size will be 0 if s does not hold enough bytes to
// complete the encoding.
func LookupString(s string) (p Properties, sz int) {
c0 := s[0]
switch {
case c0 < 0x80: // is ASCII
return Properties{entry: bidiValues[c0]}, 1
case c0 < 0xC2:
return Properties{}, 1
case c0 < 0xE0: // 2-byte UTF-8
if len(s) < 2 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
case c0 < 0xF0: // 3-byte UTF-8
if len(s) < 3 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
o := uint32(i)<<6 + uint32(c1)
i = bidiIndex[o]
c2 := s[2]
if c2 < 0x80 || 0xC0 <= c2 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
case c0 < 0xF8: // 4-byte UTF-8
if len(s) < 4 {
return Properties{}, 0
}
i := bidiIndex[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return Properties{}, 1
}
o := uint32(i)<<6 + uint32(c1)
i = bidiIndex[o]
c2 := s[2]
if c2 < 0x80 || 0xC0 <= c2 {
return Properties{}, 1
}
o = uint32(i)<<6 + uint32(c2)
i = bidiIndex[o]
c3 := s[3]
if c3 < 0x80 || 0xC0 <= c3 {
return Properties{}, 1
}
return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
}
// Illegal rune
return Properties{}, 1
}

1815
vendor/golang.org/x/text/unicode/bidi/tables10.0.0.go generated vendored Normal file

File diff suppressed because it is too large Load diff

1887
vendor/golang.org/x/text/unicode/bidi/tables11.0.0.go generated vendored Normal file

File diff suppressed because it is too large Load diff

1781
vendor/golang.org/x/text/unicode/bidi/tables9.0.0.go generated vendored Normal file

File diff suppressed because it is too large Load diff

60
vendor/golang.org/x/text/unicode/bidi/trieval.go generated vendored Normal file
View file

@ -0,0 +1,60 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package bidi
// Class is the Unicode BiDi class. Each rune has a single class.
type Class uint
const (
L Class = iota // LeftToRight
R // RightToLeft
EN // EuropeanNumber
ES // EuropeanSeparator
ET // EuropeanTerminator
AN // ArabicNumber
CS // CommonSeparator
B // ParagraphSeparator
S // SegmentSeparator
WS // WhiteSpace
ON // OtherNeutral
BN // BoundaryNeutral
NSM // NonspacingMark
AL // ArabicLetter
Control // Control LRO - PDI
numClass
LRO // LeftToRightOverride
RLO // RightToLeftOverride
LRE // LeftToRightEmbedding
RLE // RightToLeftEmbedding
PDF // PopDirectionalFormat
LRI // LeftToRightIsolate
RLI // RightToLeftIsolate
FSI // FirstStrongIsolate
PDI // PopDirectionalIsolate
unknownClass = ^Class(0)
)
var controlToClass = map[rune]Class{
0x202D: LRO, // LeftToRightOverride,
0x202E: RLO, // RightToLeftOverride,
0x202A: LRE, // LeftToRightEmbedding,
0x202B: RLE, // RightToLeftEmbedding,
0x202C: PDF, // PopDirectionalFormat,
0x2066: LRI, // LeftToRightIsolate,
0x2067: RLI, // RightToLeftIsolate,
0x2068: FSI, // FirstStrongIsolate,
0x2069: PDI, // PopDirectionalIsolate,
}
// A trie entry has the following bits:
// 7..5 XOR mask for brackets
// 4 1: Bracket open, 0: Bracket close
// 3..0 Class type
const (
openMask = 0x10
xorMaskShift = 5
)

512
vendor/golang.org/x/text/unicode/norm/composition.go generated vendored Normal file
View file

@ -0,0 +1,512 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import "unicode/utf8"
const (
maxNonStarters = 30
// The maximum number of characters needed for a buffer is
// maxNonStarters + 1 for the starter + 1 for the GCJ
maxBufferSize = maxNonStarters + 2
maxNFCExpansion = 3 // NFC(0x1D160)
maxNFKCExpansion = 18 // NFKC(0xFDFA)
maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
)
// ssState is used for reporting the segment state after inserting a rune.
// It is returned by streamSafe.next.
type ssState int
const (
// Indicates a rune was successfully added to the segment.
ssSuccess ssState = iota
// Indicates a rune starts a new segment and should not be added.
ssStarter
// Indicates a rune caused a segment overflow and a CGJ should be inserted.
ssOverflow
)
// streamSafe implements the policy of when a CGJ should be inserted.
type streamSafe uint8
// first inserts the first rune of a segment. It is a faster version of next if
// it is known p represents the first rune in a segment.
func (ss *streamSafe) first(p Properties) {
*ss = streamSafe(p.nTrailingNonStarters())
}
// insert returns a ssState value to indicate whether a rune represented by p
// can be inserted.
func (ss *streamSafe) next(p Properties) ssState {
if *ss > maxNonStarters {
panic("streamSafe was not reset")
}
n := p.nLeadingNonStarters()
if *ss += streamSafe(n); *ss > maxNonStarters {
*ss = 0
return ssOverflow
}
// The Stream-Safe Text Processing prescribes that the counting can stop
// as soon as a starter is encountered. However, there are some starters,
// like Jamo V and T, that can combine with other runes, leaving their
// successive non-starters appended to the previous, possibly causing an
// overflow. We will therefore consider any rune with a non-zero nLead to
// be a non-starter. Note that it always hold that if nLead > 0 then
// nLead == nTrail.
if n == 0 {
*ss = streamSafe(p.nTrailingNonStarters())
return ssStarter
}
return ssSuccess
}
// backwards is used for checking for overflow and segment starts
// when traversing a string backwards. Users do not need to call first
// for the first rune. The state of the streamSafe retains the count of
// the non-starters loaded.
func (ss *streamSafe) backwards(p Properties) ssState {
if *ss > maxNonStarters {
panic("streamSafe was not reset")
}
c := *ss + streamSafe(p.nTrailingNonStarters())
if c > maxNonStarters {
return ssOverflow
}
*ss = c
if p.nLeadingNonStarters() == 0 {
return ssStarter
}
return ssSuccess
}
func (ss streamSafe) isMax() bool {
return ss == maxNonStarters
}
// GraphemeJoiner is inserted after maxNonStarters non-starter runes.
const GraphemeJoiner = "\u034F"
// reorderBuffer is used to normalize a single segment. Characters inserted with
// insert are decomposed and reordered based on CCC. The compose method can
// be used to recombine characters. Note that the byte buffer does not hold
// the UTF-8 characters in order. Only the rune array is maintained in sorted
// order. flush writes the resulting segment to a byte array.
type reorderBuffer struct {
rune [maxBufferSize]Properties // Per character info.
byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
nbyte uint8 // Number or bytes.
ss streamSafe // For limiting length of non-starter sequence.
nrune int // Number of runeInfos.
f formInfo
src input
nsrc int
tmpBytes input
out []byte
flushF func(*reorderBuffer) bool
}
func (rb *reorderBuffer) init(f Form, src []byte) {
rb.f = *formTable[f]
rb.src.setBytes(src)
rb.nsrc = len(src)
rb.ss = 0
}
func (rb *reorderBuffer) initString(f Form, src string) {
rb.f = *formTable[f]
rb.src.setString(src)
rb.nsrc = len(src)
rb.ss = 0
}
func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) {
rb.out = out
rb.flushF = f
}
// reset discards all characters from the buffer.
func (rb *reorderBuffer) reset() {
rb.nrune = 0
rb.nbyte = 0
}
func (rb *reorderBuffer) doFlush() bool {
if rb.f.composing {
rb.compose()
}
res := rb.flushF(rb)
rb.reset()
return res
}
// appendFlush appends the normalized segment to rb.out.
func appendFlush(rb *reorderBuffer) bool {
for i := 0; i < rb.nrune; i++ {
start := rb.rune[i].pos
end := start + rb.rune[i].size
rb.out = append(rb.out, rb.byte[start:end]...)
}
return true
}
// flush appends the normalized segment to out and resets rb.
func (rb *reorderBuffer) flush(out []byte) []byte {
for i := 0; i < rb.nrune; i++ {
start := rb.rune[i].pos
end := start + rb.rune[i].size
out = append(out, rb.byte[start:end]...)
}
rb.reset()
return out
}
// flushCopy copies the normalized segment to buf and resets rb.
// It returns the number of bytes written to buf.
func (rb *reorderBuffer) flushCopy(buf []byte) int {
p := 0
for i := 0; i < rb.nrune; i++ {
runep := rb.rune[i]
p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
}
rb.reset()
return p
}
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
// It returns false if the buffer is not large enough to hold the rune.
// It is used internally by insert and insertString only.
func (rb *reorderBuffer) insertOrdered(info Properties) {
n := rb.nrune
b := rb.rune[:]
cc := info.ccc
if cc > 0 {
// Find insertion position + move elements to make room.
for ; n > 0; n-- {
if b[n-1].ccc <= cc {
break
}
b[n] = b[n-1]
}
}
rb.nrune += 1
pos := uint8(rb.nbyte)
rb.nbyte += utf8.UTFMax
info.pos = pos
b[n] = info
}
// insertErr is an error code returned by insert. Using this type instead
// of error improves performance up to 20% for many of the benchmarks.
type insertErr int
const (
iSuccess insertErr = -iota
iShortDst
iShortSrc
)
// insertFlush inserts the given rune in the buffer ordered by CCC.
// If a decomposition with multiple segments are encountered, they leading
// ones are flushed.
// It returns a non-zero error code if the rune was not inserted.
func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr {
if rune := src.hangul(i); rune != 0 {
rb.decomposeHangul(rune)
return iSuccess
}
if info.hasDecomposition() {
return rb.insertDecomposed(info.Decomposition())
}
rb.insertSingle(src, i, info)
return iSuccess
}
// insertUnsafe inserts the given rune in the buffer ordered by CCC.
// It is assumed there is sufficient space to hold the runes. It is the
// responsibility of the caller to ensure this. This can be done by checking
// the state returned by the streamSafe type.
func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) {
if rune := src.hangul(i); rune != 0 {
rb.decomposeHangul(rune)
}
if info.hasDecomposition() {
// TODO: inline.
rb.insertDecomposed(info.Decomposition())
} else {
rb.insertSingle(src, i, info)
}
}
// insertDecomposed inserts an entry in to the reorderBuffer for each rune
// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
// It flushes the buffer on each new segment start.
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
rb.tmpBytes.setBytes(dcomp)
// As the streamSafe accounting already handles the counting for modifiers,
// we don't have to call next. However, we do need to keep the accounting
// intact when flushing the buffer.
for i := 0; i < len(dcomp); {
info := rb.f.info(rb.tmpBytes, i)
if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {
return iShortDst
}
i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)])
rb.insertOrdered(info)
}
return iSuccess
}
// insertSingle inserts an entry in the reorderBuffer for the rune at
// position i. info is the runeInfo for the rune at position i.
func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) {
src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size))
rb.insertOrdered(info)
}
// insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
func (rb *reorderBuffer) insertCGJ() {
rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))})
}
// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
func (rb *reorderBuffer) appendRune(r rune) {
bn := rb.nbyte
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
rb.nbyte += utf8.UTFMax
rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
rb.nrune++
}
// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
func (rb *reorderBuffer) assignRune(pos int, r rune) {
bn := rb.rune[pos].pos
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
}
// runeAt returns the rune at position n. It is used for Hangul and recomposition.
func (rb *reorderBuffer) runeAt(n int) rune {
inf := rb.rune[n]
r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
return r
}
// bytesAt returns the UTF-8 encoding of the rune at position n.
// It is used for Hangul and recomposition.
func (rb *reorderBuffer) bytesAt(n int) []byte {
inf := rb.rune[n]
return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
}
// For Hangul we combine algorithmically, instead of using tables.
const (
hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
hangulBase0 = 0xEA
hangulBase1 = 0xB0
hangulBase2 = 0x80
hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
hangulEnd0 = 0xED
hangulEnd1 = 0x9E
hangulEnd2 = 0xA4
jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
jamoLBase0 = 0xE1
jamoLBase1 = 0x84
jamoLEnd = 0x1113
jamoVBase = 0x1161
jamoVEnd = 0x1176
jamoTBase = 0x11A7
jamoTEnd = 0x11C3
jamoTCount = 28
jamoVCount = 21
jamoVTCount = 21 * 28
jamoLVTCount = 19 * 21 * 28
)
const hangulUTF8Size = 3
func isHangul(b []byte) bool {
if len(b) < hangulUTF8Size {
return false
}
b0 := b[0]
if b0 < hangulBase0 {
return false
}
b1 := b[1]
switch {
case b0 == hangulBase0:
return b1 >= hangulBase1
case b0 < hangulEnd0:
return true
case b0 > hangulEnd0:
return false
case b1 < hangulEnd1:
return true
}
return b1 == hangulEnd1 && b[2] < hangulEnd2
}
func isHangulString(b string) bool {
if len(b) < hangulUTF8Size {
return false
}
b0 := b[0]
if b0 < hangulBase0 {
return false
}
b1 := b[1]
switch {
case b0 == hangulBase0:
return b1 >= hangulBase1
case b0 < hangulEnd0:
return true
case b0 > hangulEnd0:
return false
case b1 < hangulEnd1:
return true
}
return b1 == hangulEnd1 && b[2] < hangulEnd2
}
// Caller must ensure len(b) >= 2.
func isJamoVT(b []byte) bool {
// True if (rune & 0xff00) == jamoLBase
return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
}
func isHangulWithoutJamoT(b []byte) bool {
c, _ := utf8.DecodeRune(b)
c -= hangulBase
return c < jamoLVTCount && c%jamoTCount == 0
}
// decomposeHangul writes the decomposed Hangul to buf and returns the number
// of bytes written. len(buf) should be at least 9.
func decomposeHangul(buf []byte, r rune) int {
const JamoUTF8Len = 3
r -= hangulBase
x := r % jamoTCount
r /= jamoTCount
utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
if x != 0 {
utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
return 3 * JamoUTF8Len
}
return 2 * JamoUTF8Len
}
// decomposeHangul algorithmically decomposes a Hangul rune into
// its Jamo components.
// See https://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
func (rb *reorderBuffer) decomposeHangul(r rune) {
r -= hangulBase
x := r % jamoTCount
r /= jamoTCount
rb.appendRune(jamoLBase + r/jamoVCount)
rb.appendRune(jamoVBase + r%jamoVCount)
if x != 0 {
rb.appendRune(jamoTBase + x)
}
}
// combineHangul algorithmically combines Jamo character components into Hangul.
// See https://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
func (rb *reorderBuffer) combineHangul(s, i, k int) {
b := rb.rune[:]
bn := rb.nrune
for ; i < bn; i++ {
cccB := b[k-1].ccc
cccC := b[i].ccc
if cccB == 0 {
s = k - 1
}
if s != k-1 && cccB >= cccC {
// b[i] is blocked by greater-equal cccX below it
b[k] = b[i]
k++
} else {
l := rb.runeAt(s) // also used to compare to hangulBase
v := rb.runeAt(i) // also used to compare to jamoT
switch {
case jamoLBase <= l && l < jamoLEnd &&
jamoVBase <= v && v < jamoVEnd:
// 11xx plus 116x to LV
rb.assignRune(s, hangulBase+
(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount)
case hangulBase <= l && l < hangulEnd &&
jamoTBase < v && v < jamoTEnd &&
((l-hangulBase)%jamoTCount) == 0:
// ACxx plus 11Ax to LVT
rb.assignRune(s, l+v-jamoTBase)
default:
b[k] = b[i]
k++
}
}
}
rb.nrune = k
}
// compose recombines the runes in the buffer.
// It should only be used to recompose a single segment, as it will not
// handle alternations between Hangul and non-Hangul characters correctly.
func (rb *reorderBuffer) compose() {
// Lazily load the map used by the combine func below, but do
// it outside of the loop.
recompMapOnce.Do(buildRecompMap)
// UAX #15, section X5 , including Corrigendum #5
// "In any character sequence beginning with starter S, a character C is
// blocked from S if and only if there is some character B between S
// and C, and either B is a starter or it has the same or higher
// combining class as C."
bn := rb.nrune
if bn == 0 {
return
}
k := 1
b := rb.rune[:]
for s, i := 0, 1; i < bn; i++ {
if isJamoVT(rb.bytesAt(i)) {
// Redo from start in Hangul mode. Necessary to support
// U+320E..U+321E in NFKC mode.
rb.combineHangul(s, i, k)
return
}
ii := b[i]
// We can only use combineForward as a filter if we later
// get the info for the combined character. This is more
// expensive than using the filter. Using combinesBackward()
// is safe.
if ii.combinesBackward() {
cccB := b[k-1].ccc
cccC := ii.ccc
blocked := false // b[i] blocked by starter or greater or equal CCC?
if cccB == 0 {
s = k - 1
} else {
blocked = s != k-1 && cccB >= cccC
}
if !blocked {
combined := combine(rb.runeAt(s), rb.runeAt(i))
if combined != 0 {
rb.assignRune(s, combined)
continue
}
}
}
b[k] = b[i]
k++
}
rb.nrune = k
}

278
vendor/golang.org/x/text/unicode/norm/forminfo.go generated vendored Normal file
View file

@ -0,0 +1,278 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import "encoding/binary"
// This file contains Form-specific logic and wrappers for data in tables.go.
// Rune info is stored in a separate trie per composing form. A composing form
// and its corresponding decomposing form share the same trie. Each trie maps
// a rune to a uint16. The values take two forms. For v >= 0x8000:
// bits
// 15: 1 (inverse of NFD_QC bit of qcInfo)
// 13..7: qcInfo (see below). isYesD is always true (no decompostion).
// 6..0: ccc (compressed CCC value).
// For v < 0x8000, the respective rune has a decomposition and v is an index
// into a byte array of UTF-8 decomposition sequences and additional info and
// has the form:
// <header> <decomp_byte>* [<tccc> [<lccc>]]
// The header contains the number of bytes in the decomposition (excluding this
// length byte). The two most significant bits of this length byte correspond
// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
// The byte sequence is followed by a trailing and leading CCC if the values
// for these are not zero. The value of v determines which ccc are appended
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
// there is an additional leading ccc. The value of tccc itself is the
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
// are the number of trailing non-starters.
const (
qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
headerLenMask = 0x3F // extract the length value from the header byte
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
)
// Properties provides access to normalization properties of a rune.
type Properties struct {
pos uint8 // start position in reorderBuffer; used in composition.go
size uint8 // length of UTF-8 encoding of this rune
ccc uint8 // leading canonical combining class (ccc if not decomposition)
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
nLead uint8 // number of leading non-starters.
flags qcInfo // quick check flags
index uint16
}
// functions dispatchable per form
type lookupFunc func(b input, i int) Properties
// formInfo holds Form-specific functions and tables.
type formInfo struct {
form Form
composing, compatibility bool // form type
info lookupFunc
nextMain iterFunc
}
var formTable = []*formInfo{{
form: NFC,
composing: true,
compatibility: false,
info: lookupInfoNFC,
nextMain: nextComposed,
}, {
form: NFD,
composing: false,
compatibility: false,
info: lookupInfoNFC,
nextMain: nextDecomposed,
}, {
form: NFKC,
composing: true,
compatibility: true,
info: lookupInfoNFKC,
nextMain: nextComposed,
}, {
form: NFKD,
composing: false,
compatibility: true,
info: lookupInfoNFKC,
nextMain: nextDecomposed,
}}
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
// unexpected behavior for the user. For example, in NFD, there is a boundary
// after 'a'. However, 'a' might combine with modifiers, so from the application's
// perspective it is not a good boundary. We will therefore always use the
// boundaries for the combining variants.
// BoundaryBefore returns true if this rune starts a new segment and
// cannot combine with any rune on the left.
func (p Properties) BoundaryBefore() bool {
if p.ccc == 0 && !p.combinesBackward() {
return true
}
// We assume that the CCC of the first character in a decomposition
// is always non-zero if different from info.ccc and that we can return
// false at this point. This is verified by maketables.
return false
}
// BoundaryAfter returns true if runes cannot combine with or otherwise
// interact with this or previous runes.
func (p Properties) BoundaryAfter() bool {
// TODO: loosen these conditions.
return p.isInert()
}
// We pack quick check data in 4 bits:
// 5: Combines forward (0 == false, 1 == true)
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
// 1..0: Number of trailing non-starters.
//
// When all 4 bits are zero, the character is inert, meaning it is never
// influenced by normalization.
type qcInfo uint8
func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 }
func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
func (p Properties) isInert() bool {
return p.flags&qcInfoMask == 0 && p.ccc == 0
}
func (p Properties) multiSegment() bool {
return p.index >= firstMulti && p.index < endMulti
}
func (p Properties) nLeadingNonStarters() uint8 {
return p.nLead
}
func (p Properties) nTrailingNonStarters() uint8 {
return uint8(p.flags & 0x03)
}
// Decomposition returns the decomposition for the underlying rune
// or nil if there is none.
func (p Properties) Decomposition() []byte {
// TODO: create the decomposition for Hangul?
if p.index == 0 {
return nil
}
i := p.index
n := decomps[i] & headerLenMask
i++
return decomps[i : i+uint16(n)]
}
// Size returns the length of UTF-8 encoding of the rune.
func (p Properties) Size() int {
return int(p.size)
}
// CCC returns the canonical combining class of the underlying rune.
func (p Properties) CCC() uint8 {
if p.index >= firstCCCZeroExcept {
return 0
}
return ccc[p.ccc]
}
// LeadCCC returns the CCC of the first rune in the decomposition.
// If there is no decomposition, LeadCCC equals CCC.
func (p Properties) LeadCCC() uint8 {
return ccc[p.ccc]
}
// TrailCCC returns the CCC of the last rune in the decomposition.
// If there is no decomposition, TrailCCC equals CCC.
func (p Properties) TrailCCC() uint8 {
return ccc[p.tccc]
}
func buildRecompMap() {
recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
var buf [8]byte
for i := 0; i < len(recompMapPacked); i += 8 {
copy(buf[:], recompMapPacked[i:i+8])
key := binary.BigEndian.Uint32(buf[:4])
val := binary.BigEndian.Uint32(buf[4:])
recompMap[key] = rune(val)
}
}
// Recomposition
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
// This clips off the bits of three entries, but we know this will not
// result in a collision. In the unlikely event that changes to
// UnicodeData.txt introduce collisions, the compiler will catch it.
// Note that the recomposition map for NFC and NFKC are identical.
// combine returns the combined rune or 0 if it doesn't exist.
//
// The caller is responsible for calling
// recompMapOnce.Do(buildRecompMap) sometime before this is called.
func combine(a, b rune) rune {
key := uint32(uint16(a))<<16 + uint32(uint16(b))
if recompMap == nil {
panic("caller error") // see func comment
}
return recompMap[key]
}
func lookupInfoNFC(b input, i int) Properties {
v, sz := b.charinfoNFC(i)
return compInfo(v, sz)
}
func lookupInfoNFKC(b input, i int) Properties {
v, sz := b.charinfoNFKC(i)
return compInfo(v, sz)
}
// Properties returns properties for the first rune in s.
func (f Form) Properties(s []byte) Properties {
if f == NFC || f == NFD {
return compInfo(nfcData.lookup(s))
}
return compInfo(nfkcData.lookup(s))
}
// PropertiesString returns properties for the first rune in s.
func (f Form) PropertiesString(s string) Properties {
if f == NFC || f == NFD {
return compInfo(nfcData.lookupString(s))
}
return compInfo(nfkcData.lookupString(s))
}
// compInfo converts the information contained in v and sz
// to a Properties. See the comment at the top of the file
// for more information on the format.
func compInfo(v uint16, sz int) Properties {
if v == 0 {
return Properties{size: uint8(sz)}
} else if v >= 0x8000 {
p := Properties{
size: uint8(sz),
ccc: uint8(v),
tccc: uint8(v),
flags: qcInfo(v >> 8),
}
if p.ccc > 0 || p.combinesBackward() {
p.nLead = uint8(p.flags & 0x3)
}
return p
}
// has decomposition
h := decomps[v]
f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
p := Properties{size: uint8(sz), flags: f, index: v}
if v >= firstCCC {
v += uint16(h&headerLenMask) + 1
c := decomps[v]
p.tccc = c >> 2
p.flags |= qcInfo(c & 0x3)
if v >= firstLeadingCCC {
p.nLead = c & 0x3
if v >= firstStarterWithNLead {
// We were tricked. Remove the decomposition.
p.flags &= 0x03
p.index = 0
return p
}
p.ccc = decomps[v+1]
}
}
return p
}

109
vendor/golang.org/x/text/unicode/norm/input.go generated vendored Normal file
View file

@ -0,0 +1,109 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import "unicode/utf8"
type input struct {
str string
bytes []byte
}
func inputBytes(str []byte) input {
return input{bytes: str}
}
func inputString(str string) input {
return input{str: str}
}
func (in *input) setBytes(str []byte) {
in.str = ""
in.bytes = str
}
func (in *input) setString(str string) {
in.str = str
in.bytes = nil
}
func (in *input) _byte(p int) byte {
if in.bytes == nil {
return in.str[p]
}
return in.bytes[p]
}
func (in *input) skipASCII(p, max int) int {
if in.bytes == nil {
for ; p < max && in.str[p] < utf8.RuneSelf; p++ {
}
} else {
for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ {
}
}
return p
}
func (in *input) skipContinuationBytes(p int) int {
if in.bytes == nil {
for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ {
}
} else {
for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ {
}
}
return p
}
func (in *input) appendSlice(buf []byte, b, e int) []byte {
if in.bytes != nil {
return append(buf, in.bytes[b:e]...)
}
for i := b; i < e; i++ {
buf = append(buf, in.str[i])
}
return buf
}
func (in *input) copySlice(buf []byte, b, e int) int {
if in.bytes == nil {
return copy(buf, in.str[b:e])
}
return copy(buf, in.bytes[b:e])
}
func (in *input) charinfoNFC(p int) (uint16, int) {
if in.bytes == nil {
return nfcData.lookupString(in.str[p:])
}
return nfcData.lookup(in.bytes[p:])
}
func (in *input) charinfoNFKC(p int) (uint16, int) {
if in.bytes == nil {
return nfkcData.lookupString(in.str[p:])
}
return nfkcData.lookup(in.bytes[p:])
}
func (in *input) hangul(p int) (r rune) {
var size int
if in.bytes == nil {
if !isHangulString(in.str[p:]) {
return 0
}
r, size = utf8.DecodeRuneInString(in.str[p:])
} else {
if !isHangul(in.bytes[p:]) {
return 0
}
r, size = utf8.DecodeRune(in.bytes[p:])
}
if size != hangulUTF8Size {
return 0
}
return r
}

458
vendor/golang.org/x/text/unicode/norm/iter.go generated vendored Normal file
View file

@ -0,0 +1,458 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import (
"fmt"
"unicode/utf8"
)
// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
// sequence of starter and non-starter runes for the purpose of normalization.
const MaxSegmentSize = maxByteBufferSize
// An Iter iterates over a string or byte slice, while normalizing it
// to a given Form.
type Iter struct {
rb reorderBuffer
buf [maxByteBufferSize]byte
info Properties // first character saved from previous iteration
next iterFunc // implementation of next depends on form
asciiF iterFunc
p int // current position in input source
multiSeg []byte // remainder of multi-segment decomposition
}
type iterFunc func(*Iter) []byte
// Init initializes i to iterate over src after normalizing it to Form f.
func (i *Iter) Init(f Form, src []byte) {
i.p = 0
if len(src) == 0 {
i.setDone()
i.rb.nsrc = 0
return
}
i.multiSeg = nil
i.rb.init(f, src)
i.next = i.rb.f.nextMain
i.asciiF = nextASCIIBytes
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
}
// InitString initializes i to iterate over src after normalizing it to Form f.
func (i *Iter) InitString(f Form, src string) {
i.p = 0
if len(src) == 0 {
i.setDone()
i.rb.nsrc = 0
return
}
i.multiSeg = nil
i.rb.initString(f, src)
i.next = i.rb.f.nextMain
i.asciiF = nextASCIIString
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
}
// Seek sets the segment to be returned by the next call to Next to start
// at position p. It is the responsibility of the caller to set p to the
// start of a segment.
func (i *Iter) Seek(offset int64, whence int) (int64, error) {
var abs int64
switch whence {
case 0:
abs = offset
case 1:
abs = int64(i.p) + offset
case 2:
abs = int64(i.rb.nsrc) + offset
default:
return 0, fmt.Errorf("norm: invalid whence")
}
if abs < 0 {
return 0, fmt.Errorf("norm: negative position")
}
if int(abs) >= i.rb.nsrc {
i.setDone()
return int64(i.p), nil
}
i.p = int(abs)
i.multiSeg = nil
i.next = i.rb.f.nextMain
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
return abs, nil
}
// returnSlice returns a slice of the underlying input type as a byte slice.
// If the underlying is of type []byte, it will simply return a slice.
// If the underlying is of type string, it will copy the slice to the buffer
// and return that.
func (i *Iter) returnSlice(a, b int) []byte {
if i.rb.src.bytes == nil {
return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
}
return i.rb.src.bytes[a:b]
}
// Pos returns the byte position at which the next call to Next will commence processing.
func (i *Iter) Pos() int {
return i.p
}
func (i *Iter) setDone() {
i.next = nextDone
i.p = i.rb.nsrc
}
// Done returns true if there is no more input to process.
func (i *Iter) Done() bool {
return i.p >= i.rb.nsrc
}
// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
// For any input a and b for which f(a) == f(b), subsequent calls
// to Next will return the same segments.
// Modifying runes are grouped together with the preceding starter, if such a starter exists.
// Although not guaranteed, n will typically be the smallest possible n.
func (i *Iter) Next() []byte {
return i.next(i)
}
func nextASCIIBytes(i *Iter) []byte {
p := i.p + 1
if p >= i.rb.nsrc {
p0 := i.p
i.setDone()
return i.rb.src.bytes[p0:p]
}
if i.rb.src.bytes[p] < utf8.RuneSelf {
p0 := i.p
i.p = p
return i.rb.src.bytes[p0:p]
}
i.info = i.rb.f.info(i.rb.src, i.p)
i.next = i.rb.f.nextMain
return i.next(i)
}
func nextASCIIString(i *Iter) []byte {
p := i.p + 1
if p >= i.rb.nsrc {
i.buf[0] = i.rb.src.str[i.p]
i.setDone()
return i.buf[:1]
}
if i.rb.src.str[p] < utf8.RuneSelf {
i.buf[0] = i.rb.src.str[i.p]
i.p = p
return i.buf[:1]
}
i.info = i.rb.f.info(i.rb.src, i.p)
i.next = i.rb.f.nextMain
return i.next(i)
}
func nextHangul(i *Iter) []byte {
p := i.p
next := p + hangulUTF8Size
if next >= i.rb.nsrc {
i.setDone()
} else if i.rb.src.hangul(next) == 0 {
i.rb.ss.next(i.info)
i.info = i.rb.f.info(i.rb.src, i.p)
i.next = i.rb.f.nextMain
return i.next(i)
}
i.p = next
return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
}
func nextDone(i *Iter) []byte {
return nil
}
// nextMulti is used for iterating over multi-segment decompositions
// for decomposing normal forms.
func nextMulti(i *Iter) []byte {
j := 0
d := i.multiSeg
// skip first rune
for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
}
for j < len(d) {
info := i.rb.f.info(input{bytes: d}, j)
if info.BoundaryBefore() {
i.multiSeg = d[j:]
return d[:j]
}
j += int(info.size)
}
// treat last segment as normal decomposition
i.next = i.rb.f.nextMain
return i.next(i)
}
// nextMultiNorm is used for iterating over multi-segment decompositions
// for composing normal forms.
func nextMultiNorm(i *Iter) []byte {
j := 0
d := i.multiSeg
for j < len(d) {
info := i.rb.f.info(input{bytes: d}, j)
if info.BoundaryBefore() {
i.rb.compose()
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
i.rb.insertUnsafe(input{bytes: d}, j, info)
i.multiSeg = d[j+int(info.size):]
return seg
}
i.rb.insertUnsafe(input{bytes: d}, j, info)
j += int(info.size)
}
i.multiSeg = nil
i.next = nextComposed
return doNormComposed(i)
}
// nextDecomposed is the implementation of Next for forms NFD and NFKD.
func nextDecomposed(i *Iter) (next []byte) {
outp := 0
inCopyStart, outCopyStart := i.p, 0
for {
if sz := int(i.info.size); sz <= 1 {
i.rb.ss = 0
p := i.p
i.p++ // ASCII or illegal byte. Either way, advance by 1.
if i.p >= i.rb.nsrc {
i.setDone()
return i.returnSlice(p, i.p)
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
i.next = i.asciiF
return i.returnSlice(p, i.p)
}
outp++
} else if d := i.info.Decomposition(); d != nil {
// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
// Case 1: there is a leftover to copy. In this case the decomposition
// must begin with a modifier and should always be appended.
// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
p := outp + len(d)
if outp > 0 {
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
// TODO: this condition should not be possible, but we leave it
// in for defensive purposes.
if p > len(i.buf) {
return i.buf[:outp]
}
} else if i.info.multiSegment() {
// outp must be 0 as multi-segment decompositions always
// start a new segment.
if i.multiSeg == nil {
i.multiSeg = d
i.next = nextMulti
return nextMulti(i)
}
// We are in the last segment. Treat as normal decomposition.
d = i.multiSeg
i.multiSeg = nil
p = len(d)
}
prevCC := i.info.tccc
if i.p += sz; i.p >= i.rb.nsrc {
i.setDone()
i.info = Properties{} // Force BoundaryBefore to succeed.
} else {
i.info = i.rb.f.info(i.rb.src, i.p)
}
switch i.rb.ss.next(i.info) {
case ssOverflow:
i.next = nextCGJDecompose
fallthrough
case ssStarter:
if outp > 0 {
copy(i.buf[outp:], d)
return i.buf[:p]
}
return d
}
copy(i.buf[outp:], d)
outp = p
inCopyStart, outCopyStart = i.p, outp
if i.info.ccc < prevCC {
goto doNorm
}
continue
} else if r := i.rb.src.hangul(i.p); r != 0 {
outp = decomposeHangul(i.buf[:], r)
i.p += hangulUTF8Size
inCopyStart, outCopyStart = i.p, outp
if i.p >= i.rb.nsrc {
i.setDone()
break
} else if i.rb.src.hangul(i.p) != 0 {
i.next = nextHangul
return i.buf[:outp]
}
} else {
p := outp + sz
if p > len(i.buf) {
break
}
outp = p
i.p += sz
}
if i.p >= i.rb.nsrc {
i.setDone()
break
}
prevCC := i.info.tccc
i.info = i.rb.f.info(i.rb.src, i.p)
if v := i.rb.ss.next(i.info); v == ssStarter {
break
} else if v == ssOverflow {
i.next = nextCGJDecompose
break
}
if i.info.ccc < prevCC {
goto doNorm
}
}
if outCopyStart == 0 {
return i.returnSlice(inCopyStart, i.p)
} else if inCopyStart < i.p {
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
}
return i.buf[:outp]
doNorm:
// Insert what we have decomposed so far in the reorderBuffer.
// As we will only reorder, there will always be enough room.
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
i.rb.insertDecomposed(i.buf[0:outp])
return doNormDecomposed(i)
}
func doNormDecomposed(i *Iter) []byte {
for {
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
i.setDone()
break
}
i.info = i.rb.f.info(i.rb.src, i.p)
if i.info.ccc == 0 {
break
}
if s := i.rb.ss.next(i.info); s == ssOverflow {
i.next = nextCGJDecompose
break
}
}
// new segment or too many combining characters: exit normalization
return i.buf[:i.rb.flushCopy(i.buf[:])]
}
func nextCGJDecompose(i *Iter) []byte {
i.rb.ss = 0
i.rb.insertCGJ()
i.next = nextDecomposed
i.rb.ss.first(i.info)
buf := doNormDecomposed(i)
return buf
}
// nextComposed is the implementation of Next for forms NFC and NFKC.
func nextComposed(i *Iter) []byte {
outp, startp := 0, i.p
var prevCC uint8
for {
if !i.info.isYesC() {
goto doNorm
}
prevCC = i.info.tccc
sz := int(i.info.size)
if sz == 0 {
sz = 1 // illegal rune: copy byte-by-byte
}
p := outp + sz
if p > len(i.buf) {
break
}
outp = p
i.p += sz
if i.p >= i.rb.nsrc {
i.setDone()
break
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
i.rb.ss = 0
i.next = i.asciiF
break
}
i.info = i.rb.f.info(i.rb.src, i.p)
if v := i.rb.ss.next(i.info); v == ssStarter {
break
} else if v == ssOverflow {
i.next = nextCGJCompose
break
}
if i.info.ccc < prevCC {
goto doNorm
}
}
return i.returnSlice(startp, i.p)
doNorm:
// reset to start position
i.p = startp
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
if i.info.multiSegment() {
d := i.info.Decomposition()
info := i.rb.f.info(input{bytes: d}, 0)
i.rb.insertUnsafe(input{bytes: d}, 0, info)
i.multiSeg = d[int(info.size):]
i.next = nextMultiNorm
return nextMultiNorm(i)
}
i.rb.ss.first(i.info)
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
return doNormComposed(i)
}
func doNormComposed(i *Iter) []byte {
// First rune should already be inserted.
for {
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
i.setDone()
break
}
i.info = i.rb.f.info(i.rb.src, i.p)
if s := i.rb.ss.next(i.info); s == ssStarter {
break
} else if s == ssOverflow {
i.next = nextCGJCompose
break
}
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
}
i.rb.compose()
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
return seg
}
func nextCGJCompose(i *Iter) []byte {
i.rb.ss = 0 // instead of first
i.rb.insertCGJ()
i.next = nextComposed
// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
// If we ever change that, insert a check here.
i.rb.ss.first(i.info)
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
return doNormComposed(i)
}

609
vendor/golang.org/x/text/unicode/norm/normalize.go generated vendored Normal file
View file

@ -0,0 +1,609 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Note: the file data_test.go that is generated should not be checked in.
//go:generate go run maketables.go triegen.go
//go:generate go test -tags test
// Package norm contains types and functions for normalizing Unicode strings.
package norm // import "golang.org/x/text/unicode/norm"
import (
"unicode/utf8"
"golang.org/x/text/transform"
)
// A Form denotes a canonical representation of Unicode code points.
// The Unicode-defined normalization and equivalence forms are:
//
// NFC Unicode Normalization Form C
// NFD Unicode Normalization Form D
// NFKC Unicode Normalization Form KC
// NFKD Unicode Normalization Form KD
//
// For a Form f, this documentation uses the notation f(x) to mean
// the bytes or string x converted to the given form.
// A position n in x is called a boundary if conversion to the form can
// proceed independently on both sides:
// f(x) == append(f(x[0:n]), f(x[n:])...)
//
// References: https://unicode.org/reports/tr15/ and
// https://unicode.org/notes/tn5/.
type Form int
const (
NFC Form = iota
NFD
NFKC
NFKD
)
// Bytes returns f(b). May return b if f(b) = b.
func (f Form) Bytes(b []byte) []byte {
src := inputBytes(b)
ft := formTable[f]
n, ok := ft.quickSpan(src, 0, len(b), true)
if ok {
return b
}
out := make([]byte, n, len(b))
copy(out, b[0:n])
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
return doAppendInner(&rb, n)
}
// String returns f(s).
func (f Form) String(s string) string {
src := inputString(s)
ft := formTable[f]
n, ok := ft.quickSpan(src, 0, len(s), true)
if ok {
return s
}
out := make([]byte, n, len(s))
copy(out, s[0:n])
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
return string(doAppendInner(&rb, n))
}
// IsNormal returns true if b == f(b).
func (f Form) IsNormal(b []byte) bool {
src := inputBytes(b)
ft := formTable[f]
bp, ok := ft.quickSpan(src, 0, len(b), true)
if ok {
return true
}
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
rb.setFlusher(nil, cmpNormalBytes)
for bp < len(b) {
rb.out = b[bp:]
if bp = decomposeSegment(&rb, bp, true); bp < 0 {
return false
}
bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
}
return true
}
func cmpNormalBytes(rb *reorderBuffer) bool {
b := rb.out
for i := 0; i < rb.nrune; i++ {
info := rb.rune[i]
if int(info.size) > len(b) {
return false
}
p := info.pos
pe := p + info.size
for ; p < pe; p++ {
if b[0] != rb.byte[p] {
return false
}
b = b[1:]
}
}
return true
}
// IsNormalString returns true if s == f(s).
func (f Form) IsNormalString(s string) bool {
src := inputString(s)
ft := formTable[f]
bp, ok := ft.quickSpan(src, 0, len(s), true)
if ok {
return true
}
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
rb.setFlusher(nil, func(rb *reorderBuffer) bool {
for i := 0; i < rb.nrune; i++ {
info := rb.rune[i]
if bp+int(info.size) > len(s) {
return false
}
p := info.pos
pe := p + info.size
for ; p < pe; p++ {
if s[bp] != rb.byte[p] {
return false
}
bp++
}
}
return true
})
for bp < len(s) {
if bp = decomposeSegment(&rb, bp, true); bp < 0 {
return false
}
bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
}
return true
}
// patchTail fixes a case where a rune may be incorrectly normalized
// if it is followed by illegal continuation bytes. It returns the
// patched buffer and whether the decomposition is still in progress.
func patchTail(rb *reorderBuffer) bool {
info, p := lastRuneStart(&rb.f, rb.out)
if p == -1 || info.size == 0 {
return true
}
end := p + int(info.size)
extra := len(rb.out) - end
if extra > 0 {
// Potentially allocating memory. However, this only
// happens with ill-formed UTF-8.
x := make([]byte, 0)
x = append(x, rb.out[len(rb.out)-extra:]...)
rb.out = rb.out[:end]
decomposeToLastBoundary(rb)
rb.doFlush()
rb.out = append(rb.out, x...)
return false
}
buf := rb.out[p:]
rb.out = rb.out[:p]
decomposeToLastBoundary(rb)
if s := rb.ss.next(info); s == ssStarter {
rb.doFlush()
rb.ss.first(info)
} else if s == ssOverflow {
rb.doFlush()
rb.insertCGJ()
rb.ss = 0
}
rb.insertUnsafe(inputBytes(buf), 0, info)
return true
}
func appendQuick(rb *reorderBuffer, i int) int {
if rb.nsrc == i {
return i
}
end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
rb.out = rb.src.appendSlice(rb.out, i, end)
return end
}
// Append returns f(append(out, b...)).
// The buffer out must be nil, empty, or equal to f(out).
func (f Form) Append(out []byte, src ...byte) []byte {
return f.doAppend(out, inputBytes(src), len(src))
}
func (f Form) doAppend(out []byte, src input, n int) []byte {
if n == 0 {
return out
}
ft := formTable[f]
// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
if len(out) == 0 {
p, _ := ft.quickSpan(src, 0, n, true)
out = src.appendSlice(out, 0, p)
if p == n {
return out
}
rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
return doAppendInner(&rb, p)
}
rb := reorderBuffer{f: *ft, src: src, nsrc: n}
return doAppend(&rb, out, 0)
}
func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
rb.setFlusher(out, appendFlush)
src, n := rb.src, rb.nsrc
doMerge := len(out) > 0
if q := src.skipContinuationBytes(p); q > p {
// Move leading non-starters to destination.
rb.out = src.appendSlice(rb.out, p, q)
p = q
doMerge = patchTail(rb)
}
fd := &rb.f
if doMerge {
var info Properties
if p < n {
info = fd.info(src, p)
if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
if p == 0 {
decomposeToLastBoundary(rb)
}
p = decomposeSegment(rb, p, true)
}
}
if info.size == 0 {
rb.doFlush()
// Append incomplete UTF-8 encoding.
return src.appendSlice(rb.out, p, n)
}
if rb.nrune > 0 {
return doAppendInner(rb, p)
}
}
p = appendQuick(rb, p)
return doAppendInner(rb, p)
}
func doAppendInner(rb *reorderBuffer, p int) []byte {
for n := rb.nsrc; p < n; {
p = decomposeSegment(rb, p, true)
p = appendQuick(rb, p)
}
return rb.out
}
// AppendString returns f(append(out, []byte(s))).
// The buffer out must be nil, empty, or equal to f(out).
func (f Form) AppendString(out []byte, src string) []byte {
return f.doAppend(out, inputString(src), len(src))
}
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) QuickSpan(b []byte) int {
n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
return n
}
// Span implements transform.SpanningTransformer. It returns a boundary n such
// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
if n < len(b) {
if !ok {
err = transform.ErrEndOfSpan
} else {
err = transform.ErrShortSrc
}
}
return n, err
}
// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
if n < len(s) {
if !ok {
err = transform.ErrEndOfSpan
} else {
err = transform.ErrShortSrc
}
}
return n, err
}
// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
// whether any non-normalized parts were found. If atEOF is false, n will
// not point past the last segment if this segment might be become
// non-normalized by appending other runes.
func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
var lastCC uint8
ss := streamSafe(0)
lastSegStart := i
for n = end; i < n; {
if j := src.skipASCII(i, n); i != j {
i = j
lastSegStart = i - 1
lastCC = 0
ss = 0
continue
}
info := f.info(src, i)
if info.size == 0 {
if atEOF {
// include incomplete runes
return n, true
}
return lastSegStart, true
}
// This block needs to be before the next, because it is possible to
// have an overflow for runes that are starters (e.g. with U+FF9E).
switch ss.next(info) {
case ssStarter:
lastSegStart = i
case ssOverflow:
return lastSegStart, false
case ssSuccess:
if lastCC > info.ccc {
return lastSegStart, false
}
}
if f.composing {
if !info.isYesC() {
break
}
} else {
if !info.isYesD() {
break
}
}
lastCC = info.ccc
i += int(info.size)
}
if i == n {
if !atEOF {
n = lastSegStart
}
return n, true
}
return lastSegStart, false
}
// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) QuickSpanString(s string) int {
n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
return n
}
// FirstBoundary returns the position i of the first boundary in b
// or -1 if b contains no boundary.
func (f Form) FirstBoundary(b []byte) int {
return f.firstBoundary(inputBytes(b), len(b))
}
func (f Form) firstBoundary(src input, nsrc int) int {
i := src.skipContinuationBytes(0)
if i >= nsrc {
return -1
}
fd := formTable[f]
ss := streamSafe(0)
// We should call ss.first here, but we can't as the first rune is
// skipped already. This means FirstBoundary can't really determine
// CGJ insertion points correctly. Luckily it doesn't have to.
for {
info := fd.info(src, i)
if info.size == 0 {
return -1
}
if s := ss.next(info); s != ssSuccess {
return i
}
i += int(info.size)
if i >= nsrc {
if !info.BoundaryAfter() && !ss.isMax() {
return -1
}
return nsrc
}
}
}
// FirstBoundaryInString returns the position i of the first boundary in s
// or -1 if s contains no boundary.
func (f Form) FirstBoundaryInString(s string) int {
return f.firstBoundary(inputString(s), len(s))
}
// NextBoundary reports the index of the boundary between the first and next
// segment in b or -1 if atEOF is false and there are not enough bytes to
// determine this boundary.
func (f Form) NextBoundary(b []byte, atEOF bool) int {
return f.nextBoundary(inputBytes(b), len(b), atEOF)
}
// NextBoundaryInString reports the index of the boundary between the first and
// next segment in b or -1 if atEOF is false and there are not enough bytes to
// determine this boundary.
func (f Form) NextBoundaryInString(s string, atEOF bool) int {
return f.nextBoundary(inputString(s), len(s), atEOF)
}
func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
if nsrc == 0 {
if atEOF {
return 0
}
return -1
}
fd := formTable[f]
info := fd.info(src, 0)
if info.size == 0 {
if atEOF {
return 1
}
return -1
}
ss := streamSafe(0)
ss.first(info)
for i := int(info.size); i < nsrc; i += int(info.size) {
info = fd.info(src, i)
if info.size == 0 {
if atEOF {
return i
}
return -1
}
// TODO: Using streamSafe to determine the boundary isn't the same as
// using BoundaryBefore. Determine which should be used.
if s := ss.next(info); s != ssSuccess {
return i
}
}
if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
return -1
}
return nsrc
}
// LastBoundary returns the position i of the last boundary in b
// or -1 if b contains no boundary.
func (f Form) LastBoundary(b []byte) int {
return lastBoundary(formTable[f], b)
}
func lastBoundary(fd *formInfo, b []byte) int {
i := len(b)
info, p := lastRuneStart(fd, b)
if p == -1 {
return -1
}
if info.size == 0 { // ends with incomplete rune
if p == 0 { // starts with incomplete rune
return -1
}
i = p
info, p = lastRuneStart(fd, b[:i])
if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
return i
}
}
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
return i
}
if info.BoundaryAfter() {
return i
}
ss := streamSafe(0)
v := ss.backwards(info)
for i = p; i >= 0 && v != ssStarter; i = p {
info, p = lastRuneStart(fd, b[:i])
if v = ss.backwards(info); v == ssOverflow {
break
}
if p+int(info.size) != i {
if p == -1 { // no boundary found
return -1
}
return i // boundary after an illegal UTF-8 encoding
}
}
return i
}
// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
// Force one character to be consumed.
info := rb.f.info(rb.src, sp)
if info.size == 0 {
return 0
}
if s := rb.ss.next(info); s == ssStarter {
// TODO: this could be removed if we don't support merging.
if rb.nrune > 0 {
goto end
}
} else if s == ssOverflow {
rb.insertCGJ()
goto end
}
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
return int(err)
}
for {
sp += int(info.size)
if sp >= rb.nsrc {
if !atEOF && !info.BoundaryAfter() {
return int(iShortSrc)
}
break
}
info = rb.f.info(rb.src, sp)
if info.size == 0 {
if !atEOF {
return int(iShortSrc)
}
break
}
if s := rb.ss.next(info); s == ssStarter {
break
} else if s == ssOverflow {
rb.insertCGJ()
break
}
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
return int(err)
}
}
end:
if !rb.doFlush() {
return int(iShortDst)
}
return sp
}
// lastRuneStart returns the runeInfo and position of the last
// rune in buf or the zero runeInfo and -1 if no rune was found.
func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
p := len(buf) - 1
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
}
if p < 0 {
return Properties{}, -1
}
return fd.info(inputBytes(buf), p), p
}
// decomposeToLastBoundary finds an open segment at the end of the buffer
// and scans it into rb. Returns the buffer minus the last segment.
func decomposeToLastBoundary(rb *reorderBuffer) {
fd := &rb.f
info, i := lastRuneStart(fd, rb.out)
if int(info.size) != len(rb.out)-i {
// illegal trailing continuation bytes
return
}
if info.BoundaryAfter() {
return
}
var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
padd := 0
ss := streamSafe(0)
p := len(rb.out)
for {
add[padd] = info
v := ss.backwards(info)
if v == ssOverflow {
// Note that if we have an overflow, it the string we are appending to
// is not correctly normalized. In this case the behavior is undefined.
break
}
padd++
p -= int(info.size)
if v == ssStarter || p < 0 {
break
}
info, i = lastRuneStart(fd, rb.out[:p])
if int(info.size) != p-i {
break
}
}
rb.ss = ss
// Copy bytes for insertion as we may need to overwrite rb.out.
var buf [maxBufferSize * utf8.UTFMax]byte
cp := buf[:copy(buf[:], rb.out[p:])]
rb.out = rb.out[:p]
for padd--; padd >= 0; padd-- {
info = add[padd]
rb.insertUnsafe(inputBytes(cp), 0, info)
cp = cp[info.size:]
}
}

125
vendor/golang.org/x/text/unicode/norm/readwriter.go generated vendored Normal file
View file

@ -0,0 +1,125 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import "io"
type normWriter struct {
rb reorderBuffer
w io.Writer
buf []byte
}
// Write implements the standard write interface. If the last characters are
// not at a normalization boundary, the bytes will be buffered for the next
// write. The remaining bytes will be written on close.
func (w *normWriter) Write(data []byte) (n int, err error) {
// Process data in pieces to keep w.buf size bounded.
const chunk = 4000
for len(data) > 0 {
// Normalize into w.buf.
m := len(data)
if m > chunk {
m = chunk
}
w.rb.src = inputBytes(data[:m])
w.rb.nsrc = m
w.buf = doAppend(&w.rb, w.buf, 0)
data = data[m:]
n += m
// Write out complete prefix, save remainder.
// Note that lastBoundary looks back at most 31 runes.
i := lastBoundary(&w.rb.f, w.buf)
if i == -1 {
i = 0
}
if i > 0 {
if _, err = w.w.Write(w.buf[:i]); err != nil {
break
}
bn := copy(w.buf, w.buf[i:])
w.buf = w.buf[:bn]
}
}
return n, err
}
// Close forces data that remains in the buffer to be written.
func (w *normWriter) Close() error {
if len(w.buf) > 0 {
_, err := w.w.Write(w.buf)
if err != nil {
return err
}
}
return nil
}
// Writer returns a new writer that implements Write(b)
// by writing f(b) to w. The returned writer may use an
// internal buffer to maintain state across Write calls.
// Calling its Close method writes any buffered data to w.
func (f Form) Writer(w io.Writer) io.WriteCloser {
wr := &normWriter{rb: reorderBuffer{}, w: w}
wr.rb.init(f, nil)
return wr
}
type normReader struct {
rb reorderBuffer
r io.Reader
inbuf []byte
outbuf []byte
bufStart int
lastBoundary int
err error
}
// Read implements the standard read interface.
func (r *normReader) Read(p []byte) (int, error) {
for {
if r.lastBoundary-r.bufStart > 0 {
n := copy(p, r.outbuf[r.bufStart:r.lastBoundary])
r.bufStart += n
if r.lastBoundary-r.bufStart > 0 {
return n, nil
}
return n, r.err
}
if r.err != nil {
return 0, r.err
}
outn := copy(r.outbuf, r.outbuf[r.lastBoundary:])
r.outbuf = r.outbuf[0:outn]
r.bufStart = 0
n, err := r.r.Read(r.inbuf)
r.rb.src = inputBytes(r.inbuf[0:n])
r.rb.nsrc, r.err = n, err
if n > 0 {
r.outbuf = doAppend(&r.rb, r.outbuf, 0)
}
if err == io.EOF {
r.lastBoundary = len(r.outbuf)
} else {
r.lastBoundary = lastBoundary(&r.rb.f, r.outbuf)
if r.lastBoundary == -1 {
r.lastBoundary = 0
}
}
}
}
// Reader returns a new reader that implements Read
// by reading data from r and returning f(data).
func (f Form) Reader(r io.Reader) io.Reader {
const chunk = 4000
buf := make([]byte, chunk)
rr := &normReader{rb: reorderBuffer{}, r: r, inbuf: buf}
rr.rb.init(f, buf)
return rr
}

7657
vendor/golang.org/x/text/unicode/norm/tables10.0.0.go generated vendored Normal file

File diff suppressed because it is too large Load diff

7693
vendor/golang.org/x/text/unicode/norm/tables11.0.0.go generated vendored Normal file

File diff suppressed because it is too large Load diff

7637
vendor/golang.org/x/text/unicode/norm/tables9.0.0.go generated vendored Normal file

File diff suppressed because it is too large Load diff

88
vendor/golang.org/x/text/unicode/norm/transform.go generated vendored Normal file
View file

@ -0,0 +1,88 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import (
"unicode/utf8"
"golang.org/x/text/transform"
)
// Reset implements the Reset method of the transform.Transformer interface.
func (Form) Reset() {}
// Transform implements the Transform method of the transform.Transformer
// interface. It may need to write segments of up to MaxSegmentSize at once.
// Users should either catch ErrShortDst and allow dst to grow or have dst be at
// least of size MaxTransformChunkSize to be guaranteed of progress.
func (f Form) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
// Cap the maximum number of src bytes to check.
b := src
eof := atEOF
if ns := len(dst); ns < len(b) {
err = transform.ErrShortDst
eof = false
b = b[:ns]
}
i, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), eof)
n := copy(dst, b[:i])
if !ok {
nDst, nSrc, err = f.transform(dst[n:], src[n:], atEOF)
return nDst + n, nSrc + n, err
}
if err == nil && n < len(src) && !atEOF {
err = transform.ErrShortSrc
}
return n, n, err
}
func flushTransform(rb *reorderBuffer) bool {
// Write out (must fully fit in dst, or else it is an ErrShortDst).
if len(rb.out) < rb.nrune*utf8.UTFMax {
return false
}
rb.out = rb.out[rb.flushCopy(rb.out):]
return true
}
var errs = []error{nil, transform.ErrShortDst, transform.ErrShortSrc}
// transform implements the transform.Transformer interface. It is only called
// when quickSpan does not pass for a given string.
func (f Form) transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
// TODO: get rid of reorderBuffer. See CL 23460044.
rb := reorderBuffer{}
rb.init(f, src)
for {
// Load segment into reorder buffer.
rb.setFlusher(dst[nDst:], flushTransform)
end := decomposeSegment(&rb, nSrc, atEOF)
if end < 0 {
return nDst, nSrc, errs[-end]
}
nDst = len(dst) - len(rb.out)
nSrc = end
// Next quickSpan.
end = rb.nsrc
eof := atEOF
if n := nSrc + len(dst) - nDst; n < end {
err = transform.ErrShortDst
end = n
eof = false
}
end, ok := rb.f.quickSpan(rb.src, nSrc, end, eof)
n := copy(dst[nDst:], rb.src.bytes[nSrc:end])
nSrc += n
nDst += n
if ok {
if err == nil && n < rb.nsrc && !atEOF {
err = transform.ErrShortSrc
}
return nDst, nSrc, err
}
}
}

54
vendor/golang.org/x/text/unicode/norm/trie.go generated vendored Normal file
View file

@ -0,0 +1,54 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
type valueRange struct {
value uint16 // header: value:stride
lo, hi byte // header: lo:n
}
type sparseBlocks struct {
values []valueRange
offset []uint16
}
var nfcSparse = sparseBlocks{
values: nfcSparseValues[:],
offset: nfcSparseOffset[:],
}
var nfkcSparse = sparseBlocks{
values: nfkcSparseValues[:],
offset: nfkcSparseOffset[:],
}
var (
nfcData = newNfcTrie(0)
nfkcData = newNfkcTrie(0)
)
// lookupValue determines the type of block n and looks up the value for b.
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
// is a list of ranges with an accompanying value. Given a matching range r,
// the value for b is by r.value + (b - r.lo) * stride.
func (t *sparseBlocks) lookup(n uint32, b byte) uint16 {
offset := t.offset[n]
header := t.values[offset]
lo := offset + 1
hi := lo + uint16(header.lo)
for lo < hi {
m := lo + (hi-lo)/2
r := t.values[m]
if r.lo <= b && b <= r.hi {
return r.value + uint16(b-r.lo)*header.value
}
if b < r.lo {
hi = m
} else {
lo = m + 1
}
}
return 0
}