mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
unicode: add case folding tables
R=r, r CC=golang-dev https://golang.org/cl/4571074
This commit is contained in:
parent
6e9b1a78ff
commit
fc77e82617
4 changed files with 919 additions and 107 deletions
|
|
@ -24,15 +24,18 @@ import (
|
|||
func main() {
|
||||
flag.Parse()
|
||||
loadChars() // always needed
|
||||
loadCasefold()
|
||||
printCategories()
|
||||
printScriptOrProperty(false)
|
||||
printScriptOrProperty(true)
|
||||
printCases()
|
||||
printLatinProperties()
|
||||
printCasefold()
|
||||
printSizes()
|
||||
}
|
||||
|
||||
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
|
||||
var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
|
||||
var url = flag.String("url",
|
||||
"http://www.unicode.org/Public/6.0.0/ucd/",
|
||||
"URL of Unicode database directory")
|
||||
|
|
@ -119,6 +122,8 @@ type Char struct {
|
|||
upperCase int
|
||||
lowerCase int
|
||||
titleCase int
|
||||
foldCase int // simple case folding
|
||||
caseOrbit int // next in simple case folding orbit
|
||||
}
|
||||
|
||||
// Scripts.txt has form:
|
||||
|
|
@ -308,8 +313,53 @@ func loadChars() {
|
|||
resp.Body.Close()
|
||||
}
|
||||
|
||||
func loadCasefold() {
|
||||
if *casefoldingURL == "" {
|
||||
flag.Set("casefolding", *url+"CaseFolding.txt")
|
||||
}
|
||||
resp, err := http.Get(*casefoldingURL)
|
||||
if err != nil {
|
||||
logger.Fatal(err)
|
||||
}
|
||||
if resp.StatusCode != 200 {
|
||||
logger.Fatal("bad GET status for CaseFolding.txt", resp.Status)
|
||||
}
|
||||
input := bufio.NewReader(resp.Body)
|
||||
for {
|
||||
line, err := input.ReadString('\n')
|
||||
if err != nil {
|
||||
if err == os.EOF {
|
||||
break
|
||||
}
|
||||
logger.Fatal(err)
|
||||
}
|
||||
if line[0] == '#' {
|
||||
continue
|
||||
}
|
||||
field := strings.Split(line, "; ", -1)
|
||||
if len(field) != 4 {
|
||||
logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4)
|
||||
}
|
||||
kind := field[1]
|
||||
if kind != "C" && kind != "S" {
|
||||
// Only care about 'common' and 'simple' foldings.
|
||||
continue
|
||||
}
|
||||
p1, err := strconv.Btoui64(field[0], 16)
|
||||
if err != nil {
|
||||
logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
|
||||
}
|
||||
p2, err := strconv.Btoui64(field[2], 16)
|
||||
if err != nil {
|
||||
logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
|
||||
}
|
||||
chars[p1].foldCase = int(p2)
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
const progHeader = `// Generated by running
|
||||
// maketables --tables=%s --data=%s
|
||||
// maketables --tables=%s --data=%s --casefolding=%s
|
||||
// DO NOT EDIT
|
||||
|
||||
package unicode
|
||||
|
|
@ -330,7 +380,7 @@ func printCategories() {
|
|||
fullCategoryTest(list)
|
||||
return
|
||||
}
|
||||
fmt.Printf(progHeader, *tablelist, *dataURL)
|
||||
fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
|
||||
|
||||
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
|
||||
fmt.Printf("const Version = %q\n\n", version())
|
||||
|
|
@ -837,13 +887,13 @@ func printCases() {
|
|||
}
|
||||
fmt.Printf(
|
||||
"// Generated by running\n"+
|
||||
"// maketables --data=%s\n"+
|
||||
"// maketables --data=%s --casefolding=%s\n"+
|
||||
"// DO NOT EDIT\n\n"+
|
||||
"// CaseRanges is the table describing case mappings for all letters with\n"+
|
||||
"// non-self mappings.\n"+
|
||||
"var CaseRanges = _CaseRanges\n"+
|
||||
"var _CaseRanges = []CaseRange {\n",
|
||||
*dataURL)
|
||||
*dataURL, *casefoldingURL)
|
||||
|
||||
var startState *caseState // the start of a run; nil for not active
|
||||
var prevState = &caseState{} // the state of the previous character
|
||||
|
|
@ -946,13 +996,246 @@ func printLatinProperties() {
|
|||
if code == ' ' {
|
||||
property = "pZ | pp"
|
||||
}
|
||||
fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code)
|
||||
fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code)
|
||||
}
|
||||
fmt.Println("}")
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
var range16Count = 0 // Number of entries in the 16-bit range tables.
|
||||
var range32Count = 0 // Number of entries in the 32-bit range tables.
|
||||
func printCasefold() {
|
||||
// Build list of case-folding groups attached to each canonical folded char (typically lower case).
|
||||
var caseOrbit = make([][]int, MaxChar+1)
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
if c.foldCase == 0 {
|
||||
continue
|
||||
}
|
||||
orb := caseOrbit[c.foldCase]
|
||||
if orb == nil {
|
||||
orb = append(orb, c.foldCase)
|
||||
}
|
||||
caseOrbit[c.foldCase] = append(orb, i)
|
||||
}
|
||||
|
||||
// Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
f := c.foldCase
|
||||
if f == 0 {
|
||||
f = i
|
||||
}
|
||||
orb := caseOrbit[f]
|
||||
if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
|
||||
// Default assumption of [upper, lower] is wrong.
|
||||
caseOrbit[i] = []int{i}
|
||||
}
|
||||
}
|
||||
|
||||
// Delete the groups for which assuming [lower, upper] is right.
|
||||
for i, orb := range caseOrbit {
|
||||
if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
|
||||
caseOrbit[i] = nil
|
||||
}
|
||||
}
|
||||
|
||||
// Record orbit information in chars.
|
||||
for _, orb := range caseOrbit {
|
||||
if orb == nil {
|
||||
continue
|
||||
}
|
||||
sort.SortInts(orb)
|
||||
c := orb[len(orb)-1]
|
||||
for _, d := range orb {
|
||||
chars[c].caseOrbit = d
|
||||
c = d
|
||||
}
|
||||
}
|
||||
|
||||
printCaseOrbit()
|
||||
|
||||
// Tables of category and script folding exceptions: code points
|
||||
// that must be added when interpreting a particular category/script
|
||||
// in a case-folding context.
|
||||
cat := make(map[string]map[int]bool)
|
||||
for name := range category {
|
||||
if x := foldExceptions(inCategory(name)); len(x) > 0 {
|
||||
cat[name] = x
|
||||
}
|
||||
}
|
||||
|
||||
scr := make(map[string]map[int]bool)
|
||||
for name := range scripts {
|
||||
if x := foldExceptions(inScript(name)); len(x) > 0 {
|
||||
cat[name] = x
|
||||
}
|
||||
}
|
||||
|
||||
printCatFold("FoldCategory", cat)
|
||||
printCatFold("FoldScript", scr)
|
||||
}
|
||||
|
||||
// inCategory returns a list of all the runes in the category.
|
||||
func inCategory(name string) []int {
|
||||
var x []int
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
|
||||
x = append(x, i)
|
||||
}
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
// inScript returns a list of all the runes in the script.
|
||||
func inScript(name string) []int {
|
||||
var x []int
|
||||
for _, s := range scripts[name] {
|
||||
for c := s.lo; c <= s.hi; c++ {
|
||||
x = append(x, int(c))
|
||||
}
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
// foldExceptions returns a list of all the runes fold-equivalent
|
||||
// to runes in class but not in class themselves.
|
||||
func foldExceptions(class []int) map[int]bool {
|
||||
// Create map containing class and all fold-equivalent chars.
|
||||
m := make(map[int]bool)
|
||||
for _, r := range class {
|
||||
c := &chars[r]
|
||||
if c.caseOrbit == 0 {
|
||||
// Just upper and lower.
|
||||
if u := c.upperCase; u != 0 {
|
||||
m[u] = true
|
||||
}
|
||||
if l := c.lowerCase; l != 0 {
|
||||
m[l] = true
|
||||
}
|
||||
m[r] = true
|
||||
continue
|
||||
}
|
||||
// Otherwise walk orbit.
|
||||
r0 := r
|
||||
for {
|
||||
m[r] = true
|
||||
r = chars[r].caseOrbit
|
||||
if r == r0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove class itself.
|
||||
for _, r := range class {
|
||||
m[r] = false, false
|
||||
}
|
||||
|
||||
// What's left is the exceptions.
|
||||
return m
|
||||
}
|
||||
|
||||
var comment = map[string]string{
|
||||
"FoldCategory": "// FoldCategory maps a category name to a table of\n" +
|
||||
"// code points outside the category that are equivalent under\n" +
|
||||
"// simple case folding to code points inside the category.\n" +
|
||||
"// If there is no entry for a category name, there are no such points.\n",
|
||||
|
||||
"FoldScript": "// FoldScript maps a script name to a table of\n" +
|
||||
"// code points outside the script that are equivalent under\n" +
|
||||
"// simple case folding to code points inside the script.\n" +
|
||||
"// If there is no entry for a script name, there are no such points.\n",
|
||||
}
|
||||
|
||||
func printCaseOrbit() {
|
||||
if *test {
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
f := c.caseOrbit
|
||||
if f == 0 {
|
||||
if c.lowerCase != i && c.lowerCase != 0 {
|
||||
f = c.lowerCase
|
||||
} else if c.upperCase != i && c.upperCase != 0 {
|
||||
f = c.upperCase
|
||||
} else {
|
||||
f = i
|
||||
}
|
||||
}
|
||||
if g := unicode.SimpleFold(i); g != f {
|
||||
fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Printf("var caseOrbit = []foldPair{\n")
|
||||
for i := range chars {
|
||||
c := &chars[i]
|
||||
if c.caseOrbit != 0 {
|
||||
fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
|
||||
foldPairCount++
|
||||
}
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
}
|
||||
|
||||
func printCatFold(name string, m map[string]map[int]bool) {
|
||||
if *test {
|
||||
var pkgMap map[string]*unicode.RangeTable
|
||||
if name == "FoldCategory" {
|
||||
pkgMap = unicode.FoldCategory
|
||||
} else {
|
||||
pkgMap = unicode.FoldScript
|
||||
}
|
||||
if len(pkgMap) != len(m) {
|
||||
fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
|
||||
return
|
||||
}
|
||||
for k, v := range m {
|
||||
t, ok := pkgMap[k]
|
||||
if !ok {
|
||||
fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
|
||||
continue
|
||||
}
|
||||
n := 0
|
||||
for _, r := range t.R16 {
|
||||
for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
|
||||
if !v[c] {
|
||||
fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
|
||||
}
|
||||
n++
|
||||
}
|
||||
}
|
||||
for _, r := range t.R32 {
|
||||
for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
|
||||
if !v[c] {
|
||||
fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
|
||||
}
|
||||
n++
|
||||
}
|
||||
}
|
||||
if n != len(v) {
|
||||
fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Print(comment[name])
|
||||
fmt.Printf("var %s = map[string]*RangeTable{\n", name)
|
||||
for name := range m {
|
||||
fmt.Printf("\t%q: fold%s,\n", name, name)
|
||||
}
|
||||
fmt.Printf("}\n\n")
|
||||
for name, class := range m {
|
||||
dumpRange(
|
||||
fmt.Sprintf("var fold%s = &RangeTable{\n", name),
|
||||
func(code int) bool { return class[code] })
|
||||
}
|
||||
}
|
||||
|
||||
var range16Count = 0 // Number of entries in the 16-bit range tables.
|
||||
var range32Count = 0 // Number of entries in the 32-bit range tables.
|
||||
var foldPairCount = 0 // Number of fold pairs in the exception tables.
|
||||
|
||||
func printSizes() {
|
||||
if *test {
|
||||
|
|
@ -963,4 +1246,6 @@ func printSizes() {
|
|||
range16Bytes := range16Count * 3 * 2
|
||||
range32Bytes := range32Count * 3 * 4
|
||||
fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
|
||||
fmt.Println()
|
||||
fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue