mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
unicode: make the tables smaller.
By splitting the ranges into 16-bit values and 32-bit values, we can reduce about 3000 entries by 48 bits per entry, or about 16KB, at the cost of a little more complexity in the code. R=iant, bradfitz, rsc, r CC=golang-dev https://golang.org/cl/4547066
This commit is contained in:
parent
2c6a2a9773
commit
0de328edd6
4 changed files with 4481 additions and 3969 deletions
|
|
@ -28,6 +28,7 @@ func main() {
|
|||
printScriptOrProperty(false)
|
||||
printScriptOrProperty(true)
|
||||
printCases()
|
||||
printSizes()
|
||||
}
|
||||
|
||||
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
|
||||
|
|
@ -278,16 +279,16 @@ func loadChars() {
|
|||
switch parseCategory(line[0 : len(line)-1]) {
|
||||
case SNormal:
|
||||
if first != 0 {
|
||||
logger.Fatalf("bad state normal at U+%04X", lastChar)
|
||||
logger.Fatalf("bad state normal at %U", lastChar)
|
||||
}
|
||||
case SFirst:
|
||||
if first != 0 {
|
||||
logger.Fatalf("bad state first at U+%04X", lastChar)
|
||||
logger.Fatalf("bad state first at %U", lastChar)
|
||||
}
|
||||
first = lastChar
|
||||
case SLast:
|
||||
if first == 0 {
|
||||
logger.Fatalf("bad state last at U+%04X", lastChar)
|
||||
logger.Fatalf("bad state last at %U", lastChar)
|
||||
}
|
||||
for i := first + 1; i <= lastChar; i++ {
|
||||
chars[i] = chars[first]
|
||||
|
|
@ -299,6 +300,15 @@ func loadChars() {
|
|||
resp.Body.Close()
|
||||
}
|
||||
|
||||
const progHeader = `// Generated by running
|
||||
// maketables --tables=%s --data=%s
|
||||
// DO NOT EDIT
|
||||
|
||||
package unicode
|
||||
|
||||
`
|
||||
|
||||
|
||||
func printCategories() {
|
||||
if *tablelist == "" {
|
||||
return
|
||||
|
|
@ -312,20 +322,14 @@ func printCategories() {
|
|||
fullCategoryTest(list)
|
||||
return
|
||||
}
|
||||
fmt.Printf(
|
||||
"// Generated by running\n"+
|
||||
"// maketables --tables=%s --data=%s\n"+
|
||||
"// DO NOT EDIT\n\n"+
|
||||
"package unicode\n\n",
|
||||
*tablelist,
|
||||
*dataURL)
|
||||
fmt.Printf(progHeader, *tablelist, *dataURL)
|
||||
|
||||
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
|
||||
fmt.Printf("const Version = %q\n\n", version())
|
||||
|
||||
if *tablelist == "all" {
|
||||
fmt.Println("// Categories is the set of Unicode data tables.")
|
||||
fmt.Println("var Categories = map[string] []Range {")
|
||||
fmt.Println("var Categories = map[string] *RangeTable {")
|
||||
for k := range category {
|
||||
fmt.Printf("\t%q: %s,\n", k, k)
|
||||
}
|
||||
|
|
@ -364,12 +368,12 @@ func printCategories() {
|
|||
ndecl++
|
||||
if name == "letter" { // special case
|
||||
dumpRange(
|
||||
"var letter = []Range {\n",
|
||||
"var letter = &RangeTable{\n",
|
||||
letterOp)
|
||||
continue
|
||||
}
|
||||
dumpRange(
|
||||
fmt.Sprintf("var _%s = []Range {\n", name),
|
||||
fmt.Sprintf("var _%s = &RangeTable{\n", name),
|
||||
func(code int) bool { return chars[code].category == name })
|
||||
}
|
||||
decl.Sort()
|
||||
|
|
@ -382,12 +386,15 @@ func printCategories() {
|
|||
|
||||
type Op func(code int) bool
|
||||
|
||||
const format = "\t{0x%04x, 0x%04x, %d},\n"
|
||||
const format = "\t\t{0x%04x, 0x%04x, %d},\n"
|
||||
|
||||
func dumpRange(header string, inCategory Op) {
|
||||
fmt.Print(header)
|
||||
next := 0
|
||||
fmt.Print("\tR16: []Range16{\n")
|
||||
// one Range for each iteration
|
||||
count := &range16Count
|
||||
size := 16
|
||||
for {
|
||||
// look for start of range
|
||||
for next < len(chars) && !inCategory(next) {
|
||||
|
|
@ -427,10 +434,18 @@ func dumpRange(header string, inCategory Op) {
|
|||
break
|
||||
}
|
||||
}
|
||||
if size == 16 && (lo >= 1<<16 || hi >= 1<<16) {
|
||||
fmt.Print("\t},\n")
|
||||
fmt.Print("\tR32: []Range32{\n")
|
||||
size = 32
|
||||
count = &range32Count
|
||||
}
|
||||
fmt.Printf(format, lo, hi, stride)
|
||||
*count++
|
||||
// next range: start looking where this range ends
|
||||
next = hi + 1
|
||||
}
|
||||
fmt.Print("\t},\n")
|
||||
fmt.Print("}\n\n")
|
||||
}
|
||||
|
||||
|
|
@ -454,12 +469,12 @@ func fullCategoryTest(list []string) {
|
|||
}
|
||||
}
|
||||
|
||||
func verifyRange(name string, inCategory Op, table []unicode.Range) {
|
||||
func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
|
||||
for i := range chars {
|
||||
web := inCategory(i)
|
||||
pkg := unicode.Is(table, i)
|
||||
if web != pkg {
|
||||
fmt.Fprintf(os.Stderr, "%s: U+%04X: web=%t pkg=%t\n", name, i, web, pkg)
|
||||
fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -497,22 +512,22 @@ func parseScript(line string, scripts map[string][]Script) {
|
|||
}
|
||||
|
||||
// The script tables have a lot of adjacent elements. Fold them together.
|
||||
func foldAdjacent(r []Script) []unicode.Range {
|
||||
s := make([]unicode.Range, 0, len(r))
|
||||
func foldAdjacent(r []Script) []unicode.Range32 {
|
||||
s := make([]unicode.Range32, 0, len(r))
|
||||
j := 0
|
||||
for i := 0; i < len(r); i++ {
|
||||
if j > 0 && int(r[i].lo) == s[j-1].Hi+1 {
|
||||
s[j-1].Hi = int(r[i].hi)
|
||||
if j > 0 && r[i].lo == s[j-1].Hi+1 {
|
||||
s[j-1].Hi = r[i].hi
|
||||
} else {
|
||||
s = s[0 : j+1]
|
||||
s[j] = unicode.Range{int(r[i].lo), int(r[i].hi), 1}
|
||||
s[j] = unicode.Range32{uint32(r[i].lo), uint32(r[i].hi), 1}
|
||||
j++
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts map[string][]Script) {
|
||||
func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
|
||||
for _, name := range list {
|
||||
if _, ok := scripts[name]; !ok {
|
||||
logger.Fatal("unknown script", name)
|
||||
|
|
@ -524,7 +539,7 @@ func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts
|
|||
for _, script := range scripts[name] {
|
||||
for r := script.lo; r <= script.hi; r++ {
|
||||
if !unicode.Is(installed[name], int(r)) {
|
||||
fmt.Fprintf(os.Stderr, "U+%04X: not in script %s\n", r, name)
|
||||
fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -589,10 +604,10 @@ func printScriptOrProperty(doProps bool) {
|
|||
if flaglist == "all" {
|
||||
if doProps {
|
||||
fmt.Println("// Properties is the set of Unicode property tables.")
|
||||
fmt.Println("var Properties = map[string] []Range {")
|
||||
fmt.Println("var Properties = map[string] *RangeTable{")
|
||||
} else {
|
||||
fmt.Println("// Scripts is the set of Unicode script tables.")
|
||||
fmt.Println("var Scripts = map[string] []Range {")
|
||||
fmt.Println("var Scripts = map[string] *RangeTable{")
|
||||
}
|
||||
for k := range table {
|
||||
fmt.Printf("\t%q: %s,\n", k, k)
|
||||
|
|
@ -613,11 +628,22 @@ func printScriptOrProperty(doProps bool) {
|
|||
name, name, name, name)
|
||||
}
|
||||
ndecl++
|
||||
fmt.Printf("var _%s = []Range {\n", name)
|
||||
fmt.Printf("var _%s = &RangeTable {\n", name)
|
||||
fmt.Print("\tR16: []Range16{\n")
|
||||
ranges := foldAdjacent(table[name])
|
||||
size := 16
|
||||
count := &range16Count
|
||||
for _, s := range ranges {
|
||||
if size == 16 && (s.Lo >= 1<<16 || s.Hi >= 1<<16) {
|
||||
fmt.Print("\t},\n")
|
||||
fmt.Print("\tR32: []Range32{\n")
|
||||
size = 32
|
||||
count = &range32Count
|
||||
}
|
||||
*count++
|
||||
fmt.Printf(format, s.Lo, s.Hi, s.Stride)
|
||||
}
|
||||
fmt.Print("\t},\n")
|
||||
fmt.Print("}\n\n")
|
||||
}
|
||||
decl.Sort()
|
||||
|
|
@ -808,7 +834,7 @@ func printCaseRange(lo, hi *caseState) {
|
|||
fmt.Printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
|
||||
lo.point, hi.point)
|
||||
case hi.point > lo.point && lo.isLowerUpper():
|
||||
logger.Fatalf("LowerUpper sequence: should not happen: U+%04X. If it's real, need to fix To()", lo.point)
|
||||
logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point)
|
||||
fmt.Printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
|
||||
lo.point, hi.point)
|
||||
default:
|
||||
|
|
@ -831,17 +857,28 @@ func fullCaseTest() {
|
|||
lower := unicode.ToLower(i)
|
||||
want := caseIt(i, c.lowerCase)
|
||||
if lower != want {
|
||||
fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower)
|
||||
fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
|
||||
}
|
||||
upper := unicode.ToUpper(i)
|
||||
want = caseIt(i, c.upperCase)
|
||||
if upper != want {
|
||||
fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper)
|
||||
fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
|
||||
}
|
||||
title := unicode.ToTitle(i)
|
||||
want = caseIt(i, c.titleCase)
|
||||
if title != want {
|
||||
fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title)
|
||||
fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var range16Count = 0 // Number of entries in the 16-bit range tables.
|
||||
var range32Count = 0 // Number of entries in the 32-bit range tables.
|
||||
|
||||
func printSizes() {
|
||||
fmt.Println()
|
||||
fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
|
||||
range16Bytes := range16Count * 3 * 2
|
||||
range32Bytes := range32Count * 3 * 4
|
||||
fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue