unicode: make the tables smaller.

By splitting the ranges into 16-bit values and 32-bit values,
we can reduce about 3000 entries by 48 bits per entry, or about
16KB, at the cost of a little more complexity in the code.

R=iant, bradfitz, rsc, r
CC=golang-dev
https://golang.org/cl/4547066
This commit is contained in:
Rob Pike 2011-05-31 09:58:07 +10:00
parent 2c6a2a9773
commit 0de328edd6
4 changed files with 4481 additions and 3969 deletions

View file

@ -28,6 +28,7 @@ func main() {
printScriptOrProperty(false)
printScriptOrProperty(true)
printCases()
printSizes()
}
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
@ -278,16 +279,16 @@ func loadChars() {
switch parseCategory(line[0 : len(line)-1]) {
case SNormal:
if first != 0 {
logger.Fatalf("bad state normal at U+%04X", lastChar)
logger.Fatalf("bad state normal at %U", lastChar)
}
case SFirst:
if first != 0 {
logger.Fatalf("bad state first at U+%04X", lastChar)
logger.Fatalf("bad state first at %U", lastChar)
}
first = lastChar
case SLast:
if first == 0 {
logger.Fatalf("bad state last at U+%04X", lastChar)
logger.Fatalf("bad state last at %U", lastChar)
}
for i := first + 1; i <= lastChar; i++ {
chars[i] = chars[first]
@ -299,6 +300,15 @@ func loadChars() {
resp.Body.Close()
}
const progHeader = `// Generated by running
// maketables --tables=%s --data=%s
// DO NOT EDIT
package unicode
`
func printCategories() {
if *tablelist == "" {
return
@ -312,20 +322,14 @@ func printCategories() {
fullCategoryTest(list)
return
}
fmt.Printf(
"// Generated by running\n"+
"// maketables --tables=%s --data=%s\n"+
"// DO NOT EDIT\n\n"+
"package unicode\n\n",
*tablelist,
*dataURL)
fmt.Printf(progHeader, *tablelist, *dataURL)
fmt.Println("// Version is the Unicode edition from which the tables are derived.")
fmt.Printf("const Version = %q\n\n", version())
if *tablelist == "all" {
fmt.Println("// Categories is the set of Unicode data tables.")
fmt.Println("var Categories = map[string] []Range {")
fmt.Println("var Categories = map[string] *RangeTable {")
for k := range category {
fmt.Printf("\t%q: %s,\n", k, k)
}
@ -364,12 +368,12 @@ func printCategories() {
ndecl++
if name == "letter" { // special case
dumpRange(
"var letter = []Range {\n",
"var letter = &RangeTable{\n",
letterOp)
continue
}
dumpRange(
fmt.Sprintf("var _%s = []Range {\n", name),
fmt.Sprintf("var _%s = &RangeTable{\n", name),
func(code int) bool { return chars[code].category == name })
}
decl.Sort()
@ -382,12 +386,15 @@ func printCategories() {
type Op func(code int) bool
const format = "\t{0x%04x, 0x%04x, %d},\n"
const format = "\t\t{0x%04x, 0x%04x, %d},\n"
func dumpRange(header string, inCategory Op) {
fmt.Print(header)
next := 0
fmt.Print("\tR16: []Range16{\n")
// one Range for each iteration
count := &range16Count
size := 16
for {
// look for start of range
for next < len(chars) && !inCategory(next) {
@ -427,10 +434,18 @@ func dumpRange(header string, inCategory Op) {
break
}
}
if size == 16 && (lo >= 1<<16 || hi >= 1<<16) {
fmt.Print("\t},\n")
fmt.Print("\tR32: []Range32{\n")
size = 32
count = &range32Count
}
fmt.Printf(format, lo, hi, stride)
*count++
// next range: start looking where this range ends
next = hi + 1
}
fmt.Print("\t},\n")
fmt.Print("}\n\n")
}
@ -454,12 +469,12 @@ func fullCategoryTest(list []string) {
}
}
func verifyRange(name string, inCategory Op, table []unicode.Range) {
func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
for i := range chars {
web := inCategory(i)
pkg := unicode.Is(table, i)
if web != pkg {
fmt.Fprintf(os.Stderr, "%s: U+%04X: web=%t pkg=%t\n", name, i, web, pkg)
fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
}
}
}
@ -497,22 +512,22 @@ func parseScript(line string, scripts map[string][]Script) {
}
// The script tables have a lot of adjacent elements. Fold them together.
func foldAdjacent(r []Script) []unicode.Range {
s := make([]unicode.Range, 0, len(r))
func foldAdjacent(r []Script) []unicode.Range32 {
s := make([]unicode.Range32, 0, len(r))
j := 0
for i := 0; i < len(r); i++ {
if j > 0 && int(r[i].lo) == s[j-1].Hi+1 {
s[j-1].Hi = int(r[i].hi)
if j > 0 && r[i].lo == s[j-1].Hi+1 {
s[j-1].Hi = r[i].hi
} else {
s = s[0 : j+1]
s[j] = unicode.Range{int(r[i].lo), int(r[i].hi), 1}
s[j] = unicode.Range32{uint32(r[i].lo), uint32(r[i].hi), 1}
j++
}
}
return s
}
func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts map[string][]Script) {
func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
for _, name := range list {
if _, ok := scripts[name]; !ok {
logger.Fatal("unknown script", name)
@ -524,7 +539,7 @@ func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts
for _, script := range scripts[name] {
for r := script.lo; r <= script.hi; r++ {
if !unicode.Is(installed[name], int(r)) {
fmt.Fprintf(os.Stderr, "U+%04X: not in script %s\n", r, name)
fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
}
}
}
@ -589,10 +604,10 @@ func printScriptOrProperty(doProps bool) {
if flaglist == "all" {
if doProps {
fmt.Println("// Properties is the set of Unicode property tables.")
fmt.Println("var Properties = map[string] []Range {")
fmt.Println("var Properties = map[string] *RangeTable{")
} else {
fmt.Println("// Scripts is the set of Unicode script tables.")
fmt.Println("var Scripts = map[string] []Range {")
fmt.Println("var Scripts = map[string] *RangeTable{")
}
for k := range table {
fmt.Printf("\t%q: %s,\n", k, k)
@ -613,11 +628,22 @@ func printScriptOrProperty(doProps bool) {
name, name, name, name)
}
ndecl++
fmt.Printf("var _%s = []Range {\n", name)
fmt.Printf("var _%s = &RangeTable {\n", name)
fmt.Print("\tR16: []Range16{\n")
ranges := foldAdjacent(table[name])
size := 16
count := &range16Count
for _, s := range ranges {
if size == 16 && (s.Lo >= 1<<16 || s.Hi >= 1<<16) {
fmt.Print("\t},\n")
fmt.Print("\tR32: []Range32{\n")
size = 32
count = &range32Count
}
*count++
fmt.Printf(format, s.Lo, s.Hi, s.Stride)
}
fmt.Print("\t},\n")
fmt.Print("}\n\n")
}
decl.Sort()
@ -808,7 +834,7 @@ func printCaseRange(lo, hi *caseState) {
fmt.Printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
lo.point, hi.point)
case hi.point > lo.point && lo.isLowerUpper():
logger.Fatalf("LowerUpper sequence: should not happen: U+%04X. If it's real, need to fix To()", lo.point)
logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point)
fmt.Printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
lo.point, hi.point)
default:
@ -831,17 +857,28 @@ func fullCaseTest() {
lower := unicode.ToLower(i)
want := caseIt(i, c.lowerCase)
if lower != want {
fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower)
fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
}
upper := unicode.ToUpper(i)
want = caseIt(i, c.upperCase)
if upper != want {
fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper)
fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
}
title := unicode.ToTitle(i)
want = caseIt(i, c.titleCase)
if title != want {
fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title)
fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
}
}
}
var range16Count = 0 // Number of entries in the 16-bit range tables.
var range32Count = 0 // Number of entries in the 32-bit range tables.
func printSizes() {
fmt.Println()
fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
range16Bytes := range16Count * 3 * 2
range32Bytes := range32Count * 3 * 4
fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
}