mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
first cut at case mapping tables and library.
next cut will do the optimization for alternating sequences. R=rsc DELTA=1658 (1620 added, 9 deleted, 29 changed) OCL=34072 CL=34075
This commit is contained in:
parent
30dcb13420
commit
22c2b476a8
5 changed files with 1646 additions and 35 deletions
|
|
@ -21,6 +21,14 @@ import (
|
|||
"unicode";
|
||||
)
|
||||
|
||||
func main() {
|
||||
flag.Parse();
|
||||
loadChars(); // always needed
|
||||
printCategories();
|
||||
printScripts();
|
||||
printCases();
|
||||
}
|
||||
|
||||
var dataUrl = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt");
|
||||
var url = flag.String("url",
|
||||
"http://www.unicode.org/Public/5.1.0/ucd/",
|
||||
|
|
@ -31,6 +39,9 @@ var tablelist = flag.String("tables",
|
|||
var scriptlist = flag.String("scripts",
|
||||
"all",
|
||||
"comma-separated list of which script tables to generate");
|
||||
var cases = flag.Bool("cases",
|
||||
true,
|
||||
"generate case tables");
|
||||
var test = flag.Bool("test",
|
||||
false,
|
||||
"test existing tables; can be used to compare web data with package data");
|
||||
|
|
@ -44,7 +55,7 @@ var category = map[string] bool{ "letter":true } // Nd Lu etc. letter is a speci
|
|||
// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
|
||||
// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
|
||||
// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
|
||||
// The fields
|
||||
// The fields:
|
||||
const (
|
||||
FCodePoint = iota;
|
||||
FName;
|
||||
|
|
@ -87,11 +98,11 @@ var fieldName = []string{
|
|||
// This contains only the properties we're interested in.
|
||||
type Char struct {
|
||||
field []string; // debugging only; could be deleted if we take out char.dump()
|
||||
codePoint uint32; // redundant (it's the index in the chars table) but useful
|
||||
codePoint uint32; // if zero, this index is not a valid code point.
|
||||
category string;
|
||||
upperCase uint32;
|
||||
lowerCase uint32;
|
||||
titleCase uint32;
|
||||
upperCase int;
|
||||
lowerCase int;
|
||||
titleCase int;
|
||||
}
|
||||
|
||||
// Scripts.txt has form:
|
||||
|
|
@ -104,26 +115,21 @@ type Script struct {
|
|||
script string;
|
||||
}
|
||||
|
||||
func main() {
|
||||
flag.Parse();
|
||||
printCategories();
|
||||
printScripts();
|
||||
}
|
||||
|
||||
var chars = make([]Char, MaxChar)
|
||||
var chars = make([]Char, MaxChar+1)
|
||||
var scripts = make(map[string] []Script)
|
||||
|
||||
var lastChar uint32 = 0;
|
||||
|
||||
// In UnicodeData.txt, some ranges are marked like this:
|
||||
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
// parseCategory returns a state variable indicating the weirdness.
|
||||
type State int
|
||||
const (
|
||||
SNormal State = iota; // known to be zero for the type
|
||||
SFirst;
|
||||
SLast;
|
||||
SMissing;
|
||||
)
|
||||
|
||||
func parseCategory(line string) (state State) {
|
||||
|
|
@ -139,7 +145,7 @@ func parseCategory(line string) (state State) {
|
|||
if point == 0 {
|
||||
return // not interesting and we use 0 as unset
|
||||
}
|
||||
if point >= MaxChar {
|
||||
if point > MaxChar {
|
||||
return;
|
||||
}
|
||||
char := &chars[point];
|
||||
|
|
@ -189,7 +195,7 @@ func (char *Char) letter(u, l, t string) {
|
|||
char.titleCase = char.letterValue(t, "T");
|
||||
}
|
||||
|
||||
func (char *Char) letterValue(s string, cas string) uint32 {
|
||||
func (char *Char) letterValue(s string, cas string) int {
|
||||
if s == "" {
|
||||
return 0
|
||||
}
|
||||
|
|
@ -198,7 +204,7 @@ func (char *Char) letterValue(s string, cas string) uint32 {
|
|||
char.dump(cas);
|
||||
die.Logf("U+%04x: bad letter(%s): %s", char.codePoint, s, err)
|
||||
}
|
||||
return uint32(v)
|
||||
return int(v)
|
||||
}
|
||||
|
||||
func allCategories() []string {
|
||||
|
|
@ -242,10 +248,7 @@ func letterOp(code int) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
func printCategories() {
|
||||
if *tablelist == "" {
|
||||
return
|
||||
}
|
||||
func loadChars() {
|
||||
if *dataUrl == "" {
|
||||
flag.Set("data", *url + "UnicodeData.txt");
|
||||
}
|
||||
|
|
@ -288,6 +291,12 @@ func printCategories() {
|
|||
}
|
||||
}
|
||||
resp.Body.Close();
|
||||
}
|
||||
|
||||
func printCategories() {
|
||||
if *tablelist == "" {
|
||||
return
|
||||
}
|
||||
// Find out which categories to dump
|
||||
list := strings.Split(*tablelist, ",", 0);
|
||||
if *tablelist == "all" {
|
||||
|
|
@ -299,11 +308,11 @@ func printCategories() {
|
|||
}
|
||||
fmt.Printf(
|
||||
"// Generated by running\n"
|
||||
"// maketables --tables=%s --url=%s\n"
|
||||
"// maketables --tables=%s --data=%s\n"
|
||||
"// DO NOT EDIT\n\n"
|
||||
"package unicode\n\n",
|
||||
*tablelist,
|
||||
*url
|
||||
*dataUrl
|
||||
);
|
||||
|
||||
fmt.Println("// Version is the Unicode edition from which the tables are derived.");
|
||||
|
|
@ -496,6 +505,9 @@ func parseScript(line string) {
|
|||
}
|
||||
|
||||
func printScripts() {
|
||||
if *scriptlist == "" {
|
||||
return
|
||||
}
|
||||
var err os.Error;
|
||||
scriptRe, err = regexp.Compile(`([0-9A-F]+)(\.\.[0-9A-F]+)? +; ([A-Za-z_]+)`);
|
||||
if err != nil {
|
||||
|
|
@ -604,3 +616,148 @@ func fullScriptTest(list []string) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
CaseUpper = 1 << iota;
|
||||
CaseLower;
|
||||
CaseTitle;
|
||||
CaseNone = 0; // must be zero
|
||||
CaseMissing = -1; // character not present; not a valid case state
|
||||
)
|
||||
|
||||
type caseState struct {
|
||||
point int;
|
||||
_case int;
|
||||
deltaToUpper int;
|
||||
deltaToLower int;
|
||||
deltaToTitle int;
|
||||
}
|
||||
|
||||
// Is d a continuation of the state of c?
|
||||
func (c *caseState) adjacent(d *caseState) bool {
|
||||
if d.point < c.point {
|
||||
return d.adjacent(c)
|
||||
}
|
||||
switch {
|
||||
case d.point != c.point+1:
|
||||
return false
|
||||
case d._case != c._case:
|
||||
return false
|
||||
case c._case == CaseNone:
|
||||
return false
|
||||
case c._case == CaseMissing:
|
||||
return false
|
||||
case d.deltaToUpper != c.deltaToUpper:
|
||||
return false
|
||||
case d.deltaToLower != c.deltaToLower:
|
||||
return false
|
||||
case d.deltaToTitle != c.deltaToTitle:
|
||||
return false
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
func getCaseState(i int) (c *caseState) {
|
||||
c = &caseState{ point: i, _case: CaseNone };
|
||||
ch := &chars[i];
|
||||
switch int(ch.codePoint) {
|
||||
case 0:
|
||||
c._case = CaseMissing; // Will get NUL wrong but that doesn't matter
|
||||
return;
|
||||
case ch.upperCase:
|
||||
c._case = CaseUpper;
|
||||
case ch.lowerCase:
|
||||
c._case = CaseLower;
|
||||
case ch.titleCase:
|
||||
c._case = CaseTitle;
|
||||
}
|
||||
if ch.upperCase != 0 {
|
||||
c.deltaToUpper = ch.upperCase - i
|
||||
}
|
||||
if ch.lowerCase != 0 {
|
||||
c.deltaToLower = ch.lowerCase - i
|
||||
}
|
||||
if ch.titleCase != 0 {
|
||||
c.deltaToTitle = ch.titleCase - i
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
func printCases() {
|
||||
if !*cases {
|
||||
return
|
||||
}
|
||||
if *test {
|
||||
fullCaseTest();
|
||||
return
|
||||
}
|
||||
fmt.Printf(
|
||||
"// Generated by running\n"
|
||||
"// maketables --data=%s\n"
|
||||
"// DO NOT EDIT\n\n"
|
||||
"// CaseRanges is the table describing case mappings for all letters with\n"
|
||||
"// non-self mappings.\n"
|
||||
"var CaseRanges = _CaseRanges\n"
|
||||
"var _CaseRanges = []CaseRange {\n",
|
||||
*dataUrl
|
||||
);
|
||||
|
||||
var startState *caseState; // the start of a run; nil for not active
|
||||
var prevState = &caseState{}; // the state of the previous character
|
||||
for i, c := range chars {
|
||||
state := getCaseState(i);
|
||||
if state.adjacent(prevState) {
|
||||
prevState = state;
|
||||
continue;
|
||||
}
|
||||
// end of run (possibly)
|
||||
printCaseRange(startState, prevState);
|
||||
startState = nil;
|
||||
if state._case != CaseMissing && state._case != CaseNone {
|
||||
startState = state;
|
||||
}
|
||||
prevState = state;
|
||||
}
|
||||
fmt.Printf("}\n");
|
||||
}
|
||||
|
||||
func printCaseRange(lo, hi *caseState) {
|
||||
if lo == nil {
|
||||
return
|
||||
}
|
||||
if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
|
||||
// character represents itself in all cases - no need to mention it
|
||||
return
|
||||
}
|
||||
fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
|
||||
lo.point, hi.point,
|
||||
lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
|
||||
}
|
||||
|
||||
// If the cased value in the Char is 0, it means use the rune itself.
|
||||
func caseIt(rune, cased int) int {
|
||||
if cased == 0 {
|
||||
return rune
|
||||
}
|
||||
return cased
|
||||
}
|
||||
|
||||
func fullCaseTest() {
|
||||
for i, c := range chars {
|
||||
lower := unicode.ToLower(i);
|
||||
want := caseIt(i, c.lowerCase);
|
||||
if lower != want {
|
||||
fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower);
|
||||
}
|
||||
upper := unicode.ToUpper(i);
|
||||
want = caseIt(i, c.upperCase);
|
||||
if upper != want {
|
||||
fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper);
|
||||
}
|
||||
title := unicode.ToTitle(i);
|
||||
want = caseIt(i, c.titleCase);
|
||||
if title != want {
|
||||
fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue