introduce non-strict mode in xml parser,

good enough to parse some html. in reader, add "comment" tag to collect comment text. do not allocate during Unmarshal unless pointer is nil. R=r DELTA=441 (416 added, 1 deleted, 24 changed) OCL=35586 CL=35594
2025-12-08 06:10:04 +00:00 · 2009-10-11 23:51:46 -07:00 · 2009-10-11 23:51:46 -07:00 · d0aac0ace1
commit d0aac0ace1
parent fcdba72d2a
2 changed files with 437 additions and 22 deletions
--- a/src/pkg/xml/read.go
+++ b/src/pkg/xml/read.go
@ -162,14 +162,20 @@ func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error {
 	}
 	if pv, ok := val.(*reflect.PtrValue); ok {
-		zv := reflect.MakeZero(pv.Type().(*reflect.PtrType).Elem());
+		if pv.Get() == 0 {
-		pv.PointTo(zv);
+			zv := reflect.MakeZero(pv.Type().(*reflect.PtrType).Elem());
-		val = zv;
+			pv.PointTo(zv);
 			val = zv;
 		} else {
 			val = pv.Elem();
 		}
 	}
 	var (
 		data []byte;
 		saveData reflect.Value;
 		comment []byte;
 		saveComment reflect.Value;
 		sv *reflect.StructValue;
 		styp *reflect.StructType;
 	)
@ -251,7 +257,7 @@ func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error {
 		}
 		// Assign attributes.
-		// Also, determine whether we need to save character data.
+		// Also, determine whether we need to save character data or comments.
 		for i, n := 0, typ.NumField(); i < n; i++ {
 			f := typ.Field(i);
 			switch f.Tag {
@ -271,6 +277,11 @@ func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error {
 				}
 				strv.Set(val);
 			case "comment":
 				if saveComment == nil {
 					saveComment = sv.FieldByIndex(f.Index);
 				}
 			case "chardata":
 				if saveData == nil {
 					saveData = sv.FieldByIndex(f.Index);
@ -326,17 +337,27 @@ Loop:
 			if saveData != nil {
 				data = bytes.Add(data, t);
 			}
 		case Comment:
 			if saveComment != nil {
 				comment = bytes.Add(comment, t);
 			}
 		}
 	}
-	// Save accumulated character data
+	// Save accumulated character data and comments
-	if saveData != nil {
+	switch t := saveData.(type) {
-		switch t := saveData.(type) {
+	case *reflect.StringValue:
-		case *reflect.StringValue:
+		t.Set(string(data));
-			t.Set(string(data));
+	case *reflect.SliceValue:
-		case *reflect.SliceValue:
+		t.Set(reflect.NewValue(data).(*reflect.SliceValue));
-			t.Set(reflect.NewValue(data).(*reflect.SliceValue));
+	}
-		}
+
 	switch t := saveComment.(type) {
 	case *reflect.StringValue:
 		t.Set(string(comment));
 	case *reflect.SliceValue:
 		t.Set(reflect.NewValue(comment).(*reflect.SliceValue));
 	}
 	return nil;
--- a/src/pkg/xml/xml.go
+++ b/src/pkg/xml/xml.go
@ -111,12 +111,49 @@ type readByter interface {
 // A Parser represents an XML parser reading a particular input stream.
 // The parser assumes that its input is encoded in UTF-8.
 type Parser struct {
 	// Strict defaults to true, enforcing the requirements
 	// of the XML specification.
 	// If set to false, the parser allows input containing common
 	// mistakes:
 	//	* If an element is missing an end tag, the parser invents
 	//	  end tags as necessary to keep the return values from Token
 	//	  properly balanced.
 	//	* In attribute values and character data, unknown or malformed
 	//	  character entities (sequences beginning with &) are left alone.
 	//
 	// Setting:
 	//
 	//	p.Strict = false;
 	//	p.AutoClose = HTMLAutoClose;
 	//	p.Entity = HTMLEntity
 	//
 	// creates a parser that can handle typical HTML.
 	Strict	bool;
 	// When Strict == false, AutoClose indicates a set of elements to
 	// consider closed immediately after they are opened, regardless
 	// of whether an end element is present.
 	AutoClose	[]string;
 	// Entity can be used to map non-standard entity names to string replacements.
 	// The parser behaves as if these standard mappings are present in the map,
 	// regardless of the actual map content:
 	//
 	//	"lt": "<",
 	//	"gt": ">",
 	//	"amp": "&",
 	//	"pos": "'",
 	//	"quot": `"`,
 	//
 	Entity	map[string]string;
 	r		readByter;
 	buf		bytes.Buffer;
 	stk		*stack;
 	free		*stack;
 	needClose	bool;
 	toClose		Name;
 	nextToken	Token;
 	nextByte	int;
 	ns		map[string]string;
 	err		os.Error;
@ -130,6 +167,7 @@ func NewParser(r io.Reader) *Parser {
 		ns: make(map[string]string),
 		nextByte: -1,
 		line: 1,
 		Strict: true,
 	};
 	// Get efficient byte at a time reader.
@ -169,9 +207,19 @@ func NewParser(r io.Reader) *Parser {
 // it uses the prefix as the Space rather than report an error.
 //
 func (p *Parser) Token() (t Token, err os.Error) {
-	if t, err = p.RawToken(); err != nil {
+	if p.nextToken != nil {
 		t = p.nextToken;
 		p.nextToken = nil;
 	} else if t, err = p.RawToken(); err != nil {
 		return;
 	}
 	if !p.Strict {
 		if t1, ok := p.autoClose(t); ok {
 			p.nextToken = t;
 			t = t1;
 		}
 	}
 	switch t1 := t.(type) {
 	case StartElement:
 		// In XML name spaces, the translations listed in the
@ -201,7 +249,7 @@ func (p *Parser) Token() (t Token, err os.Error) {
 	case EndElement:
 		p.translate(&t1.Name, true);
-		if !p.popElement(t1.Name) {
+		if !p.popElement(&t1) {
 			return nil, p.err;
 		}
 		t = t1;
@ -286,13 +334,20 @@ func (p *Parser) pushNs(local string, url string, ok bool) {
 // After popping the element, apply any undo records from
 // the stack to restore the name translations that existed
 // before we saw this element.
-func (p *Parser) popElement(name Name) bool {
+func (p *Parser) popElement(t *EndElement) bool {
 	s := p.pop();
 	name := t.Name;
 	switch {
 	case s == nil || s.kind != stkStart:
 		p.err = SyntaxError("unexpected end element </" + name.Local + ">");
 		return false;
 	case s.name.Local != name.Local:
 		if !p.Strict {
 			p.needClose = true;
 			p.toClose = t.Name;
 			t.Name = s.name;
 			return true;
 		}
 		p.err = SyntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">");
 		return false;
 	case s.name.Space != name.Space:
@ -311,6 +366,27 @@ func (p *Parser) popElement(name Name) bool {
 	return true;
 }
 // If the top element on the stack is autoclosing and
 // t is not the end tag, invent the end tag.
 func (p *Parser) autoClose(t Token) (Token, bool) {
 	if p.stk == nil || p.stk.kind != stkStart {
 		return nil, false;
 	}
 	name := strings.ToLower(p.stk.name.Local);
 	for _, s := range p.AutoClose {
 		if strings.ToLower(s) == name {
 			// This one should be auto closed if t doesn't close it.
 			et, ok := t.(EndElement);
 			if !ok || et.Name.Local != name {
 				return EndElement{p.stk.name}, true;
 			}
 			break;
 		}
 	}
 	return nil, false;
 }
 // RawToken is like Token but does not verify that
 // start and end elements match and does not translate
 // name space prefixes to their corresponding URLs.
@ -645,21 +721,38 @@ Input:
 			// Parsers are required to recognize lt, gt, amp, apos, and quot
 			// even if they have not been declared.  That's all we allow.
 			var i int;
 		CharLoop:
 			for i = 0; i < len(p.tmp); i++ {
 				p.tmp[i], p.err = p.r.ReadByte();
 				if p.err != nil {
 					return nil;
 				}
-				if p.tmp[i] == ';' {
+				c := p.tmp[i];
 				if c == ';' {
 					break;
 				}
 				if 'a' <= c && c <= 'z' ||
 					'A' <= c && c <= 'Z' ||
 					'0' <= c && c <= '9' ||
 					c == '_' || c == '#' {
 					continue;
 				}
 				p.ungetc(c);
 				break;
 			}
 			s := string(p.tmp[0:i]);
 			if i >= len(p.tmp) {
 				if !p.Strict {
 					b0, b1 = 0, 0;
 					p.buf.WriteByte('&');
 					p.buf.Write(p.tmp[0:i]);
 					continue Input;
 				}
 				p.err = SyntaxError("character entity expression &" + s + "... too long");
 				return nil;
 			}
-			rune := -1;
+			var haveText bool;
 			var text string;
 			if i >= 2 && s[0] == '#' {
 				var n uint64;
 				var err os.Error;
@ -669,19 +762,28 @@ Input:
 					n, err = strconv.Btoui64(s[1:len(s)], 10);
 				}
 				if err == nil && n <= unicode.MaxRune {
-					rune = int(n);
+					text = string(n);
 					haveText = true;
 				}
 			} else {
 				if r, ok := entity[s]; ok {
-					rune = r;
+					text = string(r);
 					haveText = true;
 				} else {
 					text, haveText = p.Entity[s];
 				}
 			}
-			if rune < 0 {
+			if !haveText {
 				if !p.Strict {
 					b0, b1 = 0, 0;
 					p.buf.WriteByte('&');
 					p.buf.Write(p.tmp[0:i]);
 					continue Input;
 				}
 				p.err = SyntaxError("invalid character entity &" + s + ";");
 				return nil;
 			}
-			i = utf8.EncodeRune(rune, &p.tmp);
+			p.buf.Write(strings.Bytes(text));
 			p.buf.Write(p.tmp[0:i]);
 			b0, b1 = 0, 0;
 			continue Input;
 		}
@ -764,6 +866,7 @@ func (p *Parser) name() (s string, ok bool) {
 func isNameByte(c byte) bool {
 	return 'A' <= c && c <= 'Z' ||
 		'a' <= c && c <= 'z' ||
 		'0' <= c && c <= '9' ||
 		c == '_' || c == ':' || c == '.' || c == '-';
 }
@ -1079,3 +1182,294 @@ var second = []unicode.Range{
 	unicode.Range{0x309D, 0x309E, 1},
 	unicode.Range{0x30FC, 0x30FE, 1},
 }
 // HTMLEntity is an entity map containing translations for the
 // standard HTML entity characters.
 var HTMLEntity = htmlEntity
 var htmlEntity = map[string]string {
 /*
 	hget http://www.w3.org/TR/html4/sgml/entities.html |
 	ssam '
 		,y /\&gt;/ x/\&lt;(.|\n)+/ s/\n/ /g
 		,x v/^\&lt;!ENTITY/d
 		,s/\&lt;!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/	"\1": "\\u\2",/g
 	'
 */
 	"nbsp": "\u00A0",
 	"iexcl": "\u00A1",
 	"cent": "\u00A2",
 	"pound": "\u00A3",
 	"curren": "\u00A4",
 	"yen": "\u00A5",
 	"brvbar": "\u00A6",
 	"sect": "\u00A7",
 	"uml": "\u00A8",
 	"copy": "\u00A9",
 	"ordf": "\u00AA",
 	"laquo": "\u00AB",
 	"not": "\u00AC",
 	"shy": "\u00AD",
 	"reg": "\u00AE",
 	"macr": "\u00AF",
 	"deg": "\u00B0",
 	"plusmn": "\u00B1",
 	"sup2": "\u00B2",
 	"sup3": "\u00B3",
 	"acute": "\u00B4",
 	"micro": "\u00B5",
 	"para": "\u00B6",
 	"middot": "\u00B7",
 	"cedil": "\u00B8",
 	"sup1": "\u00B9",
 	"ordm": "\u00BA",
 	"raquo": "\u00BB",
 	"frac14": "\u00BC",
 	"frac12": "\u00BD",
 	"frac34": "\u00BE",
 	"iquest": "\u00BF",
 	"Agrave": "\u00C0",
 	"Aacute": "\u00C1",
 	"Acirc": "\u00C2",
 	"Atilde": "\u00C3",
 	"Auml": "\u00C4",
 	"Aring": "\u00C5",
 	"AElig": "\u00C6",
 	"Ccedil": "\u00C7",
 	"Egrave": "\u00C8",
 	"Eacute": "\u00C9",
 	"Ecirc": "\u00CA",
 	"Euml": "\u00CB",
 	"Igrave": "\u00CC",
 	"Iacute": "\u00CD",
 	"Icirc": "\u00CE",
 	"Iuml": "\u00CF",
 	"ETH": "\u00D0",
 	"Ntilde": "\u00D1",
 	"Ograve": "\u00D2",
 	"Oacute": "\u00D3",
 	"Ocirc": "\u00D4",
 	"Otilde": "\u00D5",
 	"Ouml": "\u00D6",
 	"times": "\u00D7",
 	"Oslash": "\u00D8",
 	"Ugrave": "\u00D9",
 	"Uacute": "\u00DA",
 	"Ucirc": "\u00DB",
 	"Uuml": "\u00DC",
 	"Yacute": "\u00DD",
 	"THORN": "\u00DE",
 	"szlig": "\u00DF",
 	"agrave": "\u00E0",
 	"aacute": "\u00E1",
 	"acirc": "\u00E2",
 	"atilde": "\u00E3",
 	"auml": "\u00E4",
 	"aring": "\u00E5",
 	"aelig": "\u00E6",
 	"ccedil": "\u00E7",
 	"egrave": "\u00E8",
 	"eacute": "\u00E9",
 	"ecirc": "\u00EA",
 	"euml": "\u00EB",
 	"igrave": "\u00EC",
 	"iacute": "\u00ED",
 	"icirc": "\u00EE",
 	"iuml": "\u00EF",
 	"eth": "\u00F0",
 	"ntilde": "\u00F1",
 	"ograve": "\u00F2",
 	"oacute": "\u00F3",
 	"ocirc": "\u00F4",
 	"otilde": "\u00F5",
 	"ouml": "\u00F6",
 	"divide": "\u00F7",
 	"oslash": "\u00F8",
 	"ugrave": "\u00F9",
 	"uacute": "\u00FA",
 	"ucirc": "\u00FB",
 	"uuml": "\u00FC",
 	"yacute": "\u00FD",
 	"thorn": "\u00FE",
 	"yuml": "\u00FF",
 	"fnof": "\u0192",
 	"Alpha": "\u0391",
 	"Beta": "\u0392",
 	"Gamma": "\u0393",
 	"Delta": "\u0394",
 	"Epsilon": "\u0395",
 	"Zeta": "\u0396",
 	"Eta": "\u0397",
 	"Theta": "\u0398",
 	"Iota": "\u0399",
 	"Kappa": "\u039A",
 	"Lambda": "\u039B",
 	"Mu": "\u039C",
 	"Nu": "\u039D",
 	"Xi": "\u039E",
 	"Omicron": "\u039F",
 	"Pi": "\u03A0",
 	"Rho": "\u03A1",
 	"Sigma": "\u03A3",
 	"Tau": "\u03A4",
 	"Upsilon": "\u03A5",
 	"Phi": "\u03A6",
 	"Chi": "\u03A7",
 	"Psi": "\u03A8",
 	"Omega": "\u03A9",
 	"alpha": "\u03B1",
 	"beta": "\u03B2",
 	"gamma": "\u03B3",
 	"delta": "\u03B4",
 	"epsilon": "\u03B5",
 	"zeta": "\u03B6",
 	"eta": "\u03B7",
 	"theta": "\u03B8",
 	"iota": "\u03B9",
 	"kappa": "\u03BA",
 	"lambda": "\u03BB",
 	"mu": "\u03BC",
 	"nu": "\u03BD",
 	"xi": "\u03BE",
 	"omicron": "\u03BF",
 	"pi": "\u03C0",
 	"rho": "\u03C1",
 	"sigmaf": "\u03C2",
 	"sigma": "\u03C3",
 	"tau": "\u03C4",
 	"upsilon": "\u03C5",
 	"phi": "\u03C6",
 	"chi": "\u03C7",
 	"psi": "\u03C8",
 	"omega": "\u03C9",
 	"thetasym": "\u03D1",
 	"upsih": "\u03D2",
 	"piv": "\u03D6",
 	"bull": "\u2022",
 	"hellip": "\u2026",
 	"prime": "\u2032",
 	"Prime": "\u2033",
 	"oline": "\u203E",
 	"frasl": "\u2044",
 	"weierp": "\u2118",
 	"image": "\u2111",
 	"real": "\u211C",
 	"trade": "\u2122",
 	"alefsym": "\u2135",
 	"larr": "\u2190",
 	"uarr": "\u2191",
 	"rarr": "\u2192",
 	"darr": "\u2193",
 	"harr": "\u2194",
 	"crarr": "\u21B5",
 	"lArr": "\u21D0",
 	"uArr": "\u21D1",
 	"rArr": "\u21D2",
 	"dArr": "\u21D3",
 	"hArr": "\u21D4",
 	"forall": "\u2200",
 	"part": "\u2202",
 	"exist": "\u2203",
 	"empty": "\u2205",
 	"nabla": "\u2207",
 	"isin": "\u2208",
 	"notin": "\u2209",
 	"ni": "\u220B",
 	"prod": "\u220F",
 	"sum": "\u2211",
 	"minus": "\u2212",
 	"lowast": "\u2217",
 	"radic": "\u221A",
 	"prop": "\u221D",
 	"infin": "\u221E",
 	"ang": "\u2220",
 	"and": "\u2227",
 	"or": "\u2228",
 	"cap": "\u2229",
 	"cup": "\u222A",
 	"int": "\u222B",
 	"there4": "\u2234",
 	"sim": "\u223C",
 	"cong": "\u2245",
 	"asymp": "\u2248",
 	"ne": "\u2260",
 	"equiv": "\u2261",
 	"le": "\u2264",
 	"ge": "\u2265",
 	"sub": "\u2282",
 	"sup": "\u2283",
 	"nsub": "\u2284",
 	"sube": "\u2286",
 	"supe": "\u2287",
 	"oplus": "\u2295",
 	"otimes": "\u2297",
 	"perp": "\u22A5",
 	"sdot": "\u22C5",
 	"lceil": "\u2308",
 	"rceil": "\u2309",
 	"lfloor": "\u230A",
 	"rfloor": "\u230B",
 	"lang": "\u2329",
 	"rang": "\u232A",
 	"loz": "\u25CA",
 	"spades": "\u2660",
 	"clubs": "\u2663",
 	"hearts": "\u2665",
 	"diams": "\u2666",
 	"quot": "\u0022",
 	"amp": "\u0026",
 	"lt": "\u003C",
 	"gt": "\u003E",
 	"OElig": "\u0152",
 	"oelig": "\u0153",
 	"Scaron": "\u0160",
 	"scaron": "\u0161",
 	"Yuml": "\u0178",
 	"circ": "\u02C6",
 	"tilde": "\u02DC",
 	"ensp": "\u2002",
 	"emsp": "\u2003",
 	"thinsp": "\u2009",
 	"zwnj": "\u200C",
 	"zwj": "\u200D",
 	"lrm": "\u200E",
 	"rlm": "\u200F",
 	"ndash": "\u2013",
 	"mdash": "\u2014",
 	"lsquo": "\u2018",
 	"rsquo": "\u2019",
 	"sbquo": "\u201A",
 	"ldquo": "\u201C",
 	"rdquo": "\u201D",
 	"bdquo": "\u201E",
 	"dagger": "\u2020",
 	"Dagger": "\u2021",
 	"permil": "\u2030",
 	"lsaquo": "\u2039",
 	"rsaquo": "\u203A",
 	"euro": "\u20AC",
 }
 // HTMLAutoClose is the set of HTML elements that
 // should be considered to close automatically.
 var HTMLAutoClose = htmlAutoClose
 var htmlAutoClose = []string {
 /*
 	hget http://www.w3.org/TR/html4/loose.dtd |
 	9 sed -n 's/<!ELEMENT (.*) - O EMPTY.+/	"\1",/p' | tr A-Z a-z
 */
 	"basefont",
 	"br",
 	"area",
 	"link",
 	"img",
 	"param",
 	"hr",
 	"input",
 	"col     ",
 	"frame",
 	"isindex",
 	"base",
 	"meta",
 }