// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package xml implements a simple XML 1.0 parser that // understands XML name spaces. package xml // References: // Annotated XML spec: https://www.xml.com/axml/testaxml.htm // XML name spaces: https://www.w3.org/TR/REC-xml-names/ // TODO(rsc): // Test error handling. import ( "bufio" "bytes" "errors" "fmt" "io" "strconv" "strings" "unicode" "unicode/utf8" ) // A SyntaxError represents a syntax error in the XML input stream. type SyntaxError struct { Msg string Line int } func (e *SyntaxError) Error() string { return "XML syntax error on line " + strconv.Itoa(e.Line) + ": " + e.Msg } // A Name represents an XML name (Local) annotated // with a name space identifier (Space). // In tokens returned by Decoder.Token, the Space identifier // is given as a canonical URL, not the short prefix used // in the document being parsed. type Name struct { Space, Local string } // An Attr represents an attribute in an XML element (Name=Value). type Attr struct { Name Name Value string } // A Token is an interface holding one of the token types: // StartElement, EndElement, CharData, Comment, ProcInst, or Directive. type Token interface{} // A StartElement represents an XML start element. type StartElement struct { Name Name Attr []Attr } // Copy creates a new copy of StartElement. func (e StartElement) Copy() StartElement { attrs := make([]Attr, len(e.Attr)) copy(attrs, e.Attr) e.Attr = attrs return e } // End returns the corresponding XML end element. func (e StartElement) End() EndElement { return EndElement{e.Name} } // An EndElement represents an XML end element. type EndElement struct { Name Name } // A CharData represents XML character data (raw text), // in which XML escape sequences have been replaced by // the characters they represent. type CharData []byte func makeCopy(b []byte) []byte { b1 := make([]byte, len(b)) copy(b1, b) return b1 } // Copy creates a new copy of CharData. func (c CharData) Copy() CharData { return CharData(makeCopy(c)) } // A Comment represents an XML comment of the form . // The bytes do not include the comment markers. type Comment []byte // Copy creates a new copy of Comment. func (c Comment) Copy() Comment { return Comment(makeCopy(c)) } // A ProcInst represents an XML processing instruction of the form type ProcInst struct { Target string Inst []byte } // Copy creates a new copy of ProcInst. func (p ProcInst) Copy() ProcInst { p.Inst = makeCopy(p.Inst) return p } // A Directive represents an XML directive of the form . // The bytes do not include the markers. type Directive []byte // Copy creates a new copy of Directive. func (d Directive) Copy() Directive { return Directive(makeCopy(d)) } // CopyToken returns a copy of a Token. func CopyToken(t Token) Token { switch v := t.(type) { case CharData: return v.Copy() case Comment: return v.Copy() case Directive: return v.Copy() case ProcInst: return v.Copy() case StartElement: return v.Copy() } return t } // A TokenReader is anything that can decode a stream of XML tokens, including a // Decoder. // // When Token encounters an error or end-of-file condition after successfully // reading a token, it returns the token. It may return the (non-nil) error from // the same call or return the error (and a nil token) from a subsequent call. // An instance of this general case is that a TokenReader returning a non-nil // token at the end of the token stream may return either io.EOF or a nil error. // The next Read should return nil, io.EOF. // // Implementations of Token are discouraged from returning a nil token with a // nil error. Callers should treat a return of nil, nil as indicating that // nothing happened; in particular it does not indicate EOF. type TokenReader interface { Token() (Token, error) } // A Decoder represents an XML parser reading a particular input stream. // The parser assumes that its input is encoded in UTF-8. type Decoder struct { // Strict defaults to true, enforcing the requirements // of the XML specification. // If set to false, the parser allows input containing common // mistakes: // * If an element is missing an end tag, the parser invents // end tags as necessary to keep the return values from Token // properly balanced. // * In attribute values and character data, unknown or malformed // character entities (sequences beginning with &) are left alone. // // Setting: // // d.Strict = false // d.AutoClose = xml.HTMLAutoClose // d.Entity = xml.HTMLEntity // // creates a parser that can handle typical HTML. // // Strict mode does not enforce the requirements of the XML name spaces TR. // In particular it does not reject name space tags using undefined prefixes. // Such tags are recorded with the unknown prefix as the name space URL. Strict bool // When Strict == false, AutoClose indicates a set of elements to // consider closed immediately after they are opened, regardless // of whether an end element is present. AutoClose []string // Entity can be used to map non-standard entity names to string replacements. // The parser behaves as if these standard mappings are present in the map, // regardless of the actual map content: // // "lt": "<", // "gt": ">", // "amp": "&", // "apos": "'", // "quot": `"`, Entity map[string]string // CharsetReader, if non-nil, defines a function to generate // charset-conversion readers, converting from the provided // non-UTF-8 charset into UTF-8. If CharsetReader is nil or // returns an error, parsing stops with an error. One of the // CharsetReader's result values must be non-nil. CharsetReader func(charset string, input io.Reader) (io.Reader, error) // DefaultSpace sets the default name space used for unadorned tags, // as if the entire XML stream were wrapped in an element containing // the attribute xmlns="DefaultSpace". DefaultSpace string r io.ByteReader t TokenReader buf bytes.Buffer saved *bytes.Buffer stk *stack free *stack needClose bool toClose Name nextToken Token nextByte int ns map[string]string err error line int offset int64 unmarshalDepth int } // NewDecoder creates a new XML parser reading from r. // If r does not implement io.ByteReader, NewDecoder will // do its own buffering. func NewDecoder(r io.Reader) *Decoder { d := &Decoder{ ns: make(map[string]string), nextByte: -1, line: 1, Strict: true, } d.switchToReader(r) return d } // NewTokenDecoder creates a new XML parser using an underlying token stream. func NewTokenDecoder(t TokenReader) *Decoder { // Is it already a Decoder? if d, ok := t.(*Decoder); ok { return d } d := &Decoder{ ns: make(map[string]string), t: t, nextByte: -1, line: 1, Strict: true, } return d } // Token returns the next XML token in the input stream. // At the end of the input stream, Token returns nil, io.EOF. // // Slices of bytes in the returned token data refer to the // parser's internal buffer and remain valid only until the next // call to Token. To acquire a copy of the bytes, call CopyToken // or the token's Copy method. // // Token expands self-closing elements such as
// into separate start and end elements returned by successive calls. // // Token guarantees that the StartElement and EndElement // tokens it returns are properly nested and matched: // if Token encounters an unexpected end element // or EOF before all expected end elements, // it will return an error. // // Token implements XML name spaces as described by // https://www.w3.org/TR/REC-xml-names/. Each of the // Name structures contained in the Token has the Space // set to the URL identifying its name space when known. // If Token encounters an unrecognized name space prefix, // it uses the prefix as the Space rather than report an error. func (d *Decoder) Token() (Token, error) { var t Token var err error if d.stk != nil && d.stk.kind == stkEOF { return nil, io.EOF } if d.nextToken != nil { t = d.nextToken d.nextToken = nil } else if t, err = d.rawToken(); err != nil { if err == io.EOF && d.stk != nil && d.stk.kind != stkEOF { err = d.syntaxError("unexpected EOF") } return t, err } if !d.Strict { if t1, ok := d.autoClose(t); ok { d.nextToken = t t = t1 } } switch t1 := t.(type) { case StartElement: // In XML name spaces, the translations listed in the // attributes apply to the element name and // to the other attribute names, so process // the translations first. for _, a := range t1.Attr { if a.Name.Space == xmlnsPrefix { v, ok := d.ns[a.Name.Local] d.pushNs(a.Name.Local, v, ok) d.ns[a.Name.Local] = a.Value } if a.Name.Space == "" && a.Name.Local == xmlnsPrefix { // Default space for untagged names v, ok := d.ns[""] d.pushNs("", v, ok) d.ns[""] = a.Value } } d.translate(&t1.Name, true) for i := range t1.Attr { d.translate(&t1.Attr[i].Name, false) } d.pushElement(t1.Name) t = t1 case EndElement: d.translate(&t1.Name, true) if !d.popElement(&t1) { return nil, d.err } t = t1 } return t, err } const ( xmlURL = "http://www.w3.org/XML/1998/namespace" xmlnsPrefix = "xmlns" xmlPrefix = "xml" ) // Apply name space translation to name n. // The default name space (for Space=="") // applies only to element names, not to attribute names. func (d *Decoder) translate(n *Name, isElementName bool) { switch { case n.Space == xmlnsPrefix: return case n.Space == "" && !isElementName: return case n.Space == xmlPrefix: n.Space = xmlURL case n.Space == "" && n.Local == xmlnsPrefix: return } if v, ok := d.ns[n.Space]; ok { n.Space = v } else if n.Space == "" { n.Space = d.DefaultSpace } } func (d *Decoder) switchToReader(r io.Reader) { // Get efficient byte at a time reader. // Assume that if reader has its own // ReadByte, it's efficient enough. // Otherwise, use bufio. if rb, ok := r.(io.ByteReader); ok { d.r = rb } else { d.r = bufio.NewReader(r) } } // Parsing state - stack holds old name space translations // and the current set of open elements. The translations to pop when // ending a given tag are *below* it on the stack, which is // more work but forced on us by XML. type stack struct { next *stack kind int name Name ok bool } const ( stkStart = iota stkNs stkEOF ) func (d *Decoder) push(kind int) *stack { s := d.free if s != nil { d.free = s.next } else { s = new(stack) } s.next = d.stk s.kind = kind d.stk = s return s } func (d *Decoder) pop() *stack { s := d.stk if s != nil { d.stk = s.next s.next = d.free d.free = s } return s } // Record that after the current element is finished // (that element is already pushed on the stack) // Token should return EOF until popEOF is called. func (d *Decoder) pushEOF() { // Walk down stack to find Start. // It might not be the top, because there might be stkNs // entries above it. start := d.stk for start.kind != stkStart { start = start.next } // The stkNs entries below a start are associated with that // element too; skip over them. for start.next != nil && start.next.kind == stkNs { start = start.next } s := d.free if s != nil { d.free = s.next } else { s = new(stack) } s.kind = stkEOF s.next = start.next start.next = s } // Undo a pushEOF. // The element must have been finished, so the EOF should be at the top of the stack. func (d *Decoder) popEOF() bool { if d.stk == nil || d.stk.kind != stkEOF { return false } d.pop() return true } // Record that we are starting an element with the given name. func (d *Decoder) pushElement(name Name) { s := d.push(stkStart) s.name = name } // Record that we are changing the value of ns[local]. // The old value is url, ok. func (d *Decoder) pushNs(local string, url string, ok bool) { s := d.push(stkNs) s.name.Local = local s.name.Space = url s.ok = ok } // Creates a SyntaxError with the current line number. func (d *Decoder) syntaxError(msg string) error { return &SyntaxError{Msg: msg, Line: d.line} } // Record that we are ending an element with the given name. // The name must match the record at the top of the stack, // which must be a pushElement record. // After popping the element, apply any undo records from // the stack to restore the name translations that existed // before we saw this element. func (d *Decoder) popElement(t *EndElement) bool { s := d.pop() name := t.Name switch { case s == nil || s.kind != stkStart: d.err = d.syntaxError("unexpected end element ") return false case s.name.Local != name.Local: if !d.Strict { d.needClose = true d.toClose = t.Name t.Name = s.name return true } d.err = d.syntaxError("element <" + s.name.Local + "> closed by ") return false case s.name.Space != name.Space: d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space + "closed by in space " + name.Space) return false } // Pop stack until a Start or EOF is on the top, undoing the // translations that were associated with the element we just closed. for d.stk != nil && d.stk.kind != stkStart && d.stk.kind != stkEOF { s := d.pop() if s.ok { d.ns[s.name.Local] = s.name.Space } else { delete(d.ns, s.name.Local) } } return true } // If the top element on the stack is autoclosing and // t is not the end tag, invent the end tag. func (d *Decoder) autoClose(t Token) (Token, bool) { if d.stk == nil || d.stk.kind != stkStart { return nil, false } name := strings.ToLower(d.stk.name.Local) for _, s := range d.AutoClose { if strings.ToLower(s) == name { // This one should be auto closed if t doesn't close it. et, ok := t.(EndElement) if !ok || et.Name.Local != name { return EndElement{d.stk.name}, true } break } } return nil, false } var errRawToken = errors.New("xml: cannot use RawToken from UnmarshalXML method") // RawToken is like Token but does not verify that // start and end elements match and does not translate // name space prefixes to their corresponding URLs. func (d *Decoder) RawToken() (Token, error) { if d.unmarshalDepth > 0 { return nil, errRawToken } return d.rawToken() } func (d *Decoder) rawToken() (Token, error) { if d.t != nil { return d.t.Token() } if d.err != nil { return nil, d.err } if d.needClose { // The last element we read was self-closing and // we returned just the StartElement half. // Return the EndElement half now. d.needClose = false return EndElement{d.toClose}, nil } b, ok := d.getc() if !ok { return nil, d.err } if b != '<' { // Text section. d.ungetc(b) data := d.text(-1, false) if data == nil { return nil, d.err } return CharData(data), nil } if b, ok = d.mustgetc(); !ok { return nil, d.err } switch b { case '/': // ' { d.err = d.syntaxError("invalid characters between ") return nil, d.err } return EndElement{name}, nil case '?': // ' { break } b0 = b } data := d.buf.Bytes() data = data[0 : len(data)-2] // chop ?> if target == "xml" { content := string(data) ver := procInst("version", content) if ver != "" && ver != "1.0" { d.err = fmt.Errorf("xml: unsupported version %q; only version 1.0 is supported", ver) return nil, d.err } enc := procInst("encoding", content) if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") { if d.CharsetReader == nil { d.err = fmt.Errorf("xml: encoding %q declared but Decoder.CharsetReader is nil", enc) return nil, d.err } newr, err := d.CharsetReader(enc, d.r.(io.Reader)) if err != nil { d.err = fmt.Errorf("xml: opening charset %q: %v", enc, err) return nil, d.err } if newr == nil { panic("CharsetReader returned a nil Reader for charset " + enc) } d.switchToReader(newr) } } return ProcInst{target, data}, nil case '!': // ' { d.err = d.syntaxError( `invalid sequence "--" not allowed in comments`) return nil, d.err } break } b0, b1 = b1, b } data := d.buf.Bytes() data = data[0 : len(data)-3] // chop --> return Comment(data), nil case '[': // . data := d.text(-1, true) if data == nil { return nil, d.err } return CharData(data), nil } // Probably a directive: , , etc. // We don't care, but accumulate for caller. Quoted angle // brackets do not count for nesting. d.buf.Reset() d.buf.WriteByte(b) inquote := uint8(0) depth := 0 for { if b, ok = d.mustgetc(); !ok { return nil, d.err } if inquote == 0 && b == '>' && depth == 0 { break } HandleB: d.buf.WriteByte(b) switch { case b == inquote: inquote = 0 case inquote != 0: // in quotes, no special action case b == '\'' || b == '"': inquote = b case b == '>' && inquote == 0: depth-- case b == '<' && inquote == 0: // Look for