/* * Copyright (C)2005-2019 Haxe Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package haxe.xml; using StringTools; private enum abstract S(Int) { var IGNORE_SPACES; var BEGIN; var BEGIN_NODE; var TAG_NAME; var BODY; var ATTRIB_NAME; var EQUALS; var ATTVAL_BEGIN; var ATTRIB_VAL; var CHILDS; var CLOSE; var WAIT_END; var WAIT_END_RET; var PCDATA; var HEADER; var COMMENT; var DOCTYPE; var CDATA; var ESCAPE; } class XmlParserException { /** the XML parsing error message **/ public var message:String; /** the line number at which the XML parsing error occurred **/ public var lineNumber:Int; /** the character position in the reported line at which the parsing error occurred **/ public var positionAtLine:Int; /** the character position in the XML string at which the parsing error occurred **/ public var position:Int; /** the invalid XML string **/ public var xml:String; public function new(message:String, xml:String, position:Int) { this.xml = xml; this.message = message; this.position = position; lineNumber = 1; positionAtLine = 0; for (i in 0...position) { var c = xml.fastCodeAt(i); if (c == '\n'.code) { lineNumber++; positionAtLine = 0; } else { if (c != '\r'.code) positionAtLine++; } } } public function toString():String { return Type.getClassName(Type.getClass(this)) + ": " + message + " at line " + lineNumber + " char " + positionAtLine; } } class Parser { static var escapes = { var h = new haxe.ds.StringMap(); h.set("lt", "<"); h.set("gt", ">"); h.set("amp", "&"); h.set("quot", '"'); h.set("apos", "'"); h; } /** Parses the String into an XML Document. Set strict parsing to true in order to enable a strict check of XML attributes and entities. @throws haxe.xml.XmlParserException **/ static public function parse(str:String, strict = false) { var doc = Xml.createDocument(); doParse(str, strict, 0, doc); return doc; } static function doParse(str:String, strict:Bool, p:Int = 0, ?parent:Xml):Int { var xml:Xml = null; var state = S.BEGIN; var next = S.BEGIN; var aname = null; var start = 0; var nsubs = 0; var nbrackets = 0; var buf = new StringBuf(); // need extra state because next is in use var escapeNext = S.BEGIN; var attrValQuote = -1; inline function addChild(xml:Xml) { parent.addChild(xml); nsubs++; } while (p < str.length) { var c = str.unsafeCodeAt(p); switch (state) { case S.IGNORE_SPACES: switch (c) { case '\n'.code, '\r'.code, '\t'.code, ' '.code: default: state = next; continue; } case S.BEGIN: switch (c) { case '<'.code: state = S.IGNORE_SPACES; next = S.BEGIN_NODE; default: start = p; state = S.PCDATA; continue; } case S.PCDATA: if (c == '<'.code) { buf.addSub(str, start, p - start); var child = Xml.createPCData(buf.toString()); buf = new StringBuf(); addChild(child); state = S.IGNORE_SPACES; next = S.BEGIN_NODE; } else if (c == '&'.code) { buf.addSub(str, start, p - start); state = S.ESCAPE; escapeNext = S.PCDATA; start = p + 1; } case S.CDATA: if (c == ']'.code && str.fastCodeAt(p + 1) == ']'.code && str.fastCodeAt(p + 2) == '>'.code) { var child = Xml.createCData(str.substr(start, p - start)); addChild(child); p += 2; state = S.BEGIN; } case S.BEGIN_NODE: switch (c) { case '!'.code: if (str.fastCodeAt(p + 1) == '['.code) { p += 2; if (str.substr(p, 6).toUpperCase() != "CDATA[") throw new XmlParserException("Expected '.code: state = S.CHILDS; default: state = S.ATTRIB_NAME; start = p; continue; } case S.ATTRIB_NAME: if (!isValidChar(c)) { var tmp; if (start == p) throw new XmlParserException("Expected attribute name", str, p); tmp = str.substr(start, p - start); aname = tmp; if (xml.exists(aname)) throw new XmlParserException("Duplicate attribute [" + aname + "]", str, p); state = S.IGNORE_SPACES; next = S.EQUALS; continue; } case S.EQUALS: switch (c) { case '='.code: state = S.IGNORE_SPACES; next = S.ATTVAL_BEGIN; default: throw new XmlParserException("Expected =", str, p); } case S.ATTVAL_BEGIN: switch (c) { case '"'.code | '\''.code: buf = new StringBuf(); state = S.ATTRIB_VAL; start = p + 1; attrValQuote = c; default: throw new XmlParserException("Expected \"", str, p); } case S.ATTRIB_VAL: switch (c) { case '&'.code: buf.addSub(str, start, p - start); state = S.ESCAPE; escapeNext = S.ATTRIB_VAL; start = p + 1; case '>'.code | '<'.code if (strict): // HTML allows these in attributes values throw new XmlParserException("Invalid unescaped " + String.fromCharCode(c) + " in attribute value", str, p); case _ if (c == attrValQuote): buf.addSub(str, start, p - start); var val = buf.toString(); buf = new StringBuf(); xml.set(aname, val); state = S.IGNORE_SPACES; next = S.BODY; } case S.CHILDS: p = doParse(str, strict, p, xml); start = p; state = S.BEGIN; case S.WAIT_END: switch (c) { case '>'.code: state = S.BEGIN; default: throw new XmlParserException("Expected >", str, p); } case S.WAIT_END_RET: switch (c) { case '>'.code: if (nsubs == 0) parent.addChild(Xml.createPCData("")); return p; default: throw new XmlParserException("Expected >", str, p); } case S.CLOSE: if (!isValidChar(c)) { if (start == p) throw new XmlParserException("Expected node name", str, p); var v = str.substr(start, p - start); if (parent == null || parent.nodeType != Element) { throw new XmlParserException('Unexpected , tag is not open', str, p); } if (v != parent.nodeName) throw new XmlParserException("Expected ", str, p); state = S.IGNORE_SPACES; next = S.WAIT_END_RET; continue; } case S.COMMENT: if (c == '-'.code && str.fastCodeAt(p + 1) == '-'.code && str.fastCodeAt(p + 2) == '>'.code) { addChild(Xml.createComment(str.substr(start, p - start))); p += 2; state = S.BEGIN; } case S.DOCTYPE: if (c == '['.code) nbrackets++; else if (c == ']'.code) nbrackets--; else if (c == '>'.code && nbrackets == 0) { addChild(Xml.createDocType(str.substr(start, p - start))); state = S.BEGIN; } case S.HEADER: if (c == '?'.code && str.fastCodeAt(p + 1) == '>'.code) { p++; var str = str.substr(start + 1, p - start - 2); addChild(Xml.createProcessingInstruction(str)); state = S.BEGIN; } case S.ESCAPE: if (c == ';'.code) { var s = str.substr(start, p - start); if (s.fastCodeAt(0) == '#'.code) { var c = s.fastCodeAt(1) == 'x'.code ? Std.parseInt("0" + s.substr(1, s.length - 1)) : Std.parseInt(s.substr(1, s.length - 1)); #if !(target.unicode) if (c >= 128) { // UTF8-encode it if (c <= 0x7FF) { buf.addChar(0xC0 | (c >> 6)); buf.addChar(0x80 | (c & 63)); } else if (c <= 0xFFFF) { buf.addChar(0xE0 | (c >> 12)); buf.addChar(0x80 | ((c >> 6) & 63)); buf.addChar(0x80 | (c & 63)); } else if (c <= 0x10FFFF) { buf.addChar(0xF0 | (c >> 18)); buf.addChar(0x80 | ((c >> 12) & 63)); buf.addChar(0x80 | ((c >> 6) & 63)); buf.addChar(0x80 | (c & 63)); } else throw new XmlParserException("Cannot encode UTF8-char " + c, str, p); } else #end buf.addChar(c); } else if (!escapes.exists(s)) { if (strict) throw new XmlParserException("Undefined entity: " + s, str, p); buf.add('&$s;'); } else { buf.add(escapes.get(s)); } start = p + 1; state = escapeNext; } else if (!isValidChar(c) && c != "#".code) { if (strict) throw new XmlParserException("Invalid character in entity: " + String.fromCharCode(c), str, p); buf.addChar("&".code); buf.addSub(str, start, p - start); p--; start = p + 1; state = escapeNext; } } ++p; } if (state == S.BEGIN) { start = p; state = S.PCDATA; } if (state == S.PCDATA) { if (parent.nodeType == Element) { throw new XmlParserException("Unclosed node <" + parent.nodeName + ">", str, p); } if (p != start || nsubs == 0) { buf.addSub(str, start, p - start); addChild(Xml.createPCData(buf.toString())); } return p; } if (!strict && state == S.ESCAPE && escapeNext == S.PCDATA) { buf.addChar("&".code); buf.addSub(str, start, p - start); addChild(Xml.createPCData(buf.toString())); return p; } throw new XmlParserException("Unexpected end", str, p); } static inline function isValidChar(c) { return (c >= 'a'.code && c <= 'z'.code) || (c >= 'A'.code && c <= 'Z'.code) || (c >= '0'.code && c <= '9'.code) || c == ':'.code || c == '.'.code || c == '_'.code || c == '-'.code; } }