/* ************************************************************************ # # DivConq # # http://divconq.com/ # # Copyright: # Copyright 2014 eTimeline, LLC. All rights reserved. # # License: # See the license.txt file in the project's top-level directory for details. # # Authors: # * Andy White # ************************************************************************ */ package divconq.xml; import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.Hashtable; import java.util.Map; import java.util.Stack; import divconq.lang.op.OperationResult; import divconq.util.IOUtil; /** * Quick and Dirty XML parser. This parser is, like the SAX parser, an event * based parser, but with much less functionality. Based off of QDParser * by Kevin Twidle see http://twicom.com/ */ public class XmlParser { private static int popMode(Stack<Integer> st) { if (!st.empty()) return st.pop().intValue(); return PRE; } private final static int TEXT = 1, ENTITY = 2, OPEN_TAG = 3, CLOSE_TAG = 4, START_TAG = 5, ATTRIBUTE_LVALUE = 6, ATTRIBUTE_EQUAL = 9, ATTRIBUTE_RVALUE = 10, QUOTE = 7, IN_TAG = 8, SINGLE_TAG = 12, COMMENT = 13, IGNORE = 14, PRE = 15, CDATA = 16, OPEN_INSTRUCTION = 17; /* * Parses XML from a reader and returns a data structure containing the * parsed XML. * * @param doc * the DocHandler that will be given the different elements of * the XML * @param reader * the Reader to get the source XML from * @throws XMLParseException * if an error in the XML is detected * @throws IOException * if an error using the Reader is detected */ public static OperationResult parse(IParseHandler doc, Reader reader) { OperationResult or = new OperationResult(); try { Stack<Integer> st = new Stack<Integer>(); int depth = 0; int mode = PRE; int c = 0; int quotec = '"'; depth = 0; StringBuffer sb = new StringBuffer(); StringBuffer etag = new StringBuffer(); String tagName = null; String lvalue = null; String rvalue = null; Map<String, String> attrs = null; doc.startDocument(or); if (or.hasErrors()) return or; int line = 1, col = 0; boolean eol = false; // TODO add support for surrogate pair, set String Builder 32 while ((c = reader.read()) != -1) { // We need to map \r, \r\n, and \n to \n // See XML spec section 2.11 if (c == '\n' && eol) { eol = false; continue; } else if (eol) { eol = false; } else if (c == '\n') { line++; col = 0; } else if (c == '\r') { eol = true; c = '\n'; line++; col = 0; } else { col++; } if (mode == TEXT) { // We are between tags collecting text. if (c == '<') { st.push(new Integer(mode)); mode = START_TAG; if (sb.length() > 0) { doc.text(or, sb.toString(), false, line, col); if (or.hasErrors()) return or; sb.setLength(0); } } else if (c == '&') { st.push(new Integer(mode)); mode = ENTITY; etag.setLength(0); } else sb.append((char) c); } else if (mode == CLOSE_TAG) { // we are processing a closing tag: e.g. </foo> if (c == '>') { mode = popMode(st); tagName = sb.toString(); sb.setLength(0); depth--; doc.endElement(or, tagName); if (or.hasErrors()) return or; if (depth == 0) { doc.endDocument(or); return or; } } else { sb.append((char) c); } } else if (mode == CDATA) { // we are processing CDATA if (c == '>' && sb.toString().endsWith("]]")) { sb.setLength(sb.length() - 2); doc.text(or, sb.toString(), true, line, col); if (or.hasErrors()) return or; sb.setLength(0); mode = popMode(st); } else sb.append((char) c); } else if (mode == COMMENT) { // we are processing a comment. We are inside // the <!-- .... --> looking for the -->. if (c == '>' && sb.toString().endsWith("--")) { sb.setLength(0); mode = popMode(st); } else sb.append((char) c); } else if (mode == PRE) { // We are outside the root tag element if (c == '<') { mode = TEXT; st.push(new Integer(mode)); mode = START_TAG; } } else if (mode == IGNORE) { // We are inside one of these <? ... ?> // or one of these <!DOCTYPE ... > if (c == '>') { mode = popMode(st); if (mode == TEXT) mode = PRE; } } else if (mode == START_TAG) { // we have just seen a < and // are wondering what we are looking at // <foo>, </foo>, <!-- ... --->, etc. mode = popMode(st); if (c == '/') { st.push(new Integer(mode)); mode = CLOSE_TAG; } else if (c == '?') { mode = IGNORE; } else if (c == '!') { st.push(new Integer(mode)); mode = OPEN_INSTRUCTION; tagName = null; attrs = new Hashtable<String, String>(); sb.append((char) c); } else if (c == '_' || Character.isLetter(c)) { st.push(new Integer(mode)); mode = OPEN_TAG; tagName = null; attrs = new Hashtable<String, String>(); sb.append((char) c); } else { or.errorTr(242, line, col, (char) c); return or; } } else if (mode == ENTITY) { // we are processing an entity, e.g. <, », etc. if (c == ';') { mode = popMode(st); String cent = etag.toString(); etag.setLength(0); /* if (cent.equals("lt")) sb.append('<'); else if (cent.equals("gt")) sb.append('>'); else if (cent.equals("amp")) sb.append('&'); else if (cent.equals("quot")) sb.append('"'); else if (cent.equals("apos")) sb.append('\''); else if (cent.startsWith("#x")) sb.append((char) Integer.parseInt(cent.substring(2), 16)); else if (cent.startsWith("#")) sb.append((char) Integer.parseInt(cent.substring(1))); else { // Just keep the unknown entity sb.append('&'); sb.append(cent); sb.append(';'); // exc("Unknown entity: &" + cent + ";", line, col); } */ // APW Just keep the entity sb.append('&'); sb.append(cent); sb.append(';'); } else { etag.append((char) c); } } else if (mode == SINGLE_TAG) { // we have just seen something like this: // <foo a="b"/ // and are looking for the final >. if (tagName == null) tagName = sb.toString(); if (c != '>') { or.errorTr(241, line, col, tagName); return or; } doc.element(or, tagName, attrs, line, col); //doc.endElement(tagName); if (or.hasErrors()) return or; if (depth == 0) { doc.endDocument(or); return or; } sb.setLength(0); attrs = new HashMap<String, String>(); tagName = null; mode = popMode(st); } else if (mode == OPEN_INSTRUCTION) { // we are processing <!... >. // We already have the first character if (c == '>') { or.errorTr(241, line, col, sb.toString()); return or; } else if (c == '-' && sb.toString().equals("!-")) { mode = COMMENT; } else if (c == '[' && sb.toString().equals("![CDATA")) { mode = CDATA; sb.setLength(0); } else if (c == 'E' && sb.toString().equals("!DOCTYP")) { sb.setLength(0); mode = IGNORE; } else if (Character.isWhitespace((char) c)) { or.errorTr(240, line, col, sb.toString()); return or; } else { // We have a character to add to the instruction // Check for length if (sb.length() > 9) { or.errorTr(239, line, col, sb.toString()); return or; } // Check for validity if (c == '-' || c == '[' || Character.isLetter(c)) sb.append((char) c); else { or.errorTr(238, line, col, c, sb.toString()); return or; } } } else if (mode == OPEN_TAG) { // we are processing something // like this <foo ... >. // We already have the first character if (c == '>') { if (tagName == null) tagName = sb.toString(); sb.setLength(0); depth++; doc.startElement(or, tagName, attrs, line, col); if (or.hasErrors()) return or; tagName = null; attrs = new HashMap<String, String>(); mode = popMode(st); } else if (c == '/') { mode = SINGLE_TAG; } else if (Character.isWhitespace((char) c)) { tagName = sb.toString(); sb.setLength(0); mode = IN_TAG; } else { // We have a character to add to the name // Check for validity if (Character.isLetterOrDigit(c) || c == '_' || c == '-' || c == '.' || c == ':') sb.append((char) c); else { or.errorTr(237, line, col, c); return or; } } } else if (mode == QUOTE) { // We are processing the quoted right-hand side // of an element's attribute. if (c == quotec) { rvalue = sb.toString(); sb.setLength(0); attrs.put(lvalue, rvalue); mode = IN_TAG; // See section the XML spec, section 3.3.3 // on normalization processing. } else if (" \r\n\u0009".indexOf(c) >= 0) { sb.append(' '); } else if (c == '&') { st.push(new Integer(mode)); mode = ENTITY; etag.setLength(0); } else { sb.append((char) c); } } else if (mode == ATTRIBUTE_RVALUE) { if (c == '"' || c == '\'') { quotec = c; mode = QUOTE; } else if (Character.isWhitespace((char) c)) { ; } else { or.errorTr(236, line, col); return or; } } else if (mode == ATTRIBUTE_LVALUE) { if (Character.isWhitespace((char) c)) { lvalue = sb.toString(); sb.setLength(0); mode = ATTRIBUTE_EQUAL; } else if (c == '=') { lvalue = sb.toString(); sb.setLength(0); mode = ATTRIBUTE_RVALUE; } else { sb.append((char) c); } } else if (mode == ATTRIBUTE_EQUAL) { if (c == '=') { mode = ATTRIBUTE_RVALUE; } else if (Character.isWhitespace((char) c)) { ; } else { or.errorTr(235, line, col); return or; } } else if (mode == IN_TAG) { if (c == '>') { mode = popMode(st); doc.startElement(or, tagName, attrs, line, col); if (or.hasErrors()) return or; depth++; tagName = null; attrs = new HashMap<String, String>(); } else if (c == '/') { mode = SINGLE_TAG; } else if (Character.isWhitespace((char) c)) { ; } else { mode = ATTRIBUTE_LVALUE; sb.append((char) c); } } } if (mode != PRE) or.errorTr(234, line, col); return or; } catch (IOException x) { or.error("Erroring reading XML: " + x); } finally { IOUtil.closeQuietly(reader); } return or; } }