XmlParser.java example

Explorer
divconq-master
/* ************************************************************************
#
#  DivConq
#
#  http://divconq.com/
#
#  Copyright:
#    Copyright 2014 eTimeline, LLC. All rights reserved.
#
#  License:
#    See the license.txt file in the project's top-level directory for details.
#
#  Authors:
#    * Andy White
#
************************************************************************ */
package divconq.xml;

import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Map;
import java.util.Stack;

import divconq.lang.op.OperationResult;
import divconq.util.IOUtil;

/**
 * Quick and Dirty XML parser. This parser is, like the SAX parser, an event
 * based parser, but with much less functionality.  Based off of QDParser
 * by Kevin Twidle see http://twicom.com/ 
 */
public class XmlParser {
	private static int popMode(Stack<Integer> st) {
		if (!st.empty())
			return st.pop().intValue();
		return PRE;
	}

	private final static int TEXT = 1, ENTITY = 2, OPEN_TAG = 3, CLOSE_TAG = 4,
			START_TAG = 5, ATTRIBUTE_LVALUE = 6, ATTRIBUTE_EQUAL = 9,
			ATTRIBUTE_RVALUE = 10, QUOTE = 7, IN_TAG = 8, SINGLE_TAG = 12,
			COMMENT = 13, IGNORE = 14, PRE = 15, CDATA = 16,
			OPEN_INSTRUCTION = 17;

	/*
	 * Parses XML from a reader and returns a data structure containing the
	 * parsed XML.
	 * 
	 * @param doc
	 *            the DocHandler that will be given the different elements of
	 *            the XML
	 * @param reader
	 *            the Reader to get the source XML from
	 * @throws XMLParseException
	 *             if an error in the XML is detected
	 * @throws IOException
	 *             if an error using the Reader is detected
	 */
	public static OperationResult parse(IParseHandler doc, Reader reader) {
		OperationResult or = new OperationResult();
		
		try {
			Stack<Integer> st = new Stack<Integer>();
			int depth = 0;
			int mode = PRE;
			int c = 0;
			int quotec = '"';
			depth = 0;
			StringBuffer sb = new StringBuffer();
			StringBuffer etag = new StringBuffer();
			String tagName = null;
			String lvalue = null;
			String rvalue = null;
			Map<String, String> attrs = null;
			
			doc.startDocument(or);
			
			if (or.hasErrors())
				return or;
			
			int line = 1, col = 0;
			boolean eol = false;
	
			// TODO add support for surrogate pair, set String Builder 32
			while ((c = reader.read()) != -1) {
				// We need to map \r, \r\n, and \n to \n
				// See XML spec section 2.11
				if (c == '\n' && eol) {
					eol = false;
					continue;
				} 
				else if (eol) {
					eol = false;
				} 
				else if (c == '\n') {
					line++;
					col = 0;
				} 
				else if (c == '\r') {
					eol = true;
					c = '\n';
					line++;
					col = 0;
				} 
				else {
					col++;
				}
	
				if (mode == TEXT) {
					// We are between tags collecting text.
					if (c == '<') {
						st.push(new Integer(mode));
						mode = START_TAG;
						if (sb.length() > 0) {
							doc.text(or, sb.toString(), false, line, col);
							
							if (or.hasErrors())
								return or;
							
							sb.setLength(0);
						}
					} 
					else if (c == '&') {
						st.push(new Integer(mode));
						mode = ENTITY;
						etag.setLength(0);
					} 
					else
						sb.append((char) c);
	
				} 
				else if (mode == CLOSE_TAG) {
					// we are processing a closing tag: e.g. </foo>
					if (c == '>') {
						mode = popMode(st);
						tagName = sb.toString();
						sb.setLength(0);
						depth--;
						
						doc.endElement(or, tagName);
						
						if (or.hasErrors())
							return or;
						
						if (depth == 0) {
							doc.endDocument(or);
							return or;
						}
					} 
					else {
						sb.append((char) c);
					}
	
				} 
				else if (mode == CDATA) {
					// we are processing CDATA
					if (c == '>' && sb.toString().endsWith("]]")) {
						sb.setLength(sb.length() - 2);
						
						doc.text(or, sb.toString(), true, line, col);
						
						if (or.hasErrors())
							return or;
						
						sb.setLength(0);
						mode = popMode(st);
					} 
					else
						sb.append((char) c);
	
				} 
				else if (mode == COMMENT) {
					// we are processing a comment. We are inside
					// the <!-- .... --> looking for the -->.
					if (c == '>' && sb.toString().endsWith("--")) {
						sb.setLength(0);
						mode = popMode(st);
					} 
					else
						sb.append((char) c);
	
				} 
				else if (mode == PRE) {
					// We are outside the root tag element
					if (c == '<') {
						mode = TEXT;
						st.push(new Integer(mode));
						mode = START_TAG;
					}
	
				} 
				else if (mode == IGNORE) {
					// We are inside one of these <? ... ?>
					// or one of these <!DOCTYPE ... >
					if (c == '>') {
						mode = popMode(st);
						if (mode == TEXT)
							mode = PRE;
					}
	
				} 
				else if (mode == START_TAG) {
					// we have just seen a < and
					// are wondering what we are looking at
					// <foo>, </foo>, <!-- ... --->, etc.
					mode = popMode(st);
					if (c == '/') {
						st.push(new Integer(mode));
						mode = CLOSE_TAG;
					} 
					else if (c == '?') {
						mode = IGNORE;
					} 
					else if (c == '!') {
						st.push(new Integer(mode));
						mode = OPEN_INSTRUCTION;
						tagName = null;
						attrs = new Hashtable<String, String>();
						sb.append((char) c);
					} 
					else if (c == '_' || Character.isLetter(c)) {
						st.push(new Integer(mode));
						mode = OPEN_TAG;
						tagName = null;
						attrs = new Hashtable<String, String>();
						sb.append((char) c);
					} 
					else {
						or.errorTr(242, line, col, (char) c);
						return or;
					}
	
				} 
				else if (mode == ENTITY) {
					// we are processing an entity, e.g. <, », etc.
					if (c == ';') {
						mode = popMode(st);
						String cent = etag.toString();
						etag.setLength(0);
						
						/*
						if (cent.equals("lt"))
							sb.append('<');
						else if (cent.equals("gt"))
							sb.append('>');
						else if (cent.equals("amp"))
							sb.append('&');
						else if (cent.equals("quot"))
							sb.append('"');
						else if (cent.equals("apos"))
							sb.append('\'');
						else if (cent.startsWith("#x"))
							sb.append((char) Integer.parseInt(cent.substring(2), 16));
						else if (cent.startsWith("#"))
							sb.append((char) Integer.parseInt(cent.substring(1)));
						else {
							// Just keep the unknown entity
							sb.append('&');
							sb.append(cent);
							sb.append(';');
							// exc("Unknown entity: &" + cent + ";", line, col);
						}
						*/
						
						// APW Just keep the entity
						sb.append('&');
						sb.append(cent);
						sb.append(';');
					} 
					else {
						etag.append((char) c);
					}
	
				} 
				else if (mode == SINGLE_TAG) {
					// we have just seen something like this:
					// <foo a="b"/
					// and are looking for the final >.
					if (tagName == null)
						tagName = sb.toString();
					
					if (c != '>') {
						or.errorTr(241, line, col, tagName);
						return or;
					}
					
					doc.element(or, tagName, attrs, line, col);
					//doc.endElement(tagName);
					
					if (or.hasErrors())
						return or;
					
					if (depth == 0) {
						doc.endDocument(or);
						return or;
					}
					
					sb.setLength(0);
					attrs = new HashMap<String, String>();
					tagName = null;
					mode = popMode(st);
	
				} 
				else if (mode == OPEN_INSTRUCTION) {
					// we are processing <!... >.
					// We already have the first character
					if (c == '>') {
						or.errorTr(241, line, col, sb.toString());
						return or;
					} 
					else if (c == '-' && sb.toString().equals("!-")) {
						mode = COMMENT;
					} 
					else if (c == '[' && sb.toString().equals("![CDATA")) {
						mode = CDATA;
						sb.setLength(0);
					} 
					else if (c == 'E' && sb.toString().equals("!DOCTYP")) {
						sb.setLength(0);
						mode = IGNORE;
					} 
					else if (Character.isWhitespace((char) c)) {
						or.errorTr(240, line, col, sb.toString());
						return or;
					} 
					else {
						// We have a character to add to the instruction
						// Check for length
						if (sb.length() > 9) {
							or.errorTr(239, line, col, sb.toString());
							return or;
						}
						
						// Check for validity
						if (c == '-' || c == '[' || Character.isLetter(c))
							sb.append((char) c);
						else {
							or.errorTr(238, line, col, c, sb.toString());
							return or;
						}
					}
				} 
				else if (mode == OPEN_TAG) {
					// we are processing something
					// like this <foo ... >.
					// We already have the first character
					if (c == '>') {
						if (tagName == null)
							tagName = sb.toString();
						
						sb.setLength(0);
						depth++;
						
						doc.startElement(or, tagName, attrs, line, col);
						
						if (or.hasErrors())
							return or;
						
						tagName = null;
						attrs = new HashMap<String, String>();
						mode = popMode(st);
					} 
					else if (c == '/') {
						mode = SINGLE_TAG;
					} 
					else if (Character.isWhitespace((char) c)) {
						tagName = sb.toString();
						sb.setLength(0);
						mode = IN_TAG;
					} 
					else {
						// We have a character to add to the name
						// Check for validity
						if (Character.isLetterOrDigit(c) || c == '_' || c == '-'
								|| c == '.' || c == ':')
							sb.append((char) c);
						else {
							or.errorTr(237, line, col, c);
							return or;
						}
					}
	
				}
				else if (mode == QUOTE) {
					// We are processing the quoted right-hand side
					// of an element's attribute.
					if (c == quotec) {
						rvalue = sb.toString();
						sb.setLength(0);
						attrs.put(lvalue, rvalue);
						mode = IN_TAG;
						// See section the XML spec, section 3.3.3
						// on normalization processing.
					} 
					else if (" \r\n\u0009".indexOf(c) >= 0) {
						sb.append(' ');
					} 
					else if (c == '&') {
						st.push(new Integer(mode));
						mode = ENTITY;
						etag.setLength(0);
					} 
					else {
						sb.append((char) c);
					}
	
				} 
				else if (mode == ATTRIBUTE_RVALUE) {
					if (c == '"' || c == '\'') {
						quotec = c;
						mode = QUOTE;
					} 
					else if (Character.isWhitespace((char) c)) {
						;
					} 
					else {
						or.errorTr(236, line, col);
						return or;
					}
	
				} 
				else if (mode == ATTRIBUTE_LVALUE) {
					if (Character.isWhitespace((char) c)) {
						lvalue = sb.toString();
						sb.setLength(0);
						mode = ATTRIBUTE_EQUAL;
					} 
					else if (c == '=') {
						lvalue = sb.toString();
						sb.setLength(0);
						mode = ATTRIBUTE_RVALUE;
					} 
					else {
						sb.append((char) c);
					}
	
				} 
				else if (mode == ATTRIBUTE_EQUAL) {
					if (c == '=') {
						mode = ATTRIBUTE_RVALUE;
					} 
					else if (Character.isWhitespace((char) c)) {
						;
					} 
					else {
						or.errorTr(235, line, col);
						return or;
					}
	
				} 
				else if (mode == IN_TAG) {
					if (c == '>') {
						mode = popMode(st);
						
						doc.startElement(or, tagName, attrs, line, col);
						
						if (or.hasErrors())
							return or;
						
						depth++;
						tagName = null;
						attrs = new HashMap<String, String>();
					} 
					else if (c == '/') {
						mode = SINGLE_TAG;
					} 
					else if (Character.isWhitespace((char) c)) {
						;
					} 
					else {
						mode = ATTRIBUTE_LVALUE;
						sb.append((char) c);
					}
				}
			}
			
			if (mode != PRE)
				or.errorTr(234, line, col);
			
			return or;
		}
		catch (IOException x) {
			or.error("Erroring reading XML: " + x);
		}
		finally {
			IOUtil.closeQuietly(reader);
		}
		
		return or;
	}
}