XmlParser.java example

Explorer
LEADT-master
/*
 * XmlParser.java
 *
 * Copyright (C) 2005-2006 Tommi Laukkanen
 * http://www.substanceofcode.com
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

// Expand to define testing define
//#define DNOTEST
// Expand to define logging define
//#define DNOLOGGING
package com.substanceofcode.utils;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Vector;

import com.substanceofcode.utils.CauseException;
import com.substanceofcode.utils.CauseMemoryException;

//#ifdef DLOGGING
import net.sf.jlogmicro.util.logging.Logger;
import net.sf.jlogmicro.util.logging.Level;
//#endif
/**
 * Simple and lightweight XML parser without complete error handling.
 *
 * @author Tommi Laukkanen
 */
public class XmlParser {
    
    /** Current XML element name (eg. <title> = title) */
    final protected StringBuffer m_currentElementName = new StringBuffer();
	/** Element data includes element name (e.g. <title>rss) */
    final protected StringBuffer m_currentElementData = new StringBuffer();
    protected boolean m_currentElementContainsText = false;
	//#ifdef DTEST
    boolean m_debugTrace = false;  // True to add extra trace
	//#endif
	// The flag for if encoding set with BOM, prologue with encoding,
	// or meta tag for HTMLParser.
	protected boolean m_encoding_set = false; // The flag for if encoding set
    // Allow some errors for get text and get attribute
    protected boolean m_acceptErrors = true;  // Allow some errors 
    protected String m_fileEncoding = "ISO8859_1";  // See EncodingUtil
    protected String m_docEncoding = "";  // See EncodingUtil
    protected String m_defEncoding = "UTF-8";  // Default doc encoding
    protected EncodingUtil m_encodingUtil = null;
    protected EncodingStreamReader m_encodingStreamReader;
	protected InputStreamReader m_inputStream;
    private String [] m_namespaces = null;
    private boolean m_getPrologue = true;
	//#ifdef DLOGGING
    private Logger logger = Logger.getLogger("XmlParser");
    final private boolean fineLoggable = logger.isLoggable(Level.FINE);
    final private boolean finerLoggable = logger.isLoggable(Level.FINER);
    final private boolean finestLoggable = logger.isLoggable(Level.FINEST);
	//#endif
    
    /** Enumerations for parse function */
    public static final int END_DOCUMENT = 0;
    public static final int ELEMENT = 1;
    public static final int PROLOGUE = 2;
    
    /** Creates a new instance of XmlParser */
    public XmlParser(InputStream inputStream) {
		this(new EncodingUtil(inputStream));
    }

    /** Creates a new instance of XmlParser */
    public XmlParser(EncodingUtil encodingUtil) {
		this.m_encodingUtil = encodingUtil;
		m_encodingStreamReader =
			m_encodingUtil.getEncodingStreamReader();
		m_fileEncoding = m_encodingStreamReader.getFileEncoding();
		m_inputStream = m_encodingStreamReader.getInputStream();
    }

    /** Parse next element */
    protected int parseStream(InputStreamReader is)
	throws IOException, CauseMemoryException, CauseException {
		StringBuffer inputBuffer = new StringBuffer();
		
		boolean parsingElementName = false;
		boolean elementFound = false;
		boolean elementStart = false;
		boolean parsingElementData = false;
		boolean prologueFound = false;
				
        char c;
        int inputCharacter = is.read();
		try {
			while ((inputCharacter != -1) && !elementFound) {
                c = (char)inputCharacter;
				
				if (elementStart) {
					switch (c) {
						case '/':
							parsingElementName = false;
							break;
						// If we get ? or ! after '<' this is not an
						// element, it's a comment or prologe.
						case '?':
						case '!':
							if(m_currentElementData.charAt(m_currentElementData.length()-1)=='<') {
								parsingElementName = false;
								// If we find <? and we're looking for the prologue,
								// set flag.
								if (m_getPrologue && (c == '?')) {
									prologueFound = true;
								}
							}
							break;
						default:
							break;
					}
				}
				if(parsingElementName) {
					// Determine if we have found the end of the element
					// name and thus started element data.
					switch (c) {
						case ':':
							// For specified namespace, put it into element name
							if ((m_namespaces != null) &&
								(((m_namespaces.length >= 1) &&
								 m_namespaces[0].equals(
									m_currentElementName.toString())) ||
								((m_namespaces.length >= 2) &&
								m_namespaces[1].equals(
									m_currentElementName.toString())) ||
								((m_namespaces.length >= 3) &&
								m_namespaces[2].equals(
									m_currentElementName.toString())))) {
								m_currentElementName.append(c);
								break;
							}
							// Don't break after ':' (above) if not a part of
							// namespace as it is the end of the element
							// name.
						case ' ':
						case '/':
							parsingElementName = false;
							parsingElementData = true;
							break;
						// Finding '>' is the end of an element name,
						// but we process it below.
						case '>':
							break;
						default:
							m_currentElementName.append(c);
							break;
					}
				}              
				// We found the beginning of a tag, so we start an element
				// name.
				if(c=='<') {
					elementStart = true;
					parsingElementName = true;
					parsingElementData = true;
					m_currentElementName.setLength(0);
					m_currentElementData.setLength(0);
				}            
				// If parsing element data, add to it.
				if(parsingElementData) {
					m_currentElementData.append(c);
				}
				// If we find end tag '>' can also be the
				// end of the prologe so we check.
				if(c=='>') {
					if(m_currentElementName.length()>0) {
						elementFound = true;
						parsingElementName = false;
						// If we find XML without a prologue, need
						// to treat as default UTF-8 encoding for XML.
						if (m_getPrologue) {
							m_getPrologue = false;
							m_encodingStreamReader.setGetPrologue(false);
							// If BOM is present, use it's definition for
							// default
							if (m_encodingStreamReader.isUtfDoc()) {
								if (m_encodingStreamReader.isUtf16Doc()) {
									m_encodingUtil.getEncoding(m_fileEncoding,
											"UTF-16");
								} else {
									m_encodingUtil.getEncoding(m_fileEncoding,
											"UTF-8");
								}
								m_encoding_set = true;
							} else {
								m_encodingUtil.getEncoding(m_fileEncoding,
										m_defEncoding);
							}
							m_docEncoding = m_encodingUtil.getDocEncoding();
						}
					} else if (m_getPrologue && prologueFound) {
						// If we are looking for the prolog, now
						// we have read the end of it, so we can
						// get the encoding specified (or null which
						// defaults to UTF-8).
						// Only process actual prologes.  <?xmlstylesheet
						// is not what we want.
						if (m_currentElementData.toString().
							startsWith("<?xml ")) {
							m_getPrologue = false;
							m_encodingStreamReader.setGetPrologue(false);
							//#ifdef DLOGGING
							if (finestLoggable) {logger.finest("m_currentElementData.length()=" + m_currentElementData.length());}
							//#endif
							String cencoding = getAttributeValue("encoding");
							if (cencoding == null) {
								//#ifdef DLOGGING
								if (finestLoggable) {logger.finest("Prologue cencoding,m_defEncoding=" + cencoding + "," + m_defEncoding);}
								//#endif
								cencoding = m_defEncoding;
							} else {
								m_encoding_set = true;
							}
							m_encodingUtil.getEncoding(m_fileEncoding,
									cencoding);
							// Get doc encoding.  The encoding to translate
							// the bytes into.
							m_docEncoding = m_encodingUtil.getDocEncoding();
							return PROLOGUE;
						}
					}
				}    

				// If we have not found an element, keep parsing.
				// Otherwise, we get out of the loop.
				if(!elementFound){
                    inputCharacter = is.read();
				}
				//#ifdef DTEST
				//#ifdef DLOGGING
				if (m_debugTrace) {
					logger.finest("c=" + c);
					logger.finest("m_currentElementName=" + m_currentElementName);
					logger.finest("m_currentElementData=" + m_currentElementData);
					logger.finest("m_currentElementContainsText=" + m_currentElementContainsText);
					logger.finest("parsingElementName=" + parsingElementName);
					logger.finest("parsingElementData=" + parsingElementData);
					logger.finest("prologueFound=" + prologueFound);
					logger.finest("parsingElementData=" + parsingElementData);
					logger.finest("parsingElementData=" + parsingElementData);
					logger.finest("elementFound=" + elementFound);
					logger.finest("elementStart=" + elementStart);
				}
				//#endif
				//#endif
			}
			
			// Determine if we actually have element data or a tag
			// that ends without data/text (e.g. <br/> has no text)
			if( m_currentElementData.charAt( m_currentElementData.length()-2 )=='/' &&
				m_currentElementData.charAt( m_currentElementData.length()-1 )=='>' ) {
				m_currentElementContainsText = false;
			} else {
				m_currentElementContainsText = true;
			}
			//#ifdef DLOGGING
			if (finerLoggable) {logger.finer("m_currentElementContainsText,m_currentElementData=" + m_currentElementContainsText + "," + m_currentElementData);}
			//#endif
			
		} catch (IOException e) {
//#ifdef DLOGGING
			logger.severe("parse read error ", e);
//#endif
			System.out.println("parse read error " + e + " " + e.getMessage());
			e.printStackTrace();
			throw e;
		} catch (OutOfMemoryError e) {
//#ifdef DLOGGING
			logger.severe("Out of memory parse read error ", e);
//#endif
			System.out.println("Out of memory parse read error " + e + " " + e.getMessage());
			e.printStackTrace();
			CauseMemoryException ce = new CauseMemoryException(
					"Parse read error. Out of memory.", e);
			throw ce;
		} catch (Throwable e) {
//#ifdef DLOGGING
			logger.severe("Internal error parse read error ", e);
//#endif
			System.out.println("Internal error parse read error " + e + " " + e.getMessage());
			e.printStackTrace();
			CauseException ce = new CauseException(
					"Internal error parse read error. ", e);
			throw ce;
		}
		if( inputCharacter == -1 ) {
			return END_DOCUMENT;
		} else {
			return ELEMENT;
		}
    }
    
    /** Parse next element */
    public int parse()
	throws IOException, CauseMemoryException, CauseException {
		if (m_encodingStreamReader.isModEncoding()) {
			return parseStream(m_encodingStreamReader);
		} else {
			return parseStream(m_inputStream);
		}
	}
		
    /** Get element name */
    public String getName() {
		//#ifdef DLOGGING
		if (finerLoggable) {logger.finer("m_currentElementName=" + m_currentElementName);}
		//#endif
        return m_currentElementName.toString();
    }
    
    /** Get element text including inner xml
	  * If no text, return empty string "" */
    private String getTextStream(InputStreamReader is,
								 final boolean convXmlEnts)
	throws IOException, CauseMemoryException, CauseException {
        
		if(!m_currentElementContainsText) {
			return "";
		}
		boolean endParsing = false;
		
		String text = "";
		try {
			StringBuffer textBuffer = new StringBuffer();
			int inputCharacter;
			char c;
			char lastChars[] = {' ', ' ', ' '};
			
			char elementNameChars[] = new char[3];
			int elen = m_currentElementName.length();
			switch (elen) {
				case 0:
					return "";
				case 1:
					elementNameChars[0] = m_currentElementName.charAt(0);
					break;
				case 2:
					elementNameChars[0] = m_currentElementName.charAt(0);
					elementNameChars[1] = m_currentElementName.charAt(1);
					break;
				default:
					// Copy the last 3 characters indexes begin at elen -3
					// to before elen to the char array.
					m_currentElementName.toString().getChars(elen - 3, elen,
							elementNameChars, 0);
					break;
			}
			final String endCurrentElement = m_currentElementName.insert(
					0, "</").toString();
			while (((inputCharacter = is.read()) != -1) &&
					!endParsing) {
				c = (char)inputCharacter;
				lastChars[0] = lastChars[1];
				lastChars[1] = lastChars[2];
				lastChars[2] = c;
				//System.out.print(c);

				textBuffer.append(c);
				if( lastChars[0] == elementNameChars[0] &&
					lastChars[1] == elementNameChars[1] &&
					lastChars[2] == elementNameChars[2]) {
					if( textBuffer.toString().endsWith(endCurrentElement)) {
						endParsing = true;
					}
				}
			}

			if (m_docEncoding.length() == 0) {
				text = textBuffer.toString();
			} else {
				try {
					// We read the bytes in as ISO8859_1, so we must get them
					// out as that and then encode as they should be.
					if (m_fileEncoding.length() == 0) {
						text = new String(textBuffer.toString().getBytes(),
										  m_docEncoding);
					} else {
						text = new String(textBuffer.toString().getBytes(
									m_fileEncoding), m_docEncoding);
					}
				} catch (IOException e) {
					//#ifdef DLOGGING
					logger.severe("getTextStream Could not convert string from,to" + m_fileEncoding + "," + m_docEncoding, e);
					//#endif
					System.out.println("getTextStream Could not convert string " +
							"from,to=" + m_fileEncoding + "," + m_docEncoding +
							" " + e + " " + e.getMessage());
					e.printStackTrace();
					text = textBuffer.toString();
				}
			}
			textBuffer = null;
			text = StringUtil.replace(text, endCurrentElement, "");
			
			/** Handle some entities and encoded characters */
			text = StringUtil.replace(text, "<![CDATA[", "");
			text = StringUtil.replace(text, "]]>", "");
			if (text.indexOf('&') >= 0) {
				text = EncodingUtil.replaceAlphaEntities(convXmlEnts, text);
				if (convXmlEnts) {
					text = EncodingUtil.replaceXmlEntities(text);
				}
				// No need to convert from UTF-8 to Unicode using replace
				// umlauts now because it is done with new String...,encoding.

				// Replace numeric entities including ’, ‘
				// “, and ”
				text = EncodingUtil.replaceNumEntity(text);
			}

			// Replace special chars like left quote, etc.
			text = m_encodingUtil.replaceSpChars(text);
			
		} catch (OutOfMemoryError t) {
			CauseMemoryException ce = new CauseMemoryException(
					"Unable to read text. Out of memory.", t);
//#ifdef DLOGGING
			logger.severe(ce.getMessage(), ce);
//#endif
			System.out.println("getTextStream Could not read a char run time." + t +
					           " " + t.getMessage());
			t.printStackTrace();
			throw ce;
		} catch (Throwable t) {
			CauseException ce = new CauseException("Unable to read text. " +
					"Internal error.", t);
//#ifdef DLOGGING
			logger.severe(ce.getMessage(), ce);
//#endif
			System.out.println("getTextStream Could not read a char run time." + t +
					           " " + t.getMessage());
			t.printStackTrace();
			if (m_acceptErrors) {
				return null;
			} else {
				throw ce;
			}
		}
		//#ifdef DLOGGING
		if (finerLoggable) {logger.finer("text=" + text);}
		//#endif
		return text;
    }

    /** Get element text including inner xml
	  * save some time by using the normal m_inputStream when we
	  * know that we are not reading UTF-8/16. */
    public String getText()
	throws IOException, CauseMemoryException, CauseException {
		if (m_encodingStreamReader.isModEncoding()) {
			return getTextStream(m_encodingStreamReader, true);
		} else {
			return getTextStream(m_inputStream, true);
		}
	}

    /** Get element text including inner xml
	  * save some time by using the normal m_inputStream when we
	  * know that we are not reading UTF-8/16. */
    public String getText(final boolean convXmlEnts)
	throws IOException, CauseMemoryException, CauseException {
		if (m_encodingStreamReader.isModEncoding()) {
			return getTextStream(m_encodingStreamReader, convXmlEnts);
		} else {
			return getTextStream(m_inputStream,  convXmlEnts);
		}
	}

    /** 
     * Get attribute value from current element 
     */
    public String getAttributeValue(String attributeName)
	throws IOException, CauseMemoryException, CauseException {
        
		try {
			/** Check whatever the element contains given attribute */
			String ccurrentElementData = m_currentElementData.toString();
			int attributeStartIndex = ccurrentElementData.indexOf(attributeName);
			if( attributeStartIndex<0 ) {
				return null;
			}
			
			/** Calculate actual value start index */
			int valueStartIndex = attributeStartIndex + attributeName.length() + 2;
			
			/** Check the attribute value end index */
			int valueEndIndex = ccurrentElementData.indexOf('\"', valueStartIndex);
			if( valueEndIndex<0 ) {
				/** Check using windows quote account for other unexplained
				    quotes */
				if ((valueStartIndex + 1) < ccurrentElementData.length()) {
					String beginQuote = ccurrentElementData.substring(
							valueStartIndex - 1, valueStartIndex);
					valueEndIndex = ccurrentElementData.indexOf(beginQuote,
							valueStartIndex);
				}
				if( valueEndIndex<0 ) {
					return null;
				}
			}
			
			/** Parse value */
			String value = ccurrentElementData.substring(valueStartIndex, valueEndIndex);
			if (m_docEncoding.length() != 0) {
				// We read the bytes in as ISO8859_1, so we must get them
				// out as that and then encode as they should be.
				if (m_fileEncoding.length() == 0) {
					value = new String(value.getBytes(),
									  m_docEncoding);
				} else {
					value = new String(value.getBytes(
								m_fileEncoding), m_docEncoding);
				}
			}
			//#ifdef DLOGGING
			if (finerLoggable) {logger.finer("attribute value=" + value);}
			//#endif
					
			return value;
		} catch (OutOfMemoryError e) {
//#ifdef DLOGGING
			logger.severe("Out of memory parse attribute error ", e);
//#endif
			System.out.println("Out of memory parse attribute error " + e + " " + e.getMessage());
			e.printStackTrace();
			CauseMemoryException ce = new CauseMemoryException(
					"Parse attribute read error. Out of memory.", e);
			throw ce;
		} catch (Throwable t) {
//#ifdef DLOGGING
			logger.severe("getAttributeValue error.", t);
//#endif
			System.out.println("getAttributeValue error." + t + " " +
					           t.getMessage());
			if (m_acceptErrors) {
				return null;
			} else {
				CauseException ce = new CauseException(
						"Parse attribute read error. Internal error.", t);
				throw ce;
			}
		}
    }
    
    /** 
     * Get namesapces.  Return two dimension array with the first column
	 * the namespace and the second on the URL for the namespace.
     */
    public String [][] parseNamespaces() {
        
		try {
			/** Check whatever the element contains given attribute */
			String ccurrentElementData = m_currentElementData.toString();
			Vector vnamespaces = new Vector();
			Vector vnamesurls = new Vector();
			int nspos = 0;
			while ((nspos = ccurrentElementData.indexOf("xmlns:", nspos)) >= 0) {
				nspos+= 6;
				int eqpos = ccurrentElementData.indexOf('=', nspos);
				if (eqpos < 0) {
					continue;
				}
				String xmlns = ccurrentElementData.substring(nspos, eqpos);
				int qpos = ccurrentElementData.indexOf('\"', eqpos + 2);
				if (qpos < 0) {
					continue;
				}
				String url = ccurrentElementData.substring(eqpos + 2, qpos);
				//#ifdef DLOGGING
				if (finerLoggable) {logger.finer("xmlns,url=" + xmlns + "," + url);}
				//#endif
				vnamespaces.addElement(xmlns);
				vnamesurls.addElement(url);
			}
			if (vnamespaces.size() == 0) {
				return new String[0][0];
			}
			int vlen = vnamespaces.size();
			String [][] ns = new String[2][vlen];
			for (int ic = 0; ic < vlen; ic++) {
				ns[0][ic] = (String)vnamespaces.elementAt(ic);
				ns[1][ic] = (String)vnamesurls.elementAt(ic);
			}
			return ns;
		} catch (Throwable t) {
//#ifdef DLOGGING
			logger.severe("parseNamespaces error.", t);
//#endif
			System.out.println("parseNamespaces error." + t + " " +
					           t.getMessage());
			return new String[0][0];
		}
    }
    
    public void setNamespaces(String [] namespaces) {
        this.m_namespaces = namespaces;
    }

    public String [] getNamespaces() {
        return (m_namespaces);
    }

    public void setDocEncoding(String docEncoding) {
        this.m_docEncoding = docEncoding;
    }

    public String getDocEncoding() {
        return (m_docEncoding);
    }

    public boolean isWindows() {
        return (m_encodingUtil.isWindows());
    }

    public boolean isUtf() {
        return (m_encodingUtil.isUtf());
    }

    public EncodingUtil getEncodingUtil() {
        return (m_encodingUtil);
    }

}