/* * XmlParser.java * * Copyright (C) 2005-2008 Tommi Laukkanen * http://www.substanceofcode.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.substanceofcode.utils; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; /** * Simple and lightweight XML parser without complete error handling. * * @author Tommi Laukkanen (tlaukkanen at gmail dot com) */ public class XmlParser { private CustomInputStream m_inputStream = null; private static String encoding = "UTF-8"; /** Current XML element name (eg. <title> = title) */ private String m_currentElementName = ""; private String m_currentElementData = ""; private boolean m_currentElementContainsText = false; /** Enumerations for parse function */ public static final int END_DOCUMENT = 0; public static final int ELEMENT = 1; /** * Creates a new instance of XmlParser * @param inputStream Stream containing XML document. */ public XmlParser(CustomInputStream inputStream) { m_inputStream = inputStream; } public XmlParser(String xmlDocument) { InputStream is = new ByteArrayInputStream(xmlDocument.getBytes()); CustomInputStream cis = new CustomInputStream(is); m_inputStream = cis; } /** * Parse next element * @return Element type or end-of-document. * @throws java.io.IOException */ public int parse() throws IOException { StringBuffer inputBuffer = new StringBuffer(); boolean parsingElementName = false; boolean elementFound = false; boolean elementStart = false; boolean parsingElementData = false; int inputCharacter; char c; inputCharacter = m_inputStream.read(); while (inputCharacter != -1 && elementFound==false) { c = (char)inputCharacter; if(c=='/' && elementStart==true) { parsingElementName = false; } else if(elementStart==true && (c=='?' || c=='!')) { if(m_currentElementData.charAt(m_currentElementData.length()-1)=='<') { parsingElementName = false; } } if(parsingElementName==true) { if(c==' ' || c=='/' ) { parsingElementName = false; parsingElementData = true; } else if(c!='>') { m_currentElementName += c; } } if(c=='<') { elementStart = true; parsingElementName = true; parsingElementData = true; m_currentElementName = ""; m_currentElementData = ""; } if(parsingElementData==true) { m_currentElementData += c; } if(c=='>') { if(m_currentElementName.length()>0) { elementFound = true; parsingElementName = false; } } if(!elementFound){ inputCharacter = m_inputStream.read(); } } if( m_currentElementData.charAt( m_currentElementData.length()-2 )=='/' && m_currentElementData.charAt( m_currentElementData.length()-1 )=='>' ) { m_currentElementContainsText = false; } else { m_currentElementContainsText = true; } if( inputCharacter==-1 ) { return END_DOCUMENT; } else { return ELEMENT; } } /** Get element name */ public String getName() { return m_currentElementName; } /** Get element text including inner xml */ public String getText() throws IOException { Log.debug("Getting text for element '" + m_currentElementName + "'"); if(m_currentElementContainsText==false) { return ""; } boolean endParsing = false; String endElementName = ""; String text; StringBuffer textBuffer = new StringBuffer(); int inputCharacter; char c; char lastChars[] = new char[3]; lastChars[0] = ' '; lastChars[1] = ' '; lastChars[2] = ' '; char elementNameChars[] = new char[2]; elementNameChars[0] = m_currentElementName.charAt( m_currentElementName.length()-2 ); elementNameChars[1] = m_currentElementName.charAt( m_currentElementName.length()-1 ); while ((inputCharacter = m_inputStream.read()) != -1 && endParsing==false) { c = (char)inputCharacter; lastChars[0] = lastChars[1]; lastChars[1] = c; //System.out.print(c); textBuffer.append(c); if( lastChars[0] == elementNameChars[0] && lastChars[1] == elementNameChars[1]) { if( textBuffer.toString().endsWith("</" + m_currentElementName)) { endParsing = true; } } } if (encoding.equals("")) { text = textBuffer.toString(); } else { try { text = new String(textBuffer.toString().getBytes(), encoding); } catch (UnsupportedEncodingException e) { Log.add("Couldn't use UTF-8 encoding"); try { text = new String(textBuffer.toString().getBytes(), "UTF8"); encoding = "UTF8"; } catch (UnsupportedEncodingException e2) { Log.add("Couldn't use UTF8 encoding"); text = textBuffer.toString(); encoding = ""; } } } text = textBuffer.toString(); text = StringUtil.replace(text, "</" + m_currentElementName, ""); /** Handle some entities and encoded characters */ //Log.add("GetText() before: " + text); text = decodeCharacters(text); //Log.add("GetText() after : " + text); return text; } /** * Get attribute value from current element */ public String getAttributeValue(String attributeName) { /** Check whatever the element contains given attribute */ int attributeStartIndex = m_currentElementData.indexOf(attributeName); if( attributeStartIndex<0 ) { return null; } /** Calculate actual value start index */ int valueStartIndex = attributeStartIndex + attributeName.length() + 2; /** Check the attribute value end index */ int valueEndIndex = m_currentElementData.indexOf("\"", valueStartIndex); if( valueEndIndex<0 ) { return null; } /** Parse value */ String value = m_currentElementData.substring(valueStartIndex, valueEndIndex); value = decodeCharacters(value); return value; } private String decodeCharacters(String text) { text = StringUtil.replace(text, "<", "<"); text = StringUtil.replace(text, ">", ">"); text = StringUtil.replace(text, " ", " "); text = StringUtil.replace(text, """, "\""); text = StringUtil.replace(text, "&", "&"); text = StringUtil.replace(text, "ä", "ä"); text = StringUtil.replace(text, "ö", "ö"); text = StringUtil.replace(text, "ä", "ä"); text = StringUtil.replace(text, "ö", "ö"); text = StringUtil.replace(text, "â??", "'"); text = StringUtil.replace(text, "’", "'"); text = StringUtil.replace(text, "‘", "'"); text = StringUtil.replace(text, "“", "\""); text = StringUtil.replace(text, "”", "\""); text = StringUtil.replace(text, "'", "\""); text = StringUtil.replace(text, "€", "€"); text = StringUtil.replace(text, String.valueOf((char)226) + String.valueOf((char)128) + String.valueOf((char)153), "'"); text = StringUtil.replace(text, String.valueOf((char)226) + String.valueOf((char)128) + String.valueOf((char)166), "..."); text = StringUtil.replace(text, String.valueOf((char)226) + String.valueOf((char)128) + String.valueOf((char)156), "\""); text = StringUtil.replace(text, String.valueOf((char)226) + String.valueOf((char)128) + String.valueOf((char)157), "\""); boolean foundEscape = text.indexOf("&#")>=0; int startIndex = 0; while(foundEscape) { int entityStart = text.indexOf("&#"); int entityEnd = text.indexOf(";",entityStart); if(entityStart>0 && entityEnd>0) { String character = text.substring(entityStart+2, entityEnd); try { int charValue = 0; charValue = Integer.parseInt(character); if(charValue>0) { text = StringUtil.replace(text, "&#" + charValue + ";", String.valueOf((char)charValue)); } } catch(Exception ex) { // Do nothing... } } startIndex++; if(startIndex<text.length()) { foundEscape = text.indexOf("&#", startIndex)>=0; } else { foundEscape = false; } } return text; } }