SloppyXMLParser.java example

Explorer

Slipstream-Mod-Manager-master
- src
  - main
    - java
      - net
        vhati
        ftldat
        FTLDat.java
        FileChannelRegionInputStream.java
        modmanager
        FTLModManager.java
        cli
        SlipstreamCLI.java
        core
        AutoUpdateInfo.java
        ComparableVersion.java
        DelayedDeleteHook.java
        EOLWriter.java
        EmptyAwareSAXHandlerFactory.java
        FTLUtilities.java
        HashObserver.java
        HashThread.java
        ModDB.java
        ModFileInfo.java
        ModInfo.java
        ModPatchObserver.java
        ModPatchThread.java
        ModUtilities.java
        ModsInfo.java
        ModsScanObserver.java
        ModsScanThread.java
        Report.java
        SlipstreamConfig.java
        SloppyXMLOutputProcessor.java
        SloppyXMLParser.java
        XMLPatcher.java
        json
        JacksonAutoUpdateReader.java
        JacksonCatalogReader.java
        JacksonCatalogWriter.java
        URLFetcher.java
        scraper
        ForumScraper.java
        ui
        ClipboardMenuMouseListener.java
        DatExtractDialog.java
        FieldEditorPanel.java
        InertPanel.java
        ManagerFrame.java
        ManagerInitThread.java
        ModInfoArea.java
        ModPatchDialog.java
        ModXMLSandbox.java
        Nerfable.java
        ProgressDialog.java
        RegexDocument.java
        SlipstreamConfigDialog.java
        Statusbar.java
        StatusbarMouseListener.java
        table
        ChecklistTableModel.java
        ChecklistTablePanel.java
        ListState.java
        Reorderable.java
        TableRowTransferHandler.java
        tree
        ChecklistTreeCellRenderer.java
        ChecklistTreeManager.java
        ChecklistTreePanel.java
        ChecklistTreePathFilter.java
        ChecklistTreeSelectionModel.java
        ChildrenEnumeration.java
        PreorderEnumeration.java
        TreeState.java
        TreeTransferHandler.java
        TristateButtonModel.java
        TristateCheckBox.java
        xml
        JDOMModMetadataReader.java

package net.vhati.modmanager.core;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.xml.sax.SAXParseException;

import org.jdom2.Attribute;
import org.jdom2.AttributeType;
import org.jdom2.CDATA;
import org.jdom2.Comment;
import org.jdom2.Content;
import org.jdom2.DefaultJDOMFactory;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.IllegalAddException;
import org.jdom2.JDOMFactory;
import org.jdom2.Namespace;
import org.jdom2.Parent;
import org.jdom2.Text;
import org.jdom2.input.JDOMParseException;


/**
 * A scraper for malformed XML.
 *
 * Sloppiness:
 *   Any closing tag, regardless of its name, closes the parent tag.
 *   <!-- <!-- blah --> is valid.
 *     The example above will become two comments. Any extra dashes will
 *     be discarded.
 *   --> can occur alone (discarded).
 *   An attribute name can start right after the quote from a prior value.
 *   Namespace prefixes for nodes and attributes are unique.
 *     (Each prefix will be used as the namespace's URI).
 *   Unrecognized named entities (&...;) and lone ampersands are accepted
 *     as literal text. (Those ampersands will be escaped if outputted).
 *
 * The text must have \n line endings.
 *
 * If a line/column aware JDOMFactory is passed to the constructor,
 *   that factory will receive locations for Elements (start tags).
 *   That will be the 1-based line/col of the end character,
 *   plus 1 col.
 *
 * If parsing fails, the thrown JDOMParseException has getter methods
 * to report the nearest upcoming non-whitespace character, from where
 * the parser gave up.
 *
 * Only use this as a last resort, after a real parser fails.
 *
 * @see org.jdom2.input.JDOMParseException
 * @see org.jdom2.located.LocatedJDOMFactory
 */
public class SloppyXMLParser {

	private Pattern declPtn = Pattern.compile( "(\\s*)<[?]xml [^?]*[?]>" );
	private Pattern emptyCommentPtn = Pattern.compile( "(\\s*)<!---->" );
	private Pattern commentPtn = Pattern.compile( "(?s)(\\s*)<!--((?:.(?!-->))*.)-->" );
	private Pattern emptyCDATAPtn = Pattern.compile( "(\\s*)<!\\[CDATA\\[\\]\\]>" );
	private Pattern cdataPtn = Pattern.compile( "(?s)(\\s*)<!\\[CDATA\\[((?:.(?!\\]\\]>))*.)\\]\\]>" );
	private Pattern sTagPtn = Pattern.compile( "(\\s*)<(?:([\\w.-]+):)?([\\w.-]+)((?: [^>]+?)??)\\s*(/?)>" );
	private Pattern eTagPtn = Pattern.compile( "([^<]*)</\\s*([^>]+)>" );
	private Pattern endSpacePtn = Pattern.compile( "\\s+$" );
	private Pattern strayCharsPtn = Pattern.compile( "(\\s*)(?:-->|[-.>,])" );

	private Pattern attrPtn = Pattern.compile( "\\s*(?:([\\w.-]+):)?([\\w.-]+)\\s*=\\s*(\"[^\"]*\"|'[^']*')" );
	private Pattern entityPtn = Pattern.compile( "&(?:(?:#([0-9]+))|(?:#x([0-9A-Fa-f]+))|([^;]+));" );

	private Pattern breakPtn = Pattern.compile( "\n" );

	private List<Pattern> chunkPtns = new ArrayList<Pattern>();
	private Map<String,String> entityMap = new HashMap<String,String>();

	private JDOMFactory factory;

	private int pos = -1;


	public SloppyXMLParser() {
		this( null );
	}

	public SloppyXMLParser( JDOMFactory factory ) {
		if ( factory == null ) factory = new DefaultJDOMFactory();
		this.factory = factory;

		chunkPtns.add( declPtn );
		chunkPtns.add( emptyCommentPtn );
		chunkPtns.add( commentPtn );
		chunkPtns.add( emptyCDATAPtn );
		chunkPtns.add( cdataPtn );
		chunkPtns.add( sTagPtn );
		chunkPtns.add( eTagPtn );
		chunkPtns.add( endSpacePtn );
		chunkPtns.add( strayCharsPtn );

		entityMap.put( "lt", "<" );
		entityMap.put( "gt", ">" );
		entityMap.put( "amp", "&" );
		entityMap.put( "apos", "'" );
		entityMap.put( "quot", "\"" );
	}


	public Document build( CharSequence s ) throws JDOMParseException {
		Element rootNode = factory.element( "wrapper" );
		Document doc = factory.document( rootNode );

		Parent parentNode = rootNode;
		int sLen = s.length();
		int lastPos = -1;
		pos = 0;
		int[] lastLineAndCol = new int[] {0, 0};  // Counts \n's and chars after the last \n.
		String tmp = null;
		Matcher m = declPtn.matcher( s );

		try {
			while ( pos > lastPos && pos < sLen ) {
				m.region( pos, sLen );
				boolean matchedChunk = false;

				for ( Pattern chunkPtn : chunkPtns ) {
					m.usePattern( chunkPtn );
					if ( !m.lookingAt() ) continue;

					if ( chunkPtn == declPtn ) {
						// Don't care.
						addLineAndCol( lastLineAndCol, m.group(0) );
					}
					else if ( chunkPtn == emptyCommentPtn ) {
						String whitespace = m.group( 1 );
						if ( whitespace.length() > 0 )
							factory.addContent( parentNode, factory.text( whitespace ) );

						addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
					}
					else if ( chunkPtn == commentPtn ) {
						String whitespace = m.group( 1 );
						if ( whitespace.length() > 0 )
							factory.addContent( parentNode, factory.text( whitespace ) );

						tmp = m.group( 2 );
						if ( tmp.length() == 0 ) {
							factory.addContent( parentNode, factory.comment( "" ) );
						}
						else {
							Matcher splicedMatcher = Pattern.compile( "(\\s*)<!--" ).matcher( tmp );
							int commentStart = 0;
							while ( splicedMatcher.find() ) {
								if ( splicedMatcher.start() - commentStart > 0 ) {
									String splicedChunk = tmp.substring( commentStart, splicedMatcher.start() );
									splicedChunk = splicedChunk.replaceAll( "^-+|(?<=-)-+|-+$", "" );
									if ( splicedChunk.startsWith( " " ) ) splicedChunk += " ";
									Comment commentNode = factory.comment( splicedChunk );
									factory.addContent( parentNode, commentNode );
								}
								if ( splicedMatcher.group(1).length() > 0 ) {
									// Whitespace between comments.
									factory.addContent( parentNode, factory.text( splicedMatcher.group(1) ) );
								}
								commentStart = splicedMatcher.end();
							}
							if ( commentStart < tmp.length() ) {
								String finalChunk = tmp.substring( commentStart );
								finalChunk = finalChunk.replaceAll( "^-+|(?<=-)-+|-+$", "" );
								Comment commentNode = factory.comment( finalChunk );
								factory.addContent( parentNode, commentNode );
							}
						}

						addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
					}
					else if ( chunkPtn == emptyCDATAPtn ) {
						String whitespace = m.group( 1 );
						if ( whitespace.length() > 0 )
							factory.addContent( parentNode, factory.text( whitespace ) );

						addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
					}
					else if ( chunkPtn == cdataPtn ) {
						String whitespace = m.group( 1 );
						if ( whitespace.length() > 0 )
							factory.addContent( parentNode, factory.text( whitespace ) );

						CDATA cdataNode = factory.cdata( m.group(2) );
						factory.addContent( parentNode, cdataNode );

						addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
					}
					else if ( chunkPtn == sTagPtn ) {
						String whitespace = m.group( 1 );
						if ( whitespace.length() > 0 )
							factory.addContent( parentNode, factory.text( whitespace ) );

						String nodePrefix = m.group( 2 );  // Might be null.
						String nodeName = m.group( 3 );
						String attrString = m.group( 4 );
						boolean selfClosing = ( m.group( 5 ).length() > 0 );

						addLineAndCol( lastLineAndCol, s, m.start(), m.end() );

						Element tagNode;
						if ( nodePrefix != null ) {
							Namespace nodeNS = Namespace.getNamespace( nodePrefix, nodePrefix );  // URI? *shrug*
							factory.addNamespaceDeclaration( rootNode, nodeNS );
							tagNode = factory.element( lastLineAndCol[0]+1, lastLineAndCol[1]+1+1, nodeName, nodeNS );
						} else {
							tagNode = factory.element( lastLineAndCol[0]+1, lastLineAndCol[1]+1+1, nodeName );
						}

						if ( attrString.length() > 0 ) {
							Matcher am = attrPtn.matcher( attrString );
							while ( am.lookingAt() ) {
								String attrPrefix = am.group( 1 );  // Might be null.
								String attrName = am.group( 2 );
								String attrValue = am.group( 3 );
								attrValue = attrValue.substring( 1, attrValue.length()-1 );
								attrValue = unescape( attrValue );

								if ( attrPrefix != null ) {
									if ( attrPrefix.equals( "xmlns" ) ) {
										// This is a pseudo attribute declaring a namespace prefix.
										// Move it to the root node.
										Namespace attrNS = Namespace.getNamespace( attrName, attrName );  // URI? *shrug*
										factory.addNamespaceDeclaration( rootNode, attrNS );
									}
									else {
										Namespace attrNS = Namespace.getNamespace( attrPrefix, attrPrefix );  // URI? *shrug*
										factory.addNamespaceDeclaration( rootNode, attrNS );
										Attribute attrObj = factory.attribute( attrName, attrValue, AttributeType.UNDECLARED, attrNS );
										factory.setAttribute( tagNode, attrObj );
									}
								} else if ( attrName.equals("xmlns") ) {
									// New default namespace URI within this node.
									Namespace attrNS = Namespace.getNamespace( attrValue );
									factory.addNamespaceDeclaration( tagNode, attrNS );
								} else {
									// Normal attribute.
									Attribute attrObj = factory.attribute( attrName, attrValue, AttributeType.UNDECLARED, Namespace.NO_NAMESPACE );
									factory.setAttribute( tagNode, attrObj );
								}
								am.region( am.end(), am.regionEnd() );
							}
							if ( am.regionStart() < attrString.length() ) {
								int nonspacePos = findNextNonspace( s, pos );
								int errorPos = ( (nonspacePos != -1) ? nonspacePos : pos );

								int[] lineAndCol = getLineAndCol( s, errorPos );
								int lineNum = lineAndCol[0];
								int colNum = lineAndCol[1];

								SAXParseException cause = new SAXParseException( String.format( "At line %d, column %d: Strange attributes.", lineNum, colNum ), null, null, lineNum, colNum );
								throw new JDOMParseException( String.format( "Error on line %d: %s", lineNum, cause.getMessage() ), cause );
							}
						}

						factory.addContent( parentNode, tagNode );
						if ( !selfClosing ) parentNode = tagNode;
					}
					else if ( chunkPtn == eTagPtn ) {
						String interimText = m.group( 1 );
						interimText = unescape( interimText );

						factory.addContent( parentNode, factory.text( interimText ) );
						parentNode = parentNode.getParent();

						addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
					}
					else if ( chunkPtn == endSpacePtn ) {
						// This is the end of the document.
					}
					else if ( chunkPtn == strayCharsPtn ) {
						// Non-space junk between an end tag and a start tag.

						String whitespace = m.group( 1 );
						if ( whitespace.length() > 0 )
							factory.addContent( parentNode, factory.text( whitespace ) );

						addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
					}

					matchedChunk = true;
					lastPos = pos;
					pos = m.end();
					break;
				}

				if ( !matchedChunk ) {
					int nonspacePos = findNextNonspace( s, pos );
					int errorPos = ( (nonspacePos != -1) ? nonspacePos : pos );

					int[] lineAndCol = getLineAndCol( s, errorPos );
					int lineNum = lineAndCol[0];
					int colNum = lineAndCol[1];

					SAXParseException cause = new SAXParseException( String.format( "At line %d, column %d: Unexpected characters.", lineNum, colNum ), null, null, lineNum, colNum );
					throw new JDOMParseException( String.format( "Error on line %d: %s", lineNum, cause.getMessage() ), cause );
				}
			}

			if ( rootNode.getChildren().size() == 1 ) {
				// No need for the wrapper, promote its only child to root.

				Element newRoot = rootNode.getChildren().get( 0 );
				newRoot.detach();
				for ( Namespace ns : rootNode.getAdditionalNamespaces() ) {
					factory.addNamespaceDeclaration( newRoot, ns );
				}
				factory.setRoot( doc, newRoot );
			}

		}
		catch( IllegalAddException e ) {
			int nonspacePos = findNextNonspace( s, pos );
			int errorPos = ( (nonspacePos != -1) ? nonspacePos : pos );

			int[] lineAndCol = getLineAndCol( s, errorPos );
			int lineNum = lineAndCol[0];
			int colNum = lineAndCol[1];

			String hint = "";
			if ( e.getMessage() != null && e.getMessage().indexOf( "not allowed at the document root" ) != -1 ) {
				hint = " (There's likely an extraneous closing tag before this point.)";
			}
			SAXParseException cause = new SAXParseException( String.format( "At line %d, column %d: %s%s", lineNum, colNum, e.getMessage(), hint ), null, null, lineNum, colNum, e );
			throw new JDOMParseException( String.format( "Error on line %d: %s", lineNum, cause.getMessage() ), cause );
		}

		return doc;
	}


	/**
	 * Unescapes standard named entities and numeric character references.
	 * This applies to attributes and element values.
	 *
	 * They are: lt, gt, quot, apos, amp, #1234, #x1a2b.
	 */
	public String unescape( String s ) {
		StringBuffer buf = new StringBuffer( s.length() );
		Matcher m = entityPtn.matcher( s );
		String decRef;
		String hexRef;
		int charCode;
		String entName;
		String entity;

		while ( m.find() ) {
			decRef = m.group( 1 );
			hexRef = m.group( 2 );
			entName = m.group( 3 );
			if ( (decRef != null) ) {
				// Decimal character reference.
				charCode = Integer.parseInt( decRef );
				entity = Character.toString( (char)charCode );
			}
			else if ( (hexRef != null) ) {
				// Hex character reference.
				charCode = Integer.parseInt( hexRef, 16 );
				entity = Character.toString( (char)charCode );
			}
			else {
				entity = entityMap.get( entName );
				if ( entity == null ) {
					// Unknown entity, repeat it as-is.
					entity = "&"+ entName +";";
				}
			}
			m.appendReplacement( buf, entity );
		}
		m.appendTail( buf );

		return buf.toString();
	}


	/**
	 * Returns the position of the next non whitespace character after pos.
	 *
	 * Returns -1 if there isn't one.
	 */
	public int findNextNonspace( CharSequence s, int pos ) {
		Matcher nonspaceMatcher = Pattern.compile( "\\S" ).matcher( s );
		if ( nonspaceMatcher.find( pos ) )
			return nonspaceMatcher.start();

		return -1;
	}


	/**
	 * Increments an ongoing tally of lines and the col on the current line.
	 *
	 * @param lastLineAndCol the current tally to increment (0-based)
	 * @param s a string to check for \n's
	 * @param start a start index in the string to search from (inclusive)
	 * @param start an end index in the string (exclusive)
	 */
	private void addLineAndCol( int[] lastLineAndCol, CharSequence s, int start, int end ) {
		if ( s.length() == 0 || start == end ) return;

		Matcher breakMatcher = breakPtn.matcher( s );
		breakMatcher.region( start, end );
		int breakCount = 0;
		int lastBreakPos = -1;
		while ( breakMatcher.find() ) {
			lastBreakPos = breakMatcher.start();
			breakCount++;
		}
		if ( lastBreakPos == -1 ) {
			// Same line, a few more chars in. Increment col.
			lastLineAndCol[1] += end-1 - start;
		} else {
			// On a new line now, reset the col.
			lastLineAndCol[0] += breakCount;
			lastLineAndCol[1] = end-1 - lastBreakPos;
		}
	}

	private void addLineAndCol( int[] lastLineAndCol, CharSequence s ) {
		addLineAndCol( lastLineAndCol, s, 0, s.length() );
	}


	/**
	 * Returns lineNum and colNum for a position in text.
	 * The first line is line 1.
	 * Line breaks start a new line as col 0.
	 * The first char of each line, after the break is col 1.
	 *
	 * @param pos a 0-based offset
	 * @return 1-based ints for line and col (the first char is line 1, col 1)
	 * @see org.jdom2.input.JDOMParseException
	 */
	public int[] getLineAndCol( CharSequence s, int pos ) {
		pos = Math.min( pos, s.length() );

		Matcher breakMatcher = breakPtn.matcher( s );
		breakMatcher.region( 0, pos+1 );  // Include pos itself in case it's a break.
		int breakCount = 0;
		int lastBreakPos = -1;
		while ( breakMatcher.find() ) {
			lastBreakPos = breakMatcher.start();
			breakCount++;
		}
		int colNum;
		if ( lastBreakPos == -1 )
			colNum = pos+1;  // Pretend ^ was column 0, as a \n would.
		else
			colNum = pos - lastBreakPos;

		return new int[] { breakCount+1, colNum };
	}


	/**
	 * Returns the last character offset this parser was looking at.
	 *
	 * Usually this will be a patch of whitespace prior to unrecognized chars.
	 * This method is a fallback when an unexpected exception doesn't provide
	 * line info.
	 *
	 * @see findNextNonspace(CharSequence s, int pos)
	 */
	public int getLastPosition() {
		return pos;
	}
}