package net.vhati.modmanager.core;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.xml.sax.SAXParseException;
import org.jdom2.Attribute;
import org.jdom2.AttributeType;
import org.jdom2.CDATA;
import org.jdom2.Comment;
import org.jdom2.Content;
import org.jdom2.DefaultJDOMFactory;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.IllegalAddException;
import org.jdom2.JDOMFactory;
import org.jdom2.Namespace;
import org.jdom2.Parent;
import org.jdom2.Text;
import org.jdom2.input.JDOMParseException;
/**
* A scraper for malformed XML.
*
* Sloppiness:
* Any closing tag, regardless of its name, closes the parent tag.
* <!-- <!-- blah --> is valid.
* The example above will become two comments. Any extra dashes will
* be discarded.
* --> can occur alone (discarded).
* An attribute name can start right after the quote from a prior value.
* Namespace prefixes for nodes and attributes are unique.
* (Each prefix will be used as the namespace's URI).
* Unrecognized named entities (&...;) and lone ampersands are accepted
* as literal text. (Those ampersands will be escaped if outputted).
*
* The text must have \n line endings.
*
* If a line/column aware JDOMFactory is passed to the constructor,
* that factory will receive locations for Elements (start tags).
* That will be the 1-based line/col of the end character,
* plus 1 col.
*
* If parsing fails, the thrown JDOMParseException has getter methods
* to report the nearest upcoming non-whitespace character, from where
* the parser gave up.
*
* Only use this as a last resort, after a real parser fails.
*
* @see org.jdom2.input.JDOMParseException
* @see org.jdom2.located.LocatedJDOMFactory
*/
public class SloppyXMLParser {
private Pattern declPtn = Pattern.compile( "(\\s*)<[?]xml [^?]*[?]>" );
private Pattern emptyCommentPtn = Pattern.compile( "(\\s*)<!---->" );
private Pattern commentPtn = Pattern.compile( "(?s)(\\s*)<!--((?:.(?!-->))*.)-->" );
private Pattern emptyCDATAPtn = Pattern.compile( "(\\s*)<!\\[CDATA\\[\\]\\]>" );
private Pattern cdataPtn = Pattern.compile( "(?s)(\\s*)<!\\[CDATA\\[((?:.(?!\\]\\]>))*.)\\]\\]>" );
private Pattern sTagPtn = Pattern.compile( "(\\s*)<(?:([\\w.-]+):)?([\\w.-]+)((?: [^>]+?)??)\\s*(/?)>" );
private Pattern eTagPtn = Pattern.compile( "([^<]*)</\\s*([^>]+)>" );
private Pattern endSpacePtn = Pattern.compile( "\\s+$" );
private Pattern strayCharsPtn = Pattern.compile( "(\\s*)(?:-->|[-.>,])" );
private Pattern attrPtn = Pattern.compile( "\\s*(?:([\\w.-]+):)?([\\w.-]+)\\s*=\\s*(\"[^\"]*\"|'[^']*')" );
private Pattern entityPtn = Pattern.compile( "&(?:(?:#([0-9]+))|(?:#x([0-9A-Fa-f]+))|([^;]+));" );
private Pattern breakPtn = Pattern.compile( "\n" );
private List<Pattern> chunkPtns = new ArrayList<Pattern>();
private Map<String,String> entityMap = new HashMap<String,String>();
private JDOMFactory factory;
private int pos = -1;
public SloppyXMLParser() {
this( null );
}
public SloppyXMLParser( JDOMFactory factory ) {
if ( factory == null ) factory = new DefaultJDOMFactory();
this.factory = factory;
chunkPtns.add( declPtn );
chunkPtns.add( emptyCommentPtn );
chunkPtns.add( commentPtn );
chunkPtns.add( emptyCDATAPtn );
chunkPtns.add( cdataPtn );
chunkPtns.add( sTagPtn );
chunkPtns.add( eTagPtn );
chunkPtns.add( endSpacePtn );
chunkPtns.add( strayCharsPtn );
entityMap.put( "lt", "<" );
entityMap.put( "gt", ">" );
entityMap.put( "amp", "&" );
entityMap.put( "apos", "'" );
entityMap.put( "quot", "\"" );
}
public Document build( CharSequence s ) throws JDOMParseException {
Element rootNode = factory.element( "wrapper" );
Document doc = factory.document( rootNode );
Parent parentNode = rootNode;
int sLen = s.length();
int lastPos = -1;
pos = 0;
int[] lastLineAndCol = new int[] {0, 0}; // Counts \n's and chars after the last \n.
String tmp = null;
Matcher m = declPtn.matcher( s );
try {
while ( pos > lastPos && pos < sLen ) {
m.region( pos, sLen );
boolean matchedChunk = false;
for ( Pattern chunkPtn : chunkPtns ) {
m.usePattern( chunkPtn );
if ( !m.lookingAt() ) continue;
if ( chunkPtn == declPtn ) {
// Don't care.
addLineAndCol( lastLineAndCol, m.group(0) );
}
else if ( chunkPtn == emptyCommentPtn ) {
String whitespace = m.group( 1 );
if ( whitespace.length() > 0 )
factory.addContent( parentNode, factory.text( whitespace ) );
addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
}
else if ( chunkPtn == commentPtn ) {
String whitespace = m.group( 1 );
if ( whitespace.length() > 0 )
factory.addContent( parentNode, factory.text( whitespace ) );
tmp = m.group( 2 );
if ( tmp.length() == 0 ) {
factory.addContent( parentNode, factory.comment( "" ) );
}
else {
Matcher splicedMatcher = Pattern.compile( "(\\s*)<!--" ).matcher( tmp );
int commentStart = 0;
while ( splicedMatcher.find() ) {
if ( splicedMatcher.start() - commentStart > 0 ) {
String splicedChunk = tmp.substring( commentStart, splicedMatcher.start() );
splicedChunk = splicedChunk.replaceAll( "^-+|(?<=-)-+|-+$", "" );
if ( splicedChunk.startsWith( " " ) ) splicedChunk += " ";
Comment commentNode = factory.comment( splicedChunk );
factory.addContent( parentNode, commentNode );
}
if ( splicedMatcher.group(1).length() > 0 ) {
// Whitespace between comments.
factory.addContent( parentNode, factory.text( splicedMatcher.group(1) ) );
}
commentStart = splicedMatcher.end();
}
if ( commentStart < tmp.length() ) {
String finalChunk = tmp.substring( commentStart );
finalChunk = finalChunk.replaceAll( "^-+|(?<=-)-+|-+$", "" );
Comment commentNode = factory.comment( finalChunk );
factory.addContent( parentNode, commentNode );
}
}
addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
}
else if ( chunkPtn == emptyCDATAPtn ) {
String whitespace = m.group( 1 );
if ( whitespace.length() > 0 )
factory.addContent( parentNode, factory.text( whitespace ) );
addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
}
else if ( chunkPtn == cdataPtn ) {
String whitespace = m.group( 1 );
if ( whitespace.length() > 0 )
factory.addContent( parentNode, factory.text( whitespace ) );
CDATA cdataNode = factory.cdata( m.group(2) );
factory.addContent( parentNode, cdataNode );
addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
}
else if ( chunkPtn == sTagPtn ) {
String whitespace = m.group( 1 );
if ( whitespace.length() > 0 )
factory.addContent( parentNode, factory.text( whitespace ) );
String nodePrefix = m.group( 2 ); // Might be null.
String nodeName = m.group( 3 );
String attrString = m.group( 4 );
boolean selfClosing = ( m.group( 5 ).length() > 0 );
addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
Element tagNode;
if ( nodePrefix != null ) {
Namespace nodeNS = Namespace.getNamespace( nodePrefix, nodePrefix ); // URI? *shrug*
factory.addNamespaceDeclaration( rootNode, nodeNS );
tagNode = factory.element( lastLineAndCol[0]+1, lastLineAndCol[1]+1+1, nodeName, nodeNS );
} else {
tagNode = factory.element( lastLineAndCol[0]+1, lastLineAndCol[1]+1+1, nodeName );
}
if ( attrString.length() > 0 ) {
Matcher am = attrPtn.matcher( attrString );
while ( am.lookingAt() ) {
String attrPrefix = am.group( 1 ); // Might be null.
String attrName = am.group( 2 );
String attrValue = am.group( 3 );
attrValue = attrValue.substring( 1, attrValue.length()-1 );
attrValue = unescape( attrValue );
if ( attrPrefix != null ) {
if ( attrPrefix.equals( "xmlns" ) ) {
// This is a pseudo attribute declaring a namespace prefix.
// Move it to the root node.
Namespace attrNS = Namespace.getNamespace( attrName, attrName ); // URI? *shrug*
factory.addNamespaceDeclaration( rootNode, attrNS );
}
else {
Namespace attrNS = Namespace.getNamespace( attrPrefix, attrPrefix ); // URI? *shrug*
factory.addNamespaceDeclaration( rootNode, attrNS );
Attribute attrObj = factory.attribute( attrName, attrValue, AttributeType.UNDECLARED, attrNS );
factory.setAttribute( tagNode, attrObj );
}
} else if ( attrName.equals("xmlns") ) {
// New default namespace URI within this node.
Namespace attrNS = Namespace.getNamespace( attrValue );
factory.addNamespaceDeclaration( tagNode, attrNS );
} else {
// Normal attribute.
Attribute attrObj = factory.attribute( attrName, attrValue, AttributeType.UNDECLARED, Namespace.NO_NAMESPACE );
factory.setAttribute( tagNode, attrObj );
}
am.region( am.end(), am.regionEnd() );
}
if ( am.regionStart() < attrString.length() ) {
int nonspacePos = findNextNonspace( s, pos );
int errorPos = ( (nonspacePos != -1) ? nonspacePos : pos );
int[] lineAndCol = getLineAndCol( s, errorPos );
int lineNum = lineAndCol[0];
int colNum = lineAndCol[1];
SAXParseException cause = new SAXParseException( String.format( "At line %d, column %d: Strange attributes.", lineNum, colNum ), null, null, lineNum, colNum );
throw new JDOMParseException( String.format( "Error on line %d: %s", lineNum, cause.getMessage() ), cause );
}
}
factory.addContent( parentNode, tagNode );
if ( !selfClosing ) parentNode = tagNode;
}
else if ( chunkPtn == eTagPtn ) {
String interimText = m.group( 1 );
interimText = unescape( interimText );
factory.addContent( parentNode, factory.text( interimText ) );
parentNode = parentNode.getParent();
addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
}
else if ( chunkPtn == endSpacePtn ) {
// This is the end of the document.
}
else if ( chunkPtn == strayCharsPtn ) {
// Non-space junk between an end tag and a start tag.
String whitespace = m.group( 1 );
if ( whitespace.length() > 0 )
factory.addContent( parentNode, factory.text( whitespace ) );
addLineAndCol( lastLineAndCol, s, m.start(), m.end() );
}
matchedChunk = true;
lastPos = pos;
pos = m.end();
break;
}
if ( !matchedChunk ) {
int nonspacePos = findNextNonspace( s, pos );
int errorPos = ( (nonspacePos != -1) ? nonspacePos : pos );
int[] lineAndCol = getLineAndCol( s, errorPos );
int lineNum = lineAndCol[0];
int colNum = lineAndCol[1];
SAXParseException cause = new SAXParseException( String.format( "At line %d, column %d: Unexpected characters.", lineNum, colNum ), null, null, lineNum, colNum );
throw new JDOMParseException( String.format( "Error on line %d: %s", lineNum, cause.getMessage() ), cause );
}
}
if ( rootNode.getChildren().size() == 1 ) {
// No need for the wrapper, promote its only child to root.
Element newRoot = rootNode.getChildren().get( 0 );
newRoot.detach();
for ( Namespace ns : rootNode.getAdditionalNamespaces() ) {
factory.addNamespaceDeclaration( newRoot, ns );
}
factory.setRoot( doc, newRoot );
}
}
catch( IllegalAddException e ) {
int nonspacePos = findNextNonspace( s, pos );
int errorPos = ( (nonspacePos != -1) ? nonspacePos : pos );
int[] lineAndCol = getLineAndCol( s, errorPos );
int lineNum = lineAndCol[0];
int colNum = lineAndCol[1];
String hint = "";
if ( e.getMessage() != null && e.getMessage().indexOf( "not allowed at the document root" ) != -1 ) {
hint = " (There's likely an extraneous closing tag before this point.)";
}
SAXParseException cause = new SAXParseException( String.format( "At line %d, column %d: %s%s", lineNum, colNum, e.getMessage(), hint ), null, null, lineNum, colNum, e );
throw new JDOMParseException( String.format( "Error on line %d: %s", lineNum, cause.getMessage() ), cause );
}
return doc;
}
/**
* Unescapes standard named entities and numeric character references.
* This applies to attributes and element values.
*
* They are: lt, gt, quot, apos, amp, #1234, #x1a2b.
*/
public String unescape( String s ) {
StringBuffer buf = new StringBuffer( s.length() );
Matcher m = entityPtn.matcher( s );
String decRef;
String hexRef;
int charCode;
String entName;
String entity;
while ( m.find() ) {
decRef = m.group( 1 );
hexRef = m.group( 2 );
entName = m.group( 3 );
if ( (decRef != null) ) {
// Decimal character reference.
charCode = Integer.parseInt( decRef );
entity = Character.toString( (char)charCode );
}
else if ( (hexRef != null) ) {
// Hex character reference.
charCode = Integer.parseInt( hexRef, 16 );
entity = Character.toString( (char)charCode );
}
else {
entity = entityMap.get( entName );
if ( entity == null ) {
// Unknown entity, repeat it as-is.
entity = "&"+ entName +";";
}
}
m.appendReplacement( buf, entity );
}
m.appendTail( buf );
return buf.toString();
}
/**
* Returns the position of the next non whitespace character after pos.
*
* Returns -1 if there isn't one.
*/
public int findNextNonspace( CharSequence s, int pos ) {
Matcher nonspaceMatcher = Pattern.compile( "\\S" ).matcher( s );
if ( nonspaceMatcher.find( pos ) )
return nonspaceMatcher.start();
return -1;
}
/**
* Increments an ongoing tally of lines and the col on the current line.
*
* @param lastLineAndCol the current tally to increment (0-based)
* @param s a string to check for \n's
* @param start a start index in the string to search from (inclusive)
* @param start an end index in the string (exclusive)
*/
private void addLineAndCol( int[] lastLineAndCol, CharSequence s, int start, int end ) {
if ( s.length() == 0 || start == end ) return;
Matcher breakMatcher = breakPtn.matcher( s );
breakMatcher.region( start, end );
int breakCount = 0;
int lastBreakPos = -1;
while ( breakMatcher.find() ) {
lastBreakPos = breakMatcher.start();
breakCount++;
}
if ( lastBreakPos == -1 ) {
// Same line, a few more chars in. Increment col.
lastLineAndCol[1] += end-1 - start;
} else {
// On a new line now, reset the col.
lastLineAndCol[0] += breakCount;
lastLineAndCol[1] = end-1 - lastBreakPos;
}
}
private void addLineAndCol( int[] lastLineAndCol, CharSequence s ) {
addLineAndCol( lastLineAndCol, s, 0, s.length() );
}
/**
* Returns lineNum and colNum for a position in text.
* The first line is line 1.
* Line breaks start a new line as col 0.
* The first char of each line, after the break is col 1.
*
* @param pos a 0-based offset
* @return 1-based ints for line and col (the first char is line 1, col 1)
* @see org.jdom2.input.JDOMParseException
*/
public int[] getLineAndCol( CharSequence s, int pos ) {
pos = Math.min( pos, s.length() );
Matcher breakMatcher = breakPtn.matcher( s );
breakMatcher.region( 0, pos+1 ); // Include pos itself in case it's a break.
int breakCount = 0;
int lastBreakPos = -1;
while ( breakMatcher.find() ) {
lastBreakPos = breakMatcher.start();
breakCount++;
}
int colNum;
if ( lastBreakPos == -1 )
colNum = pos+1; // Pretend ^ was column 0, as a \n would.
else
colNum = pos - lastBreakPos;
return new int[] { breakCount+1, colNum };
}
/**
* Returns the last character offset this parser was looking at.
*
* Usually this will be a patch of whitespace prior to unrecognized chars.
* This method is a fallback when an unexpected exception doesn't provide
* line info.
*
* @see findNextNonspace(CharSequence s, int pos)
*/
public int getLastPosition() {
return pos;
}
}