/** * */ package org.voyanttools.trombone.input.extract; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.parser.xml.XMLParser; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * @author sgs * */ public class XmlOrHtmlTikaParser extends AbstractParser { private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( MediaType.application("xml"), MediaType.text("html"), MediaType.application("xhtml+xml")))); /** * */ public XmlOrHtmlTikaParser() { } /* (non-Javadoc) * @see org.apache.tika.parser.Parser#getSupportedTypes(org.apache.tika.parser.ParseContext) */ @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } /* (non-Javadoc) * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, org.apache.tika.parser.ParseContext) */ @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { Detector detector = new DefaultDetector(); MediaType mediaType = detector.detect(stream, metadata); if (mediaType==MediaType.TEXT_HTML) { new HtmlParser().parse(stream, handler, metadata, context); } else { new XMLParser().parse(stream, handler, metadata, context); } } }