/** * mmParser * Copyright 2010 by Marc Nause, marc.nause@gmx.de, Braunschweig, Germany * First released 27.12.2010 at http://yacy.net * * $LastChangedDate$ * $LastChangedRevision$ * $LastChangedBy$ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.document.parser; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Date; import java.util.List; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.VocabularyScraper; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; // this is a new implementation of this parser idiom using multiple documents as result set public class mmParser extends AbstractParser implements Parser { public mmParser() { super("FreeMind Parser"); this.SUPPORTED_EXTENSIONS.add("mm"); this.SUPPORTED_MIME_TYPES.add("application/freemind"); this.SUPPORTED_MIME_TYPES.add("application/x-freemind"); } private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>(); private static SAXParser getParser() throws SAXException { SAXParser parser = tlSax.get(); if (parser == null) { try { parser = SAXParserFactory.newInstance().newSAXParser(); } catch (final ParserConfigurationException e) { throw new SAXException(e.getMessage(), e); } tlSax.set(parser); } return parser; } @Override public Document[] parse( final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { final StringBuilder sb = new StringBuilder(); String rootElementText = ""; byte[] content = new byte[0]; try { final SAXParser saxParser = getParser(); final FreeMindHandler freeMindHandler = new FreeMindHandler(); saxParser.parse(source, freeMindHandler); final List<String> nodeTextList = freeMindHandler.getNodeText(); rootElementText = nodeTextList.isEmpty() ? "" : nodeTextList.get(0); for (final String nodeText : nodeTextList) { sb.append(nodeText); sb.append(". "); } content = UTF8.getBytes(sb.toString()); } catch (final SAXException ex) { AbstractParser.log.warn(ex.getMessage()); } catch (final IOException ex) { AbstractParser.log.warn(ex.getMessage()); } return new Document[]{new Document( location, mimeType, StandardCharsets.UTF_8.name(), this, null, null, singleList(rootElementText), null, null, null, null, 0.0d, 0.0d, content, null, null, null, false, new Date())}; } private class FreeMindHandler extends DefaultHandler { private final List<String> nodeText = new ArrayList<String>(); @Override public void startElement(final String uri, final String localName, final String qName, final Attributes attributes) { if (qName.equals("node")) { final String textValue = attributes.getValue("TEXT"); if (textValue != null) { this.nodeText.add(textValue); } } } protected List<String> getNodeText() { return this.nodeText; } } }