/** * RSSReader * Copyright 2007 by Michael Peter Christen * First released 16.7.2007 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.cora.document.feed; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import net.yacy.cora.document.feed.RSSMessage.Token; import org.xml.sax.Attributes; import org.xml.sax.EntityResolver; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class RSSReader extends DefaultHandler { // class variables private RSSMessage item; private final StringBuilder buffer; private boolean parsingChannel, parsingItem; private final RSSFeed theChannel; private Type type; public enum Type { rss, atom, rdf, none } private RSSReader(final int maxsize) { this.theChannel = new RSSFeed(maxsize); this.buffer = new StringBuilder(300); this.item = null; this.parsingChannel = false; this.parsingItem = false; this.type = Type.none; } private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>(); private static SAXParser getParser() throws SAXException { SAXParser parser = tlSax.get(); if (parser == null) { try { parser = SAXParserFactory.newInstance().newSAXParser(); } catch (final ParserConfigurationException e) { throw new SAXException(e.getMessage(), e); } tlSax.set(parser); } return parser; } public RSSReader(final int maxsize, InputStream stream) throws IOException { this(maxsize); if (!(stream instanceof ByteArrayInputStream) && !(stream instanceof BufferedInputStream)) stream = new BufferedInputStream(stream); try { final SAXParser saxParser = getParser(); // do not look at external dtd - see: http://www.ibm.com/developerworks/xml/library/x-tipcfsx/index.html saxParser.getXMLReader().setEntityResolver(new EntityResolver() { @Override public InputSource resolveEntity(final String arg0, final String arg1) throws SAXException, IOException { return new InputSource(new StringReader("")); } }); saxParser.parse(stream, this); } catch (final SAXException e) { throw new IOException (e.getMessage()); } } public Type getType() { return this.type; } public static RSSReader parse(final int maxsize, final byte[] a) throws IOException { // check integrity of array if (a == null || a.length < 100) { return null; // returning null instead of throwing an IOException is expected in most calling methods where a fail is checked against null } // make input stream final ByteArrayInputStream bais = new ByteArrayInputStream(a); // parse stream RSSReader reader = null; try { reader = new RSSReader(maxsize, bais); } catch (final Exception e) { throw new IOException("parse exception: " + e.getMessage(), e); } try { bais.close(); } catch (final IOException e) {} return reader; } @Override public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { if ("channel".equals(tag)) { this.type = Type.rss; this.item = new RSSMessage(); this.parsingChannel = true; } else if ("feed".equals(tag)) { this.type = Type.atom; this.item = new RSSMessage(); this.parsingChannel = true; } else if ("item".equals(tag) || "entry".equals(tag)) { if (this.parsingChannel) { // the channel ends with the first item not with the channel close tag this.theChannel.setChannel(this.item); this.parsingChannel = false; } this.item = new RSSMessage(); this.parsingItem = true; } else if (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("rel") == null || atts.getValue("rel").equals("alternate"))) { // atom link handling (rss link is handled in endElement) final String url = atts.getValue("href"); if (url != null && url.length() > 0) this.item.setValue(Token.link, url); } else if ("rss".equals(tag)) { this.type = Type.rss; } } @Override public void endElement(final String uri, final String name, final String tag) throws SAXException { if (tag == null) return; if ("channel".equals(tag) || "feed".equals(tag)) { if (this.parsingChannel) this.theChannel.setChannel(this.item); this.parsingChannel = false; } else if ("item".equals(tag) || "entry".equals(tag)) { this.theChannel.addMessage(this.item); this.parsingItem = false; } else if (this.parsingItem) { final String value = this.buffer.toString().trim(); this.buffer.setLength(0); if (RSSMessage.tags.contains(tag) && value.length() > 0) this.item.setValue(RSSMessage.valueOfNick(tag), value); } else if (this.parsingChannel) { final String value = this.buffer.toString().trim(); this.buffer.setLength(0); if (RSSMessage.tags.contains(tag)) this.item.setValue(RSSMessage.valueOfNick(tag), value); } else if (this.type == Type.none) { // give up if we don't known the feed format throw new SAXException("response incomplete or unknown feed format"); } } @Override public void characters(final char ch[], final int start, final int length) { if (this.parsingItem || this.parsingChannel) { this.buffer.append(ch, start, length); } } public RSSFeed getFeed() { return this.theChannel; } }