/**
* RSSReader
* Copyright 2007 by Michael Peter Christen
* First released 16.7.2007 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.document.feed;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.feed.RSSMessage.Token;
import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class RSSReader extends DefaultHandler {
// class variables
private RSSMessage item;
private final StringBuilder buffer;
private boolean parsingChannel, parsingItem;
private final RSSFeed theChannel;
private Type type;
public enum Type { rss, atom, rdf, none }
private RSSReader(final int maxsize) {
this.theChannel = new RSSFeed(maxsize);
this.buffer = new StringBuilder(300);
this.item = null;
this.parsingChannel = false;
this.parsingItem = false;
this.type = Type.none;
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (final ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
public RSSReader(final int maxsize, InputStream stream) throws IOException {
this(maxsize);
if (!(stream instanceof ByteArrayInputStream) && !(stream instanceof BufferedInputStream)) stream = new BufferedInputStream(stream);
try {
final SAXParser saxParser = getParser();
// do not look at external dtd - see: http://www.ibm.com/developerworks/xml/library/x-tipcfsx/index.html
saxParser.getXMLReader().setEntityResolver(new EntityResolver() {
@Override
public InputSource resolveEntity(final String arg0, final String arg1)
throws SAXException, IOException {
return new InputSource(new StringReader(""));
}
});
saxParser.parse(stream, this);
} catch (final SAXException e) {
throw new IOException (e.getMessage());
}
}
public Type getType() {
return this.type;
}
public static RSSReader parse(final int maxsize, final byte[] a) throws IOException {
// check integrity of array
if (a == null || a.length < 100) {
return null; // returning null instead of throwing an IOException is expected in most calling methods where a fail is checked against null
}
// make input stream
final ByteArrayInputStream bais = new ByteArrayInputStream(a);
// parse stream
RSSReader reader = null;
try {
reader = new RSSReader(maxsize, bais);
} catch (final Exception e) {
throw new IOException("parse exception: " + e.getMessage(), e);
}
try { bais.close(); } catch (final IOException e) {}
return reader;
}
@Override
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("channel".equals(tag)) {
this.type = Type.rss;
this.item = new RSSMessage();
this.parsingChannel = true;
} else if ("feed".equals(tag)) {
this.type = Type.atom;
this.item = new RSSMessage();
this.parsingChannel = true;
} else if ("item".equals(tag) || "entry".equals(tag)) {
if (this.parsingChannel) {
// the channel ends with the first item not with the channel close tag
this.theChannel.setChannel(this.item);
this.parsingChannel = false;
}
this.item = new RSSMessage();
this.parsingItem = true;
} else if (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("rel") == null || atts.getValue("rel").equals("alternate"))) {
// atom link handling (rss link is handled in endElement)
final String url = atts.getValue("href");
if (url != null && url.length() > 0) this.item.setValue(Token.link, url);
} else if ("rss".equals(tag)) {
this.type = Type.rss;
}
}
@Override
public void endElement(final String uri, final String name, final String tag) throws SAXException {
if (tag == null) return;
if ("channel".equals(tag) || "feed".equals(tag)) {
if (this.parsingChannel) this.theChannel.setChannel(this.item);
this.parsingChannel = false;
} else if ("item".equals(tag) || "entry".equals(tag)) {
this.theChannel.addMessage(this.item);
this.parsingItem = false;
} else if (this.parsingItem) {
final String value = this.buffer.toString().trim();
this.buffer.setLength(0);
if (RSSMessage.tags.contains(tag) && value.length() > 0) this.item.setValue(RSSMessage.valueOfNick(tag), value);
} else if (this.parsingChannel) {
final String value = this.buffer.toString().trim();
this.buffer.setLength(0);
if (RSSMessage.tags.contains(tag)) this.item.setValue(RSSMessage.valueOfNick(tag), value);
} else if (this.type == Type.none) {
// give up if we don't known the feed format
throw new SAXException("response incomplete or unknown feed format");
}
}
@Override
public void characters(final char ch[], final int start, final int length) {
if (this.parsingItem || this.parsingChannel) {
this.buffer.append(ch, start, length);
}
}
public RSSFeed getFeed() {
return this.theChannel;
}
}