package org.myrobotlab.service; import java.net.MalformedURLException; import java.net.URL; import org.myrobotlab.document.Document; import org.myrobotlab.document.connector.AbstractConnector; import org.myrobotlab.document.connector.ConnectorState; import org.myrobotlab.document.transformer.ConnectorConfig; import org.myrobotlab.framework.ServiceType; import it.sauronsoftware.feed4j.FeedIOException; import it.sauronsoftware.feed4j.FeedParser; import it.sauronsoftware.feed4j.FeedXMLParseException; import it.sauronsoftware.feed4j.UnsupportedFeedException; import it.sauronsoftware.feed4j.bean.Feed; import it.sauronsoftware.feed4j.bean.FeedHeader; import it.sauronsoftware.feed4j.bean.FeedItem; public class RSSConnector extends AbstractConnector { private static final long serialVersionUID = 1L; private String rssUrl = "http://www.myrobotlab.org/rss.xml"; private boolean interrupted = false; public RSSConnector(String reservedKey) { super(reservedKey); } @Override public void setConfig(ConnectorConfig config) { // TODO Auto-generated method stub log.info("Set Config not yet implemented"); } @Override public void startCrawling() { // TODO : make this cooler. :) for now.. fire and forget this.state = ConnectorState.RUNNING; URL url; try { url = new URL(rssUrl); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); return; } Feed feed; try { feed = FeedParser.parse(url); } catch (FeedIOException | FeedXMLParseException | UnsupportedFeedException e) { // TODO Auto-generated catch block e.printStackTrace(); return; } FeedHeader header = feed.getHeader(); int items = feed.getItemCount(); for (int i = 0; i < items; i++) { if (interrupted) { state = ConnectorState.INTERRUPTED; // TODO: clean up after yourself! return; } FeedItem item = feed.getItem(i); // create an id for this as being url # item offset Document feedItem = new Document(url + "#" + i); feedItem.setField("rss_title", header.getTitle()); feedItem.setField("rss_link", header.getLink()); feedItem.setField("rss_description", header.getDescription()); feedItem.setField("rss_language", header.getLanguage()); feedItem.setField("rss_date", header.getPubDate()); feedItem.setField("title", item.getTitle()); feedItem.setField("link", item.getLink()); feedItem.setField("description", item.getDescriptionAsText()); feedItem.setField("date", item.getPubDate()); feedItem.setField("html", item.getDescriptionAsHTML()); feed(feedItem); } // flush the last partial batch of documents if we are batching. flush(); this.state = ConnectorState.STOPPED; } public String getRssUrl() { return rssUrl; } public void setRssUrl(String rssUrl) { this.rssUrl = rssUrl; } public static void main(String[] args) throws Exception { RSSConnector connector = (RSSConnector) Runtime.start("rss", "RSSConnector"); Solr solr = (Solr) Runtime.start("solr", "Solr"); solr.setSolrUrl("http://www.skizatch.org:8983/solr/graph"); connector.addDocumentListener(solr); connector.startCrawling(); } @Override public void stopCrawling() { // interrupt the current crawl gently. interrupted = true; } /** * This static method returns all the details of the class without it having * to be constructed. It has description, categories, dependencies, and peer * definitions. * * @return ServiceType - returns all the data * */ static public ServiceType getMetaData() { ServiceType meta = new ServiceType(RSSConnector.class.getCanonicalName()); meta.addDescription("This will crawl an rss feed at the given url and break apart the page into Documents"); meta.addCategory("data"); return meta; } }