RSSConnector.java example

Explorer
myrobotlab-master
- src
- test
  - ArduinoChaosTest.java
  - ArduinoMotorPotTest.java
  - org
    - myrobotlab
package org.myrobotlab.service;

import java.net.MalformedURLException;
import java.net.URL;

import org.myrobotlab.document.Document;
import org.myrobotlab.document.connector.AbstractConnector;
import org.myrobotlab.document.connector.ConnectorState;
import org.myrobotlab.document.transformer.ConnectorConfig;
import org.myrobotlab.framework.ServiceType;

import it.sauronsoftware.feed4j.FeedIOException;
import it.sauronsoftware.feed4j.FeedParser;
import it.sauronsoftware.feed4j.FeedXMLParseException;
import it.sauronsoftware.feed4j.UnsupportedFeedException;
import it.sauronsoftware.feed4j.bean.Feed;
import it.sauronsoftware.feed4j.bean.FeedHeader;
import it.sauronsoftware.feed4j.bean.FeedItem;

public class RSSConnector extends AbstractConnector {

  private static final long serialVersionUID = 1L;

  private String rssUrl = "http://www.myrobotlab.org/rss.xml";
  private boolean interrupted = false;

  public RSSConnector(String reservedKey) {
    super(reservedKey);
  }

  @Override
  public void setConfig(ConnectorConfig config) {
    // TODO Auto-generated method stub
    log.info("Set Config not yet implemented");
  }

  @Override
  public void startCrawling() {
    // TODO : make this cooler. :) for now.. fire and forget
    this.state = ConnectorState.RUNNING;
    URL url;
    try {
      url = new URL(rssUrl);
    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      return;
    }
    Feed feed;
    try {
      feed = FeedParser.parse(url);
    } catch (FeedIOException | FeedXMLParseException | UnsupportedFeedException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      return;
    }

    FeedHeader header = feed.getHeader();
    int items = feed.getItemCount();
    for (int i = 0; i < items; i++) {
      if (interrupted) {
        state = ConnectorState.INTERRUPTED;
        // TODO: clean up after yourself!
        return;
      }
      FeedItem item = feed.getItem(i);
      // create an id for this as being url # item offset
      Document feedItem = new Document(url + "#" + i);
      feedItem.setField("rss_title", header.getTitle());
      feedItem.setField("rss_link", header.getLink());
      feedItem.setField("rss_description", header.getDescription());
      feedItem.setField("rss_language", header.getLanguage());
      feedItem.setField("rss_date", header.getPubDate());
      feedItem.setField("title", item.getTitle());
      feedItem.setField("link", item.getLink());
      feedItem.setField("description", item.getDescriptionAsText());
      feedItem.setField("date", item.getPubDate());
      feedItem.setField("html", item.getDescriptionAsHTML());
      feed(feedItem);
    }
    // flush the last partial batch of documents if we are batching.
    flush();
    this.state = ConnectorState.STOPPED;
  }

  public String getRssUrl() {
    return rssUrl;
  }

  public void setRssUrl(String rssUrl) {
    this.rssUrl = rssUrl;
  }

  public static void main(String[] args) throws Exception {
    RSSConnector connector = (RSSConnector) Runtime.start("rss", "RSSConnector");
    Solr solr = (Solr) Runtime.start("solr", "Solr");
    solr.setSolrUrl("http://www.skizatch.org:8983/solr/graph");
    connector.addDocumentListener(solr);
    connector.startCrawling();
  }

  @Override
  public void stopCrawling() {
    // interrupt the current crawl gently.
    interrupted = true;
  }

  /**
   * This static method returns all the details of the class without it having
   * to be constructed. It has description, categories, dependencies, and peer
   * definitions.
   * 
   * @return ServiceType - returns all the data
   * 
   */
  static public ServiceType getMetaData() {

    ServiceType meta = new ServiceType(RSSConnector.class.getCanonicalName());
    meta.addDescription("This will crawl an rss feed at the given url and break apart the page into Documents");
    meta.addCategory("data");

    return meta;
  }

}