package ru.exorg.miner; import java.util.TreeMap; import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.log4j.Logger; import org.webharvest.definition.ScraperConfiguration; import org.webharvest.exception.HttpException; import org.webharvest.runtime.Scraper; import org.webharvest.runtime.ScraperContext; import ru.exorg.core.service.*; // ================================================================================ public abstract class Miner { private Logger log; private String[] configFiles; private int httpTimeout; private int maxRetries; private String proxyHost; private int proxyPort; protected DataProvider dataProvider; protected static class Vars extends TreeMap<String, String> { } protected static String beautify(final String str) { return str .trim() .replaceAll("^[\r\n \t]*", "") .replaceAll("[\r\n \t]*$", "") .replaceAll("[ \t]+", " ") .replaceAll("[\r\n]+[\r\n ]*", "\n"); } protected static Vars parseMinedItem(final String info) { Vars v = new Vars(); String[] fields = info.split("[\\[\\]]"); for (int i = 1; i + 1 < fields.length; i += 2) { if (fields[i + 1].length() > 1) { v.put(fields[i], beautify(fields[i + 1])); } } return v; } public Miner() { this.log = Logger.getLogger(Main.class); this.configFiles = new String[0]; this.httpTimeout = 10000; this.maxRetries = 5; } final public void setDataProvider(final DataProvider p) { this.dataProvider = p; } final public void setConfig(final String config) { this.configFiles = config.split(";"); } final public void setProxy(final String proxy) { if (proxy.length() >= 1) { String[] proxyCfg = proxy.split(":"); this.proxyHost = proxyCfg[0]; this.proxyPort = Integer.parseInt(proxyCfg[1]); } else { proxyHost = null; } } final public void setHttpTimeout(int timeout) { this.httpTimeout = timeout; } final public void setMaxRetries(int retries) { this.maxRetries = retries; } final public void run() { try { for (String configFile : this.configFiles) { this.log.warn("Working on " + configFile); int cTry = this.maxRetries; boolean success = false; Scraper scraper = null; while (!success && cTry > 0) { ScraperConfiguration config = new ScraperConfiguration(configFile); scraper = new Scraper(config, "."); if (this.proxyHost != null) { scraper.getHttpClientManager().setHttpProxy(proxyHost, proxyPort); } scraper.getHttpClientManager().getHttpClient().getParams().setSoTimeout(this.httpTimeout); scraper.getHttpClientManager().getHttpClient().getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES); scraper.getHttpClientManager().getHttpClient().getHttpConnectionManager().getParams().setConnectionTimeout(this.httpTimeout); scraper.setDebug(false); try { scraper.execute(); success = true; } catch (HttpException e) { this.log.warn("HTTP error occured. Retries left " + String.valueOf(cTry)); System.out.println("HTTP error occured. Retries left " + String.valueOf(cTry)); cTry = cTry - 1; } finally { } } if (success) { handle(scraper.getContext()); } else { this.log.warn("Not handling " + configFile + ". Retry count exceeded."); System.out.println("Not handling " + configFile + ". Retry count exceeded."); } } } catch (Exception e) { System.out.println("Exception was caught. See debug.log for details"); this.log.warn(e.getMessage()); this.log.warn(ru.exorg.core.util.Log.getCallStack(e)); } } abstract protected void handle(final ScraperContext sc) throws Exception; }