package sagan.search.support; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.Executors; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.config.RequestConfig; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.stereotype.Component; import com.soulgalore.crawler.core.CrawlerConfiguration; import com.soulgalore.crawler.core.HTMLPageResponse; import com.soulgalore.crawler.core.PageURL; import com.soulgalore.crawler.core.PageURLParser; import com.soulgalore.crawler.core.impl.AhrefPageURLParser; import com.soulgalore.crawler.core.impl.DefaultCrawler; import com.soulgalore.crawler.core.impl.HTTPClientResponseFetcher; @Component public class CrawlerService { private HttpClient httpClient() { PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(); connectionManager.setMaxTotal(1); return HttpClientBuilder .create() .setConnectionManager(connectionManager) .setDefaultRequestConfig( RequestConfig.custom().setConnectTimeout(30000) .setSocketTimeout(3000).build()).build(); } public void crawl(String url, int linkDepth, DocumentProcessor processor) { CrawlerConfiguration apiConfig = new CrawlerConfiguration.Builder().setStartUrl(url).setMaxLevels(linkDepth).setVerifyUrls(false) .build(); DefaultCrawler crawler = new DefaultCrawler(new ResponseFetcher(processor), Executors.newFixedThreadPool(10), new CompositeURLParser(new FramePageURLParser(), new AhrefPageURLParser())); crawler.getUrls(apiConfig); crawler.shutdown(); } private class ResponseFetcher extends HTTPClientResponseFetcher { private final DocumentProcessor processor; public ResponseFetcher(DocumentProcessor processor) { super(httpClient()); this.processor = processor; } @Override public HTMLPageResponse get(PageURL url, boolean fetchBody, Map<String, String> requestHeaders, boolean followRedirectsToNewDomain) { HTMLPageResponse response = super.get(url, fetchBody, requestHeaders, followRedirectsToNewDomain); if (response.getResponseCode() == 200 && response.getResponseType().startsWith("text")) { processor.process(response.getBody()); } return response; } } private static class CompositeURLParser implements PageURLParser { private PageURLParser[] parsers; private CompositeURLParser(PageURLParser... parsers) { this.parsers = parsers; } @Override public Set<PageURL> get(HTMLPageResponse theResponse) { Set<PageURL> urls = new HashSet<>(); for (PageURLParser parser : parsers) { urls.addAll(parser.get(theResponse)); } return urls; } } private static class FramePageURLParser implements PageURLParser { private static final String FRAME = "frame[src]"; private static final String ABS_SRC = "abs:src"; public Set<PageURL> get(HTMLPageResponse theResponse) { String url = theResponse.getUrl(); Set<PageURL> urls = new HashSet<>(); // only populate if we have a valid response, else return empty set if (theResponse.getResponseCode() == HttpStatus.SC_OK) { urls = fetch(FRAME, ABS_SRC, theResponse.getBody(), url); } return urls; } private Set<PageURL> fetch(String query, String attributeKey, Document doc, String url) { Set<PageURL> urls = new HashSet<>(); Elements elements = doc.select(query); for (Element src : elements) { if (src.attr(attributeKey).isEmpty()) continue; urls.add(new PageURL(src.attr(attributeKey), url)); } return urls; } } }