package com.virjar.dungproxy.client.webmagic;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Sets;
import com.virjar.dungproxy.client.httpclient.CrawlerHttpClient;
import com.virjar.dungproxy.client.ippool.config.ProxyConstant;
import com.virjar.dungproxy.client.util.PoolUtil;
import com.virjar.dungproxy.client.util.ReflectUtil;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
/**
* The http downloader based on HttpClient.
*
* 为webMagic实现的downloader,如果有其他定制需求,参考本类实现即可<br/>
* <br/>
* 本类现在加入了很多新特性,不过使用方式和webmagic原生仍然兼容,如果是webmagic项目且需要定制downloader,建议在本类基础上做修改。
* <ul>
* <li>代理IP池特性(对接dungprxoy)</li>
* <li>代理IP上下线特性</li>
* <li>webmagic版本兼容(因为相对于webmagic,本类使用属于外来者,其代码逻辑不能跟随webmagic版本变动而变动)</li>
* <li>放开downloader获取原生httpclient的口子,getHttpClient变成public方法,方便使用者模拟登录</li>
* <li>多用户在线支持,使用multiUserCookieStore,天生支持多个用户并发的登录爬取数据</li>
* <li>用户URL关系维护,他会自动纪录产生的新URL是那个user爬取到的,而且下次调度到一个新URL的时候,会自动获取到新URL是那个账户爬取到的,然后使用对应账户的cookie信息</li>
* </ul>
*
* <pre>
* public static void main(String[] args) {
* Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft")
* .setDownloader(new DungProxyDownloader()).thread(5).run();
* }
* </pre>
*
* <pre>
* 如果自己实现代理池到httpclient的织入:
* CloseableHttpClient closeableHttpClient =
* HttpClientBuilder.create().setRetryHandler(new DunProxyHttpRequestRetryHandler())
* .setRoutePlanner(new ProxyBindRoutPlanner()).build();
* </pre>
*
* @author code4crafter@gmail.com <br>
* @author virjar
* @since 0.0.1
*/
@ThreadSafe
public class DungProxyDownloader extends AbstractDownloader {
private Logger logger = LoggerFactory.getLogger(getClass());
private final Map<String, CrawlerHttpClient> httpClients = new HashMap<>();
// 自动代理替换了这里
private DungProxyHttpClientGenerator httpClientGenerator = new DungProxyHttpClientGenerator();
/**
* 设置为public,这样用户就可以获取到原生httpclient,虽然打破了封装,但是用户确实有这样的需求
*
* @param site site
* @param proxy proxy
* @return CrawlerHttpClient, 本身继承自CloseableHttpClient, 兼容CloseableHttpClient所有方法
*/
public CrawlerHttpClient getHttpClient(Site site, Proxy proxy) {
if (site == null) {
return httpClientGenerator.getClient(null, proxy);
}
String domain = site.getDomain();
CrawlerHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site, proxy);
httpClients.put(domain, httpClient);
}
}
}
return httpClient;
}
@Override
public Page download(Request request, Task task) {
Site site = null;
if (task != null) {
site = task.getSite();
}
Set<Integer> acceptStatCode;
String charset = null;
Map<String, String> headers = null;
if (site != null) {
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = Sets.newHashSet(200);// 使用guava等价替换 WMCollections.newHashSet(200);
}
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
int statusCode = 0;
HttpClientContext httpClientContext = null;
HttpUriRequest httpUriRequest = null;
try {
HttpHost proxyHost = null;
Proxy proxy = null; // TODO
if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
Object proxyObject = ReflectUtil.invoke(site, "getHttpProxyFromPool", new Class[] {}, new Object[] {});// site.getHttpProxyFromPool();
// 在0.6.x下面,返回类型是Proxy,所以虽然编译器报警,但是也只能忽略语法检查,因为不同版本的webMagic会走不同的分之
if (proxyObject instanceof HttpHost) {// 0.5.x的用法
proxyHost = (HttpHost) proxyObject;
} else if (proxyObject instanceof Proxy) {// 0.6.x的用法
proxy = (Proxy) proxyObject;
proxyHost = proxy.getHttpHost();
}
} else if (site != null && site.getHttpProxy() != null) {
proxyHost = site.getHttpProxy();
}
httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
httpClientContext = HttpClientContext.adapt(new BasicHttpContext());
// 扩展功能,支持多用户隔离,默认使用的是crawlerHttpClient,crawlerHttpClient默认则使用multiUserCookieStore
if (request.getExtra(ProxyConstant.DUNGPROXY_USER_KEY) != null) {
PoolUtil.bindUserKey(httpClientContext, request.getExtra(ProxyConstant.DUNGPROXY_USER_KEY).toString());
}
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest, httpClientContext);
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, httpResponse, task);
if (needOfflineProxy(page)) {
PoolUtil.offline(httpClientContext);
return addToCycleRetry(request, site);
}
onSuccess(request);
return page;
} else {
logger.warn("get page {} error, status code {} ", request.getUrl(), statusCode);
if (needOfflineProxy(statusCode)) {
PoolUtil.offline(httpClientContext);// webMagic对状态码的拦截可能出现在这里,所以也要在这里下线IP
return addToCycleRetry(request, site);
}
return null;
}
} catch (IOException e) {
if (needOfflineProxy(e)) {
logger.warn("发生异常:{},IP下线");
PoolUtil.offline(httpClientContext);// 由IP异常导致,直接重试
return addToCycleRetry(request, site);
}
if (isLastRetry(request, site)) {// 移动异常日志位置,只记录最终失败的。中途失败不算失败
logger.warn("download page {} error", request.getUrl(), e);
}
if (site != null && site.getCycleRetryTimes() > 0) {
return addToCycleRetry(request, site);
}
onError(request);
return null;
} finally {
request.putExtra(Request.STATUS_CODE, statusCode);
if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY),
(Integer) request.getExtra(Request.STATUS_CODE));
}
try {
// 先释放链接,在consume,consume本身会释放链接,但是可能提前抛错导致链接释放失败
if (httpUriRequest != null) {
try {
httpUriRequest.abort();
} catch (UnsupportedOperationException unsupportedOperationException) {
logger.error("can not abort connection", unsupportedOperationException);
}
}
if (httpResponse != null) {
// ensure the connection is released back to pool
EntityUtils.consume(httpResponse.getEntity());
}
} catch (IOException e) {
logger.warn("close response fail", e);
}
}
}
/**
* 判断当前请求是不是最后的重试,流程等同于 addToCycleRetry
*
* @see us.codecraft.webmagic.downloader.AbstractDownloader#addToCycleRetry(us.codecraft.webmagic.Request,
* us.codecraft.webmagic.Site)
* @param request request
* @param site site
* @return 是否是最后一次重试
*/
protected boolean isLastRetry(Request request, Site site) {
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
return false;
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
return true;
}
}
return false;
}
/**
* 默认封禁403和401两个状态码的IP
*
* @param page 爬取结果
* @return 是否需要封禁这个IP
*/
protected boolean needOfflineProxy(Page page) {
Integer statusCode = (Integer) page.getRequest().getExtra(Request.STATUS_CODE);
if (statusCode == null) {
return false;// 不知道状态码
}
return statusCode == 401 || statusCode == 403;// 401和403两个状态强制下线IP
}
protected boolean needOfflineProxy(IOException e) {
return false;
}
protected boolean needOfflineProxy(int statusCode) {
return statusCode == 401 || statusCode == 403;
}
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
}
protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
return acceptStatCode.contains(statusCode);
}
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers,
HttpHost proxy) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
.setConnectionRequestTimeout(site.getTimeOut()).setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut()).setCookieSpec(CookieSpecs.BEST_MATCH);
if (proxy != null) {
requestConfigBuilder.setProxy(proxy);
request.putExtra(Request.PROXY, proxy);
}
requestBuilder.setConfig(requestConfigBuilder.build());
return requestBuilder.build();
}
protected RequestBuilder selectRequestMethod(Request request) {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
// default get
return RequestBuilder.get();
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
RequestBuilder requestBuilder = RequestBuilder.post();
NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair");
if (nameValuePair != null && nameValuePair.length > 0) {
requestBuilder.addParameters(nameValuePair);
}
return requestBuilder;
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return RequestBuilder.put();
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return RequestBuilder.trace();
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task)
throws IOException {
String content = getContent(charset, httpResponse);
Page page = new UserSessionPage();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
return page;
}
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
if (htmlCharset != null) {
return new String(contentBytes, htmlCharset);
} else {
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()",
Charset.defaultCharset());
return new String(contentBytes);
}
} else {
return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
}
}
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
String charset = null;
// charset
// 1、encoding in http header Content-Type
Header contentType = httpResponse.getEntity().getContentType();
if (contentType != null) {// contentType可能为空
charset = UrlUtils.getCharset(contentType.getValue());
}
if (StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}
// use default charset to decode first time
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset.name());
// 2、charset in meta
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.contains("charset")) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
// 2.2、html5 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
}
logger.debug("Auto get charset: {}", charset);
// 3、todo use tools as cpdetector for content decode
return charset;
}
}