package com.virjar.dungproxy.client.samples.webmagic.successrate; import java.io.IOException; import java.nio.charset.Charset; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.Header; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Maps; import com.google.common.collect.Queues; import com.google.common.collect.Sets; import com.virjar.dungproxy.client.httpclient.CrawlerHttpClient; import com.virjar.dungproxy.client.ippool.config.ProxyConstant; import com.virjar.dungproxy.client.util.PoolUtil; import com.virjar.dungproxy.client.util.ReflectUtil; import com.virjar.dungproxy.client.webmagic.DungProxyHttpClientGenerator; import com.virjar.dungproxy.client.webmagic.UserSessionPage; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; /** * Created by virjar on 17/2/22. */ public class SuccessRateTestDownloader extends AbstractDownloader { private AtomicLong totalTimes = new AtomicLong(0); // 计算最近一百次的使失败率 private static final int ratio = 100; private double successRate = 0.0; private Logger logger = LoggerFactory.getLogger(getClass()); private final Map<String, CrawlerHttpClient> httpClients = Maps.newHashMap();// 直接new在1.5以下会出问题,在1.7会有波浪线提示 private ConcurrentLinkedQueue<Double> concurrentLinkedDeque = Queues.<Double> newConcurrentLinkedQueue(); // 自动代理替换了这里 private DungProxyHttpClientGenerator httpClientGenerator = new DungProxyHttpClientGenerator(); public SuccessRateTestDownloader() { // 在点击关闭程序的时候,再次输出所有的失败率报告 Runtime.getRuntime().addShutdownHook(new Thread() { @Override public void run() { StringBuilder sb = new StringBuilder("整体的失败率变化为:"); Double d; while ((d = concurrentLinkedDeque.poll()) != null) { sb.append(d); sb.append(","); } System.out.println(sb.toString()); } }); } /** * 设置为public,这样用户就可以获取到原生httpclient,虽然打破了封装,但是用户确实有这样的需求 * * @param site site * @param proxy proxy * @return CrawlerHttpClient,本身继承自CloseableHttpClient,兼容CloseableHttpClient所有方法 */ public CrawlerHttpClient getHttpClient(Site site, Proxy proxy) { if (site == null) { return httpClientGenerator.getClient(null, proxy); } String domain = site.getDomain(); CrawlerHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site, proxy); httpClients.put(domain, httpClient); } } } return httpClient; } @Override public Page download(Request request, Task task) { Site site = null; if (task != null) { site = task.getSite(); } Set<Integer> acceptStatCode; String charset = null; Map<String, String> headers = null; if (site != null) { acceptStatCode = site.getAcceptStatCode(); charset = site.getCharset(); headers = site.getHeaders(); } else { acceptStatCode = Sets.newHashSet(200);// 使用guava等价替换 WMCollections.newHashSet(200); } logger.info("downloading page {}", request.getUrl()); CloseableHttpResponse httpResponse = null; int statusCode = 0; HttpClientContext httpClientContext = null; boolean isSuccess = true; try { HttpHost proxyHost = null; Proxy proxy = null; // TODO if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { Object proxyObject = ReflectUtil.invoke(site, "getHttpProxyFromPool", new Class[] {}, new Object[] {});// site.getHttpProxyFromPool(); // 在0.6.x下面,返回类型是Proxy,所以虽然编译器报警,但是也只能忽略语法检查,因为不同版本的webMagic会走不同的分之 if (proxyObject instanceof HttpHost) {// 0.5.x的用法 proxyHost = (HttpHost) proxyObject; } else if (proxyObject instanceof Proxy) {// 0.6.x的用法 proxy = (Proxy) proxyObject; proxyHost = proxy.getHttpHost(); } } else if (site != null && site.getHttpProxy() != null) { proxyHost = site.getHttpProxy(); } HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost); httpClientContext = HttpClientContext.adapt(new BasicHttpContext()); // 扩展功能,支持多用户隔离,默认使用的是crawlerHttpClient,crawlerHttpClient默认则使用multiUserCookieStore if (request.getExtra(ProxyConstant.DUNGPROXY_USER_KEY) != null) { PoolUtil.bindUserKey(httpClientContext, request.getExtra(ProxyConstant.DUNGPROXY_USER_KEY).toString()); } if (totalTimes.getAndIncrement() % ratio == 0) { // 采样 System.out.println("当前失败率为:" + successRate); concurrentLinkedDeque.add(successRate); } httpResponse = getHttpClient(site, proxy).execute(httpUriRequest, httpClientContext); statusCode = httpResponse.getStatusLine().getStatusCode(); request.putExtra(Request.STATUS_CODE, statusCode); if (statusAccept(acceptStatCode, statusCode)) { Page page = handleResponse(request, charset, httpResponse, task); if (needOfflineProxy(page)) { PoolUtil.offline(httpClientContext); return addToCycleRetry(request, site); } onSuccess(request); return page; } else { logger.warn("get page {} error, status code {} ", request.getUrl(), statusCode); if (needOfflineProxy(statusCode)) { PoolUtil.offline(httpClientContext);// webMagic对状态码的拦截可能出现在这里,所以也要在这里下线IP return addToCycleRetry(request, site); } return null; } } catch (IOException e) { isSuccess = false; if (needOfflineProxy(e)) { logger.warn("发生异常:{},IP下线"); PoolUtil.offline(httpClientContext);// 由IP异常导致,直接重试 return addToCycleRetry(request, site); } if (isLastRetry(request, site)) {// 移动异常日志位置,只记录最终失败的。中途失败不算失败 logger.warn("download page {} error", request.getUrl(), e); } if (site != null && site.getCycleRetryTimes() > 0) { return addToCycleRetry(request, site); } onError(request); return null; } finally { synchronized (SuccessRateTestDownloader.class) {// 算错了,算成成功率率 successRate = (successRate * (ratio - 1) + (isSuccess ? 1 : 0)) / ratio; } request.putExtra(Request.STATUS_CODE, statusCode); if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request.getExtra(Request.STATUS_CODE)); } try { if (httpResponse != null) { // ensure the connection is released back to pool EntityUtils.consume(httpResponse.getEntity()); } } catch (IOException e) { logger.warn("close response fail", e); } } } /** * 判断当前请求是不是最后的重试,流程等同于 addToCycleRetry * * @see us.codecraft.webmagic.downloader.AbstractDownloader#addToCycleRetry(us.codecraft.webmagic.Request, * us.codecraft.webmagic.Site) * @param request request * @param site site * @return 是否是最后一次重试 */ protected boolean isLastRetry(Request request, Site site) { Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); if (cycleTriedTimesObject == null) { return false; } else { int cycleTriedTimes = (Integer) cycleTriedTimesObject; cycleTriedTimes++; if (cycleTriedTimes >= site.getCycleRetryTimes()) { return true; } } return false; } /** * 默认封禁403和401两个状态码的IP * * @param page 爬取结果 * @return 是否需要封禁这个IP */ protected boolean needOfflineProxy(Page page) { Integer statusCode = (Integer) page.getRequest().getExtra(Request.STATUS_CODE); if (statusCode == null) { return false;// 不知道状态码 } return statusCode == 401 || statusCode == 403;// 401和403两个状态强制下线IP } protected boolean needOfflineProxy(IOException e) { return false; } protected boolean needOfflineProxy(int statusCode) { return statusCode == 401 || statusCode == 403; } @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); } protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) { return acceptStatCode.contains(statusCode); } protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers, HttpHost proxy) { RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); if (headers != null) { for (Map.Entry<String, String> headerEntry : headers.entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() .setConnectionRequestTimeout(site.getTimeOut()).setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()).setCookieSpec(CookieSpecs.BEST_MATCH); if (proxy != null) { requestConfigBuilder.setProxy(proxy); request.putExtra(Request.PROXY, proxy); } requestBuilder.setConfig(requestConfigBuilder.build()); return requestBuilder.build(); } protected RequestBuilder selectRequestMethod(Request request) { String method = request.getMethod(); if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { // default get return RequestBuilder.get(); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { RequestBuilder requestBuilder = RequestBuilder.post(); NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair"); if (nameValuePair != null && nameValuePair.length > 0) { requestBuilder.addParameters(nameValuePair); } return requestBuilder; } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { return RequestBuilder.head(); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { return RequestBuilder.put(); } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { return RequestBuilder.delete(); } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { return RequestBuilder.trace(); } throw new IllegalArgumentException("Illegal HTTP Method " + method); } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = getContent(charset, httpResponse); Page page = new UserSessionPage(); page.setRawText(content); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); return page; } protected String getContent(String charset, HttpResponse httpResponse) throws IOException { if (charset == null) { byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String htmlCharset = getHtmlCharset(httpResponse, contentBytes); if (htmlCharset != null) { return new String(contentBytes, htmlCharset); } else { logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); return new String(contentBytes); } } else { return IOUtils.toString(httpResponse.getEntity().getContent(), charset); } } protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { String charset = null; // charset // 1、encoding in http header Content-Type Header contentType = httpResponse.getEntity().getContentType(); if (contentType != null) {// contentType可能为空 charset = UrlUtils.getCharset(contentType.getValue()); } if (StringUtils.isNotBlank(charset)) { logger.debug("Auto get charset: {}", charset); return charset; } // use default charset to decode first time Charset defaultCharset = Charset.defaultCharset(); String content = new String(contentBytes, defaultCharset.name()); // 2、charset in meta if (StringUtils.isNotEmpty(content)) { Document document = Jsoup.parse(content); Elements links = document.select("meta"); for (Element link : links) { // 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> String metaContent = link.attr("content"); String metaCharset = link.attr("charset"); if (metaContent.contains("charset")) { metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); charset = metaContent.split("=")[1]; break; } // 2.2、html5 <meta charset="UTF-8" /> else if (StringUtils.isNotEmpty(metaCharset)) { charset = metaCharset; break; } } } logger.debug("Auto get charset: {}", charset); // 3、todo use tools as cpdetector for content decode return charset; } }