package com.virjar.dungproxy.client.ippool; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Random; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.alibaba.fastjson.JSONObject; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.virjar.dungproxy.client.ippool.config.DomainContext; import com.virjar.dungproxy.client.ippool.strategy.ResourceFacade; import com.virjar.dungproxy.client.model.AvProxy; import com.virjar.dungproxy.client.model.AvProxyVO; import com.virjar.dungproxy.client.ningclient.concurrent.NamedThreadFactory; /** * Created by virjar on 16/9/29. */ public class DomainPool { private String domain; // 数据引入器,默认引入我们现在的服务器数据,可以扩展,改为其他数据来源 private ResourceFacade resourceFacade; // 系统稳定的时候需要保持的资源 private int coreSize = 50; private List<String> testUrls = Lists.newArrayList(); private Random random = new Random(System.currentTimeMillis()); private SmartProxyQueue smartProxyQueue; /** * 被下线的IP */ private List<AvProxy> removedProxies = Lists.newArrayList(); /** * 被暂时封禁的IP */ private List<AvProxy> blockedProxies = Lists.newLinkedList(); // 备选的代理资源,他是通过IP下载器下载的的一批初始化IP,但是没有经过可用性测试 private ConcurrentLinkedQueue<AvProxyVO> candidateProxies = new ConcurrentLinkedQueue<>(); private AtomicBoolean isCandidateProxiesDownloading = new AtomicBoolean(false); private volatile long lastIpImportTimeStamp = 0L; private static final Logger logger = LoggerFactory.getLogger(DomainPool.class); private AtomicInteger refreshTaskNumber = new AtomicInteger(0); /** * 刷新任务线程池,5个活跃线程,最多25个线程,5分钟内如果有空转,则对线程资源进行回收,超过25个线程放到队列里面执行,discardPolicy永远不会触发<br/> * 如果一个系统变成了需要25个线程来刷新IP资源,那么基本也不能做爬虫了。线程池是静态的,所有域名IP池的刷新任务使用同一个刷新线程池。线程池里面的线程都是守护线程,主线程(用户线程)结束后刷新任务立即结束 */ private static ThreadPoolExecutor threadPool = new ThreadPoolExecutor(5, 25, 5, TimeUnit.MINUTES, new LinkedBlockingDeque<Runnable>(), new NamedThreadFactory("ip-refresh"), new ThreadPoolExecutor.DiscardPolicy()); private DomainContext domainContext; public DomainContext getDomainContext() { return domainContext; } public DomainPool(String domain, DomainContext domainContext) { this.domain = domain; this.resourceFacade = domainContext.getResourceFacade(); this.domainContext = domainContext; this.coreSize = domainContext.getCoreSize(); smartProxyQueue = new SmartProxyQueue(domainContext.getSmartProxyQueueRatio(), domainContext.getUseInterval()); // 全局默认代理 cloud proxy for (AvProxyVO cloudProxy : domainContext.getDungProxyContext().getCloudProxies()) { if (cloudProxy.getPartnerSize() == null || cloudProxy.getPartnerSize() < 1 || !cloudProxy.getCloud()) { addAvailable(cloudProxy.toModel(this)); } else { List<? extends AvProxy> avProxies = cloudProxy.toPartnerModels(domainContext); for (AvProxy avProxy : avProxies) { addAvailable(avProxy); } } } // domain区分的局部默认代理 for (AvProxyVO defaultProxy : domainContext.getDefaultProxy()) { if (defaultProxy.getPartnerSize() == null || defaultProxy.getPartnerSize() < 1 || !defaultProxy.getCloud()) { addAvailable(defaultProxy.toModel(this)); } else { List<? extends AvProxy> avProxies = defaultProxy.toPartnerModels(domainContext); for (AvProxy avProxy : avProxies) { addAvailable(avProxy); } } } } public void addAvailable(Collection<AvProxy> avProxyList) { for (AvProxy avProxy : avProxyList) { avProxy.setDomainPool(this);// 注意考虑对象懒加载问题 smartProxyQueue.addWithScore(avProxy); } } public void addAvailable(AvProxy avProxy) { avProxy.setDomainPool(this); smartProxyQueue.addWithScore(avProxy); } public List<AvProxy> availableProxy() { return Lists.newArrayList(smartProxyQueue.values()); } public AvProxy bind(String url) { if (StringUtils.isNotEmpty(url)) {// post的话,URL不会传递下来 if (testUrls.size() < 10) { testUrls.add(url); } else { testUrls.set(random.nextInt(10), url); } } if (needFresh()) { refresh();// 在新线程刷新 } // 当只有两个IP轮询的时候,放弃局部轮询,而是采用全部轮询的方式 return smartProxyQueue.getAndAdjustPriority((smartProxyQueue.availableSize() * smartProxyQueue.getRatio()) <= 2, domainContext.getDungProxyContext().isWaitIfNoAvailableProxy()); } /** * 当前IP池是否需要下载新的IP资源。 * * @return 是否 */ public boolean needFresh() { if (smartProxyQueue.availableSize() < coreSize) { smartProxyQueue.recoveryBlockedProxy(); } return smartProxyQueue.availableSize() < coreSize; } /** * 动态调整IP刷新线程数量 * * @return 现在可以运行的线程数量 */ private int expectedRefreshTaskNumber() { if (smartProxyQueue.availableSize() >= coreSize) { return 0; } int threadNumber = (coreSize - smartProxyQueue.availableSize()) * 10 / coreSize; if (threadNumber == 0) { threadNumber = 1; } // logger.info("IP池可用IP数量:{} 当前准备进行刷新工作的线程数量:{}", smartProxyQueue.availableSize(), threadNumber); return threadNumber; } public void feedBack() { resourceFacade.feedBack(domain, Lists.transform(Lists.newArrayList(smartProxyQueue.values()), new Function<AvProxy, AvProxyVO>() { @Override public AvProxyVO apply(AvProxy input) { return AvProxyVO.fromModel(input); } }), Lists.transform(removedProxies, new Function<AvProxy, AvProxyVO>() { @Override public AvProxyVO apply(AvProxy input) { return AvProxyVO.fromModel(input); } })); removedProxies.clear(); } /** * 本方法会启动线程异步刷线,所以不需要自己建立线程环境了,他不会检查可用IP是否足量,但是如果IP本身量太大的话,本调用也几乎无效(会检查是否需要下载IP,这个检查会失败), */ public void refresh() { if (testUrls.size() == 0) { return;// 数据还没有进来,不refresh } int expectedThreadNumber = expectedRefreshTaskNumber(); if (refreshTaskNumber.get() > expectedThreadNumber) { // logger.info("当前刷新线程数:{} 大于调度线程数:{} 取消本次IP资源刷新任务", refreshTaskNumber.get(), expectedThreadNumber); return; } if (refreshTaskNumber.incrementAndGet() <= expectedThreadNumber) { threadPool.execute(new RefreshThread()); } } private class RefreshThread implements Runnable { @Override public void run() { try { logger.info("IP资源刷新开始,当前刷新线程数量:{}...", refreshTaskNumber.get()); doRefresh(); logger.info("IP资源刷新结束..."); } finally { refreshTaskNumber.decrementAndGet(); } } } private void checkAndExtendCandidateResource() { // 两分钟内下载过IP,则取消IP下载,因为一次IP下载本身可能需要十几秒 if (System.currentTimeMillis() - lastIpImportTimeStamp < 120000) { return; } // 候选IP足量,取消IP下载 if (candidateProxies.size() + smartProxyQueue.availableSize() > (coreSize * 1.5)) { return; } // download new proxies // 同一个时刻只能有一个线程进行IP下载 if (isCandidateProxiesDownloading.compareAndSet(false, true)) { try { List<AvProxyVO> avProxies = resourceFacade.importProxy(domain, testUrls.get(random.nextInt(testUrls.size())), coreSize); logger.info("在线IP刷新,当前下载到的IP数目为:{}", avProxies.size()); candidateProxies.addAll(avProxies); } finally { lastIpImportTimeStamp = System.currentTimeMillis(); isCandidateProxiesDownloading.set(false); } } } private void doRefresh() { checkAndExtendCandidateResource(); AvProxyVO avProxy; // PreHeater preHeater = dungProxyContext.getPreHeater(); while ((avProxy = candidateProxies.poll()) != null) { if (domainContext.getProxyChecker().available(avProxy, testUrls.get(random.nextInt(testUrls.size())))) { avProxy.setAvgScore(0.5);// 设置默认值。让他处于次级缓存的中间。 addAvailable(avProxy.toModel(domainContext)); logger.info("IP池{}当前可用IP数目:{}", domain, smartProxyQueue.availableSize()); } } } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; DomainPool that = (DomainPool) o; return domain.equals(that.domain); } @Override public int hashCode() { return (domain + "?/").hashCode(); } public synchronized void block(AvProxy avProxy, long duration) { Iterator<AvProxy> iterator = blockedProxies.iterator(); while (iterator.hasNext()) { AvProxy next = iterator.next(); if (next.getResueTime() <= System.currentTimeMillis()) { iterator.remove(); addAvailable(next); } else { break; } } Preconditions.checkArgument(duration > 0, " block duration must greater than zero"); smartProxyQueue.offline(avProxy); avProxy.setResueTime(System.currentTimeMillis() + duration); blockedProxies.add(avProxy); logger.info("IP:{}暂时封禁,封禁时间:{} 毫秒", JSONObject.toJSONString(AvProxyVO.fromModel(avProxy)), duration); } public void offline(AvProxy avProxy) { smartProxyQueue.offline(avProxy); removedProxies.add(avProxy); if (avProxy.getReferCount() != 0) { logger.warn("IP offline {}", JSONObject.toJSONString(AvProxyVO.fromModel(avProxy))); } } public String getDomain() { return domain; } public ResourceFacade getResourceFacade() { return resourceFacade; } public void adjustPriority(AvProxy avProxy) { smartProxyQueue.adjustPriority(avProxy); } public int getCoreSize() { return coreSize; } public void setCoreSize(int coreSize) { this.coreSize = coreSize; } public List<String> getTestUrls() { return testUrls; } public SmartProxyQueue getSmartProxyQueue() { return smartProxyQueue; } public boolean isRefreshing() { return refreshTaskNumber.get() > 0; } }