package com.virjar.dungproxy.client.samples.webmagic;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.ConcurrentMap;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.NameValuePair;
import org.apache.http.client.protocol.HttpClientContext;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.google.common.collect.Maps;
import com.virjar.dungproxy.client.httpclient.CrawlerHttpClient;
import com.virjar.dungproxy.client.httpclient.HeaderBuilder;
import com.virjar.dungproxy.client.httpclient.NameValuePairBuilder;
import com.virjar.dungproxy.client.ippool.IpPoolHolder;
import com.virjar.dungproxy.client.ippool.config.DungProxyContext;
import com.virjar.dungproxy.client.ippool.config.ProxyConstant;
import com.virjar.dungproxy.client.util.CommonUtil;
import com.virjar.dungproxy.client.util.PoolUtil;
import com.virjar.dungproxy.client.webmagic.DungProxyDownloader;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.Scheduler;
/**
* Created by virjar on 17/2/17.<br/>
* 测试对多用户登录的支持
*/
public class MultiUserLoginTest implements PageProcessor {
private static Site site = Site.me()// .setHttpProxy(new HttpHost("127.0.0.1",8888))
.setRetryTimes(3) // 就我的经验,这个重试一般用处不大,他是httpclient内部重试
.setTimeOut(30000)// 在使用代理的情况下,这个需要设置,可以考虑调大线程数目
.setSleepTime(0)// 使用代理了之后,代理会通过切换IP来防止反扒。同时,使用代理本身qps降低了,所以这个可以小一些
.setCycleRetryTimes(3)// 这个重试会换IP重试,是setRetryTimes的上一层的重试,不要怕三次重试解决一切问题。。
.setUseGzip(true);
private static CrawlerHttpClient httpclient = null;
private static Random random = new Random();
private static Map<String, OSChinaUser> userMap = Maps.newConcurrentMap();
static {
OSChinaUser osChinaUser = new OSChinaUser();
osChinaUser.account = "test1@virjar.com";
osChinaUser.password = "ylUbg73Qs";
osChinaUser.userName = "维佳1号";
userMap.put(osChinaUser.account, osChinaUser);
osChinaUser = new OSChinaUser();
osChinaUser.account = "test2@virjar.com";
osChinaUser.password = "ylUbg73Qs";
osChinaUser.userName = "维佳2号";
userMap.put(osChinaUser.account, osChinaUser);
osChinaUser = new OSChinaUser();
osChinaUser.account = "test3@virjar.com";
osChinaUser.password = "ylUbg73Qs";
osChinaUser.userName = "维佳3号";
userMap.put(osChinaUser.account, osChinaUser);
}
public static void main(String[] args) {
// STEP 0 默认的代理池,可选方案,代理池的具体使用参考文档。这里禁用dungproxy的代理池功能
ProxyConstant.CLIENT_CONFIG_FILE_NAME = "proxyclient_oschina.properties";
DungProxyContext dungProxyContext = DungProxyContext.create().setPoolEnabled(false);
IpPoolHolder.init(dungProxyContext);
// STEP 1 获取原生的httpclient执行登录操作
DungProxyDownloader dungProxyDownloader = new DungProxyDownloader();
CrawlerHttpClient httpClient = dungProxyDownloader.getHttpClient(site, null);
MultiUserLoginTest.httpclient = httpClient;
// STEP 2 装配webMagic
Spider spider = Spider.create(new MultiUserLoginTest()).setScheduler(new Scheduler() {
// 重写Scheduler,为了让种子使用随机的方式调度URL任务,否则大部分URL都是第一个登录的账户的。
// 这个调度器让任务根据用户来做均衡。不是这个始终是demo,实际上的用法视情况而定
// 请注意这个没有重写消重器,因为涉及多个账户数据,可能同一个URL的数据内容也不同,所以消重规则应该视情况而定
private ConcurrentMap<String, ConcurrentLinkedDeque<Request>> data = Maps.newConcurrentMap();
private ConcurrentLinkedDeque<Request> createOrGet(Request request) {
Object extra = request.getExtra(ProxyConstant.DUNGPROXY_USER_KEY);
if (extra == null) {
extra = "default_user_account";
}
ConcurrentLinkedDeque<Request> requests = data.get(extra.toString());
if (requests == null) {
synchronized (MultiUserLoginTest.class) {
requests = data.get(extra.toString());
if (requests == null) {
data.put(extra.toString(), new ConcurrentLinkedDeque<Request>());
requests = data.get(extra.toString());
}
}
}
return requests;
}
@Override
public void push(Request request, Task task) {
createOrGet(request).add(request);
}
@Override
public Request poll(Task task) {
int i = random.nextInt(data.size());
for (ConcurrentLinkedDeque<Request> queque : data.values()) {
if (i == 0) {
Request request = queque.poll();
if (request != null) {
return request;
}
}
i--;
}
for (ConcurrentLinkedDeque<Request> queque : data.values()) {
Request request = queque.poll();
if (request != null) {
return request;
}
}
return null;
}
}).setDownloader(dungProxyDownloader).thread(1);
// STEP 3 登录操作
for (OSChinaUser user : userMap.values()) {
if (login(httpClient, user)) {// 登录成功后为这个用户添加一个种子
Request request = new Request("https://www.oschina.net/?nocache=" + System.currentTimeMillis());
request.putExtra(ProxyConstant.DUNGPROXY_USER_KEY, user.account);// 绑定账户到种子上面
spider.addRequest(request);
CommonUtil.sleep(10);// 为了让时间戳不同,避免URL被webmagic消重机制给干掉,实际上关于URL消重应该好好考虑如何实现了
}
}
// STEP 4开启爬虫
spider.run();
}
private static class OSChinaUser {
String account;
String password;
String userName;
}
private static boolean login(CrawlerHttpClient crawlerHttpClient, OSChinaUser user) {
if (user == null) {
return false;
}
HttpClientContext httpClientContext = HttpClientContext.create();
PoolUtil.bindUserKey(httpClientContext, user.account);// 绑定账户到这个请求
PoolUtil.disableDungProxy(httpClientContext);// 暂时禁用代理功能。如果你的代理比较快,可以不禁用
// 构造登录表单参数
List<NameValuePair> params = NameValuePairBuilder.create().addParam("email", user.account)
.addParam("pwd", DigestUtils.sha1Hex(user.password)).addParam("verifyCode").addParam("save_login", "1")
.build();
Header[] headers = HeaderBuilder.create().defaultCommonHeader().withRefer("https://www.oschina.net/home/login")
.buildArray();
// 登录操作
crawlerHttpClient.post("https://www.oschina.net/action/user/hash_login", params, headers, httpClientContext);
String s = crawlerHttpClient.get("https://www.oschina.net/?nocache=" + System.currentTimeMillis(),
httpClientContext);
if (StringUtils.contains(s, "/action/user/logout")) {// 有退出登录的链接,代表登录成功
Elements select = Jsoup.parse(s).select(".user-info span[class=name]");
String loginUserName = select.first().text();
System.out.println(loginUserName + "登录成功");
return true;
}
return false;
}
@Override
public void process(Page page) {
List<String> allLinks = page.getHtml().links().all();
// 绑定在URL种子里面的账户
Object account = page.getRequest().getExtra(ProxyConstant.DUNGPROXY_USER_KEY);
String urlUser = null;
if (account != null) {
urlUser = account.toString();
}
// xsoup有bug,,,,
// String user = page.getHtml().css(".user-info span[class=name]").xpath("text()").get();
Element first = page.getHtml().getDocument().select(".user-info span[class=name").first();
String userName = "不知道姓名";
if (first != null) {
userName = first.ownText().trim();// 这是显示名称,URL种子里面保存的是账户
if (urlUser != null && !StringUtils.equalsIgnoreCase(userMap.get(urlUser).userName, userName)) {
// 这句话永远不能打印出来,如果打印出来,就证明多用户的功能支持不正常
System.out.println("这个链接是:" + urlUser + " 抓取到的,但是返回的网页却是:" + userName + " 的名字,cookie紊乱");
}
} else {
// cookie失效,重新登录
if (StringUtils.contains(page.getRawText(), "https://www.oschina.net/home/reg")) {
if (account != null) {
String userText = account.toString();
System.out.println("用户" + userText + ":session失效,重新登录...");
login(httpclient, userMap.get(userText));
}
}
}
System.out.println("当前页面是:《" + userName + "》 的数据");
for (String url : allLinks) {
if (StringUtils.contains(url, "my.oschina.net")) {// 控制爬虫在自己家目录爬取,实际上根据自己需要控制爬虫边缘
page.addTargetRequest(url);
}
}
}
@Override
public Site getSite() {
return site;
}
}