package com.yichao.woo.univs;
import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.HttpClients;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Get university Data From Renren
*/
public class UnivDataCrawler {
private File provinceSql = new File("province.sql");
private File collegeSql = new File("college.sql");
private File departmentSql = new File("department.sql");
private File dormSql = new File("dorm.sql");
public static final String GET_DEPT = "http://www.renren.com/GetDep.do?id=";
public static final String GET_DORM = "http://www.renren.com/GetDorm.do?id=";
public static final String ALL_UNIVS = "http://s.xnimg.cn/a13819/allunivlist.js";
public UnivDataCrawler() {
}
public void start() throws IOException {
String cnUnivs = getCnUnivsJsonString();
List<Province> provinces = getProvinces(cnUnivs);
for (Province province : provinces) {
String unicodeName = province.getName();
province.setName(convertFromHex(unicodeName));
}
for (Province province : provinces) {
List<Univ> univs = province.getUnivs();
String optionRegex = "<option[^>]*>([^<]*)</option>";
for (Univ univ : univs) {
String id = univ.getId();
HttpEntity httpEntity1 = issueHttpRequest(GET_DEPT + id);
String responseString1 = getResponseString(httpEntity1);
Pattern pattern = Pattern.compile(optionRegex);
Matcher matcher = pattern.matcher(responseString1);
while (matcher.find()) {
String uDepartmentName = matcher.group(1);
String departmentName = convertFromDec(uDepartmentName);
univ.getDepartments().add(departmentName);
}
HttpEntity httpEntity2 = issueHttpRequest(GET_DORM + id);
String responseString2 = getResponseString(httpEntity2);
Matcher matcher1 = pattern.matcher(responseString2);
while (matcher1.find()) {
String uDormName = matcher1.group(1);
String dormName = convertFromDec(uDormName);
univ.getDorms().add(dormName);
}
}
for (Univ univ : univs) {
univ.getDepartments().remove(0);
univ.getDorms().remove(0);
}
Path univsOfOneProvince = Paths.get(province.getName() + ".txt");
BufferedWriter writer = Files.newBufferedWriter(univsOfOneProvince, Charset.forName("UTF-8"));
writer.write(province.getId() + " " + province.getName() + "\r\n");
writer.write("\r\n");
for (Univ univ : province.getUnivs()) {
writer.write(univ.getId() + " " + univ.getName() + "\r\n\r\n");
writer.write("院系:\r\n");
for (String dept : univ.getDepartments()) {
writer.write(dept + "\r\n");
}
writer.write("\r\n寝室:\r\n");
for (String dorm : univ.getDorms()) {
writer.write(dorm + "\r\n");
}
writer.write("\r\n\r\n");
}
writer.close();
}
System.out.println("Data Ready");
}
private String convertFromDec(String code) {
StringBuffer sb = new StringBuffer(code);
int startPos;
int endPos;
while ((startPos = sb.indexOf("")) > -1) {
endPos = sb.indexOf(";");
String tmp = sb.substring(startPos + 2, endPos);
sb.replace(startPos, endPos + 1, Character.toString((char) Integer
.parseInt(tmp, 10)));
}
return code = sb.toString();
}
private String convertFromHex(String code) {
StringBuffer sb = new StringBuffer(code);
int pos;
while ((pos = sb.indexOf("\\u")) > -1) {
String tmp = sb.substring(pos, pos + 6);
sb.replace(pos, pos + 6, Character.toString((char) Integer
.parseInt(tmp.substring(2), 16)));
}
return code = sb.toString();
}
private void writeCnUnivsToText(String cnUnivs) throws IOException {
String[] strings = cnUnivs.split("}");
Path path = Paths.get("allUniv.txt");
BufferedWriter bufferedWriter = Files.newBufferedWriter(path, Charset.forName("UTF-8"));
for (String string : strings) {
bufferedWriter.write(string);
bufferedWriter.write("}");
bufferedWriter.write("\r\n");
}
bufferedWriter.close();
}
private List<Province> getProvinces(String cnUnivs) throws IOException {
ObjectMapper objectMapper = new ObjectMapper();
JavaType type = objectMapper.getTypeFactory().constructCollectionType(List.class, Province.class);
return objectMapper.readValue(cnUnivs, type);
}
private String getCnUnivsJsonString() throws IOException {
HttpEntity httpEntity = issueHttpRequest(ALL_UNIVS);
String responseString = getResponseString(httpEntity);
String cnUnivs = getCnUnivs(responseString);
cnUnivs = cnUnivs.substring(8);
return cnUnivs;
}
private String getCnUnivs(String responseString) {
String allUinvRegex = "\"provs\":(.*?)]}";
Pattern pattern = Pattern.compile(allUinvRegex);
Matcher matcher = pattern.matcher(responseString);
matcher.find();
return matcher.group(0);
}
private String getResponseString(HttpEntity httpEntity) throws IOException {
ContentType contentType = ContentType.getOrDefault(httpEntity);
Reader reader = new InputStreamReader(httpEntity.getContent(), Charset.forName("UTF-8"));
BufferedReader bufferedReader = new BufferedReader(reader);
String line;
String result = "";
while ((line = bufferedReader.readLine()) != null) {
result += line;
}
return result;
}
private HttpEntity issueHttpRequest(String url) throws IOException {
HttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
// 这个 Cookie 值可能需要改动才能够从人人爬到数据。没有这个 Cookie,拿到的是首页的 html 文档
httpGet.setHeader("Cookie", "Cookie: anonymid=hz69qrm3-8j3kgb; _r01_=1; JSESSIONID=abc0ZuMsiOs7t-QCUcLGu; depovince=GW; XNESSESSIONID=e71b1dc153f0; wp=0; jebecookies=2fe87010-a7df-4631-b681-d7a9105b59bd|||||; ick_login=7386442e-00ab-4548-a2eb-88226ca710bf; _de=B7F46AAA715F528B5BF7EAB63017E04A; p=806dfd7f8ba7025b101e5ea89d3258252; ap=247748802; first_login_flag=1; t=50085a807668da0514e80f8f39dab6822; societyguester=50085a807668da0514e80f8f39dab6822; id=247748802; xnsid=bf40569b; loginfrom=syshome; ln_uact=tmac1ro@163.com; ln_hurl=http://hdn.xnimg.cn/photos/hdn121/20140622/0015/main_HkP8_5418000014db1986.jpg; WebOnLineNotice_247748802=1; ver=rewrite");
HttpResponse response = httpClient.execute(httpGet);
return response.getEntity();
}
public static void main(String[] args) throws IOException {
UnivDataCrawler crawler = new UnivDataCrawler();
crawler.start();
}
}