package com.mite8.Insight.movie_great_wall;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import com.mite8.jx.gz.dn.utils.DefineDn;
import com.mite8.utils.CutDoubleValue;
import com.mite8.utils.DefineOut;
import com.mite8.utils.TransferTime;
import com.mite8.utils.ansj_util.LoadDictionary;
import com.mite8.utils.ansj_util.ResultFilter;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.springframework.jdbc.core.JdbcTemplate;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Logger;
import java.util.regex.Matcher;
/**
* Author: blogc
* Time: 2016/11/23.
* Email: blogchong#qq.com
* 公众号:数据虫巢 ID:blogchong
// * Desc: 豆瓣影评入口,解析全局数据,包括翻页数量,电影名字,导演,主演,类型,地区,上映时间,记录时间,影评总数,1/2/3/4/5各星的数量,
*/
public class OptMovie {
private static final Logger logger = Logger.getLogger(OptMovie.class.getName());
public static String id = "6982558";
// public static void main(String[] args) {
// AnalysisNum(id,"2016-12-17");
// }
//操作遍历入口
public static void optMovie(String id,JdbcTemplate jdbcTemplate){
long beginTime = TransferTime.dateToLong(new Date())/1000;
String updateTime = TransferTime.dateToString(new Date(), DefineOut.timeFormat);
//分词相关
Forest emotionForest = LoadDictionary.dicSegMap.get("emotion");
Forest starForest = LoadDictionary.dicSegMap.get("movie");
//词性过滤
ResultFilter resultFilterEmotion = new ResultFilter();
resultFilterEmotion.addNatureFilterByNature("emotion");
ResultFilter resultFilterStar = new ResultFilter();
resultFilterStar.addNatureFilterByNature("movie");
JSONObject jsonObject_global = AnalysisNum(id,updateTime);
int type = CheckAndStore.checkNameAndPTime(jdbcTemplate, id);
CheckAndStore.storeData(jdbcTemplate,jsonObject_global, type);
int num = jsonObject_global.getInt("m_page");
String m_name = jsonObject_global.getString("m_name");
String m_url = jsonObject_global.getString("m_url");
int count = 0;
if (num == 0) {
logger.info("ERROR - optMovie, Please check!");
} else {
for (int i = 0; i < num; i++) {
//获取请求链接
String listUrl = "https://movie.douban.com/subject/"+id+"/reviews?start=" + i*20;
AnalysisList.analysisList(emotionForest, resultFilterEmotion,
starForest, resultFilterStar,
jdbcTemplate,listUrl,updateTime,id,m_name,m_url);
count++;
logger.info("INFO - COMMENTS, count["+count+"] list_url:"+listUrl);
}
}
long endTime = TransferTime.dateToLong(new Date())/1000;
logger.info("TAKE - COMMENTS TIME: " + (endTime - beginTime) + "s Notes: " + count);
// ////////////短评分隔线//////////////////////
// int num_s = AnalysisNum2(id);
// int count_c = 0;
// if (num_s == 0) {
// logger.info("ERROR - optMovie, Please check!");
// } else {
//
// for (int i = 0; i < num_s; ) {
//
// //获取请求链接
// String listUrl = "https://movie.douban.com/subject/6982558/comments?start="+i+"&limit=20&sort=new_score&status=P";
//
// if (i==0){
// i=22;
// } else {
// i+=20;
// }
//
// AnalysisListShort.analysisList(emotionForest, resultFilterEmotion,
// starForest, resultFilterStar,
// jdbcTemplate,listUrl,updateTime,id,m_name,m_url);
// count_c++;
//
// logger.info("INFO - COMMENTS-S, count_c["+count_c+"] list_url:"+listUrl);
// }
// }
//
// long endTime2 = TransferTime.dateToLong(new Date())/1000;
//
// logger.info("TAKE - COMMENTS-S TIME: " + (endTime2 - endTime) + "s Notes: " + count_c);
}
//获取循环次数
public static int AnalysisNum2(String id) {
//初始链接https://movie.douban.com/subject/6982558/comments?start=0&limit=20&sort=new_score&status=P
String url = "https://movie.douban.com/subject/"+id+"/comments?start=0&limit=20&sort=new_score&status=P";
int num = 24054;
try {
CrawlDatum crawlDatum = new CrawlDatum(url).putMetaData("method", "GET");
HttpRequest request = new HttpRequest(crawlDatum.getUrl());
request.setMethod(crawlDatum.getMetaData("method"));
String outputData = crawlDatum.getMetaData("outputData");
if (outputData != null) {
request.setOutputData(outputData.getBytes("utf-8"));
}
request.setCookie(MovieUtils.cookie);
request.setUserAgent(MovieUtils.agent);
HttpResponse httpResponse = request.getResponse();
Page page = new Page(crawlDatum, httpResponse);
String num_str = page.select("li[class=is-active]").text();
Matcher matcher = MovieUtils.patternDP.matcher(num_str);
if (matcher.find()) {
num = Integer.parseInt(matcher.group(1));
}
} catch (Exception e) {
logger.info("[ERROR] - analysisList: url["+ url +"] - Detail: " + e);
}
return num;
}
//获取循环次数
public static JSONObject AnalysisNum(String id, String update_time){
//初始链接https://movie.douban.com/subject/6982558/reviews?start=0
JSONObject jsonObject = new JSONObject();
try {
jsonObject.put("update_time", update_time);
jsonObject.put("m_id", id);
//////////////////////直接解析全局信息////////////////////////
String url_global = "https://movie.douban.com/subject/"+id+"/";
String url_comments_1 = "https://movie.douban.com/subject/"+id+"/reviews";
CrawlDatum crawlDatum_g = new CrawlDatum(url_global).putMetaData("method", "GET");
CrawlDatum crawlDatum_c = new CrawlDatum(url_comments_1).putMetaData("method", "GET");
HttpRequest request_global = new HttpRequest(crawlDatum_g.getUrl());
HttpRequest request_comments = new HttpRequest(crawlDatum_c.getUrl());
request_global.setMethod(crawlDatum_g.getMetaData("method"));
String outputData_global = crawlDatum_g.getMetaData("outputData");
if (outputData_global != null) {
request_global.setOutputData(outputData_global.getBytes("utf-8"));
}
request_comments.setMethod(crawlDatum_c.getMetaData("method"));
String outputData_comments = crawlDatum_c.getMetaData("outputData");
if (outputData_comments != null) {
request_comments.setOutputData(outputData_comments.getBytes("utf-8"));
}
request_global.setCookie(MovieUtils.cookie);
request_global.setUserAgent(MovieUtils.agent);
request_comments.setCookie(MovieUtils.cookie);
request_comments.setUserAgent(MovieUtils.agent);
HttpResponse httpResponse_g = request_global.getResponse();
HttpResponse httpResponse_c = request_comments.getResponse();
Page page_g = new Page(crawlDatum_g, httpResponse_g);
Page page_c = new Page(crawlDatum_c, httpResponse_c);
//名称
String m_name = page_g.select("span[property=v:itemreviewed]").text();
jsonObject.put("m_name", m_name);
//年份
String m_year = page_g.select("span[class=year]").text();
Matcher matcher = MovieUtils.patternKH.matcher(m_year);
if (matcher.find()) {
m_year = matcher.group(1);
}
jsonObject.put("m_year", m_year);
//导演
String m_dy = page_g.select("a[rel=v:directedBy]").text();
jsonObject.put("m_dy", m_dy);
//编剧
jsonObject.put("m_bj", "");
//演员
String m_zy = page_g.select("span[class=actor]").select("span[class=attrs]").text();
jsonObject.put("m_zy", m_zy);
//类型
String m_type = page_g.select("span[property=v:genre]").text();
jsonObject.put("m_type", m_type);
//地区
jsonObject.put("m_area", "");
//上映时间
String m_time = page_g.select("span[property=v:initialReleaseDate]").text();
jsonObject.put("m_time", m_time);
//评分
double m_score = Double.parseDouble(page_g.select("strong[class=ll rating_num]").text());
jsonObject.put("m_score", m_score);
//人数
int m_num = Integer.parseInt(page_g.select("span[property=v:votes]").text());
jsonObject.put("m_num", m_num);
//具体星级人数
Elements elements_score = page_g.select("span[class=rating_per]");
int m_1_num = 0;int m_2_num = 0;int m_3_num = 0;int m_4_num = 0;int m_5_num = 0;
if(elements_score.size() == 5){
int count_score = 1;
for(Element element: elements_score) {
double num_tmp = 0;
Matcher matcher_score = MovieUtils.patternScore.matcher(element.text());
if (matcher_score.find()) {
num_tmp = Double.parseDouble(matcher_score.group(1)) / 100 * m_num;
}
if (count_score == 1) {
m_1_num = (int)num_tmp;
}else if (count_score == 2) {
m_2_num = (int)num_tmp;
}else if (count_score == 3) {
m_3_num = (int)num_tmp;
}else if (count_score == 4) {
m_4_num = (int)num_tmp;
}else if (count_score == 5) {
m_5_num = (int)num_tmp;
}
count_score++;
}
}
jsonObject.put("m_1_num", m_1_num);
jsonObject.put("m_2_num", m_2_num);
jsonObject.put("m_3_num", m_3_num);
jsonObject.put("m_4_num", m_4_num);
jsonObject.put("m_5_num", m_num-(m_1_num+m_2_num+m_3_num+m_4_num));
//评论星级
int m_c_num = 0;int m_c_1_num = 0;int m_c_2_num = 0;int m_c_3_num = 0;int m_c_4_num = 0;int m_c_5_num = 0;
String m_c_num_str = page_c.select("a[href=?rating=]").text();
Matcher matcher_m_c_num = MovieUtils.patternScoreCommentsAll.matcher(m_c_num_str);
if (matcher_m_c_num.find()) {
m_c_num = Integer.parseInt(matcher_m_c_num.group(1));
}
String m_c_1_num_str = page_c.select("a[href=?rating=1]").text();
Matcher matcher_m_c_1_num = MovieUtils.patternScoreCommentsEach.matcher(m_c_1_num_str);
if (matcher_m_c_1_num.find()) {
m_c_1_num = Integer.parseInt(matcher_m_c_1_num.group(1));
}
String m_c_2_num_str = page_c.select("a[href=?rating=2]").text();
Matcher matcher_m_c_2_num = MovieUtils.patternScoreCommentsEach.matcher(m_c_2_num_str);
if (matcher_m_c_2_num.find()) {
m_c_2_num = Integer.parseInt(matcher_m_c_2_num.group(1));
}
String m_c_3_num_str = page_c.select("a[href=?rating=3]").text();
Matcher matcher_m_c_3_num = MovieUtils.patternScoreCommentsEach.matcher(m_c_3_num_str);
if (matcher_m_c_3_num.find()) {
m_c_3_num = Integer.parseInt(matcher_m_c_3_num.group(1));
}
String m_c_4_num_str = page_c.select("a[href=?rating=4]").text();
Matcher matcher_m_c_4_num = MovieUtils.patternScoreCommentsEach.matcher(m_c_4_num_str);
if (matcher_m_c_4_num.find()) {
m_c_4_num = Integer.parseInt(matcher_m_c_4_num.group(1));
}
String m_c_5_num_str = page_c.select("a[href=?rating=5]").text();
Matcher matcher_m_c_5_num = MovieUtils.patternScoreCommentsEach.matcher(m_c_5_num_str);
if (matcher_m_c_5_num.find()) {
m_c_5_num = Integer.parseInt(matcher_m_c_5_num.group(1));
}
double m_c_score = CutDoubleValue.cutDoubleValue((double)(m_5_num*5+m_4_num*4+m_3_num*3+m_2_num*2+m_1_num*1)*2/(double)(m_5_num+m_4_num+m_3_num+m_2_num+m_1_num), 2);
jsonObject.put("m_c_score", m_c_score);
jsonObject.put("m_c_num", m_c_num);
jsonObject.put("m_c_1_num", m_c_1_num);
jsonObject.put("m_c_2_num", m_c_2_num);
jsonObject.put("m_c_3_num", m_c_3_num);
jsonObject.put("m_c_4_num", m_c_4_num);
jsonObject.put("m_c_5_num", m_c_5_num);
jsonObject.put("m_url", url_global);
//评论页码
String m_page_str = page_c.select("span[class=thispage]").toString();
int m_page = 87;
Matcher matcher_m_page = MovieUtils.patternPage.matcher(m_page_str);
if (matcher_m_page.find()) {
m_page = Integer.parseInt(matcher_m_page.group(1));
}
jsonObject.put("m_page", m_page);
} catch (Exception e) {
logger.info("[ERROR] - AnalysisNum. Detail: " + e);
}
return jsonObject;
}
}