package com.mite8.wechat.wpweixin_com; import cn.edu.hfut.dmic.webcollector.model.CrawlDatum; import cn.edu.hfut.dmic.webcollector.model.Page; import cn.edu.hfut.dmic.webcollector.net.HttpRequest; import cn.edu.hfut.dmic.webcollector.net.HttpResponse; import com.mite8.Insight.movie_great_wall.MovieUtils; import com.mite8.utils.DefineOut; import com.mite8.utils.TransferTime; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.stereotype.Service; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Random; import java.util.logging.Logger; import java.util.regex.Pattern; /** * Author: blogchong * Time: 2016/11/23. * Email: blogchong#qq.com * 公众号:数据虫巢 ID:blogchong // * Desc: http://data.wpweixin.com/微果酱 */ @Service public class WpweixinService { @Autowired private JdbcTemplate jdbcTemplate; private static final String cookie = "_ga=GA1.2.1523211651.1483093510; _gat=1; Hm_lvt_ecf82f2f1ced060ee9a095a6f21792e9=1483459256,1483495005,1483529353,1483600935; Hm_lpvt_ecf82f2f1ced060ee9a095a6f21792e9=1483600935; SID=wxw9iboy6ttof4rjs9nwg6sm; Hm_lvt_753c2243909f3929f565db606d75dae6=1483459257,1483495006,1483529354,1483600938; Hm_lpvt_753c2243909f3929f565db606d75dae6=1483601009"; private static final Logger logger = Logger.getLogger(WpweixinService.class.getName()); private static Pattern patternID = Pattern.compile("\" data-sku=\"(\\d+)\" href="); private static int sleepTime = 2000; public void getWechatData(){ optWechatData(jdbcTemplate); } // public static void main(String[] args) { // optWechatData(null); // } //操作遍历入口 public static void optWechatData4(JdbcTemplate jdbcTemplate, String rankingDate){ long beginTime = TransferTime.dateToLong(new Date())/1000; //12-22号,刚跑完财富,暂停 // String[] typeListName = {"科技","创业","汽车","楼市","职场","教育","学术","政务","企业","文化","百科","幽默"}; String[] typeListName = {"时尚","健康","乐活","情感","美食","旅行","体娱","美体","文摘","时事","民生","财富","科技","创业","汽车","楼市","职场","教育","学术","政务","企业","文化","百科","幽默"}; // String rankingDate = "2016-12-22"; Random random = new Random(); String updateTime = TransferTime.dateToString(new Date(), DefineOut.timeFormat); int count_all = 0; // String accountType = "时尚"; for (String accountType: typeListName) { //初始总数,默认是 int total = 100; int page_flag = 0; int count_type = 0; for (int i = 0; i < total; i += 50) { String url = "http://data.wpweixin.com/v1/mp/account/rank/daily?" + "rankingDate=" + rankingDate + "&accountType=" + accountType + "&offset=" + i + "&size=50"; try { CrawlDatum crawlDatum_in = new CrawlDatum(url).putMetaData("method", "GET"); HttpRequest request_in = new HttpRequest(crawlDatum_in.getUrl()); request_in.setMethod(crawlDatum_in.getMetaData("method")); String outputData_in = crawlDatum_in.getMetaData("outputData"); if (outputData_in != null) { request_in.setOutputData(outputData_in.getBytes("utf-8")); } request_in.setCookie(cookie); request_in.setUserAgent(MovieUtils.agent); HttpResponse httpResponse_in = request_in.getResponse(); Page page_in = new Page(crawlDatum_in, httpResponse_in); String json_str = page_in.getHtml(); if (count_all!=0) { int sleep_time_in = random.nextInt(2000) + 3000; logger.info("[INFO-SLEEP] rankingDate[" + rankingDate + "] accountType[" + accountType + "] total[" + total + "] count_all[" + count_all + "] count_type[" + count_type + "] offset[" + i + "] It's time to sleep[" + sleep_time_in + "]."); Thread.sleep(sleep_time_in); } try { JSONObject jsonObject = JSONObject.fromObject(json_str); //解析数据,并且入库 JSONObject jsonObjectData = jsonObject.getJSONObject("data"); int totalTmp = jsonObjectData.getInt("total"); if (total == 100 && totalTmp != 0) { total = totalTmp; } JSONArray jsonArray = jsonObjectData.getJSONArray("list"); if (jsonArray != null && jsonArray.size() == 0) { logger.info("[ERROR-JAR] rankingDate[" + rankingDate + "] accountType[" + accountType + "]" + " total[" + total + "] count_all[" + count_all + "] " + "count_type[" + count_type + "] offset[" + i + "] url[" + url + "] error[jsonArray=null or jsonArray.size()==0]"); } else { //进行正常解析并且入库 List<JSONObject> list = new ArrayList<>(); for (int j = 0; j < jsonArray.size(); j++) { try { JSONObject jsonObject1 = jsonArray.getJSONObject(j); if (jsonObject1.size() >= 24) { list.add(jsonObject1); count_all++; count_type++; } } catch (Exception e) { logger.info("[ERROR-IN_JAR] rankingDate[" + rankingDate + "] accountType[" + accountType + "]" + " total[" + total + "] count_all[" + count_all + "] " + "count_type[" + count_type + "] offset[" + i + "] note[" + j + "] url[" + url + "] error[" + e + "]"); } } //进行入库操作 storeListData2(jdbcTemplate, updateTime, rankingDate, list); page_flag++; // logger.info("[INFO-TYPE_PAGE] accountType[" + accountType + "]" + // " total[" + total + "] count_all[" + count_all + "] " + // "count_type[" + count_type + "] offset[" + i + "] page_flag[" + page_flag + "] url[" + url + "]"); } } catch (Exception e) { logger.info("[ERROR-ANA_HTML] rankingDate[" + rankingDate + "] accountType[" + accountType + "]" + " total[" + total + "] count_all[" + count_all + "] " + "count_type[" + count_type + "] offset[" + i + "] url[" + url + "] error[" + e + "]"); } } catch (Exception e) { logger.info("[ERROR-ANA_URL] rankingDate[" + rankingDate + "] accountType[" + accountType + "]" + " total[" + total + "] count_all[" + count_all + "] " + "count_type[" + count_type + "] offset[" + i + "] url[" + url + "] error[" + e + "]"); } } logger.info("[INFO-TYPE_OVER] rankingDate[" + rankingDate + "] accountType[" + accountType + "]" + " total[" + total + "] count_all[" + count_all + "] " + "count_type[" + count_type + "]"); try { int sleep_time_out = random.nextInt(10000) + 20000; logger.info("[INFO-SLEEP_OUT] rankingDate[" + rankingDate + "] accountType[" + accountType + "] total[" + total + "] count_all[" + count_all + "] count_type[" + count_type + "] It's time to sleep[" + sleep_time_out + "]."); Thread.sleep(sleep_time_out); }catch (Exception e){ logger.info("[ERROR-SLEEP_OUT]"); } } long endTime = TransferTime.dateToLong(new Date())/1000; logger.info("[INFO-END] rankingDate["+rankingDate+"] times[" + (endTime - beginTime) + "s] count_all:["+count_all+"]"); } public static void optWechatData(JdbcTemplate jdbcTemplate){ String rankingDate = ""; for (int j = 12; j > 0; j--) { int max = 30; if (j==12){ max = 16; } else if (j==2){ max = 28; }else if (j==1 || j==3 || j==5 || j==7 || j==8 || j==10){ max = 31; } for (int i = max; i > 0; i--) { String month = "12"; String day = "01"; if (j<10){ month = "0"+j; } else { month = j+""; } if (i<10){ day = "0"+i; }else { day = i+""; } rankingDate = "2016-"+month+"-"+day; optWechatData4(jdbcTemplate, rankingDate); } } } //操作遍历入口 public static void optWechatData2(JdbcTemplate jdbcTemplate,String rankingDate){ long beginTime = TransferTime.dateToLong(new Date())/1000; String[] typeListName = {"时尚","健康","乐活","情感","美食","旅行","体娱","美体","文摘","时事","民生","财富","科技","创业","汽车","楼市","职场","教育","学术","政务","企业","文化","百科","幽默"}; Random random = new Random(); String updateTime = TransferTime.dateToString(new Date(), DefineOut.timeFormat); int count_all = 0; //遍历类别 for (String accountType: typeListName){ //初始总数,默认是 int total = 100; int page_flag = 0; int count_type = 0; for (int i = 0; i < total; i+=50) { String url = "http://data.wpweixin.com/v1/mp/account/rank/daily?" + "rankingDate="+rankingDate+"&accountType=" + accountType + "&offset="+i+"&size=50"; try { CrawlDatum crawlDatum_in = new CrawlDatum(url).putMetaData("method", "GET"); HttpRequest request_in = new HttpRequest(crawlDatum_in.getUrl()); request_in.setMethod(crawlDatum_in.getMetaData("method")); String outputData_in = crawlDatum_in.getMetaData("outputData"); if (outputData_in != null) { request_in.setOutputData(outputData_in.getBytes("utf-8")); } request_in.setCookie(cookie); request_in.setUserAgent(MovieUtils.agent); HttpResponse httpResponse_in = request_in.getResponse(); Page page_in = new Page(crawlDatum_in, httpResponse_in); String json_str = page_in.getHtml(); try { JSONObject jsonObject = JSONObject.fromObject(json_str); //解析数据,并且入库 JSONObject jsonObjectData = jsonObject.getJSONObject("data"); int totalTmp = jsonObjectData.getInt("total"); if (total == 100 && totalTmp != 0){ total = totalTmp; } JSONArray jsonArray = jsonObjectData.getJSONArray("list"); if (jsonArray!= null && jsonArray.size() == 0) { logger.info("[ERROR-JAR] rankingDate["+rankingDate+"] accountType["+accountType+"]" + " total["+total+"] count_all["+count_all+"] " + "count_type["+count_type+"] offset["+i+"] url["+url+"] error[jsonArray=null or jsonArray.size()==0]"); } else { //进行正常解析并且入库 List<JSONObject> list = new ArrayList<>(); for (int j = 0; j < jsonArray.size(); j++) { try { JSONObject jsonObject1 = jsonArray.getJSONObject(j); if (jsonObject1.size() == 24) { list.add(jsonObject1); count_all++; count_type++; // if (count_all % 3000 == 0 && count_all!=0) { // int sleep_time_in = random.nextInt(1500) + 500; // logger.info("[INFO-SLEEP] rankingDate["+rankingDate+"] accountType["+accountType+"] total["+total+"] count_all["+count_all+"] count_type["+count_type+"] offset["+i+"] It's time to sleep[" + sleep_time_in + "]."); // Thread.sleep(sleep_time_in); // } } } catch (Exception e) { logger.info("[ERROR-IN_JAR] rankingDate["+rankingDate+"] accountType["+accountType+"]" + " total["+total+"] count_all["+count_all+"] " + "count_type["+count_type+"] offset["+i+"] note["+j+"] url["+url+"] error["+e+"]"); } } //进行入库操作 storeListData(jdbcTemplate, updateTime, rankingDate, list); page_flag++; // logger.info("[INFO-TYPE_PAGE] accountType["+accountType+"]" + // " total["+total+"] count_all["+count_all+"] " + // "count_type["+count_type+"] offset["+i+"] page_flag["+page_flag+"] url["+url+"]"); } } catch (Exception e) { logger.info("[ERROR-ANA_HTML] rankingDate["+rankingDate+"] accountType["+accountType+"]" + " total["+total+"] count_all["+count_all+"] " + "count_type["+count_type+"] offset["+i+"] url["+url+"] error["+e+"]"); } }catch (Exception e){ logger.info("[ERROR-ANA_URL] rankingDate["+rankingDate+"] accountType["+accountType+"]" + " total["+total+"] count_all["+count_all+"] " + "count_type["+count_type+"] offset["+i+"] url["+url+"] error["+e+"]"); } } logger.info("[INFO-TYPE_OVER] rankingDate["+rankingDate+"] accountType["+accountType+"]" + " total["+total+"] count_all["+count_all+"] " + "count_type["+count_type+"]"); } long endTime = TransferTime.dateToLong(new Date())/1000; logger.info("[INFO-END] rankingDate["+rankingDate+"] times[" + (endTime - beginTime) + "s] count_all:["+count_all+"]"); try { int sleep_time_in = random.nextInt(5*1000) + 10*1000; logger.info("[INFO-SLEEP_OUT] rankingDate[" + rankingDate + "] It's time to sleep[" + sleep_time_in + "]."); Thread.sleep(sleep_time_in); } catch (Exception e){ logger.info("[ERROR-SLEEP] I need sleep! error["+e+"]"); } } public static void storeListData(JdbcTemplate jdbcTemplate,String update_time, String rankingDate, List<JSONObject> list){ jdbcTemplate.update("set names utf8mb4"); String queryInsertKeyWordsTable="insert into wechat_flow_list" + "(update_time,rankingDate,stat_id,account_id," + "weixin_id,nick_name,account_type,idx1_article_num," + "idx2_article_num,orig_article_num,total_article_num,idx1_read_num," + "idx2_read_num,orig_read_num,total_read_num,idx1_like_num," + "idx2_like_num,orig_like_num,total_like_num,max_read_num," + "max_like_num,ave_like_rate,score,main_ranking," + "type_ranking,stat_time) values"; String queryInsertKeyWordsTableTmp = queryInsertKeyWordsTable; boolean flagKeyWords = false; for (int j = 0; j < list.size(); j++){ try { JSONObject jsonObject = list.get(j); String stat_id = jsonObject.getString("stat_id"); String account_id = jsonObject.getString("account_id"); String weixin_id = jsonObject.getString("weixin_id"); String nick_name = jsonObject.getString("nick_name").replaceAll("\"|'", ""); String account_type = jsonObject.getString("account_type").replaceAll("\"|'", ""); String idx1_article_num = jsonObject.getString("idx1_article_num"); String idx2_article_num = jsonObject.getString("idx2_article_num"); String orig_article_num = jsonObject.getString("orig_article_num"); String total_article_num = jsonObject.getString("total_article_num"); String idx1_read_num = jsonObject.getString("idx1_read_num"); String idx2_read_num = jsonObject.getString("idx2_read_num"); String orig_read_num = jsonObject.getString("orig_read_num"); String total_read_num = jsonObject.getString("total_read_num"); String idx1_like_num = jsonObject.getString("idx1_like_num"); String idx2_like_num = jsonObject.getString("idx2_like_num"); String orig_like_num = jsonObject.getString("orig_like_num"); String total_like_num = jsonObject.getString("total_like_num"); String max_read_num = jsonObject.getString("max_read_num"); String max_like_num = jsonObject.getString("max_like_num"); String ave_like_rate = jsonObject.getString("ave_like_rate"); String score = jsonObject.getString("score"); String main_ranking = jsonObject.getString("main_ranking"); String type_ranking = jsonObject.getString("type_ranking"); String stat_time = jsonObject.getString("stat_time"); if (!flagKeyWords) { queryInsertKeyWordsTable = queryInsertKeyWordsTable + "(\"" + update_time + "\", \"" + rankingDate + "\", \"" + stat_id + "\", \"" + account_id + "\",\"" + weixin_id + "\", \"" + nick_name + "\", \"" + account_type + "\", \"" + idx1_article_num + "\", \"" + idx2_article_num + "\", \"" + orig_article_num + "\", \"" + total_article_num + "\", \"" + idx1_read_num + "\", \"" + idx2_read_num + "\", \"" + orig_read_num + "\", \"" + total_read_num + "\", \"" + idx1_like_num + "\", \"" + idx2_like_num + "\", \"" + orig_like_num + "\", \"" + total_like_num + "\", \"" + max_read_num + "\", \"" + max_like_num + "\", \"" + ave_like_rate + "\", \"" + score + "\", \"" + main_ranking + "\", \"" + type_ranking + "\", \"" + stat_time + "\")"; flagKeyWords = true; } else { queryInsertKeyWordsTable = queryInsertKeyWordsTable + "," + "(\"" + update_time + "\", \"" + rankingDate + "\", \"" + stat_id + "\", \"" + account_id + "\",\"" + weixin_id + "\", \"" + nick_name + "\", \"" + account_type + "\", \"" + idx1_article_num + "\", \"" + idx2_article_num + "\", \"" + orig_article_num + "\", \"" + total_article_num + "\", \"" + idx1_read_num + "\", \"" + idx2_read_num + "\", \"" + orig_read_num + "\", \"" + total_read_num + "\", \"" + idx1_like_num + "\", \"" + idx2_like_num + "\", \"" + orig_like_num + "\", \"" + total_like_num + "\", \"" + max_read_num + "\", \"" + max_like_num + "\", \"" + ave_like_rate + "\", \"" + score + "\", \"" + main_ranking + "\", \"" + type_ranking + "\", \"" + stat_time + "\")"; } }catch (Exception e){ logger.info("[ERROR-IN_STORE] error["+e+"]"); } } if (!queryInsertKeyWordsTableTmp.equals(queryInsertKeyWordsTable)) { jdbcTemplate.update(queryInsertKeyWordsTable); }else { logger.info("[ERROR-OUT_STORE] error[no one good!]"); } } public static void storeListData2(JdbcTemplate jdbcTemplate,String update_time, String rankingDate, List<JSONObject> list){ jdbcTemplate.update("set names utf8mb4"); String queryInsertKeyWordsTable="insert into wechat_flow_list2" + "(update_time,rankingDate,stat_id,account_id," + "weixin_id,nick_name,account_type,idx1_article_num," + "idx2_article_num,orig_article_num,total_article_num,idx1_read_num," + "idx2_read_num,orig_read_num,total_read_num,idx1_like_num," + "idx2_like_num,orig_like_num,total_like_num,max_read_num," + "max_like_num,ave_like_rate,score,main_ranking," + "type_ranking,stat_time) values"; String queryInsertKeyWordsTableTmp = queryInsertKeyWordsTable; boolean flagKeyWords = false; for (int j = 0; j < list.size(); j++){ try { JSONObject jsonObject = list.get(j); String stat_id = jsonObject.getString("stat_id"); String account_id = jsonObject.getString("account_id"); String weixin_id = jsonObject.getString("weixin_id"); String nick_name = jsonObject.getString("nick_name").replaceAll("\"|'", ""); String account_type = jsonObject.getString("account_type").replaceAll("\"|'", ""); String idx1_article_num = jsonObject.getString("idx1_article_num"); String idx2_article_num = jsonObject.getString("idx2_article_num"); String orig_article_num = jsonObject.getString("orig_article_num"); String total_article_num = jsonObject.getString("total_article_num"); String idx1_read_num = jsonObject.getString("idx1_read_num"); String idx2_read_num = jsonObject.getString("idx2_read_num"); String orig_read_num = jsonObject.getString("orig_read_num"); String total_read_num = jsonObject.getString("total_read_num"); String idx1_like_num = jsonObject.getString("idx1_like_num"); String idx2_like_num = jsonObject.getString("idx2_like_num"); String orig_like_num = jsonObject.getString("orig_like_num"); String total_like_num = jsonObject.getString("total_like_num"); String max_read_num = jsonObject.getString("max_read_num"); String max_like_num = jsonObject.getString("max_like_num"); String ave_like_rate = jsonObject.getString("ave_like_rate"); String score = jsonObject.getString("score"); String main_ranking = jsonObject.getString("main_ranking"); String type_ranking = jsonObject.getString("type_ranking"); String stat_time = jsonObject.getString("stat_time"); if (!flagKeyWords) { queryInsertKeyWordsTable = queryInsertKeyWordsTable + "(\"" + update_time + "\", \"" + rankingDate + "\", \"" + stat_id + "\", \"" + account_id + "\",\"" + weixin_id + "\", \"" + nick_name + "\", \"" + account_type + "\", \"" + idx1_article_num + "\", \"" + idx2_article_num + "\", \"" + orig_article_num + "\", \"" + total_article_num + "\", \"" + idx1_read_num + "\", \"" + idx2_read_num + "\", \"" + orig_read_num + "\", \"" + total_read_num + "\", \"" + idx1_like_num + "\", \"" + idx2_like_num + "\", \"" + orig_like_num + "\", \"" + total_like_num + "\", \"" + max_read_num + "\", \"" + max_like_num + "\", \"" + ave_like_rate + "\", \"" + score + "\", \"" + main_ranking + "\", \"" + type_ranking + "\", \"" + stat_time + "\")"; flagKeyWords = true; } else { queryInsertKeyWordsTable = queryInsertKeyWordsTable + "," + "(\"" + update_time + "\", \"" + rankingDate + "\", \"" + stat_id + "\", \"" + account_id + "\",\"" + weixin_id + "\", \"" + nick_name + "\", \"" + account_type + "\", \"" + idx1_article_num + "\", \"" + idx2_article_num + "\", \"" + orig_article_num + "\", \"" + total_article_num + "\", \"" + idx1_read_num + "\", \"" + idx2_read_num + "\", \"" + orig_read_num + "\", \"" + total_read_num + "\", \"" + idx1_like_num + "\", \"" + idx2_like_num + "\", \"" + orig_like_num + "\", \"" + total_like_num + "\", \"" + max_read_num + "\", \"" + max_like_num + "\", \"" + ave_like_rate + "\", \"" + score + "\", \"" + main_ranking + "\", \"" + type_ranking + "\", \"" + stat_time + "\")"; } }catch (Exception e){ logger.info("[ERROR-IN_STORE] error["+e+"]"); } } if (!queryInsertKeyWordsTableTmp.equals(queryInsertKeyWordsTable)) { jdbcTemplate.update(queryInsertKeyWordsTable); }else { logger.info("[ERROR-OUT_STORE] rankingDate["+rankingDate+"] error[no one good!]"); } } }