package com.mite8.Insight.jd_wumai; import cn.edu.hfut.dmic.webcollector.model.CrawlDatum; import cn.edu.hfut.dmic.webcollector.model.Page; import cn.edu.hfut.dmic.webcollector.net.HttpRequest; import cn.edu.hfut.dmic.webcollector.net.HttpResponse; import com.mite8.Insight.movie_great_wall.MovieUtils; import com.mite8.utils.DefineOut; import com.mite8.utils.TransferTime; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.stereotype.Service; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Random; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Author: blogchong * Time: 2016/11/23. * Email: blogchong#qq.com * 公众号:数据虫巢 ID:blogchong // * Desc: 抓取京东评论 */ @Service public class OptJDcomments { @Autowired private JdbcTemplate jdbcTemplate; private static final Logger logger = Logger.getLogger(OptJDcomments.class.getName()); private static Pattern patternID = Pattern.compile("\" data-sku=\"(\\d+)\" href="); private static int S_NUM = 5; private static int sleepTime = 2000; public void getJDComments(){ optJDComments(jdbcTemplate); } // public static void main(String[] args) { // optJDComments(null); // } //https://search.jd.com/Search?keyword=%E9%9B%BE%E9%9C%BE%E5%8F%A3%E7%BD%A9&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E9%9B%BE%E9%9C%BE%E5%8F%A3%E7%BD%A9&psort=4&page=1&s=1&click=0 //操作遍历入口 public static void optJDComments(JdbcTemplate jdbcTemplate){ long beginTime = TransferTime.dateToLong(new Date())/1000; String updateTime = TransferTime.dateToString(new Date(), DefineOut.timeFormat); String url = "https://search.jd.com/Search?keyword=%E9%9B%BE%E9%9C%BE%E5%8F%A3%E7%BD%A9&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E9%9B%BE%E9%9C%BE%E5%8F%A3%E7%BD%A9&psort=4&page=1&click=0"; // int num = AnalysisNum(url); Random random = new Random(); int num = 63; int count = 0; int count_all = 0; if (num == 0) { logger.info("ERROR - optMovie, Please check!"); } else { for (int i = 0; i < num; i++) { // String listUrl = "https://search.jd.com/Search?keyword=%E9%9B%BE%E9%9C%BE%E5%8F%A3%E7%BD%A9&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E9%9B%BE%E9%9C%BE%E5%8F%A3%E7%BD%A9&psort=4&page=" + ((i + 1) * 2 - 1) + "&click=0"; String listUrl = "https://search.jd.com/Search?keyword=%E9%9B%BE%E9%9C%BE%E5%8F%A3%E7%BD%A9&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E9%9B%BE%E9%9C%BE%E5%8F%A3%E7%BD%A9&psort=4&page=" + ((i + 1) * 2 - 1) + "&s=61&click=0"; try { CrawlDatum crawlDatum = new CrawlDatum(listUrl).putMetaData("method", "GET"); HttpRequest request = new HttpRequest(crawlDatum.getUrl()); request.setMethod(crawlDatum.getMetaData("method")); String outputData = crawlDatum.getMetaData("outputData"); if (outputData != null) { request.setOutputData(outputData.getBytes("utf-8")); } request.setCookie(MovieUtils.jd_cookie); request.setUserAgent(MovieUtils.agent); HttpResponse httpResponse = request.getResponse(); Page page = new Page(crawlDatum, httpResponse); Elements elements_ids = page.select("div[class=p-operate]"); for (Element element: elements_ids){ Elements elements_ids2 = element.select("a"); for (Element element1: elements_ids2){ String str = element1.toString(); Matcher matcher = patternID.matcher(str); String id = ""; if (matcher.find()) { id = matcher.group(1); } int count_id = 0; if (!id.equals("")){ boolean b_flag = true; int c_flag = 0; int page_flag = 0; while (b_flag) { try { if (count_all%1000 == 0){ int sleep_time_in = random.nextInt(3000) + 1000; logger.info("[INFO] It's time to sleep["+sleep_time_in+"]."); Thread.sleep(sleep_time_in); } String c_url = "https://sclub.jd.com/comment/productPageComments.action?productId=" + id + "&score=0&sortType=3&page=" + page_flag + "&pageSize=10&isShadowSku=0"; CrawlDatum crawlDatum_in = new CrawlDatum(c_url).putMetaData("method", "GET"); HttpRequest request_in = new HttpRequest(crawlDatum_in.getUrl()); request_in.setMethod(crawlDatum_in.getMetaData("method")); String outputData_in = crawlDatum_in.getMetaData("outputData"); if (outputData_in != null) { request_in.setOutputData(outputData_in.getBytes("utf-8")); } request_in.setCookie(MovieUtils.jd_cookie); request_in.setUserAgent(MovieUtils.agent); HttpResponse httpResponse_in = request_in.getResponse(); Page page_in = new Page(crawlDatum_in, httpResponse_in); String json_str = page_in.getHtml(); try { JSONObject jsonObject = JSONObject.fromObject(json_str); //解析数据,并且入库 JSONArray jsonArray = jsonObject.getJSONArray("comments"); if (c_flag < S_NUM && jsonArray.size() == 0) { //试错累计 c_flag++; } else if (c_flag < S_NUM && jsonArray.size() != 0) { //进行正常解析并且入库 List<JSONObject> list = new ArrayList<>(); for (int j = 0; j < jsonArray.size(); j++) { try { JSONObject jsonObject1 = jsonArray.getJSONObject(j); JSONObject jsonObject2 = new JSONObject(); jsonObject2.put("p_id",id); jsonObject2.put("id", jsonObject1.getString("id")); jsonObject2.put("content", jsonObject1.getString("content")); jsonObject2.put("creationTime", jsonObject1.getString("creationTime")); String referenceTime = jsonObject1.getString("referenceTime"); jsonObject2.put("referenceTime", referenceTime); jsonObject2.put("referenceName", jsonObject1.getString("referenceName")); jsonObject2.put("userProvince", jsonObject1.getString("userProvince")); jsonObject2.put("productColor", jsonObject1.getString("productColor")); String[] reference_tmp = referenceTime.split(" "); jsonObject2.put("referenceMonth", reference_tmp[0].split("-")[1]); jsonObject2.put("referenceDay", reference_tmp[0]); jsonObject2.put("referenceHours", reference_tmp[1].split(":")[0]); list.add(jsonObject2); count_id++; count_all++; } catch (Exception e) { logger.info("[ERROR] c_id[" + id + "] c_url[" + c_url + "] comments get error: " + e); } } //进行入库操作 storeListData(jdbcTemplate, updateTime, list); logger.info("[INFO] count page["+i+"] page_flag["+page_flag+"] productId[" + id + "] count_all["+count_all+"] count_id[" + count_id + "]"); page_flag++; c_flag=0; } else { b_flag = false; } } catch (Exception e) { logger.info("[ERROR] c_url[" + c_url + "] comments get error: " + e); c_flag++; page_flag++; if (c_flag >= S_NUM) { b_flag = false; } } }catch (Exception e){ int sleep = sleepTime + random.nextInt(3000); logger.info("[ERROR] id[" + id + "] sleep["+sleep+"] comments get error: " + e); c_flag++; if (c_flag >= S_NUM) { b_flag = false; } Thread.sleep(sleep); } } } } } } catch (Exception e){ logger.info("[E]"); } count++; logger.info("INFO - COMMENTS, count["+count+"] list_url:"+listUrl); } } long endTime = TransferTime.dateToLong(new Date())/1000; logger.info("TAKE - COMMENTS TIME: " + (endTime - beginTime) + "s Notes: " + count); } public static void storeListData(JdbcTemplate jdbcTemplate,String update_time, List<JSONObject> list){ jdbcTemplate.update("set names utf8mb4"); String queryInsertKeyWordsTable="insert into insight_jd_comments" + "(update_time,p_id,id,content,creationTime,referenceTime," + "referenceName,userProvince,productColor," + "referenceMonth,referenceDay,referenceHours) values"; boolean flagKeyWords = false; for (int j = 0; j < list.size(); j++){ JSONObject jsonObject = list.get(j); String p_id = jsonObject.getString("p_id"); String id = jsonObject.getString("id"); String content = jsonObject.getString("content"); String creationTime = jsonObject.getString("creationTime"); String referenceTime = jsonObject.getString("referenceTime"); String referenceName = jsonObject.getString("referenceName"); String userProvince = jsonObject.getString("userProvince"); String productColor = jsonObject.getString("productColor"); String referenceMonth = jsonObject.getString("referenceMonth"); String referenceDay = jsonObject.getString("referenceDay"); String referenceHours = jsonObject.getString("referenceHours"); //update_time,wechat_id,type,key_word,score if (!flagKeyWords) { queryInsertKeyWordsTable = queryInsertKeyWordsTable + "(\"" + update_time + "\", \"" + p_id + "\", \"" + id + "\",\"" + content + "\", \"" + creationTime + "\", \"" + referenceTime + "\", \"" + referenceName + "\", \"" + userProvince + "\", \"" + productColor + "\", \"" + referenceMonth + "\", \"" + referenceDay + "\", \"" + referenceHours + "\")"; flagKeyWords = true; } else { queryInsertKeyWordsTable = queryInsertKeyWordsTable + "," + "(\"" + update_time + "\", \"" + p_id + "\", \"" + id + "\",\"" + content + "\", \"" + creationTime + "\", \"" + referenceTime + "\", \"" + referenceName + "\", \"" + userProvince + "\", \"" + productColor + "\", \"" + referenceMonth + "\", \"" + referenceDay + "\", \"" + referenceHours + "\")"; } } jdbcTemplate.update(queryInsertKeyWordsTable); } //获取循环次数 public static int AnalysisNum(String url) { //初始链接https://movie.douban.com/subject/6982558/comments?start=0&limit=20&sort=new_score&status=P int num = 63; try { CrawlDatum crawlDatum = new CrawlDatum(url).putMetaData("method", "GET"); HttpRequest request = new HttpRequest(crawlDatum.getUrl()); request.setMethod(crawlDatum.getMetaData("method")); String outputData = crawlDatum.getMetaData("outputData"); if (outputData != null) { request.setOutputData(outputData.getBytes("utf-8")); } request.setCookie(MovieUtils.cookie); request.setUserAgent(MovieUtils.agent); HttpResponse httpResponse = request.getResponse(); Page page = new Page(crawlDatum, httpResponse); String num_str = page.select("li[class=is-active]").text(); Matcher matcher = MovieUtils.patternDP.matcher(num_str); if (matcher.find()) { num = Integer.parseInt(matcher.group(1)); } } catch (Exception e) { logger.info("[ERROR] - analysisList: url["+ url +"] - Detail: " + e); } return num; } }