package com.mite8.utils.ansj_util; import com.mite8.utils.DefineOut; import com.mite8.utils.MapSort; import org.ansj.domain.Result; import org.ansj.recognition.impl.FilterRecognition; import org.ansj.splitWord.analysis.SeeDicAnalysis; import org.nlpcn.commons.lang.tire.domain.Forest; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.jdbc.core.RowMapper; import org.springframework.stereotype.Service; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.sql.ResultSet; import java.sql.SQLException; import java.util.*; import java.util.logging.Logger; /** * Author: blogchong * Time: 2016/10/31. * Email: blogchong#qq.com * 公众号:数据虫巢 ID:blogchong * Desc: 更新文档频服务 */ @Service public class UpdateDFService { @Autowired private JdbcTemplate jdbcTemplate; private static final Logger logger = Logger.getLogger(UpdateDFService.class.getName()); public void updateDFService() { String query = "SELECT title,contents,comments FROM wechat_kol_content_all"; List<String> listData = jdbcTemplate.query(query, new RowMapper<String>() { @Override public String mapRow(ResultSet resultSet, int i) throws SQLException { return resultSet.getString("title").trim() + " " + resultSet.getString("contents").trim() + " " + resultSet.getString("comments").trim(); } }); //分词相关 Forest segForest = null; Map<String, String> synonForest = null; List<String> stopForest = null; if (LoadDynamicDictionary.dicSegMap.get(DefineOut.DEFAULT_DIC) != null) { segForest = LoadDynamicDictionary.dicSegMap.get(DefineOut.DEFAULT_DIC); } else { segForest = LoadDictionary.dicSegMap.get(DefineOut.DEFAULT_DIC); } if (LoadDynamicDictionary.dicSynonMap.get(DefineOut.DEFAULT_DIC) != null) { synonForest = LoadDynamicDictionary.dicSynonMap.get(DefineOut.DEFAULT_DIC); } else { synonForest = LoadDictionary.dicSynonMap.get(DefineOut.DEFAULT_DIC); } if (LoadDynamicDictionary.dicStopMap.get(DefineOut.DEFAULT_DIC) != null) { stopForest = LoadDynamicDictionary.dicStopMap.get(DefineOut.DEFAULT_DIC); } else { stopForest = LoadDictionary.dicStopMap.get(DefineOut.DEFAULT_DIC); } FilterRecognition filter = new FilterRecognition(); filter.insertStopWords(stopForest); filter = WordNatureFilter.wordNatureFilter(filter); int countAll = 0; Map<String, Integer> mapDF = new HashMap<>(); Map<String, String> mapNature = new HashMap<>(); for (String topic: listData){ Set<String> setWord = new HashSet<>(); Result result = ResultFilter.resultFilterByRule(SeeDicAnalysis.parse(segForest, topic).recognition(filter)); result = ResultFilter.resultFilterBySynon(result, synonForest, DefineOut.DEFAULT_DIC); //遍历词汇 for (int i = 0; i < result.size(); i++) { String word = result.get(i).getName(); String nature = result.get(i).getNatureStr(); mapNature.put(word, nature); setWord.add(word); } for (String word: setWord){ if (mapDF.containsKey(word)){ mapDF.put(word, mapDF.get(word) + 1); } else { mapDF.put(word, 1); } } countAll++; } //进行df文件落地 boolean result = storeDF(MapSort.sortByValue(mapDF), mapNature, countAll); if (result) { logger.info("TASK-UPDATE DYNAMIC DF END!"); } else { logger.info("TASK-UPDATE DYNAMIC DF ERROR!"); } } //落地操作 public static boolean storeDF(Map<String, Integer> mapDF, Map<String, String> mapNature, int num){ //判断父目录是否存在,不存在则进行 boolean fileFlag = mkDirDF(); BufferedWriter bwDF = null; BufferedWriter bwNum = null; if (fileFlag) { try { bwDF = new BufferedWriter(new BufferedWriter(new OutputStreamWriter (new FileOutputStream(new File("./dic/df/df.dic")), "UTF-8"))); bwNum = new BufferedWriter(new BufferedWriter(new OutputStreamWriter (new FileOutputStream(new File("./dic/df/num.dic")), "UTF-8"))); for (String word: mapDF.keySet()){ bwDF.write(word + "\t" + mapDF.get(word) + "\t" + mapNature.get(word) + "\n"); } bwNum.write(num + "\n"); bwDF.close(); bwNum.close(); } catch (Exception e) { logger.info("DF ERROR: can not find file! " + e); } } return fileFlag; } //对于父目录进行判断 public static boolean mkDirDF(){ File file =new File("./dic/df"); File file2 =new File("./dic"); boolean flagDic = false; boolean flagDic2 = false; //如果文件夹不存在则创建 if (!file .exists() && !file .isDirectory()){ flagDic = file2 .mkdir(); flagDic2 = file .mkdir(); } else{ flagDic = true; flagDic2 = true; } return (flagDic && flagDic2); } }