package com.mite8.utils.off_line_util; import com.mite8.utils.MapSort; import com.mite8.utils.ansj_util.WordNatureFilter; import org.ansj.domain.Result; import org.ansj.recognition.impl.FilterRecognition; import org.ansj.splitWord.analysis.DicAnalysis; import java.io.*; import java.util.*; /** * Author: blogchong * Time: 2016/10/9. * Email: blogchong#qq.com * 公众号:数据虫巢 ID:blogchong * Desc: See NLP 生成专题的IDF字典 */ public class SeeTopicIDF { // public static void main(String[] args) throws Exception{ // String srcPath = "see/topic_content.txt"; // String outPath = "C:\\Data\\桌面空间\\工作文档\\分词\\实例\\see_idf.log"; // SeeTopicIDF seeNlp = new SeeTopicIDF(); // seeNlp.getReIDF(srcPath, outPath); // } //计算 词-包含该词的文档数 public void getReIDF(String srcPath, String outPath) throws Exception{ //停用词过滤器 FilterRecognition fitler = new FilterRecognition(); LoadStopWordDic loadStopWordDic = new LoadStopWordDic(); List<String> list = loadStopWordDic.loadStopWordDic("library/stop_words.dic"); fitler.insertStopWords(list); fitler = WordNatureFilter.wordNatureFilter(fitler); ClassLoader classLoader = getClass().getClassLoader(); File file = new File(classLoader.getResource(srcPath).getFile()); int count = 0; Map<String, Integer> map = new HashMap<String, Integer>(); try (Scanner scanner = new Scanner(file)) { while (scanner.hasNextLine()) { String line = scanner.nextLine(); String[] notes = line.split("\t"); if (notes.length == 4) { String title = notes[1]; String body = notes[3]; count++; Result result = DicAnalysis.parse(title + "," + body).recognition(fitler); Set<String> set = new HashSet<String>(); for (int i=0; i<result.size(); i++) { String word = result.get(i).getName().trim(); //去除单字和数字,小数 // if (word.length() >= 2 && !word.matches("-?[0-9]+.*[0-9]*")) { //// set.add(word+"[" + result.get(i).getNatureStr() + "]"); // set.add(word); // } if(result.get(i).getNatureStr().equals("userDefine")) { set.add(word); } } for (String str: set) { if(map.containsKey(str)) { map.put(str, map.get(str) + 1); } else { map.put(str, 1); } } } else { System.out.println(notes[0]); } } scanner.close(); System.out.println("The num of topic is : " + count); } catch (IOException e) { e.printStackTrace(); } //对map进行排序 map = MapSort.sortByValue(map); writeFile(map, outPath); } //存储到指定位置 public static void writeFile(Map<String, Integer> map, String path) { try { BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(new File(path)), "UTF-8")); for (String name : map.keySet()) { bw.write(name + "\t" + map.get(name)); bw.newLine(); } bw.close(); } catch (Exception e) { System.err.println("write errors :" + e); } } }