package com.mite8.utils.off_line_util;
import com.mite8.utils.DefineOut;
import com.mite8.utils.MapSort;
import com.mite8.utils.ansj_util.WordNatureFilter;
import org.ansj.domain.Result;
import org.ansj.recognition.impl.FilterRecognition;
import org.ansj.splitWord.analysis.DicAnalysis;
import java.io.*;
import java.text.DecimalFormat;
import java.util.*;
/**
* Author: blogchong
* Time: 2016/10/9.
* Email: blogchong#qq.com
* 公众号:数据虫巢 ID:blogchong
* Desc: See NLP对于专题抽取关键词,使用TFIDF模型
*/
public class SeeTopicTFIDF {
// public static void main(String[] args) throws Exception{
// String srcPath = "see/topic_content.txt";
// String idfPath = "see/see_idf.log";
// String outPath = "C:\\Data\\桌面空间\\工作文档\\分词\\实例\\see_tf_idf.log";
// SeeTopicTFIDF seeNlp = new SeeTopicTFIDF();
// seeNlp.getTFIDF(srcPath, outPath, idfPath);
// }
//计算 词-包含该词的文档数
public void getTFIDF(String srcPath, String outPath, String idfPath) throws Exception{
//停用词过滤器
FilterRecognition fitler = new FilterRecognition();
LoadStopWordDic loadStopWordDic = new LoadStopWordDic();
List<String> list = loadStopWordDic.loadStopWordDic("library/stop_words.dic");
fitler.insertStopWords(list);
fitler = WordNatureFilter.wordNatureFilter(fitler);
Map<String, Integer> idfMap = loadStopWordDic.loadIdfFile(idfPath);
ClassLoader classLoader = getClass().getClassLoader();
File file = new File(classLoader.getResource(srcPath).getFile());
//最终返回的结果数据
List<String> retList = new ArrayList<String>();
//总文章数
int countAll = 0;
//计算一个总热词情况
Map<String, Double> mapAllWords = new HashMap<String, Double>();
Map<String, Integer> mapAllWords2 = new HashMap<String, Integer>();
try (Scanner scanner = new Scanner(file)) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
String[] notes = line.split("\t");
if (notes.length == 4) {
String id = notes[0];
String title = notes[1];
String body = notes[3];
countAll++;
Result result = DicAnalysis.parse(title + "," + body).recognition(fitler);
int docWordNum = 0;
Map<String, Integer> tfMapTmp = new HashMap<String, Integer>();
Map<String, Double> tfIdfMap = new HashMap<String, Double>();
for (int i=0; i<result.size(); i++) {
String word = result.get(i).getName().trim();
//去除单字和数字,小数
if (word.length() >= 2 && !word.matches("-?[0-9]+.*[0-9]*")) {
docWordNum++;
if (tfMapTmp.containsKey(word)) {
tfMapTmp.put(word, tfMapTmp.get(word) + 1);
} else {
tfMapTmp.put(word, 1);
}
}
}
//计算TF值
for (String str: tfMapTmp.keySet()) {
int num = tfMapTmp.get(str);
double tf = (double)num / (double)docWordNum;
int idfTmp = 0;
if (idfMap.containsKey(str)) {
idfTmp = idfMap.get(str);
} else {
//idf表中没有,则认为文档频为1
idfTmp = 1;
}
double idf = Math.log((double) DefineOut.TopicNum/((double)idfTmp + 1));
//保留四位小数
DecimalFormat df = new DecimalFormat("#.0000");
tfIdfMap.put(str, Double.parseDouble(df.format(tf*idf)));
}
//排序
tfIdfMap = MapSort.sortByValue2(tfIdfMap);
int count = 0;
String wordLists = null;
for (String str: tfIdfMap.keySet()) {
if (count <= 20) {
if(wordLists == null) {
wordLists = str + "[" + tfIdfMap.get(str) + "]";
} else {
wordLists = wordLists + " | " + str + "[" + tfIdfMap.get(str) + "]";
}
count++;
//全局分值记录
if (mapAllWords.containsKey(str)) {
mapAllWords.put(str, mapAllWords.get(str) + tfIdfMap.get(str));
} else {
mapAllWords.put(str, tfIdfMap.get(str));
}
//全局个数记录
if(mapAllWords2.containsKey(str)) {
mapAllWords2.put(str, mapAllWords2.get(str) + 1);
} else {
mapAllWords2.put(str, 1);
}
} else {
break;
}
}
//组装最后的返回数据
retList.add(id + "\t" + wordLists);
} else {
System.out.println(notes[0]);
}
}
scanner.close();
System.out.println("The num of topic is : " + countAll);
} catch (IOException e) {
e.printStackTrace();
}
writeFile(retList, outPath);
//记录总体热度情况
String allScorePath = "C:\\Data\\桌面空间\\工作文档\\分词\\实例\\global_score_words_20.log";
String allNumPath = "C:\\Data\\桌面空间\\工作文档\\分词\\实例\\global_num_words_20.log";
writeFile(MapSort.sortByValue2(mapAllWords), allScorePath);
writeFile2(MapSort.sortByValue(mapAllWords2), allNumPath);
}
//存储到指定位置
public static void writeFile(Map<String, Double> map, String path) {
try {
BufferedWriter bw = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(new File(path)), "UTF-8"));
for (String name : map.keySet()) {
bw.write(name + "\t" + map.get(name));
bw.newLine();
}
bw.close();
} catch (Exception e) {
System.err.println("write errors :" + e);
}
}
//存储到指定位置
public static void writeFile2(Map<String, Integer> map, String path) {
try {
BufferedWriter bw = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(new File(path)), "UTF-8"));
for (String name : map.keySet()) {
bw.write(name + "\t" + map.get(name));
bw.newLine();
}
bw.close();
} catch (Exception e) {
System.err.println("write errors :" + e);
}
}
//存储到指定位置
public static void writeFile(List<String> list, String path) {
try {
BufferedWriter bw = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(new File(path)), "UTF-8"));
for (String note : list) {
bw.write(note);
bw.newLine();
}
bw.close();
} catch (Exception e) {
System.err.println("write errors :" + e);
}
}
}