package com.mite8.utils.ansj_util;
import com.mite8.utils.DefineOut;
import org.ansj.library.DATDictionary;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.Value;
import org.nlpcn.commons.lang.tire.library.Library;
import org.nlpcn.commons.lang.util.StringUtil;
import org.springframework.stereotype.Service;
import java.io.*;
import java.util.*;
import java.util.logging.Logger;
import static org.ansj.util.MyStaticValue.LIBRARYLOG;
/**
* Author: blogchong
* Time: 2016/10/14.
* Email: blogchong#qq.com
* 公众号:数据虫巢 ID:blogchong
* Desc: 字典的自动加载。
*/
@Service
public class LoadDictionary {
private static final Logger logger = Logger.getLogger(LoadDictionary.class.getName());
private static String DEFAULT_FREQ_STR = "1000";
public static Map<String, Forest> dicSegMap = new HashMap<>();
public static Map<String, Forest> dicAmbigMap = new HashMap<>();
public static Map<String, List<String>> dicStopMap = new HashMap<>();
public static Map<String, Map<String, String>> dicSynonMap = new HashMap<>();
public static Map<String, String> dicConfigMap = new HashMap<>();
public static Map<String, Set<String>> dicSpeMap = new HashMap<>();
//文档频文件,存储文档频
public static Map<String, Integer> dicDfMap = new HashMap<>();
public static int DF_NUM = 0;
public LoadDictionary() throws Exception{
initDicConfig();
init();
}
//进行字典配置读取
public static void initDicConfig() throws Exception{
InputStream input = LoadDictionary.class.getResourceAsStream("/dic.properties");
BufferedReader br = new BufferedReader(new InputStreamReader(input));
String temp = null;
try {
while ((temp = br.readLine()) != null) {
String[] notes = temp.split("=");
if (notes.length == 2) {
dicConfigMap.put(notes[0], notes[1]);
}
System.out.println("###config: " + temp);
}
br.close();
} catch (IOException e) {
System.err.println("READ CONFIG ERROR: " + e);
e.printStackTrace();
}
}
//初始化各个自定义字典
private static void init() throws Exception{
//初始化分词字典
initLoad(dicConfigMap.get(DefineOut.dic_seg), DefineOut.DIC_FLAG_SEG);
//初始化歧义字典
initLoad(dicConfigMap.get(DefineOut.dic_ambig), DefineOut.DIC_FLAG_AMBIG);
//初始化停用词字典
initLoad(dicConfigMap.get(DefineOut.dic_stop), DefineOut.DIC_FLAG_STOP);
//初始化同义词字典
initLoad(dicConfigMap.get(DefineOut.dic_synon), DefineOut.DIC_FLAG_SYNON);
//初始化特俗字典
initLoad(dicConfigMap.get(DefineOut.dic_spe), DefineOut.DIC_FLAG_SPE);
//初始化DF字典
loadDFDic();
}
//读取文档频相关的文件
public static void loadDFDic(){
String dfFile = "/dic/df/df.dic";
String numFile = "/dic/df/num.dic";
InputStream inputNum = LoadDictionary.class.getResourceAsStream(numFile);
BufferedReader brNum = new BufferedReader(new InputStreamReader(inputNum));
String temp = null;
try {
while ((temp = brNum.readLine()) != null) {
//获取到resource中的每行
DF_NUM = Integer.parseInt(temp);
}
brNum.close();
} catch (IOException e) {
System.err.println("READ DF-NUM ERROR: " + e);
e.printStackTrace();
}
InputStream input = LoadDictionary.class.getResourceAsStream(dfFile);
BufferedReader br = new BufferedReader(new InputStreamReader(input));
int count = 0;
try {
while ((temp = br.readLine()) != null) {
//获取到resource中的每行
String[] notes = temp.split("\\t");
if(notes.length == 3) {
try {
String word = notes[0];
int df = Integer.parseInt(notes[1]);
dicDfMap.put(word, df);
count++;
} catch (Exception e) {
logger.info("DF bad word: [" + notes[0] + "][" + notes[1] + "][" + notes[2] + "] ERROR:" + e);
}
}
}
br.close();
} catch (IOException e) {
System.err.println("READ DF ERROR: " + e);
e.printStackTrace();
}
}
//初始化公共函数
private static void initLoad(String typePaths, String type) throws Exception{
if (typePaths != null && typePaths.length() > 1) {
String[] paths = typePaths.split(";");
for (String path: paths) {
String[] pathTmp = path.split("/");
String natureTmp = pathTmp[pathTmp.length - 1];
String[] natureTmp2 = natureTmp.split("\\.");
if (natureTmp2.length == 2) {
String nature = natureTmp2[0];
if (type.equals(DefineOut.DIC_FLAG_SEG)) {
dicSegMap.put(nature, loadSegDic(path, nature));
} else if (type.equals(DefineOut.DIC_FLAG_AMBIG)) {
dicAmbigMap.put(nature, loadAmbigDic(path));
} else if (type.equals(DefineOut.DIC_FLAG_STOP)) {
dicStopMap.put(nature, loadStopDic(path));
} else if (type.equals(DefineOut.DIC_FLAG_SYNON)) {
dicSynonMap.put(nature, loadSynonDic(path));
} else if (type.equals(DefineOut.DIC_FLAG_SPE)) {
dicSpeMap.put(nature, loadSpeDic(path));
}
}
}
}
}
//加载分词字典
private static Forest loadSegDic(String path, String nature) throws Exception{
Forest forest = new Forest();
InputStream input = LoadDictionary.class.getResourceAsStream(path);
BufferedReader br = new BufferedReader(new InputStreamReader(input));
String temp = null;
String[] strs;
Value value;
int count = 0;
try {
while ((temp = br.readLine()) != null) {
//获取到resource中的每行
if (StringUtil.isNotBlank(temp)) {
temp = StringUtil.trim(temp);
strs = temp.split("\t");
strs[0] = strs[0].toLowerCase();
// 如何核心辞典存在那么就放弃
if (MyStaticValue.isSkipUserDefine && DATDictionary.getId(strs[0]) > 0) {
continue;
}
if (strs.length != 3) {
value = new Value(strs[0], nature, DEFAULT_FREQ_STR);
} else {
value = new Value(strs[0], strs[1], strs[2]);
}
Library.insertWord(forest, value);
count++;
}
}
br.close();
} catch (IOException e) {
System.err.println("READ DIC-SEG ERROR: " + e);
e.printStackTrace();
}
logger.info("###[DIC-SEG]###The dic of Seg[" + path + "] is loaded, the num of dic is: " + count);
return forest;
}
//加歧义词字典
private static Forest loadAmbigDic(String path) throws Exception{
Forest forest = new Forest();
InputStream input = LoadDictionary.class.getResourceAsStream(path);
BufferedReader br = new BufferedReader(new InputStreamReader(input));
String temp = null;
int count = 0;
try {
while ((temp = br.readLine()) != null) {
if (StringUtil.isNotBlank(temp)) {
temp = StringUtil.trim(temp);
String[] split = temp.split("\t");
StringBuilder sb = new StringBuilder();
if (split.length % 2 != 0) {
LIBRARYLOG.error("init ambiguity error in line :" + temp + " format err !");
}
for (int i = 0; i < split.length; i += 2) {
sb.append(split[i]);
}
forest.addBranch(sb.toString(), split);
count++;
}
}
br.close();
} catch (IOException e) {
System.err.println("READ DIC-AMBIG ERROR: " + e);
e.printStackTrace();
}
logger.info("###[DIC-AMBIG]###The dic of Ambig[" + path + "] is loaded, the num of dic is: " + count);
return forest;
}
//加载停用词字典
private static List<String> loadStopDic(String path) throws Exception{
List<String> list = new ArrayList<String>();
InputStream input = LoadDictionary.class.getResourceAsStream(path);
BufferedReader br = new BufferedReader(new InputStreamReader(input));
String temp = null;
int count = 0;
try {
while ((temp = br.readLine()) != null) {
list.add(temp);
count++;
}
br.close();
} catch (IOException e) {
System.err.println("READ DIC-STOP ERROR: " + e);
e.printStackTrace();
}
logger.info("###[DIC-STOP]###The dic of Stop[" + path + "] is loaded, the num of dic is: " + count);
return list;
}
//加载同义词字典
private static Map<String, String> loadSynonDic(String path) throws Exception{
Map<String, String> map = new HashMap<String, String>();
InputStream input = LoadDictionary.class.getResourceAsStream(path);
BufferedReader br = new BufferedReader(new InputStreamReader(input));
String temp = null;
int count = 0;
try {
while ((temp = br.readLine()) != null) {
String[] words = temp.split("\\t");
if (words.length >= 2) {
for (int i = 1; i < words.length; i++) {
map.put(words[i].toLowerCase(), words[0].toLowerCase());
}
count++;
}
}
br.close();
} catch (IOException e) {
System.err.println("READ DIC-SYNON ERROR: " + e);
e.printStackTrace();
}
logger.info("###[DIC-SYNON]###The dic of Synon[" + path + "] is loaded, the num of dic is: " + count);
return map;
}
//加载特俗字典
private static Set<String> loadSpeDic(String path) throws Exception{
Set<String> set = new HashSet<>();
InputStream input = LoadDictionary.class.getResourceAsStream(path);
BufferedReader br = new BufferedReader(new InputStreamReader(input));
String temp = null;
int count = 0;
try {
while ((temp = br.readLine()) != null) {
if (temp.length() != 0) {
set.add(temp);
count++;
}
}
br.close();
} catch (IOException e) {
System.err.println("READ DIC-SPE ERROR: " + e);
e.printStackTrace();
}
logger.info("###[DIC-SPE]###The dic of Spe[" + path + "] is loaded, the num of dic is: " + count);
return set;
}
}