package com.mite8.utils.ansj_util; import com.mite8.utils.DefineOut; import org.ansj.library.DATDictionary; import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.tire.domain.Forest; import org.nlpcn.commons.lang.tire.domain.Value; import org.nlpcn.commons.lang.tire.library.Library; import org.nlpcn.commons.lang.util.StringUtil; import org.springframework.stereotype.Service; import java.io.*; import java.util.*; import java.util.logging.Logger; import static org.ansj.util.MyStaticValue.LIBRARYLOG; /** * Author: blogchong * Time: 2016/10/14. * Email: blogchong#qq.com * 公众号:数据虫巢 ID:blogchong * Desc: 加载外部动态字典。 */ @Service public class LoadDynamicDictionary { private static final Logger logger = Logger.getLogger(LoadDynamicDictionary.class.getName()); private static String DEFAULT_FREQ_STR = "1000"; public static Map<String, Forest> dicSegMap = new HashMap<>(); public static Map<String, Forest> dicAmbigMap = new HashMap<>(); public static Map<String, List<String>> dicStopMap = new HashMap<>(); public static Map<String, Map<String, String>> dicSynonMap = new HashMap<>(); public static Map<String, String> dicConfigMap = new HashMap<>(); public static Map<String, Set<String>> dicSpeMap = new HashMap<>(); //文档频文件,存储文档频 public static Map<String, Integer> dicDfMap = new HashMap<>(); public static int DF_NUM = 0; public LoadDynamicDictionary(){ initDicConfig(); init(); } //手动执行触发更新外部字典 public void loadDynamicDictionary(){ clearDic(); initDicConfig(); init(); logger.info("TASK-UPDATE DYNAMIC DIC END!"); } //清除系统缓存动态字典 public static void clearDic(){ dicSegMap.clear(); dicAmbigMap.clear(); dicStopMap.clear(); dicSynonMap.clear(); dicConfigMap.clear(); dicStopMap.clear(); } //进行字典配置读取 public static void initDicConfig(){ BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("./config/dic.properties")))); } catch (FileNotFoundException e) { System.err.println("READ DYNAMIC CONFIG ERROR, NO THIS CONFIG: " + e); } String temp = null; if (br != null) { try { while ((temp = br.readLine()) != null) { String[] notes = temp.split("="); if (notes.length == 2) { dicConfigMap.put(notes[0], notes[1]); } System.out.println("###dynamic-config: " + temp); } br.close(); } catch (IOException e) { System.err.println("READ DYNAMIC CONFIG ERROR: " + e); e.printStackTrace(); } } } //初始化各个自定义字典 private static void init(){ //初始化分词字典 initLoad(dicConfigMap.get(DefineOut.dic_seg), DefineOut.DIC_FLAG_SEG); //初始化歧义字典 initLoad(dicConfigMap.get(DefineOut.dic_ambig), DefineOut.DIC_FLAG_AMBIG); //初始化停用词字典 initLoad(dicConfigMap.get(DefineOut.dic_stop), DefineOut.DIC_FLAG_STOP); //初始化同义词字典 initLoad(dicConfigMap.get(DefineOut.dic_synon), DefineOut.DIC_FLAG_SYNON); //初始化特俗字典 initLoad(dicConfigMap.get(DefineOut.dic_spe), DefineOut.DIC_FLAG_SPE); } //读取文档频相关的文件 public static void loadDFDic(){ String dfFile = "/dic/df/df.dic"; String numFile = "/dic/df/num.dic"; InputStream inputNum = LoadDictionary.class.getResourceAsStream(numFile); BufferedReader brNum = new BufferedReader(new InputStreamReader(inputNum)); String temp = null; try { while ((temp = brNum.readLine()) != null) { //获取到resource中的每行 DF_NUM = Integer.parseInt(temp); } brNum.close(); } catch (IOException e) { System.err.println("READ DF-NUM ERROR: " + e); e.printStackTrace(); } InputStream input = LoadDictionary.class.getResourceAsStream(dfFile); BufferedReader br = new BufferedReader(new InputStreamReader(input)); int count = 0; try { while ((temp = br.readLine()) != null) { //获取到resource中的每行 String[] notes = temp.split("\\t"); if(notes.length == 3) { try { String word = notes[0]; int df = Integer.parseInt(notes[1]); dicDfMap.put(word, df); count++; } catch (Exception e) { logger.info("DF bad word: [" + notes[0] + "][" + notes[1] + "][" + notes[2] + "] ERROR:" + e); } } } br.close(); } catch (IOException e) { System.err.println("READ DF ERROR: " + e); e.printStackTrace(); } } //初始化公共函数 private static void initLoad(String typePaths, String type){ if (typePaths != null && typePaths.length() > 1) { String[] paths = typePaths.split(";"); for (String path: paths) { String[] pathTmp = path.split("/"); String natureTmp = pathTmp[pathTmp.length - 1]; String[] natureTmp2 = natureTmp.split("\\."); if (natureTmp2.length == 2) { String nature = natureTmp2[0]; if (type.equals(DefineOut.DIC_FLAG_SEG)) { dicSegMap.put(nature, loadSegDic(path, nature)); } else if (type.equals(DefineOut.DIC_FLAG_AMBIG)) { dicAmbigMap.put(nature, loadAmbigDic(path)); } else if (type.equals(DefineOut.DIC_FLAG_STOP)) { dicStopMap.put(nature, loadStopDic(path)); } else if (type.equals(DefineOut.DIC_FLAG_SYNON)) { dicSynonMap.put(nature, loadSynonDic(path)); } else if (type.equals(DefineOut.DIC_FLAG_SPE)) { dicSpeMap.put(nature, loadSpeDic(path)); } } } } } //加载分词字典 private static Forest loadSegDic(String path, String nature){ Forest forest = new Forest(); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)))); } catch (FileNotFoundException e) { System.err.println("READ DYNAMIC SEG ERROR, NO THIS FILE. PATH[" + path + "] ERROR: " + e); } if (br != null) { String temp = null; String[] strs; Value value; int count = 0; try { while ((temp = br.readLine()) != null) { //获取到resource中的每行 if (StringUtil.isNotBlank(temp)) { temp = StringUtil.trim(temp); strs = temp.split("\t"); strs[0] = strs[0].toLowerCase(); // 如何核心辞典存在那么就放弃 if (MyStaticValue.isSkipUserDefine && DATDictionary.getId(strs[0]) > 0) { continue; } if (strs.length != 3) { value = new Value(strs[0], nature, DEFAULT_FREQ_STR); } else { value = new Value(strs[0], strs[1], strs[2]); } Library.insertWord(forest, value); count++; } } br.close(); } catch (IOException e) { System.err.println("READ DYNAMIC CONFIG ERROR: " + e); e.printStackTrace(); } logger.info("###[DIC-SEG-DYNAMIC]###The dic of Seg[" + path + "] is loaded, the num of dic is: " + count); return forest; } else { return null; } } //加歧义词字典 private static Forest loadAmbigDic(String path){ Forest forest = new Forest(); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)))); } catch (FileNotFoundException e) { System.err.println("READ DYNAMIC AMBIG ERROR, NO THIS FILE. PATH[" + path + "] ERROR: " + e); } if (br != null) { String temp = null; int count = 0; try { while ((temp = br.readLine()) != null) { if (StringUtil.isNotBlank(temp)) { temp = StringUtil.trim(temp); String[] split = temp.split("\t"); StringBuilder sb = new StringBuilder(); if (split.length % 2 != 0) { LIBRARYLOG.error("init dynamic ambiguity error in line :" + temp + " format err !"); } for (int i = 0; i < split.length; i += 2) { sb.append(split[i]); } forest.addBranch(sb.toString(), split); count++; } } br.close(); } catch (IOException e) { System.err.println("READ DIC-SEG-DYNAMIC ERROR: " + e); e.printStackTrace(); } logger.info("###[DIC-AMBIG-DYNAMIC]###The dic of Ambig[" + path + "] is loaded, the num of dic is: " + count); return forest; } else { return null; } } //加载停用词字典 private static List<String> loadStopDic(String path){ List<String> list = new ArrayList<String>(); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)))); } catch (FileNotFoundException e) { System.err.println("READ DYNAMIC STOP ERROR, NO THIS FILE. PATH[" + path + "] ERROR: " + e); } if (br != null) { String temp = null; int count = 0; try { while ((temp = br.readLine()) != null) { list.add(temp); count++; } br.close(); } catch (IOException e) { System.err.println("READ DIC-STOP-DYNAMIC ERROR: " + e); e.printStackTrace(); } logger.info("###[DIC-STOP-DYNAMIC]###The dic of Stop[" + path + "] is loaded, the num of dic is: " + count); return list; } else { return null; } } //加载同义词字典 private static Map<String, String> loadSynonDic(String path){ Map<String, String> map = new HashMap<String, String>(); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)))); } catch (FileNotFoundException e) { System.err.println("READ DYNAMIC SYNON ERROR, NO THIS FILE. PATH[" + path + "] ERROR: " + e); } if (br != null) { String temp = null; int count = 0; try { while ((temp = br.readLine()) != null) { String[] words = temp.split("\\t"); if (words.length >= 2) { for (int i = 1; i < words.length; i++) { map.put(words[i].toLowerCase(), words[0].toLowerCase()); } count++; } } br.close(); } catch (IOException e) { System.err.println("READ DIC-SYNON-DYNAMIC ERROR: " + e); e.printStackTrace(); } logger.info("###[DIC-SYNON-DYNAMIC]###The dic of Synon[" + path + "] is loaded, the num of dic is: " + count); return map; } else { return null; } } //加载特俗字典 private static Set<String> loadSpeDic(String path) { Set<String> set = new HashSet<>(); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(path)))); } catch (FileNotFoundException e) { System.err.println("READ DYNAMIC SPE ERROR, NO THIS FILE. PATH[" + path + "] ERROR: " + e); } if (br != null) { String temp = null; int count = 0; try { while ((temp = br.readLine()) != null) { if (temp.length() != 0) { set.add(temp); count++; } } br.close(); } catch (IOException e) { System.err.println("READ DIC-SPE-DYNAMIC ERROR: " + e); e.printStackTrace(); } logger.info("###[DIC-SPE-DYNAMIC]###The dic of Spe[" + path + "] is loaded, the num of dic is: " + count); return set; } else { return null; } } }