LoadDictionary.java example

Explorer

mite8-com-master
- mite-bigdata-jsp
  - src
    - main
      - java
        com
        mite8
        Application.java
        Insight
        controller
        JDController.java
        MovieController.java
        jd_wumai
        OptJDcomments.java
        movie_great_wall
        AnalysisList.java
        AnalysisListShort.java
        CheckAndStore.java
        MovieService.java
        MovieTagOffLineService.java
        MovieUtils.java
        OptMovie.java
        config
        MyWebAppConfigurer.java
        controller
        SegController.java
        mite
        json
        MiteGovJxGzDnJsonController.java
        MiteInsightJsonController.java
        MiteOtherJsonController.java
        jsp
        MiteGovJxGzDnJspController.java
        MiteInsightController.java
        MiteOtherController.java
        entity
        DataTopicEntity.java
        GovEmotionEntity.java
        GovFinanceEntity.java
        jx
        gz
        dn
        controller
        HireController.java
        PoliticsController.java
        PraiseController.java
        ResumeController.java
        entity
        EmotionEntity.java
        service
        ask_politics
        AnalysisPoliticsService.java
        AskPoliticsService.java
        utils
        AnalysisDetail.java
        AnalysisList.java
        CheckAndStore.java
        OptPolitics.java
        hire
        HireService.java
        utils
        AnalysisDetail.java
        AnalysisList.java
        CheckAndStore.java
        OptHire.java
        public_praise
        PublicPraiseService.java
        utils
        AnalysisList.java
        CheckAndStore.java
        OptPraise.java
        resume
        ResumeService.java
        utils
        AnalysisDetail.java
        AnalysisList.java
        CheckAndStore.java
        OptResume.java
        utils
        DefineDn.java
        LoadEmotionDictionary.java
        service
        BigdataService.java
        DataTopicService.java
        GovJxGzDnService.java
        JDCommentsService.java
        OtherService.java
        SegService.java
        TypeService.java
        utils
        CleanStr.java
        CollectionsSort.java
        CutDoubleValue.java
        DefineOut.java
        GetAddrHostUtils.java
        MapSort.java
        PageUtils.java
        TransferTime.java
        ansj_util
        AnsjUtils.java
        LoadDictionary.java
        LoadDynamicDictionary.java
        ResultFilter.java
        SegBrandSpeOpt.java
        UpdateDFService.java
        WordNatureFilter.java
        mite_restful
        MiteGovUtils.java
        off_line_util
        JudgeWordDic.java
        LoadStopWordDic.java
        MergeWordDic.java
        SeeTopicIDF.java
        SeeTopicTFIDF.java
        UserMapForest.java
        wechat
        wpweixin_com
        WechatController.java
        WpweixinService.java
        org
        ansj
        splitWord
        analysis
        SeeDicAnalysis.java

package com.mite8.utils.ansj_util;

import com.mite8.utils.DefineOut;
import org.ansj.library.DATDictionary;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.Value;
import org.nlpcn.commons.lang.tire.library.Library;
import org.nlpcn.commons.lang.util.StringUtil;
import org.springframework.stereotype.Service;
import java.io.*;
import java.util.*;
import java.util.logging.Logger;
import static org.ansj.util.MyStaticValue.LIBRARYLOG;

/**
 * Author: blogchong
 * Time:  2016/10/14.
 * Email: blogchong#qq.com
 * 公众号：数据虫巢 ID:blogchong
 * Desc:  字典的自动加载。
 */

@Service
public class LoadDictionary {

    private static final Logger logger = Logger.getLogger(LoadDictionary.class.getName());

    private static String DEFAULT_FREQ_STR = "1000";

    public static Map<String, Forest> dicSegMap = new HashMap<>();
    public static Map<String, Forest> dicAmbigMap = new HashMap<>();
    public static Map<String, List<String>> dicStopMap = new HashMap<>();
    public static Map<String, Map<String, String>> dicSynonMap = new HashMap<>();
    public static Map<String, String> dicConfigMap = new HashMap<>();
    public static Map<String, Set<String>> dicSpeMap = new HashMap<>();

    //文档频文件,存储文档频
    public static Map<String, Integer> dicDfMap = new HashMap<>();
    public static int DF_NUM = 0;

    public LoadDictionary() throws Exception{
        initDicConfig();
        init();
    }

    //进行字典配置读取
    public static void initDicConfig() throws Exception{

        InputStream  input = LoadDictionary.class.getResourceAsStream("/dic.properties");
        BufferedReader br = new BufferedReader(new InputStreamReader(input));
        String temp = null;
        try {
            while ((temp = br.readLine()) != null) {
                String[] notes = temp.split("=");
                if (notes.length == 2) {
                    dicConfigMap.put(notes[0], notes[1]);
                }
                System.out.println("###config: " + temp);
            }
            br.close();
        } catch (IOException e) {
            System.err.println("READ CONFIG ERROR: " + e);
            e.printStackTrace();
        }

    }

    //初始化各个自定义字典
    private static void init() throws Exception{

        //初始化分词字典
        initLoad(dicConfigMap.get(DefineOut.dic_seg), DefineOut.DIC_FLAG_SEG);
        //初始化歧义字典
        initLoad(dicConfigMap.get(DefineOut.dic_ambig), DefineOut.DIC_FLAG_AMBIG);
        //初始化停用词字典
        initLoad(dicConfigMap.get(DefineOut.dic_stop), DefineOut.DIC_FLAG_STOP);
        //初始化同义词字典
        initLoad(dicConfigMap.get(DefineOut.dic_synon), DefineOut.DIC_FLAG_SYNON);
        //初始化特俗字典
        initLoad(dicConfigMap.get(DefineOut.dic_spe), DefineOut.DIC_FLAG_SPE);
        //初始化DF字典
        loadDFDic();

    }

    //读取文档频相关的文件
    public static void loadDFDic(){
        String dfFile = "/dic/df/df.dic";
        String numFile = "/dic/df/num.dic";

        InputStream  inputNum = LoadDictionary.class.getResourceAsStream(numFile);
        BufferedReader brNum = new BufferedReader(new InputStreamReader(inputNum));
        String temp = null;
        try {
            while ((temp = brNum.readLine()) != null) {
                //获取到resource中的每行
                DF_NUM = Integer.parseInt(temp);
            }
            brNum.close();
        } catch (IOException e) {
            System.err.println("READ DF-NUM ERROR: " + e);
            e.printStackTrace();
        }

        InputStream  input = LoadDictionary.class.getResourceAsStream(dfFile);
        BufferedReader br = new BufferedReader(new InputStreamReader(input));
        int count = 0;
        try {
            while ((temp = br.readLine()) != null) {
                //获取到resource中的每行
                String[] notes = temp.split("\\t");
                if(notes.length == 3) {

                    try {
                        String word = notes[0];
                        int df = Integer.parseInt(notes[1]);
                        dicDfMap.put(word, df);
                        count++;
                    } catch (Exception e) {
                        logger.info("DF bad word: [" + notes[0] + "][" + notes[1] + "][" + notes[2] + "] ERROR:" + e);
                    }
                }
            }
            br.close();
        } catch (IOException e) {
            System.err.println("READ DF ERROR: " + e);
            e.printStackTrace();
        }

    }

    //初始化公共函数
    private static void initLoad(String typePaths, String type) throws Exception{

        if (typePaths != null && typePaths.length() > 1) {
            String[] paths = typePaths.split(";");
            for (String path: paths) {
                String[] pathTmp = path.split("/");
                String natureTmp = pathTmp[pathTmp.length - 1];
                String[] natureTmp2 = natureTmp.split("\\.");
                if (natureTmp2.length == 2) {
                    String nature = natureTmp2[0];
                    if (type.equals(DefineOut.DIC_FLAG_SEG)) {
                        dicSegMap.put(nature, loadSegDic(path, nature));
                    } else if (type.equals(DefineOut.DIC_FLAG_AMBIG)) {
                        dicAmbigMap.put(nature, loadAmbigDic(path));
                    } else if (type.equals(DefineOut.DIC_FLAG_STOP)) {
                        dicStopMap.put(nature, loadStopDic(path));
                    } else if (type.equals(DefineOut.DIC_FLAG_SYNON)) {
                        dicSynonMap.put(nature, loadSynonDic(path));
                    } else if (type.equals(DefineOut.DIC_FLAG_SPE)) {
                        dicSpeMap.put(nature, loadSpeDic(path));
                    }

                }
            }
        }
    }

    //加载分词字典
    private static Forest loadSegDic(String path, String nature) throws Exception{

        Forest forest = new Forest();

        InputStream  input = LoadDictionary.class.getResourceAsStream(path);
        BufferedReader br = new BufferedReader(new InputStreamReader(input));
        String temp = null;
        String[] strs;
        Value value;
        int count = 0;
        try {
            while ((temp = br.readLine()) != null) {
                //获取到resource中的每行
                if (StringUtil.isNotBlank(temp)) {
                    temp = StringUtil.trim(temp);
                    strs = temp.split("\t");
                    strs[0] = strs[0].toLowerCase();
                    // 如何核心辞典存在那么就放弃
                    if (MyStaticValue.isSkipUserDefine && DATDictionary.getId(strs[0]) > 0) {
                        continue;
                    }
                    if (strs.length != 3) {
                        value = new Value(strs[0], nature, DEFAULT_FREQ_STR);
                    } else {
                        value = new Value(strs[0], strs[1], strs[2]);
                    }
                    Library.insertWord(forest, value);
                    count++;
                }
            }
            br.close();
        } catch (IOException e) {
            System.err.println("READ DIC-SEG ERROR: " + e);
            e.printStackTrace();
        }

        logger.info("###[DIC-SEG]###The dic of Seg[" + path + "] is loaded, the num of dic is: " + count);

        return forest;
    }

    //加歧义词字典
    private static Forest loadAmbigDic(String path) throws Exception{

        Forest forest = new Forest();

        InputStream  input = LoadDictionary.class.getResourceAsStream(path);
        BufferedReader br = new BufferedReader(new InputStreamReader(input));
        String temp = null;
        int count = 0;
        try {
            while ((temp = br.readLine()) != null) {
                if (StringUtil.isNotBlank(temp)) {
                    temp = StringUtil.trim(temp);
                    String[] split = temp.split("\t");
                    StringBuilder sb = new StringBuilder();
                    if (split.length % 2 != 0) {
                        LIBRARYLOG.error("init ambiguity  error in line :" + temp + " format err !");
                    }
                    for (int i = 0; i < split.length; i += 2) {
                        sb.append(split[i]);
                    }
                    forest.addBranch(sb.toString(), split);
                    count++;
                }
            }
            br.close();
        } catch (IOException e) {
            System.err.println("READ DIC-AMBIG ERROR: " + e);
            e.printStackTrace();
        }

        logger.info("###[DIC-AMBIG]###The dic of Ambig[" + path + "] is loaded, the num of dic is: " + count);
        return forest;
    }

    //加载停用词字典
    private static List<String> loadStopDic(String path) throws Exception{

        List<String> list = new ArrayList<String>();

        InputStream  input = LoadDictionary.class.getResourceAsStream(path);
        BufferedReader br = new BufferedReader(new InputStreamReader(input));
        String temp = null;
        int count = 0;
        try {
            while ((temp = br.readLine()) != null) {
                list.add(temp);
                count++;
            }
            br.close();
        } catch (IOException e) {
            System.err.println("READ DIC-STOP ERROR: " + e);
            e.printStackTrace();
        }

        logger.info("###[DIC-STOP]###The dic of Stop[" + path + "] is loaded, the num of dic is: " + count);
        return list;
    }

    //加载同义词字典
    private static Map<String, String> loadSynonDic(String path) throws Exception{

        Map<String, String> map = new HashMap<String, String>();

        InputStream  input = LoadDictionary.class.getResourceAsStream(path);
        BufferedReader br = new BufferedReader(new InputStreamReader(input));
        String temp = null;
        int count = 0;
        try {
            while ((temp = br.readLine()) != null) {
                String[] words = temp.split("\\t");
                if (words.length >= 2) {
                    for (int i = 1; i < words.length; i++) {
                        map.put(words[i].toLowerCase(), words[0].toLowerCase());
                    }
                    count++;
                }
            }
            br.close();
        } catch (IOException e) {
            System.err.println("READ DIC-SYNON ERROR: " + e);
            e.printStackTrace();
        }

        logger.info("###[DIC-SYNON]###The dic of Synon[" + path + "] is loaded, the num of dic is: " + count);
        return map;
    }

    //加载特俗字典
    private static Set<String> loadSpeDic(String path) throws Exception{

        Set<String> set = new HashSet<>();

        InputStream  input = LoadDictionary.class.getResourceAsStream(path);
        BufferedReader br = new BufferedReader(new InputStreamReader(input));
        String temp = null;
        int count = 0;
        try {
            while ((temp = br.readLine()) != null) {
                if (temp.length() != 0) {
                    set.add(temp);
                    count++;
                }
            }
            br.close();
        } catch (IOException e) {
            System.err.println("READ DIC-SPE ERROR: " + e);
            e.printStackTrace();
        }

        logger.info("###[DIC-SPE]###The dic of Spe[" + path + "] is loaded, the num of dic is: " + count);
        return set;
    }
}