package com.mite8.utils.off_line_util; import java.io.*; import java.util.HashSet; import java.util.Scanner; import java.util.Set; /** * Author: blogchong * Time: 2016/10/9. * Email: blogchong#qq.com * 公众号:数据虫巢 ID:blogchong * Desc: 针对于品牌词,梳理其单英文字的词 */ public class JudgeWordDic { // public static void main(String[] args) { // String dicPath = "dic/seg_words/brand.dic"; // String outPath = "C:\\Data\\桌面空间\\工作文档\\分词\\实例\\brand_merge_only_en.dic"; // JudgeWordDic mergeWordDic = new JudgeWordDic(); // mergeWordDic.mergeWordDic(dicPath, outPath); // } //返回一个list进行加载 public void mergeWordDic(String srcDicPath, String outPutPath) { Set<String> set = new HashSet<String>(); ClassLoader classLoader = getClass().getClassLoader(); File file = new File(classLoader.getResource(srcDicPath).getFile()); try (Scanner scanner = new Scanner(file)) { while (scanner.hasNextLine()) { String dicWord = scanner.nextLine().trim(); if (dicWord.matches("[a-zA-Z]+")) { set.add(dicWord.trim().toLowerCase()); } } scanner.close(); writeFile(set, outPutPath); } catch (IOException e) { e.printStackTrace(); } } //存储到指定位置 public static void writeFile(Set<String> set, String outPath) { try { BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(new File(outPath)), "UTF-8")); int count = 0; for (String word: set) { bw.write(word); bw.newLine(); count++; } bw.close(); System.err.println("The num of merge words is :" + count); } catch (Exception e) { System.err.println("write errors :" + e); } } }