package org.ansj.splitWord.analysis;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.recognition.arrimpl.NumRecognition;
import org.ansj.recognition.arrimpl.UserDefineRecognition;
import org.ansj.splitWord.Analysis;
import org.ansj.splitWord.impl.GetWordsImpl;
import org.ansj.util.Graph;
import org.ansj.util.MyStaticValue;
import org.ansj.util.NameFix;
import org.ansj.util.TermUtil;
import org.nlpcn.commons.lang.tire.GetWord;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.util.WordAlert;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.ansj.library.DATDictionary.IN_SYSTEM;
import static org.ansj.library.DATDictionary.status;
/**
* Author: blogchong
* Time: 2016/10/13.
* Email: blogchong#qq.com
* 公众号:数据虫巢 ID:blogchong
* Desc: 重构dic分词器,支持按业务动态的更换字典。
*/
public class SeeDicAnalysis extends Analysis {
private Forest ambiguityForest = null;
protected Forest[] forests = null;
public Result parseStr(String temp) {
return new Result(analysisStr(temp));
}
/**
* 一整句话分词,用户设置的歧异优先
*
* @param temp
* @return
*/
private List<Term> analysisStr(String temp) {
Graph gp = new Graph(temp);
int startOffe = 0;
if (this.ambiguityForest != null) {
GetWord gw = new GetWord(this.ambiguityForest, gp.chars);
String[] params = null;
while ((gw.getFrontWords()) != null) {
if (gw.offe > startOffe) {
analysis(gp, startOffe, gw.offe);
}
params = gw.getParams();
startOffe = gw.offe;
for (int i = 0; i < params.length; i += 2) {
gp.addTerm(new Term(params[i], startOffe, new TermNatures(new TermNature(params[i + 1], 1))));
startOffe += params[i].length();
}
}
}
if (startOffe < gp.chars.length - 1) {
analysis(gp, startOffe, gp.chars.length);
}
List<Term> result = this.getResult(gp);
return result;
}
private void analysis(Graph gp, int startOffe, int endOffe) {
int start = 0;
int end = 0;
char[] chars = gp.chars;
String str = null;
char c = 0;
for (int i = startOffe; i < endOffe; i++) {
switch (status(chars[i])) {
case 0:
if (Character.isHighSurrogate(chars[i]) && (i + 1) < endOffe && Character.isLowSurrogate(chars[i + 1])) {
str = new String(Arrays.copyOfRange(chars, i, i + 2));
gp.addTerm(new Term(str, i, TermNatures.NULL));
i++;
} else {
gp.addTerm(new Term(String.valueOf(chars[i]), i, TermNatures.NULL));
}
break;
case 4:
start = i;
end = 1;
while (++i < endOffe && status(chars[i]) == 4) {
end++;
}
str = WordAlert.alertEnglish(chars, start, end);
gp.addTerm(new Term(str, start, TermNatures.EN));
i--;
break;
case 5:
start = i;
end = 1;
while (++i < endOffe && status(chars[i]) == 5) {
end++;
}
str = WordAlert.alertNumber(chars, start, end);
gp.addTerm(new Term(str, start, TermNatures.M));
i--;
break;
default:
start = i;
end = i;
c = chars[start];
while (IN_SYSTEM[c] > 0) {
end++;
if (++i >= endOffe)
break;
c = chars[i];
}
if (start == end) {
gp.addTerm(new Term(String.valueOf(c), i, TermNatures.NULL));
continue;
}
gwi.setChars(chars, start, end);
while ((str = gwi.allWords()) != null) {
gp.addTerm(new Term(str, gwi.offe, gwi.getItem()));
}
/**
* 如果未分出词.以未知字符加入到gp中
*/
if (IN_SYSTEM[c] > 0 || status(c) > 3 || Character.isHighSurrogate(chars[i])) {
i -= 1;
} else {
gp.addTerm(new Term(String.valueOf(c), i, TermNatures.NULL));
}
break;
}
}
}
/**
* 分词的类
*/
private GetWordsImpl gwi = new GetWordsImpl();
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
graph.walkPath();
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
// 数字发现
if (MyStaticValue.isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && MyStaticValue.isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
return getResult();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
new UserDefineRecognition(TermUtil.InsertTermType.REPLACE, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
graph.rmLittlePath();
}
private List<Term> getResult() {
List<Term> result = new ArrayList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
}
}
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
public SeeDicAnalysis(Forest forest) {
this.forests = new Forest[] { forest };
}
public SeeDicAnalysis(Forest forest, Forest ambiguityForest) {
this.forests = new Forest[] { forest };
this.ambiguityForest = ambiguityForest;
}
public static Result parse(Forest forest, String str) {
return new SeeDicAnalysis(forest).parseStr(str);
}
public static Result parse(Forest forest, Forest ambiguityForest, String str) {
return new SeeDicAnalysis(forest, ambiguityForest).parseStr(str);
}
}