package de.dfki.nlp.diseasener;
import com.google.common.base.Charsets;
import com.google.common.io.LineProcessor;
import com.google.common.io.Resources;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Stream;
/**
* Created by philippe on 1/31/17.
*/
public class DiseasesNer {
private final Trie trie;
public DiseasesNer() {
this("diseases.dict");
}
public DiseasesNer(String file) {
Set<String> allNames;
try {
allNames = Resources.readLines(Resources.getResource(file), Charsets.UTF_8, new LineProcessor<Set<String>>() {
Set<String> allNames = new HashSet<>();
@Override
public boolean processLine(String line) throws IOException {
line = line.replaceAll("\"", "");
line = line.replaceAll("\\(", "");
line = line.replaceAll("\\)", "");
line = line.replaceAll("#", "");
if (line.length() > 5 && !line.matches("^[A-Z0-9\\s]*$")) {
allNames.add(line);
}
return true;
}
@Override
public Set<String> getResult() {
return allNames;
}
});
} catch (IOException e) {
throw new IllegalArgumentException("Can't initialize DiseasesNer", e);
}
Trie.TrieBuilder builder = Trie.builder()
.ignoreCase()
.onlyWholeWords()
.ignoreOverlaps();
allNames.forEach(builder::addKeyword);
this.trie = builder.build();
}
public Stream<DiseaseMention> extractFromText(String text){
Collection<Emit> emits = trie.parseText(text);
return emits.stream().map(a -> new DiseaseMention(a.getStart(), a.getEnd(), text.substring(a.getStart(), a.getEnd() + 1)));
}
}