/*
* Copyright (c) 2012. Humboldt-Universität zu Berlin, Dept. of Computer Science and Dept.
* of Wissensmanagement in der Bioinformatik
* -------------------------------
*
* THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC
* LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
* CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
*
* http://www.opensource.org/licenses/cpl1.0
*/
package de.berlin.hu.uima.ae.normalizer;
import de.berlin.hu.chemspot.ChemSpotConfiguration;
import de.berlin.hu.chemspot.ChemSpotConfiguration.Component;
import de.berlin.hu.util.Constants;
import de.berlin.hu.util.Constants.ChemicalID;
//import groovyNormalizerBeans.NameNormalizer;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.u_compare.shared.semantic.NamedEntity;
import org.uimafit.util.JCasUtil;
import uk.ac.cam.ch.wwmm.opsin.NameToInchi;
import uk.ac.cam.ch.wwmm.opsin.NameToStructureException;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
/**
* User: Tim Rocktaeschel
* Date: 8/16/12
* Time: 3:28 PM
*/
public class Normalizer extends JCasAnnotator_ImplBase {
private static Map<String,String[]> ids = new HashMap<String,String[]>();
private static Map<String,String[]> normalizedIds = new HashMap<String,String[]>();
private NameToInchi nameToInChi;
private static final String PATH_TO_IDS = "PathToIDs";
//private NameNormalizer nameNormalizer = null;
private Map<String, String> fdaIds = null;
private Map<String, String> fdaDates = null;
private void loadFDAData(String pathToFile) throws IOException {
fdaIds = new HashMap<String, String>();
fdaDates = new HashMap<String, String>();
BufferedReader reader = new BufferedReader(new InputStreamReader(this.getClass().getResourceAsStream(pathToFile)));
String line = null;
while((line = reader.readLine()) != null) {
String[] data = line.split("\t");
String id = data[0];
String drug = data[1];
String date = data[2];
fdaIds.put(drug, id);
fdaDates.put(id, date);
}
reader.close();
}
private void writePrefixSuffixLists() throws IOException {
int prefixLength = 3;
int suffixLength = 3;
Map<String, Integer> prefixes = new HashMap<String, Integer>();
Map<String, Integer> suffixes = new HashMap<String, Integer>();
System.out.println("Writing prefix and suffix lists...");
for (String chemical : ids.keySet()) {
if (chemical.startsWith("(")) chemical = chemical.substring(1, chemical.length());
if (chemical.endsWith(")")) chemical = chemical.substring(0, chemical.length() - 1);
String prefix = chemical.length() >= prefixLength ? chemical.substring(0, prefixLength) : null;
String suffix = chemical.length() >= suffixLength ? chemical.substring(chemical.length() - suffixLength, chemical.length()) : null;
if (prefix != null) {
if (!prefixes.containsKey(prefix)) {
prefixes.put(prefix, 0);
}
prefixes.put(prefix, prefixes.get(prefix)+1);
}
if (suffix != null) {
if (!suffixes.containsKey(suffix)) {
suffixes.put(suffix, 0);
}
suffixes.put(suffix, suffixes.get(suffix)+1);
}
}
List<String> prefixList = new ArrayList<String>(prefixes.keySet());
List<String> suffixList = new ArrayList<String>(suffixes.keySet());
class IntegerMapComparator implements Comparator<String> {
private Map<String, Integer> map = null;
public IntegerMapComparator(Map<String, Integer> map) {
this.map = map;
}
public int compare(String o1, String o2) {
return map.get(o1) - map.get(o2);
}
};
Collections.sort(prefixList, Collections.reverseOrder(new IntegerMapComparator(prefixes)));
Collections.sort(suffixList, Collections.reverseOrder(new IntegerMapComparator(suffixes)));
BufferedWriter writer = new BufferedWriter(new FileWriter("prefixes.txt"));
for (String prefix : prefixList) {
writer.write(String.format("%s\t%d%n", prefix, prefixes.get(prefix)));
}
writer.close();
writer = new BufferedWriter(new FileWriter("suffixes.txt"));
for (String suffix : suffixList) {
writer.write(String.format("%s\t%d%n", suffix, suffixes.get(suffix)));
}
writer.close();
writer = new BufferedWriter(new FileWriter("suffixes-filtered.txt"));
for (String suffix : suffixList) {
if (suffix.matches(String.format("[a-z]{%d}", suffixLength))) {
writer.write(String.format("%s\t%d%n", suffix, suffixes.get(suffix)));
}
}
writer.close();
System.out.println("Done.");
}
public static Map<String, String[]> readIdsFile(InputStream in) throws IOException {
Map<String, String[]> result = new HashMap<String, String[]>();
Map<String, List<String>> normalizedChems = new HashMap<String, List<String>>();
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
String line = reader.readLine();
while (line != null) {
int splitAt = line.indexOf('\t');
String chem = line.substring(0, splitAt).toLowerCase();
String identifiers = line.substring(splitAt+1);
result.put(chem, identifiers.split("\t"));
/*nameNormalizer.setName(chem);
String normalizedChem = nameNormalizer.getNormName();
if (!normalizedChems.containsKey(normalizedChem)) {
normalizedChems.put(normalizedChem, new ArrayList<String>());
} else {
String equal = Arrays.equals(identifiers.split("\t"), ids.get(normalizedChem)) ? "equal" : "not equal";
System.out.println("conflict for '" + chem + "' and " + normalizedChems.get(normalizedChem) + ". Ids are " + equal + ".");
if ("not equal".equals(equal)) {
System.out.println(" " + identifiers.replaceAll("\t", ", ") + " vs." + Arrays.toString(ids.get(normalizedChem)).replace('[', ' ').replace(']', ' '));
}
}
normalizedChems.get(normalizedChem).add(chem);
result.put(normalizedChem, identifiers.split("\t"));*/
line = reader.readLine();
}
return result;
}
public static Map<String, String[]> loadIdsFromFile(String file) throws IOException {
Map<String, String[]> ids = new HashMap<String, String[]>();
if (file.endsWith(".zip")) {
ZipFile zipFile = new ZipFile(file);
Enumeration<? extends ZipEntry> entries = zipFile.entries();
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
InputStream in = zipFile.getInputStream(entry);
ids.putAll(readIdsFile(in));
in.close();
}
} else {
InputStream in = new FileInputStream(file);
ids.putAll(readIdsFile(in));
in.close();
}
return ids;
}
public static void writeIDs(String pathToFile, Map<String, String[]> ids) throws IOException {
BufferedWriter writer = new BufferedWriter(new FileWriter(pathToFile));
for (String chem : ids.keySet()) {
String[] chemIds = ids.get(chem);
String idString = "";
for (ChemicalID type : ChemicalID.values()) {
String id = "";
if (type.ordinal() < chemIds.length) {
id = chemIds[type.ordinal()];
if (id == null) id = "";
}
idString += "\t" + id;
}
writer.write(chem + idString);
writer.newLine();
}
writer.close();
}
public static Map<String, String[]> getIds() {
return ids;
}
@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
System.out.println("Initializing normalizer...");
try {
loadFDAData("/resources/fda/approved_drugs.tsv");
} catch (IOException e) {
e.printStackTrace();
}
/*if (ChemSpotConfiguration.useComponent(Component.CHEMHITS)) {
System.out.println(" Initializing ChemHits...");
nameNormalizer = new NameNormalizer();
}*/
String idsFile = aContext.getConfigParameterValue(PATH_TO_IDS).toString();
try {
if (idsFile.endsWith(".zip")) {
ZipFile zipFile = new ZipFile(idsFile);
Enumeration<? extends ZipEntry> entries = zipFile.entries();
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
InputStream in = zipFile.getInputStream(entry);
if (entry.getName().contains("normalized")) {
if (ChemSpotConfiguration.useComponent(Component.CHEMHITS)) {
System.out.print(" Loading ChemHits normalized ids... ");
normalizedIds.putAll(readIdsFile(in));
System.out.println("Done.");
}
} else if (ChemSpotConfiguration.useComponent(Component.NORMALIZER)) {
System.out.print(" Loading ids... ");
ids.putAll(readIdsFile(in));
System.out.println("Done.");
}
in.close();
}
} else {
ids = loadIdsFromFile(idsFile);
}
} catch (IOException e) {
throw new ResourceInitializationException(e);
}
if (ChemSpotConfiguration.useComponent(Component.OPSIN)) {
try {
//initializing OPSIN
nameToInChi = new NameToInchi();
} catch (NameToStructureException e) {
e.printStackTrace();
}
}
/*try {
writePrefixSuffixLists();
} catch (IOException e) {
e.printStackTrace();
}*/
}
private String[] getBestMatch(String chemical, Map<String, String[]> ids) {
String[] result = null;
List<String> substringMatches = new ArrayList<String>();
String bestMatch = null;
float bestScore = 0;
int i = 0;
for (String key : ids.keySet()) {
if (Math.abs(chemical.length() - key.length()) < 3){
}
float score = StringComparator.diceCoefficient(StringComparator.getNGrams(chemical, 2), StringComparator.getNGrams(key, 2));
if (score > bestScore) {
bestMatch = key;
bestScore = score;
}
if (chemical.contains(key)) {
substringMatches.add(key);
}
if (++i % 10000 == 0) {
System.out.print(".");
}
}
if (bestScore > 0.7) {
result = ids.get(bestMatch);
} else if (!substringMatches.isEmpty()) {
Comparator<String> comparator = new Comparator<String>() {
public int compare(String o1, String o2) {
return o1.length() - o2.length();
}
};
Collections.sort(substringMatches, Collections.reverseOrder(comparator));
String bestSubstringMatch = substringMatches.get(0);
if (bestSubstringMatch.length() > 3) {
result = ids.get(bestMatch);
}
}
return result;
}
private static int chemHitsDifferent = 0;
private static int chemHitsEqual = 0;
private static int chemHitsIdFound = 0;
private static int chemHitsIdFoundExclusively = 0;
private static int chemHitsIdNotFoundExclusively = 0;
private static int chemHitsIdFoundBoth = 0;
private static int chemHitsIdNotFound = 0;
private static int chemHitsdifferentIdFound = 0;
private static int nE = 0;
private static int nN = 0;
private static int fda = 0;
private static int one = 0;
private static int two = 0;
private static int twoAll = 0;
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
Iterator<NamedEntity> entities = JCasUtil.iterator(jCas, NamedEntity.class);
List<NamedEntity> entiti = new ArrayList<NamedEntity>();
while (entities.hasNext()) {
NamedEntity entity = entities.next();
String inchi = nameToInChi != null ? nameToInChi.parseToStdInchi(entity.getCoveredText()) : null;
if (entity.getCoveredText().matches("[a-zA-Z]+")) {
one++;
}
if (entity.getCoveredText().matches("[a-zA-Z]+ [a-zA-Z]+")) {
two++;
}
if (entity.getCoveredText().matches("[a-zA-Z0-9]+( [a-zA-Z0-9]+)?")) {
twoAll++;
}
if (!Constants.GOLDSTANDARD.equals(entity.getSource())) {
nE++;
String[] normalized = ids.get(entity.getCoveredText().toLowerCase());
/*if (nameNormalizer != null) {
nameNormalizer.setName(entity.getCoveredText());
String chemHitsnormalizedString = nameNormalizer.getNormName();
if (entity.getCoveredText().replace("-", " ").equalsIgnoreCase(chemHitsnormalizedString.replace("-", " "))) {
chemHitsEqual++;
} else {
chemHitsDifferent++;
}
//System.out.println(entity.getCoveredText() + " - > " + chemHitsnormalizedString);
String[] chemhitsNormalized = normalizedIds.get(chemHitsnormalizedString);
if (normalized != null && (normalized[Constants.ChemicalID.CHEB.ordinal()] == null || normalized[Constants.ChemicalID.CHEB.ordinal()].isEmpty())) {
normalized = null;
}
if (chemhitsNormalized != null && (chemhitsNormalized[Constants.ChemicalID.CHEB.ordinal()] == null || chemhitsNormalized[Constants.ChemicalID.CHEB.ordinal()].isEmpty())) {
chemhitsNormalized = null;
}
if (normalized == null && chemhitsNormalized == null) {
chemHitsIdNotFound++;
} else if (normalized == null && chemhitsNormalized != null) {
chemHitsIdFoundExclusively++;
} else if (normalized != null && chemhitsNormalized == null) {
chemHitsIdNotFoundExclusively++;
} else if (normalized != null && chemhitsNormalized != null) {
chemHitsIdFoundBoth++;
}
if (normalized != null && chemhitsNormalized != null) {
if (chemhitsNormalized.length != normalized.length) {
chemHitsdifferentIdFound++;
} else {
for (int i = 0; i < chemhitsNormalized.length; i++) {
if (
(normalized[i] != null && !normalized[i].equals(chemhitsNormalized[i]))
|| (chemhitsNormalized[i] != null && !chemhitsNormalized[i].equals(normalized[i]))
) {
chemHitsdifferentIdFound++;
break;
}
}
}
}
if (normalized == null && chemhitsNormalized != null) {
normalized = chemhitsNormalized;
System.out.println("replacing id with the one found by ChemHits: " + entity.getCoveredText() + " -> " + chemHitsnormalizedString);
}
}*/
/*if (normalized == null) {
normalized = getBestMatch(entity.getCoveredText().toLowerCase(), ids);
}*/
//if entity is contained in dictionary
if (normalized != null) {
//FIXME: use a UIMA field instead of a String here
if (normalized.length > ChemicalID.INCH.ordinal()) {
if (normalized[ChemicalID.INCH.ordinal()].isEmpty() && inchi != null) normalized[ChemicalID.INCH.ordinal()] = inchi;
} else {
if (inchi != null) {
String[] normalizedTemp = Arrays.copyOf(normalized, ChemicalID.INCH.ordinal() + 1);
normalizedTemp[ChemicalID.INCH.ordinal()] = inchi;
normalized = normalizedTemp;
}
}
nN++;
} else {
if (inchi != null) {
String[] normalizedTemp = new String[ChemicalID.INCH.ordinal() + 1];
normalizedTemp[ChemicalID.INCH.ordinal()] = inchi;
normalized = normalizedTemp;
nN++;
}
}
if (fdaIds != null && fdaIds.containsKey(entity.getCoveredText().toLowerCase())) {
fda++;
//System.out.println(entity.getCoveredText().toLowerCase());
if (normalized == null) normalized = new String[Constants.ChemicalID.values().length];
normalized = Arrays.copyOf(normalized, Constants.ChemicalID.values().length);
normalized[ChemicalID.FDA.ordinal()] = fdaIds.get(entity.getCoveredText().toLowerCase());
if (fdaDates.containsKey(fdaIds.get(entity.getCoveredText().toLowerCase()))) {
normalized[ChemicalID.FDA_DATE.ordinal()] = fdaDates.get(fdaIds.get(entity.getCoveredText().toLowerCase()));
}
}
String normalizedString = normalized != null ? Arrays.toString(normalized) : null;
if (Constants.GOLDSTANDARD.equals(entity.getSource())) {
NamedEntity e = (NamedEntity)entity.clone();
e.setId(normalizedString);
e.setSource("Test");
entiti.add(e);
} else {
entity.setId(normalizedString);
}
}
}
if (!entiti.isEmpty()) {
for (NamedEntity e : entiti) {
e.addToIndexes();
}
}
//if (nameNormalizer != null) printChemHitsStatistic();
//System.out.println(fda);
//System.out.println(nN + "/" + nE);
//System.out.printf("not normalized: %d, all: %d, one word: %d, two words: %d, two words all: %d%n", nN, nE, one, two, twoAll);
}
private void printChemHitsStatistic() {
System.out.printf("%nChemHits statistics:%n identifed %d new terms after normalization (of %d / %.2f %%)%n", chemHitsDifferent, chemHitsDifferent + chemHitsEqual, chemHitsDifferent + chemHitsEqual > 0 ? (float)chemHitsDifferent / (float)(chemHitsDifferent + chemHitsEqual) * 100 : 0);
System.out.printf(" found only by ChemHits: %d, only by ChemSpot: %d, by neither: %d, by both: %d (%d of those differently / %.2f %%)%n%n", chemHitsIdFoundExclusively, chemHitsIdNotFoundExclusively, chemHitsIdNotFound, chemHitsIdFoundBoth , chemHitsdifferentIdFound, chemHitsIdFoundBoth > 0 ? (float)chemHitsdifferentIdFound / (float)chemHitsIdFoundBoth * 100 : 0);
}
}