package de.berlin.hu.uima.cr.chemdner;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.u_compare.shared.semantic.NamedEntity;
import de.berlin.hu.chemspot.Mention;
import de.berlin.hu.types.PubmedDocument;
import de.berlin.hu.util.Constants;
import de.berlin.hu.util.Constants.ChemicalType;
public class CHEMDNERReader extends CollectionReader_ImplBase {
public static final String PARAM_INPUTDIR = "InputDirectory";
private File inputDirectory = null;
private File documentsFile = null;
private File annotationsFile = null;
private BufferedReader reader = null;
private String inputLine = null;
private Map<String, List<Mention>> docIdToAnnotations = null;
private String abstractId= null;
private String abstractText = null;
@Override
public void initialize() throws ResourceInitializationException {
inputDirectory = new File((String)getConfigParameterValue(PARAM_INPUTDIR));
documentsFile = new File (inputDirectory.getAbsolutePath() + "/chemdner_abstracts.txt");
annotationsFile = new File (inputDirectory.getAbsolutePath() + "/chemdner_annotations.txt");
try {
if (annotationsFile.exists()) {
reader = new BufferedReader(new FileReader(annotationsFile));
}
} catch (FileNotFoundException e) {
throw new ResourceInitializationException(String.format("annotations file '%s' could not be found", annotationsFile.getAbsolutePath()), null, e);
}
String line = null;
docIdToAnnotations = new HashMap<String, List<Mention>>();
try {
while (reader != null && (line = reader.readLine()) != null) {
String[] data = line.split("\t");
String docId = data[0];
String section = data[1];
int begin = Integer.parseInt(data[2]);
int end = Integer.parseInt(data[3]);
String text = data[4];
String type = data[5];
Mention mention = new Mention(begin, end, text);
mention.setType(ChemicalType.fromString(type));
mention.setSource(Constants.GOLDSTANDARD);
String key = docId + ":" + section;
if (!docIdToAnnotations.containsKey(key)) {
docIdToAnnotations.put(key, new ArrayList<Mention>());
}
docIdToAnnotations.get(key).add(mention);
}
} catch (IOException e) {
throw new ResourceInitializationException(String.format("could not read annotations file '%s'", annotationsFile.getAbsolutePath()), null, e);
}
try {
reader = new BufferedReader(new FileReader(documentsFile));
} catch (FileNotFoundException e) {
throw new ResourceInitializationException(String.format("documents file '%s' could not be found", documentsFile.getAbsolutePath()), null, e);
}
}
@Override
public void getNext(CAS aCAS) throws IOException, CollectionException {
JCas jcas;
try {
jcas = aCAS.getJCas();
} catch (CASException e) {
throw new CollectionException(e);
}
String id = null;
String text = null;
String key = null;
if (abstractText != null) {
id = abstractId;
text = abstractText;
key = id + ":A";
abstractId = null;
abstractText = null;
} else {
String[] data = inputLine.split("\t");
id = data[0];
text = data[1];
key = id + ":T";
abstractId = id;
abstractText = data[2];
}
jcas.setDocumentText(text);
PubmedDocument pubmedDoc = new PubmedDocument(jcas);
pubmedDoc.setBegin(0);
pubmedDoc.setEnd(text.length());
pubmedDoc.setPmid(key);
pubmedDoc.addToIndexes(jcas);
if (docIdToAnnotations.containsKey(key)) {
for (Mention mention : docIdToAnnotations.get(key)) {
NamedEntity entity = new NamedEntity(jcas);
entity.setBegin(mention.getStart());
entity.setEnd(mention.getEnd());
entity.setEntityType(mention.getType().toString());
entity.setSource(Constants.GOLDSTANDARD);
entity.setEntityType(mention.getType().toString());
entity.addToIndexes();
//System.out.println(entity.getCoveredText());
}
}
}
@Override
public boolean hasNext() throws IOException, CollectionException {
return abstractText != null || (inputLine = reader.readLine()) != null && !inputLine.isEmpty();
}
@Override
public Progress[] getProgress() {
// TODO Auto-generated method stub
return null;
}
@Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
}