package io.github.infolis.algorithm;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.model.EntityType;
import io.github.infolis.model.ExecutionStatus;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.Entity;
import io.github.infolis.util.InformationExtractor;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author kata
*
*/
public class MetaDataExtractor extends BaseAlgorithm {
public MetaDataExtractor(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) {
super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
}
private static final Logger log = LoggerFactory.getLogger(MetaDataExtractor.class);
@Override
public void execute() throws IOException {
String tr = getExecution().getTextualReferences().get(0);
TextualReference ref = getInputDataStoreClient().get(TextualReference.class, tr);
debug(log, "Extracting metadata from textual reference {}", ref);
Entity entity = extractMetadata(ref);
if ((null == entity.getName() || entity.getName().isEmpty()) &&
entity.getNumericInfo().isEmpty() &&
(null == entity.getIdentifiers() || entity.getIdentifiers().isEmpty()) &&
(null == entity.getURL() || entity.getURL().isEmpty())) {
error(log, "Could not extract metadata for reference {} ", ref);
getExecution().setStatus(ExecutionStatus.FAILED);
return;
}
//TODO multiply with actual reliability of metadata extraction
double metadataExtractionReliability = 0.7;
try {
entity.setEntityReliability(ref.getReferenceReliability() * metadataExtractionReliability);
} catch (NullPointerException npe) {
log.debug("Cannot set reliability of entity: textual reference's reliability is null");
}
getOutputDataStoreClient().post(Entity.class, entity);
getExecution().setLinkedEntities(Arrays.asList(entity.getUri()));
getExecution().setStatus(ExecutionStatus.FINISHED);
}
/**
* Extracts metadata from a TextualReference object and returns a corresponding Entity.
*
* @param ref the textual reference
* @return an entity representing the extracted information
*/
public Entity extractMetadata(TextualReference ref) {
Entity entity = new Entity();
//TODO hacky, other special characters may still cause problems
String name = ref.getReference()
// replace characters that might have been introduced during tokenization
.replace("-LRB-", "")
.replace("-RRB-", "")
.replace("*NL*", "")
.replaceAll("\\d", "")
.replaceAll("\\p{Punct}+", " ")
.replace("ü", "ue")
.replace("ä", "ae")
.replace("ö", "oe")
.replace("Ü", "Ue")
.replace("Ä", "Ae")
.replace("Ö", "Oe")
.replaceAll("\\s+", " ")
.trim();
entity.setName(name);
entity.setTags(ref.getTags());
entity.addAllTags(getExecution().getTags());
List<String> numericInfo = InformationExtractor.extractNumericInfo(ref);
//TODO make priorities configurable...
numericInfo = InformationExtractor.sortNumericInfo(numericInfo);
entity.setNumericInfo(InformationExtractor.sortNumericInfo(numericInfo));
entity.setEntityType(EntityType.citedData);
String identifier = InformationExtractor.extractDOI(ref);
if (!"".equals(identifier)) entity.addIdentifier(identifier);
entity.setURL(InformationExtractor.extractURL(ref));
//TODO entity.setCreator(InformationExtractor.extractCreator(ref));
return entity;
}
@Override
public void validate() throws IllegalAlgorithmArgumentException {
if (null == getExecution().getTextualReferences() || getExecution().getTextualReferences().isEmpty()) {
throw new IllegalAlgorithmArgumentException(getClass(), "textualReference", "Required parameter 'textual reference' is missing!");
}
}
}