package eu.dnetlib.iis.wf.collapsers.basic;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import eu.dnetlib.iis.common.citations.schemas.Citation;
import eu.dnetlib.iis.common.citations.schemas.CitationEntry;
/**
* Collapses {@link Citation} records by position field by merging all fields
* taking confidenceLevel into account when handling destinationDocumentId.
*
* It is expected sourceDocumentId is the same for all {@link Citation} objects.
*
* @author mhorst
*/
public class GenericCitationCollapser extends AbstractSimpleCollapser<Citation> {
/**
* Total extracted citations counter.
*/
private Counter totalCounter;
/**
* Documents with at least one extracted citation counter.
*/
private Counter docsWithAtLeastOneCitationCounter;
/**
* Hadoop counters enum of citation records.
*/
public static enum CitationTextCounters {
TOTAL,
DOCS_WITH_AT_LEAST_ONE_CITATION
}
// --------------------- LOGIC -------------------------------
@Override
public void setup(TaskAttemptContext context) {
totalCounter = context.getCounter(CitationTextCounters.TOTAL);
totalCounter.setValue(0);
docsWithAtLeastOneCitationCounter = context.getCounter(CitationTextCounters.DOCS_WITH_AT_LEAST_ONE_CITATION);
docsWithAtLeastOneCitationCounter.setValue(0);
}
@Override
protected List<Citation> collapseNonEmpty(List<Citation> objects) {
Map<Integer, List<Citation>> citationsByPositionMap = new HashMap<Integer, List<Citation>>();
for (Citation citation : objects) {
List<Citation> list = citationsByPositionMap.get(citation.getEntry().getPosition());
if (list==null) {
list = new ArrayList<Citation>();
citationsByPositionMap.put(citation.getEntry().getPosition(), list);
}
list.add(citation);
}
List<Citation> results = new ArrayList<Citation>(citationsByPositionMap.size());
int citationsWithTextCount = 0;
for (List<Citation> citationsByPosition : citationsByPositionMap.values()) {
Citation collapsedCitation = collapseForPosition(citationsByPosition);
if (hasTextDefined(collapsedCitation)) {
citationsWithTextCount ++;
}
results.add(collapsedCitation);
}
if (citationsWithTextCount > 0) {
totalCounter.increment(citationsWithTextCount);
docsWithAtLeastOneCitationCounter.increment(1);
}
return results;
}
// --------------------- PRIVATE -------------------------------
/**
* Checks whether text was defined for given citation.
*/
private boolean hasTextDefined(Citation citation) {
return StringUtils.isNotBlank(citation.getEntry().getRawText());
}
/**
* Collapses citations for the same sourceDocumentId and position.
* @param objects
* @return collapsed citation or null when nothing to collapse
*/
private Citation collapseForPosition(List<Citation> objects) {
if (objects.size()==1) {
return objects.get(0);
} else {
Citation resultCandidate = objects.get(0);
for (int i=1; i<objects.size(); i++) {
resultCandidate = merge(resultCandidate, objects.get(i).getEntry());
}
return resultCandidate;
}
}
/**
* Merges existing citation with new citation entry details.
* @param existingCitation
* @param newCitationEntry
* @return existing citation supplemented with new citation entry details
*/
private Citation merge(Citation existingCitation, CitationEntry newCitationEntry) {
if (newCitationEntry != null) {
if (newCitationEntry.getDestinationDocumentId()!=null && newCitationEntry.getConfidenceLevel()!=null) {
// setting only when not set or when confidence level higher than already stored
// important assumption is based on schema assurance: confidenceLevel is always set when destinationDocumentId was set
if (existingCitation.getEntry().getDestinationDocumentId()==null ||
existingCitation.getEntry().getConfidenceLevel() == null ||
existingCitation.getEntry().getConfidenceLevel() < newCitationEntry.getConfidenceLevel()) {
existingCitation.getEntry().setDestinationDocumentId(newCitationEntry.getDestinationDocumentId());
existingCitation.getEntry().setConfidenceLevel(newCitationEntry.getConfidenceLevel());
}
}
// according to schema externalDestinationDocumentIds cannot be null
if (!newCitationEntry.getExternalDestinationDocumentIds().isEmpty()) {
existingCitation.getEntry().getExternalDestinationDocumentIds().putAll(
newCitationEntry.getExternalDestinationDocumentIds());
}
if (newCitationEntry.getRawText()!=null && newCitationEntry.getRawText().length()>0) {
existingCitation.getEntry().setRawText(newCitationEntry.getRawText());
}
}
return existingCitation;
}
}