package ca.uhn.fhir.jpa.term;
import static org.apache.commons.lang3.StringUtils.isNotBlank;
/*
* #%L
* HAPI FHIR JPA Server
* %%
* Copyright (C) 2014 - 2017 University Health Network
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.csv.QuoteMode;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.springframework.beans.factory.annotation.Autowired;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import ca.uhn.fhir.jpa.entity.TermCodeSystemVersion;
import ca.uhn.fhir.jpa.entity.TermConcept;
import ca.uhn.fhir.jpa.entity.TermConceptParentChildLink;
import ca.uhn.fhir.jpa.entity.TermConceptParentChildLink.RelationshipTypeEnum;
import ca.uhn.fhir.jpa.util.Counter;
import ca.uhn.fhir.rest.method.RequestDetails;
import ca.uhn.fhir.rest.server.exceptions.InternalErrorException;
import ca.uhn.fhir.rest.server.exceptions.InvalidRequestException;
public class TerminologyLoaderSvc implements IHapiTerminologyLoaderSvc {
private static final int LOG_INCREMENT = 100000;
public static final String LOINC_FILE = "loinc.csv";
public static final String LOINC_HIERARCHY_FILE = "MULTI-AXIAL_HIERARCHY.CSV";
private static final org.slf4j.Logger ourLog = org.slf4j.LoggerFactory.getLogger(TerminologyLoaderSvc.class);
public static final String SCT_FILE_CONCEPT = "Terminology/sct2_Concept_Full_";
public static final String SCT_FILE_DESCRIPTION = "Terminology/sct2_Description_Full-en";
public static final String SCT_FILE_RELATIONSHIP = "Terminology/sct2_Relationship_Full";
@Autowired
private IHapiTerminologySvc myTermSvc;
private void dropCircularRefs(TermConcept theConcept, ArrayList<String> theChain, Map<String, TermConcept> theCode2concept, Counter theCircularCounter) {
theChain.add(theConcept.getCode());
for (Iterator<TermConceptParentChildLink> childIter = theConcept.getChildren().iterator(); childIter.hasNext();) {
TermConceptParentChildLink next = childIter.next();
TermConcept nextChild = next.getChild();
if (theChain.contains(nextChild.getCode())) {
StringBuilder b = new StringBuilder();
b.append("Removing circular reference code ");
b.append(nextChild.getCode());
b.append(" from parent ");
b.append(next.getParent().getCode());
b.append(". Chain was: ");
for (String nextInChain : theChain) {
TermConcept nextCode = theCode2concept.get(nextInChain);
b.append(nextCode.getCode());
b.append('[');
b.append(StringUtils.substring(nextCode.getDisplay(), 0, 20).replace("[", "").replace("]", "").trim());
b.append("] ");
}
ourLog.info(b.toString(), theConcept.getCode());
childIter.remove();
nextChild.getParents().remove(next);
} else {
dropCircularRefs(nextChild, theChain, theCode2concept, theCircularCounter);
}
}
theChain.remove(theChain.size() - 1);
}
private void extractFiles(List<byte[]> theZipBytes, List<String> theExpectedFilenameFragments) {
Set<String> foundFragments = new HashSet<String>();
for (byte[] nextZipBytes : theZipBytes) {
ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new ByteArrayInputStream(nextZipBytes)));
try {
for (ZipEntry nextEntry; (nextEntry = zis.getNextEntry()) != null;) {
for (String next : theExpectedFilenameFragments) {
if (nextEntry.getName().contains(next)) {
foundFragments.add(next);
}
}
}
} catch (IOException e) {
throw new InternalErrorException(e);
} finally {
IOUtils.closeQuietly(zis);
}
}
for (String next : theExpectedFilenameFragments) {
if (!foundFragments.contains(next)) {
throw new InvalidRequestException("Invalid input zip file, expected zip to contain the following name fragments: " + theExpectedFilenameFragments + " but found: " + foundFragments);
}
}
}
public String firstNonBlank(String... theStrings) {
String retVal = "";
for (String nextString : theStrings) {
if (isNotBlank(nextString)) {
retVal = nextString;
break;
}
}
return retVal;
}
private TermConcept getOrCreateConcept(TermCodeSystemVersion codeSystemVersion, Map<String, TermConcept> id2concept, String id) {
TermConcept concept = id2concept.get(id);
if (concept == null) {
concept = new TermConcept();
id2concept.put(id, concept);
concept.setCodeSystem(codeSystemVersion);
}
return concept;
}
private void iterateOverZipFile(List<byte[]> theZipBytes, String fileNamePart, IRecordHandler handler, char theDelimiter, QuoteMode theQuoteMode) {
boolean found = false;
for (byte[] nextZipBytes : theZipBytes) {
ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new ByteArrayInputStream(nextZipBytes)));
try {
for (ZipEntry nextEntry; (nextEntry = zis.getNextEntry()) != null;) {
ZippedFileInputStream inputStream = new ZippedFileInputStream(zis);
String nextFilename = nextEntry.getName();
if (nextFilename.contains(fileNamePart)) {
ourLog.info("Processing file {}", nextFilename);
found = true;
Reader reader = null;
CSVParser parsed = null;
try {
reader = new InputStreamReader(new BOMInputStream(zis), Charsets.UTF_8);
CSVFormat format = CSVFormat.newFormat(theDelimiter).withFirstRecordAsHeader();
if (theQuoteMode != null) {
format = format.withQuote('"').withQuoteMode(theQuoteMode);
}
parsed = new CSVParser(reader, format);
Iterator<CSVRecord> iter = parsed.iterator();
ourLog.debug("Header map: {}", parsed.getHeaderMap());
int count = 0;
int logIncrement = LOG_INCREMENT;
int nextLoggedCount = 0;
while (iter.hasNext()) {
CSVRecord nextRecord = iter.next();
handler.accept(nextRecord);
count++;
if (count >= nextLoggedCount) {
ourLog.info(" * Processed {} records in {}", count, nextFilename);
nextLoggedCount += logIncrement;
}
}
} catch (IOException e) {
throw new InternalErrorException(e);
}
}
}
} catch (IOException e) {
throw new InternalErrorException(e);
} finally {
IOUtils.closeQuietly(zis);
}
}
// This should always be true, but just in case we've introduced a bug...
Validate.isTrue(found);
}
@Override
public UploadStatistics loadLoinc(List<byte[]> theZipBytes, RequestDetails theRequestDetails) {
List<String> expectedFilenameFragments = Arrays.asList(LOINC_FILE, LOINC_HIERARCHY_FILE);
extractFiles(theZipBytes, expectedFilenameFragments);
ourLog.info("Beginning LOINC processing");
return processLoincFiles(theZipBytes, theRequestDetails);
}
@Override
public UploadStatistics loadSnomedCt(List<byte[]> theZipBytes, RequestDetails theRequestDetails) {
List<String> expectedFilenameFragments = Arrays.asList(SCT_FILE_DESCRIPTION, SCT_FILE_RELATIONSHIP, SCT_FILE_CONCEPT);
extractFiles(theZipBytes, expectedFilenameFragments);
ourLog.info("Beginning SNOMED CT processing");
return processSnomedCtFiles(theZipBytes, theRequestDetails);
}
UploadStatistics processLoincFiles(List<byte[]> theZipBytes, RequestDetails theRequestDetails) {
final TermCodeSystemVersion codeSystemVersion = new TermCodeSystemVersion();
final Map<String, TermConcept> code2concept = new HashMap<String, TermConcept>();
IRecordHandler handler = new LoincHandler(codeSystemVersion, code2concept);
iterateOverZipFile(theZipBytes, LOINC_FILE, handler, ',', QuoteMode.NON_NUMERIC);
handler = new LoincHierarchyHandler(codeSystemVersion, code2concept);
iterateOverZipFile(theZipBytes, LOINC_HIERARCHY_FILE, handler, ',', QuoteMode.NON_NUMERIC);
theZipBytes.clear();
for (Iterator<Entry<String, TermConcept>> iter = code2concept.entrySet().iterator(); iter.hasNext();) {
Entry<String, TermConcept> next = iter.next();
// if (isBlank(next.getKey())) {
// ourLog.info("Removing concept with blankc code[{}] and display [{}", next.getValue().getCode(), next.getValue().getDisplay());
// iter.remove();
// continue;
// }
TermConcept nextConcept = next.getValue();
if (nextConcept.getParents().isEmpty()) {
codeSystemVersion.getConcepts().add(nextConcept);
}
}
ourLog.info("Have {} total concepts, {} root concepts", code2concept.size(), codeSystemVersion.getConcepts().size());
String url = LOINC_URL;
storeCodeSystem(theRequestDetails, codeSystemVersion, url);
return new UploadStatistics(code2concept.size());
}
private void storeCodeSystem(RequestDetails theRequestDetails, final TermCodeSystemVersion codeSystemVersion, String url) {
myTermSvc.setProcessDeferred(false);
myTermSvc.storeNewCodeSystemVersion(url, codeSystemVersion, theRequestDetails);
myTermSvc.setProcessDeferred(true);
}
UploadStatistics processSnomedCtFiles(List<byte[]> theZipBytes, RequestDetails theRequestDetails) {
final TermCodeSystemVersion codeSystemVersion = new TermCodeSystemVersion();
final Map<String, TermConcept> id2concept = new HashMap<String, TermConcept>();
final Map<String, TermConcept> code2concept = new HashMap<String, TermConcept>();
final Set<String> validConceptIds = new HashSet<String>();
IRecordHandler handler = new SctHandlerConcept(validConceptIds);
iterateOverZipFile(theZipBytes, SCT_FILE_CONCEPT, handler, '\t', null);
ourLog.info("Have {} valid concept IDs", validConceptIds.size());
handler = new SctHandlerDescription(validConceptIds, code2concept, id2concept, codeSystemVersion);
iterateOverZipFile(theZipBytes, SCT_FILE_DESCRIPTION, handler, '\t', null);
ourLog.info("Got {} concepts, cloning map", code2concept.size());
final HashMap<String, TermConcept> rootConcepts = new HashMap<String, TermConcept>(code2concept);
handler = new SctHandlerRelationship(codeSystemVersion, rootConcepts, code2concept);
iterateOverZipFile(theZipBytes, SCT_FILE_RELATIONSHIP, handler, '\t', null);
theZipBytes.clear();
ourLog.info("Looking for root codes");
for (Iterator<Entry<String, TermConcept>> iter = rootConcepts.entrySet().iterator(); iter.hasNext(); ) {
if (iter.next().getValue().getParents().isEmpty() == false) {
iter.remove();
}
}
ourLog.info("Done loading SNOMED CT files - {} root codes, {} total codes", rootConcepts.size(), code2concept.size());
Counter circularCounter = new Counter();
for (TermConcept next : rootConcepts.values()) {
long count = circularCounter.getThenAdd();
float pct = ((float)count / rootConcepts.size()) * 100.0f;
ourLog.info(" * Scanning for circular refs - have scanned {} / {} codes ({}%)", count, rootConcepts.size(), pct);
dropCircularRefs(next, new ArrayList<String>(), code2concept, circularCounter);
}
codeSystemVersion.getConcepts().addAll(rootConcepts.values());
String url = SCT_URL;
storeCodeSystem(theRequestDetails, codeSystemVersion, url);
return new UploadStatistics(code2concept.size());
}
@VisibleForTesting
void setTermSvcForUnitTests(IHapiTerminologySvc theTermSvc) {
myTermSvc = theTermSvc;
}
private interface IRecordHandler {
void accept(CSVRecord theRecord);
}
public class LoincHandler implements IRecordHandler {
private final Map<String, TermConcept> myCode2Concept;
private final TermCodeSystemVersion myCodeSystemVersion;
public LoincHandler(TermCodeSystemVersion theCodeSystemVersion, Map<String, TermConcept> theCode2concept) {
myCodeSystemVersion = theCodeSystemVersion;
myCode2Concept = theCode2concept;
}
@Override
public void accept(CSVRecord theRecord) {
String code = theRecord.get("LOINC_NUM");
if (isNotBlank(code)) {
String longCommonName = theRecord.get("LONG_COMMON_NAME");
String shortName = theRecord.get("SHORTNAME");
String consumerName = theRecord.get("CONSUMER_NAME");
String display = firstNonBlank(longCommonName, shortName, consumerName);
TermConcept concept = new TermConcept(myCodeSystemVersion, code);
concept.setDisplay(display);
Validate.isTrue(!myCode2Concept.containsKey(code));
myCode2Concept.put(code, concept);
}
}
}
public class LoincHierarchyHandler implements IRecordHandler {
private Map<String, TermConcept> myCode2Concept;
private TermCodeSystemVersion myCodeSystemVersion;
public LoincHierarchyHandler(TermCodeSystemVersion theCodeSystemVersion, Map<String, TermConcept> theCode2concept) {
myCodeSystemVersion = theCodeSystemVersion;
myCode2Concept = theCode2concept;
}
@Override
public void accept(CSVRecord theRecord) {
String parentCode = theRecord.get("IMMEDIATE_PARENT");
String childCode = theRecord.get("CODE");
String childCodeText = theRecord.get("CODE_TEXT");
if (isNotBlank(parentCode) && isNotBlank(childCode)) {
TermConcept parent = getOrCreate(parentCode, "(unknown)");
TermConcept child = getOrCreate(childCode, childCodeText);
parent.addChild(child, RelationshipTypeEnum.ISA);
}
}
private TermConcept getOrCreate(String theCode, String theDisplay) {
TermConcept retVal = myCode2Concept.get(theCode);
if (retVal == null) {
retVal = new TermConcept();
retVal.setCodeSystem(myCodeSystemVersion);
retVal.setCode(theCode);
retVal.setDisplay(theDisplay);
myCode2Concept.put(theCode, retVal);
}
return retVal;
}
}
private final class SctHandlerConcept implements IRecordHandler {
private Set<String> myValidConceptIds;
private Map<String, String> myConceptIdToMostRecentDate = new HashMap<String, String>();
public SctHandlerConcept(Set<String> theValidConceptIds) {
myValidConceptIds = theValidConceptIds;
}
@Override
public void accept(CSVRecord theRecord) {
String id = theRecord.get("id");
String date = theRecord.get("effectiveTime");
if (!myConceptIdToMostRecentDate.containsKey(id) || myConceptIdToMostRecentDate.get(id).compareTo(date) < 0) {
boolean active = "1".equals(theRecord.get("active"));
if (active) {
myValidConceptIds.add(id);
} else {
myValidConceptIds.remove(id);
}
myConceptIdToMostRecentDate.put(id, date);
}
}
}
private final class SctHandlerDescription implements IRecordHandler {
private final Map<String, TermConcept> myCode2concept;
private final TermCodeSystemVersion myCodeSystemVersion;
private final Map<String, TermConcept> myId2concept;
private Set<String> myValidConceptIds;
private SctHandlerDescription(Set<String> theValidConceptIds, Map<String, TermConcept> theCode2concept, Map<String, TermConcept> theId2concept, TermCodeSystemVersion theCodeSystemVersion) {
myCode2concept = theCode2concept;
myId2concept = theId2concept;
myCodeSystemVersion = theCodeSystemVersion;
myValidConceptIds = theValidConceptIds;
}
@Override
public void accept(CSVRecord theRecord) {
String id = theRecord.get("id");
boolean active = "1".equals(theRecord.get("active"));
if (!active) {
return;
}
String conceptId = theRecord.get("conceptId");
if (!myValidConceptIds.contains(conceptId)) {
return;
}
String term = theRecord.get("term");
TermConcept concept = getOrCreateConcept(myCodeSystemVersion, myId2concept, id);
concept.setCode(conceptId);
concept.setDisplay(term);
myCode2concept.put(conceptId, concept);
}
}
private final class SctHandlerRelationship implements IRecordHandler {
private final Map<String, TermConcept> myCode2concept;
private final TermCodeSystemVersion myCodeSystemVersion;
private final Map<String, TermConcept> myRootConcepts;
private SctHandlerRelationship(TermCodeSystemVersion theCodeSystemVersion, HashMap<String, TermConcept> theRootConcepts, Map<String, TermConcept> theCode2concept) {
myCodeSystemVersion = theCodeSystemVersion;
myRootConcepts = theRootConcepts;
myCode2concept = theCode2concept;
}
@Override
public void accept(CSVRecord theRecord) {
Set<String> ignoredTypes = new HashSet<String>();
ignoredTypes.add("Method (attribute)");
ignoredTypes.add("Direct device (attribute)");
ignoredTypes.add("Has focus (attribute)");
ignoredTypes.add("Access instrument");
ignoredTypes.add("Procedure site (attribute)");
ignoredTypes.add("Causative agent (attribute)");
ignoredTypes.add("Course (attribute)");
ignoredTypes.add("Finding site (attribute)");
ignoredTypes.add("Has definitional manifestation (attribute)");
String sourceId = theRecord.get("sourceId");
String destinationId = theRecord.get("destinationId");
String typeId = theRecord.get("typeId");
boolean active = "1".equals(theRecord.get("active"));
TermConcept typeConcept = myCode2concept.get(typeId);
TermConcept sourceConcept = myCode2concept.get(sourceId);
TermConcept targetConcept = myCode2concept.get(destinationId);
if (sourceConcept != null && targetConcept != null && typeConcept != null) {
if (typeConcept.getDisplay().equals("Is a (attribute)")) {
RelationshipTypeEnum relationshipType = RelationshipTypeEnum.ISA;
if (!sourceId.equals(destinationId)) {
if (active) {
TermConceptParentChildLink link = new TermConceptParentChildLink();
link.setChild(sourceConcept);
link.setParent(targetConcept);
link.setRelationshipType(relationshipType);
link.setCodeSystem(myCodeSystemVersion);
targetConcept.addChild(sourceConcept, relationshipType);
} else {
// not active, so we're removing any existing links
for (TermConceptParentChildLink next : new ArrayList<TermConceptParentChildLink>(targetConcept.getChildren())) {
if (next.getRelationshipType() == relationshipType) {
if (next.getChild().getCode().equals(sourceConcept.getCode())) {
next.getParent().getChildren().remove(next);
next.getChild().getParents().remove(next);
}
}
}
}
}
} else if (ignoredTypes.contains(typeConcept.getDisplay())) {
// ignore
} else {
// ourLog.warn("Unknown relationship type: {}/{}", typeId, typeConcept.getDisplay());
}
}
}
}
private static class ZippedFileInputStream extends InputStream {
private ZipInputStream is;
public ZippedFileInputStream(ZipInputStream is) {
this.is = is;
}
@Override
public void close() throws IOException {
is.closeEntry();
}
@Override
public int read() throws IOException {
return is.read();
}
}
}