package de.dfki.nlp.loader; import com.google.common.base.Joiner; import com.google.common.collect.Lists; import de.dfki.nlp.config.AnnotatorConfig; import de.dfki.nlp.domain.IdList; import de.dfki.nlp.domain.ParsedInputText; import de.dfki.nlp.io.RetryHandler; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Component; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import java.io.IOException; import java.io.StringReader; import java.util.List; import java.util.Optional; @Slf4j @Component public class PMCDocumentFetcher extends AbstractDocumentFetcher { private final AnnotatorConfig annotatorConfig; private final RetryHandler retryHandler; private final XPathFactory xpathFactory = XPathFactory.newInstance(); private final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); public PMCDocumentFetcher(AnnotatorConfig annotatorConfig, RetryHandler retryHandler) { this.annotatorConfig = annotatorConfig; this.retryHandler = retryHandler; } @Override List<ParsedInputText> load(IdList idList) { // load multiple pmc documents at once String listOfIds = Joiner.on(",").join(idList.getIds()); String pmc = retryHandler.retryableGet( annotatorConfig.pmc.url, String.class, listOfIds); List<ParsedInputText> res = Lists.newArrayList(); if (StringUtils.isEmpty(pmc)) return res; InputSource source = new InputSource(new StringReader(pmc)); try { DocumentBuilder db = dbf.newDocumentBuilder(); Document xmlDocument = db.parse(source); XPath xpath = xpathFactory.newXPath(); // loop article NodeList result = (NodeList) xpath.evaluate("/pmc-articleset/article", xmlDocument, XPathConstants.NODESET); for (int i = 0; i < result.getLength(); i++) { String title = xpath.evaluate("front/article-meta/title-group/article-title", result.item(i)); String abstractT = xpath.evaluate("front/article-meta/abstract[not(@*)]", result.item(i)); title = StringUtils.defaultIfEmpty(StringUtils.trim(title), null); abstractT = StringUtils.defaultIfEmpty(StringUtils.trim(abstractT), null); final String id = xpath.evaluate("front/article-meta/article-id[@pub-id-type='pmc']", result.item(i)); // match id with incoming Optional<String> matchedID = idList.getIds().stream().filter(givenId -> StringUtils.contains(givenId, id)).findFirst(); if (!matchedID.isPresent()) { log.error("Did not find a matching ID {} in {}", id, idList.getIds().get(i)); } res.add(new ParsedInputText(matchedID.orElse(id), title, abstractT, null)); } //parsedInputText = new ParsedInputText(document.getDocument_id(), title, abstractT, null); } catch (ParserConfigurationException | IOException | XPathExpressionException | SAXException e) { log.error("Error parsing pmc results", e); } return res; } }