package eu.dnetlib.iis.wf.metadataextraction; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.avro.util.Utf8; import org.apache.log4j.Logger; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.xpath.XPath; import eu.dnetlib.iis.common.InfoSpaceConstants; import eu.dnetlib.iis.common.importer.CermineAffiliation; import eu.dnetlib.iis.common.importer.CermineAffiliationBuilder; import eu.dnetlib.iis.metadataextraction.schemas.Affiliation; import eu.dnetlib.iis.metadataextraction.schemas.Author; import eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata; import eu.dnetlib.iis.metadataextraction.schemas.Range; import eu.dnetlib.iis.metadataextraction.schemas.ReferenceBasicMetadata; import eu.dnetlib.iis.metadataextraction.schemas.ReferenceMetadata; import pl.edu.icm.cermine.bibref.model.BibEntry; import pl.edu.icm.cermine.bibref.model.BibEntryFieldType; import pl.edu.icm.cermine.bibref.transformers.NLMToBibEntryConverter; import pl.edu.icm.cermine.exception.TransformationException; /** * NLM {@link Element} converter building {@link DocumentWithBasicMetadata} objects. * @author mhorst * */ public final class NlmToDocumentWithBasicMetadataConverter { public static final String EMPTY_META = "$EMPTY$"; private static final Logger log = Logger.getLogger(NlmToDocumentWithBasicMetadataConverter.class); private static CermineToMetadataAffConverter cermineToMetadataAffConverter = new CermineToMetadataAffConverter(); private static CermineAffiliationBuilder cermineAffiliationBuilder = new CermineAffiliationBuilder(); private static NLMToBibEntryConverter bibEntryConverter = new NLMToBibEntryConverter(); // ----------------------------- CONSTRUCTORS ------------------------------------ /** * Private constructor. */ private NlmToDocumentWithBasicMetadataConverter() {} // ----------------------------- LOGIC ------------------------------------------- /** * Converts given source element to {@link DocumentWithBasicMetadata}. * * @param id document identifier * @param source XML document * @return {@link DocumentWithBasicMetadata}, never returns null. * @throws TransformationException */ public static ExtractedDocumentMetadata convertFull(String id, Document source, String text) throws TransformationException { if (id == null) { throw new RuntimeException("unable to set null id"); } ExtractedDocumentMetadata.Builder builder = ExtractedDocumentMetadata.newBuilder(); builder.setId(id); builder.setText(text != null ? text : ""); if (source == null) { // allowing returning empty extracted metadata return builder.build(); } Element rootElement = source.getRootElement(); convertMeta(id, rootElement, builder); Map<String, Affiliation> affiliationMap = convertAffiliations(rootElement); List<String> affiliationIds = new ArrayList<String>(); if (!affiliationMap.isEmpty()) { List<Affiliation> affiliations = new ArrayList<Affiliation>(); for (Entry<String, Affiliation> entry : affiliationMap.entrySet()) { affiliationIds.add(entry.getKey()); affiliations.add(entry.getValue()); } builder.setAffiliations(affiliations); } List<Author> authors = convertAuthors(rootElement, affiliationIds); if (!authors.isEmpty()) { builder.setAuthors(authors); } List<ReferenceMetadata> refs = convertReferences(rootElement); if (!refs.isEmpty()) { builder.setReferences(refs); } return builder.build(); } /** * Creates empty entry with identifier set and empty record indicator. * Never returns null. * @param id * @return {@link DocumentWithBasicMetadata} */ public static ExtractedDocumentMetadata createEmpty(String id) { if (id==null) { throw new RuntimeException("unable to set null id"); } ExtractedDocumentMetadata.Builder builder = ExtractedDocumentMetadata.newBuilder(); builder.setId(id); builder.setText(""); builder.setPublicationTypeName(EMPTY_META); return builder.build(); } // ----------------------------- PRIVATE ----------------------------------------- /** * Converts XML affiliations into map of avro objects. * Never returns null. * @param source main XML element */ private static Map<String, Affiliation> convertAffiliations(Element source) { try { XPath xPath = XPath.newInstance("/article/front//contrib-group/aff"); @SuppressWarnings("unchecked") List<Element> nodeList = xPath.selectNodes(source); if (nodeList == null || nodeList.isEmpty()) { return Collections.emptyMap(); } Map<String, Affiliation> affiliations = new LinkedHashMap<String, Affiliation>(); for (Element node : nodeList) { CermineAffiliation cAff = cermineAffiliationBuilder.build(node); affiliations.put(node.getAttributeValue("id"), cermineToMetadataAffConverter.convert(cAff)); } return affiliations; } catch (JDOMException ex) { return Collections.emptyMap(); } } /** * Converts XML authors into avro objects augumented with affiliation positions. * Never returns null. * @param source main XML element * @param affiliationsIds ordered affiliations */ private static List<Author> convertAuthors(Element source, List<String> affiliationsIds) { try { XPath xPath = XPath.newInstance("/article/front//contrib-group/contrib[@contrib-type='author']"); @SuppressWarnings("unchecked") List<Element> nodeList = xPath.selectNodes(source); if (nodeList == null || nodeList.isEmpty()) { return Collections.emptyList(); } List<Author> authors = new ArrayList<Author>(); for (Element element : nodeList) { String name = element.getChildTextNormalize("string-name"); List<Integer> affPositions = new ArrayList<Integer>(); List<?> xrefs = element.getChildren("xref"); for (Object xref : xrefs) { Element xrefEl = (Element) xref; String xrefType = xrefEl.getAttributeValue("ref-type"); String xrefId = xrefEl.getAttributeValue("rid"); if ("aff".equals(xrefType) && xrefId != null && affiliationsIds.contains(xrefId)) { affPositions.add(affiliationsIds.indexOf(xrefId)); } } if (affPositions.isEmpty()) { affPositions = null; } Author author = Author.newBuilder() .setAuthorFullName(name) .setAffiliationPositions(affPositions) .build(); authors.add(author); } return authors; } catch (JDOMException ex) { return Collections.emptyList(); } } /** * Converts bibliographic references. * @param source * @return list containing all bibliographic references, never returns null * @throws TransformationException */ private static List<ReferenceMetadata> convertReferences(Element source) throws TransformationException { Element backElement = source.getChild("back"); if (backElement!=null) { Element refListElement = backElement.getChild("ref-list"); if (refListElement!=null) { @SuppressWarnings("unchecked") List<Element> refs = refListElement.getChildren("ref"); int idx = 1; List<ReferenceMetadata> refMetas = new ArrayList<ReferenceMetadata>(); for (Element ref : refs) { ReferenceMetadata refMeta = convertReference(ref, idx); if (refMeta != null) { refMetas.add(refMeta); } idx++; } return refMetas; } } // fallback return Collections.emptyList(); } /** * Converts single reference. * @param ref reference element * @param position reference position in the source list * @return converted {@link ReferenceMetadata} object or null when reference did not contain valid citation * @throws TransformationException */ private static ReferenceMetadata convertReference(Element ref, int position) throws TransformationException { Element mixedCitation = ref.getChild("mixed-citation"); if (mixedCitation != null) { BibEntry bibEntry = bibEntryConverter.convert(mixedCitation); if (bibEntry!=null) { ReferenceMetadata.Builder refMetaBuilder = ReferenceMetadata.newBuilder(); refMetaBuilder.setPosition(position); refMetaBuilder.setText(bibEntry.getText()); refMetaBuilder.setBasicMetadata(convertBibEntry(bibEntry)); return refMetaBuilder.build(); } else { log.warn("got null bib-entry from element " + mixedCitation.getValue()); } } // fallback return null; } /** * Converts {@link BibEntry} to {@link ReferenceBasicMetadata}. * @param entry * @return {@link ReferenceBasicMetadata} */ private static ReferenceBasicMetadata convertBibEntry(BibEntry entry) { if (entry!=null) { ReferenceBasicMetadata.Builder builder = ReferenceBasicMetadata.newBuilder(); List<String> resultValues = entry.getAllFieldValues(BibEntryFieldType.AUTHOR); if (resultValues != null && !resultValues.isEmpty()) { List<CharSequence> authors = new ArrayList<CharSequence>(resultValues.size()); for (CharSequence seq : resultValues) { authors.add(seq); } builder.setAuthors(authors); } String resultValue = entry.getFirstFieldValue(BibEntryFieldType.PAGES); if (resultValue!=null) { Pattern pagesPattern = Pattern.compile("^([0-9]+)--([0-9]+)$"); Matcher m = pagesPattern.matcher(resultValue); if (m.matches()) { builder.setPages(Range.newBuilder().setStart(m.group(1)).setEnd(m.group(2)).build()); } else { pagesPattern = Pattern.compile("^[0-9]+$"); m = pagesPattern.matcher(resultValue); if (m.matches()) { builder.setPages(Range.newBuilder().setStart(m.group()).setEnd(m.group()).build()); } } } resultValue = entry.getFirstFieldValue(BibEntryFieldType.JOURNAL); if (resultValue!=null) { builder.setSource(resultValue); } resultValue = entry.getFirstFieldValue(BibEntryFieldType.TITLE); if (resultValue!=null) { builder.setTitle(resultValue); } resultValue = entry.getFirstFieldValue(BibEntryFieldType.VOLUME); if (resultValue!=null) { builder.setVolume(resultValue); } resultValue = entry.getFirstFieldValue(BibEntryFieldType.YEAR); if (resultValue!=null) { builder.setYear(resultValue); } resultValue = entry.getFirstFieldValue(BibEntryFieldType.EDITION); if (resultValue!=null) { builder.setEdition(resultValue); } resultValue = entry.getFirstFieldValue(BibEntryFieldType.PUBLISHER); if (resultValue!=null) { builder.setPublisher(resultValue); } resultValue = entry.getFirstFieldValue(BibEntryFieldType.LOCATION); if (resultValue!=null) { builder.setLocation(resultValue); } resultValue = entry.getFirstFieldValue(BibEntryFieldType.SERIES); if (resultValue!=null) { builder.setSeries(resultValue); } resultValue = entry.getFirstFieldValue(BibEntryFieldType.NUMBER); if (resultValue!=null) { builder.setIssue(resultValue); } resultValue = entry.getFirstFieldValue(BibEntryFieldType.URL); if (resultValue!=null) { builder.setUrl(resultValue); } return builder.build(); } else { return null; } } /** * Converts metadata. * @param id * @param source * @param builder * @return DocumentBasicMetadata */ private static ExtractedDocumentMetadata.Builder convertMeta(String id, Element source, ExtractedDocumentMetadata.Builder docMetaBuilder) { Element frontElement = source.getChild("front"); if (frontElement == null) { return docMetaBuilder; } Element articleMetaElement = frontElement.getChild("article-meta"); if (articleMetaElement!=null) { convertArticleMeta(id, articleMetaElement, docMetaBuilder); } Element journalMeta = frontElement.getChild("journal-meta"); if (journalMeta!=null) { convertJournalMeta(id, journalMeta, docMetaBuilder); } return docMetaBuilder; } /** * Converts article metadata element. * @param id document identifier * @param articleMeta article metadata element * @param docMetaBuilder document metadata avro record builder */ @SuppressWarnings("unchecked") private static ExtractedDocumentMetadata.Builder convertArticleMeta(String id, Element articleMeta, ExtractedDocumentMetadata.Builder docMetaBuilder) { // /article/front/article-meta/title-group/article-title [multiple elements] Element titleGroup = articleMeta.getChild("title-group"); if (titleGroup != null) { convertTitles(id, titleGroup, docMetaBuilder); } // /article/front/article-meta/abstract/p Element articleAbstract = articleMeta.getChild("abstract"); if (articleAbstract != null) { Element pAbstract = articleAbstract.getChild("p"); if (pAbstract != null) { docMetaBuilder.setAbstract$(pAbstract.getTextNormalize()); } } // /article/front/article-meta/kwd-group/kwd [multiple elements] Element keywordsGroup = articleMeta.getChild("kwd-group"); if (keywordsGroup != null) { convertKeywords(keywordsGroup, docMetaBuilder); } // /article/front/article-meta/article-id List<Element> articleIds = articleMeta.getChildren("article-id"); Map<CharSequence, CharSequence> extIds = new HashMap<CharSequence, CharSequence>(); for (Element articleIdElem : articleIds) { String idType = articleIdElem.getAttributeValue("pub-id-type"); extIds.put(new Utf8(idType!=null?idType:InfoSpaceConstants.EXTERNAL_ID_TYPE_UNKNOWN), new Utf8(articleIdElem.getTextNormalize())); } if (!extIds.isEmpty()) { docMetaBuilder.setExternalIdentifiers(extIds); } // /article/front/article-meta/pub-date/year Element publicationDate = articleMeta.getChild("pub-date"); if (publicationDate != null) { Element publicationDateYear = publicationDate.getChild("year"); if (publicationDateYear != null) { try { docMetaBuilder.setYear(Integer.valueOf(publicationDateYear.getTextNormalize())); } catch (Exception e) { log.error("unable to parse year, unsupported format: " + publicationDateYear.getTextNormalize() + ", document id" + id, e); } } } // /article/front/article-meta/volume Element volume = articleMeta.getChild("volume"); if (volume != null) { docMetaBuilder.setVolume(volume.getTextNormalize()); } // /article/front/article-meta/issue Element issue = articleMeta.getChild("issue"); if (issue != null) { docMetaBuilder.setIssue(issue.getTextNormalize()); } // /article/front/article-meta/fpage // /article/front/article-meta/lpage Range.Builder pagesBuilder = Range.newBuilder(); Element fpage = articleMeta.getChild("fpage"); if (fpage != null) { pagesBuilder.setStart(fpage.getTextNormalize()); } Element lpage = articleMeta.getChild("lpage"); if (lpage != null) { pagesBuilder.setEnd(lpage.getTextNormalize()); } if (pagesBuilder.hasStart() || pagesBuilder.hasEnd()) { docMetaBuilder.setPages(pagesBuilder.build()); } return docMetaBuilder; } /** * Converts XML title group element into avro object representation. * @param id document identifier * @param titleGroup title group XML element * @param docMetaBuilder document metadata avro record builder */ @SuppressWarnings("unchecked") private static ExtractedDocumentMetadata.Builder convertTitles(String id, Element titleGroup, ExtractedDocumentMetadata.Builder docMetaBuilder) { List<Element> titles = titleGroup.getChildren("article-title"); // currenlty taking only first title into account! if (!titles.isEmpty()) { if (titles.size()>1) { log.warn("got multiple titles for document " + id + ", storing first title only"); } for (Element titleElem : titles) { // iterating until finding not null title String title = titleElem.getTextNormalize(); if (title!=null && !title.isEmpty()) { docMetaBuilder.setTitle(title); break; } } } return docMetaBuilder; } /** * Converts XML keyword group element into avro object representation. * @param keywordsGroup keyword group XML element * @param docMetaBuilder document metadata avro record builder */ @SuppressWarnings("unchecked") private static ExtractedDocumentMetadata.Builder convertKeywords(Element keywordsGroup, ExtractedDocumentMetadata.Builder docMetaBuilder) { List<Element> keywords = keywordsGroup.getChildren("kwd"); for (Element keywordElem : keywords) { String keyword = keywordElem.getTextNormalize(); if (keyword!=null && !keyword.isEmpty()) { if (docMetaBuilder.getKeywords()==null) { docMetaBuilder.setKeywords(new ArrayList<CharSequence>()); } docMetaBuilder.getKeywords().add(keyword); } } return docMetaBuilder; } /** * Converts journal metadata. * @param id document identifier * @param journalMeta journal metadata element * @param docMetaBuilder document metadata avro record builder */ @SuppressWarnings("unchecked") private static ExtractedDocumentMetadata.Builder convertJournalMeta(String id, Element journalMeta, ExtractedDocumentMetadata.Builder docMetaBuilder) { // /article/front/journal-meta/journal-title-group/journal-title Element titleGroup = journalMeta.getChild("journal-title-group"); if (titleGroup != null) { List<Element> titles = titleGroup.getChildren("journal-title"); if (!titles.isEmpty()) { docMetaBuilder.setJournal(titles.iterator().next().getTextNormalize()); if (titles.size()>1) { log.warn("got multiple journal titles, retrieving first title only. " + "Document id: " + id); } } } // /article/front/journal-meta/publisher/publisher-name Element publisher = journalMeta.getChild("publisher"); if (publisher != null) { Element publisherName = publisher.getChild("publisher-name"); if (publisherName != null) { String pubNameText = publisherName.getTextNormalize(); if (pubNameText != null && !pubNameText.isEmpty()) { docMetaBuilder.setPublisher(pubNameText); } } } return docMetaBuilder; } }