package org.wikibrain.parser.wiki; import de.tudarmstadt.ukp.wikipedia.parser.*; import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory; import org.wikibrain.core.WikiBrainException; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageInfo; import org.wikibrain.core.lang.LanguageSet; import org.wikibrain.core.model.NameSpace; import org.wikibrain.core.model.Title; import org.wikibrain.core.model.RawPage; import java.util.ArrayList; import java.util.List; import java.util.NoSuchElementException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.regex.Matcher; import java.util.regex.Pattern; public class WikiTextParser { public static final Logger LOG = LoggerFactory.getLogger(WikiTextParser.class); private final MediaWikiParser jwpl; private final SubarticleParser subarticleParser; private final LanguageInfo lang; private final List<ParserVisitor> visitors; public WikiTextParser(LanguageInfo lang, List<ParserVisitor> visitors) { this(lang, null, visitors); } public WikiTextParser(LanguageInfo lang, LanguageSet allowedIllLangs, List<ParserVisitor> visitors) { this.lang = lang; subarticleParser = new SubarticleParser(lang); this.visitors = visitors; MediaWikiParserFactory pf = new MediaWikiParserFactory(); pf.setCalculateSrcSpans(true); pf.setCategoryIdentifers(lang.getCategoryNames()); if (allowedIllLangs != null) { pf.setLanguageIdentifers(allowedIllLangs.getLangCodes()); } jwpl = pf.createParser(); } /** * TODO: change exception to WpParseException * @param xml * @throws WikiBrainException */ public void parse(RawPage xml) throws WikiBrainException { visitBeginPage(xml); if (xml.isRedirect()) { ParsedRedirect pr = new ParsedRedirect(); pr.location = new ParsedLocation(xml, -1, -1, -1); // TODO: calculate redirect text? visitRedirect(pr); } else { try { ParsedPage pp = jwpl.parse(xml.getBody()); if (pp == null) { LOG.debug("invalid page: " + xml.getBody()); } if (xml.getNamespace() == NameSpace.CATEGORY) { parseCategory(xml, pp); } else if (xml.getNamespace() == NameSpace.ARTICLE) { parseArticle(xml, pp); } } catch (NoSuchElementException e) { visitParseError(xml, e); } catch (NullPointerException e) { visitParseError(xml, e); } } visitEndPage(xml); } private void parseArticle(RawPage xml, ParsedPage pp) { // *** LINKS, ANCHOR TEXTS, SECTIONS int secNum = 0; // paragraph numbers before first paragraph are negative // paragraph 0 is the first paragraph int paraNum = -pp.getFirstParagraphNr(); for (Section curSection: pp.getSections()){ try{ ParsedLink.SubarticleType secSubType = subarticleParser.isSeeAlsoHeader(lang, curSection.getTitle()); for (Content curContent : curSection.getContentList()){ // EASY LINKS for (Link curLink : curContent.getLinks()){ if (curLink.getTarget().isEmpty()){ LOG.debug("Found link with empty target: \t" + xml + "\t text=" + curLink.getText()); continue; } Title destTitle = link2Title(curLink); if (destTitle == null || destTitle.getNamespace() != NameSpace.ARTICLE){ continue; } try{ ParsedLink.SubarticleType linkSubType; if (secSubType == null){ // don't look for inlines in "see also" linkSubType = subarticleParser.isInlineSubarticle(curLink.getSrcSpan().getStart(), xml); }else{ linkSubType = secSubType; // captures see also } ParsedLocation location = new ParsedLocation(xml, secNum, paraNum, curLink.getSrcSpan().getStart()); visitLink(location, destTitle, curLink.getText(), linkSubType); } catch (WikiBrainException e) { LOG.warn(String.format("Could not process link\t%s\t%s", xml, curLink.toString()), e); } } //TEMPLATES for (Template t : curContent.getTemplates()){ boolean errorWithSrcLocation = t.getSrcSpan().getEnd() < 0; // this checks for what seems to be when parsing fails in JWPL String templateTextOrig; if (!errorWithSrcLocation){ templateTextOrig = xml.getBody().substring(t.getSrcSpan().getStart(), t.getSrcSpan().getEnd()); }else{ // this makes up for errors in JWPL (or bad script, but it mostly looks like erros) int estimatedLength = t.getPos().getEnd() - t.getPos().getStart(); templateTextOrig = xml.getBody().substring(t.getSrcSpan().getStart(), t.getSrcSpan().getStart() + estimatedLength + 1); } String templateText; if (templateTextOrig.length() >= 5){ templateText = templateTextOrig.substring(2, templateTextOrig.length()-2); }else{ continue; // blank template } String templateName = t.getName(); // SUBARTICLE INFO STUFF templateName = new Title(templateName, false, lang).toString(); // this appears to be necessary due to JWPL's handling of template names ParsedLink.SubarticleType tempSubType; tempSubType = subarticleParser.isTemplateSubarticle(templateName, templateText); if (tempSubType == null){ try{ templateText = templateText.replaceAll("\\{\\{", ""); // <-- these are all special cases in which JWPL fails templateText = templateText.replaceAll("\\}\\}", ""); templateText = templateText.replaceAll("<!--", ""); templateText = templateText.replaceAll("\\[\\[\\]\\]", ""); ParsedPage parsedTemplate = jwpl.parse(templateText); for (Link templateLink : parsedTemplate.getLinks()){ Title destTitle = link2Title(templateLink); if (destTitle == null) { continue; } NameSpace type = destTitle.getNamespace(); if (type == NameSpace.ARTICLE){ ParsedLocation location = new ParsedLocation(xml, secNum, paraNum, t.getSrcSpan().getStart()); visitLink(location, destTitle, templateLink.getText(), tempSubType); } else if (type == NameSpace.CATEGORY){ ParsedCategory pc = new ParsedCategory(); pc.location = new ParsedLocation(xml, secNum, paraNum, t.getSrcSpan().getStart()); pc.category = destTitle; visitCategory(pc); } } }catch(IndexOutOfBoundsException e){ LOG.error("Parsing error while doing templates -> ParsedPages:\t" + xml + "\t" + templateText); } }else{ List<String> dests = subarticleParser.getContentsOfTemplatePipe(templateText); for (String dest : dests){ dest = SubarticleParser.removeTemplateAnchor(dest); Title destTitle = new Title(dest, lang); try { ParsedLocation location = new ParsedLocation(xml, secNum, paraNum, t.getSrcSpan().getStart()); visitLink(location, destTitle, dest, tempSubType); } catch (WikiBrainException e) { LOG.error(String.format("Could not process template-based subarticle link: \t%s\t%s", xml, t.toString()), e); } } } } if (curContent instanceof Paragraph) { paraNum++; } } } catch(WikiBrainException e){ LOG.error(String.format("Could not store whole section in %s", xml), e); } secNum++; } // *** ILLS parseIlls(xml, pp); // *** CATEGORY MEMBERSHIPS for (Link cat : pp.getCategories()){ String linkText = cat.getText(); if (linkText.contains(Pattern.quote("|"))){ continue; } Title destTitle = new Title(cat.getTarget(), false, lang); // TODO: ensure destTitle is a category ParsedCategory pc = new ParsedCategory(); pc.location = new ParsedLocation(xml, -1, -1, cat.getSrcSpan().getStart()); pc.category = destTitle; visitCategory(pc); } } private static Pattern illPattern = Pattern.compile("(.+?)\\:\\s*(.+)"); private void parseIlls(RawPage xml, ParsedPage pp) { if (pp.getLanguagesElement() != null){ for (Link ill : pp.getLanguages()){ try{ Matcher m = illPattern.matcher(ill.getTarget()); if (m.find()){ String langCode = m.group(1); String target = m.group(2); Language l = Language.getByLangCode(langCode); if (l == null) { LOG.warn("unkonwn lang code:\t" + langCode); } else if (l != lang.getLanguage()) { ParsedIll pill = new ParsedIll(); pill.location = new ParsedLocation(xml, -1, -1, ill.getSrcSpan().getStart()); pill.title = new Title(target, false, LanguageInfo.getByLanguage(l)); visitIll(pill); } }else{ LOG.debug("Invalid ILL:\t" + xml + "\t" + ill.getTarget()); } } catch (Exception e) { LOG.warn(String.format("Error while parsing/storing ILL\t%s\t%s\t%s", xml, ill.toString().replaceAll("\n", ","), e.getMessage())); } } } // else{ // LOG.info("No ILLs found for\t" + xml); // } } private void parseCategory(RawPage xml, ParsedPage pp){ // handle categories for (Link cat : pp.getCategories()){ Title destTitle = new Title(cat.getTarget(), lang); // TODO: ensure title is a category ParsedCategory pc = new ParsedCategory(); pc.location = new ParsedLocation(xml, -1, -1, cat.getSrcSpan().getStart()); pc.category = destTitle; visitCategory(pc); } // handle ILLs parseIlls(xml, pp); } private void visitBeginPage(RawPage xml) { for (ParserVisitor visitor : visitors) { try { visitor.beginPage(xml); } catch (WikiBrainException e) { LOG.warn("beginPage failed:", e); } } } private void visitEndPage(RawPage xml) { for (ParserVisitor visitor : visitors) { try { visitor.endPage(xml); } catch (WikiBrainException e) { LOG.warn("beginPage failed:", e); } } } private void visitRedirect(ParsedRedirect redirect) { for (ParserVisitor visitor : visitors) { try { visitor.redirect(redirect); } catch (WikiBrainException e) { LOG.warn("beginPage failed:", e); } } } private void visitParseError(RawPage rp, Exception e) { for (ParserVisitor visitor : visitors) { visitor.parseError(rp, e); } } private void visitIll(ParsedIll ill) { for (ParserVisitor visitor : visitors) { try { visitor.ill(ill); } catch (WikiBrainException e) { LOG.warn("beginPage failed:", e); } } } private void visitCategory(ParsedCategory cat) { for (ParserVisitor visitor : visitors) { try { visitor.category(cat); } catch (WikiBrainException e) { // LOG.log(Level.WARNING, "beginPage failed:", e); } } } private void visitLink(ParsedLocation location, Title dest, String linkText, ParsedLink.SubarticleType subType) throws WikiBrainException{ // don't want to consider within-page links Title src = location.getXml().getTitle(); if (src.toString().startsWith("#") || src.equals(dest)) { return; } ParsedLink pl = new ParsedLink(); pl.location = location; pl.target = dest; pl.text = linkText; pl.subarticleType = subType; for (ParserVisitor visitor : visitors) { try { visitor.link(pl); } catch (WikiBrainException e) { LOG.warn("beginPage failed:", e); } } } private NameSpace getLinkType(Link link){ Title t = link2Title(link); return t == null ? null : t.getNamespace(); } private Title link2Title(Link link) { if (link.getType().equals(Link.type.INTERNAL) || link.getType().equals(Link.type.UNKNOWN)) { return new Title(link.getTarget(), lang); } else { return null; } } static public List<String> getLangCodes(List<LanguageInfo> langs) { List<String> langCodes = new ArrayList<String>(); for (LanguageInfo l : langs) { langCodes.add(l.getLanguage().getLangCode()); } return langCodes; } }