package org.wikibrain.sr.wikify; import com.typesafe.config.Config; import org.h2.util.StringUtils; import org.wikibrain.conf.Configuration; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.LocalLinkDao; import org.wikibrain.core.dao.RawPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.model.LocalLink; import org.wikibrain.core.model.RawPage; import java.util.*; /** * A wikifier that just returns existing hyperliks. * * This is a bit tricky, because the locations of the hyperlinks are based on "raw" wikitext, * but the text used by this class is "plaintext" stripped of wikimarkup. * * Most of the logic in this class realigns the wikimarkup locations * with plain text locations. * * @author Shilad Sen */ public class IdentityWikifier implements Wikifier { private final RawPageDao pageDao; private final LocalLinkDao linkDao; private final Language language; public IdentityWikifier(Language language, RawPageDao pageDao, LocalLinkDao linkDao) { this.language = language; this.pageDao = pageDao; this.linkDao = linkDao; } @Override public List<LocalLink> wikify(int wpId) throws DaoException { RawPage rp = pageDao.getById(language, wpId); if (rp == null) { return new ArrayList<LocalLink>(); } return wikify(wpId, rp.getPlainText(false)); } @Override public List<LocalLink> wikify(int wpId, String text) throws DaoException { if (text == null || text.isEmpty()) { return new ArrayList<LocalLink>(); } List<LocalLink> links = new ArrayList<LocalLink>(); for (LocalLink ll : linkDao.getLinks(language, wpId, true)) { if (ll.getLocation() >= 0 && ll.isParseable() && !StringUtils.isNullOrEmpty(ll.getAnchorText())) { links.add(ll); } } Collections.sort(links); return align(links, text); } private LocalLink cloneLinkWithLocation(LocalLink ll, int location) { return new LocalLink( ll.getLanguage(), ll.getAnchorText(), ll.getSourceId(), ll.getDestId(), ll.isOutlink(), location, ll.isParseable(), ll.getLocType() ); } private List<LocalLink> align(List<LocalLink> anchors, String text) { List<LocalLink> alignment = new ArrayList<LocalLink>(); if (anchors.isEmpty()) { return alignment; } BitSet used = new BitSet(text.length()); anchors = new LinkedList<LocalLink>(anchors); // for perfomance int i = 0; Iterator<LocalLink> iter = anchors.iterator(); while (iter.hasNext()) { LocalLink ll = iter.next(); String a = ll.getAnchorText(); int j = text.indexOf(a, i); // if not found, skip to the next anchor but don't advance pointer if (j < 0) { continue; } alignment.add(cloneLinkWithLocation(ll, j)); // Mark bits as set used.set(j, j + a.length()); i = j + a.length(); iter.remove(); } // Stop if we're done (typical case) if (anchors.isEmpty()) { return alignment; } // Look for unused matches for (LocalLink ll : anchors) { String a = ll.getAnchorText(); i = findNextUnused(text, a, i, used); if (i >= 0) { alignment.add(cloneLinkWithLocation(ll, i)); used.set(i, i + a.length()); } } Collections.sort(alignment); return alignment; } private int findNextUnused(String text, String query, int begin, BitSet used) { int i = begin; while (i < text.length()) { i = text.indexOf(query, i); if (i < 0) { return -1; } if (used.get(i, i + query.length()).isEmpty()) { return i; } i++; // skip to next index } return -1; } @Override public List<LocalLink> wikify(String text) throws DaoException { return new ArrayList<LocalLink>(); } public static class Provider extends org.wikibrain.conf.Provider<Wikifier> { public Provider(Configurator configurator, Configuration config) throws ConfigurationException { super(configurator, config); } @Override public Class<Wikifier> getType() { return Wikifier.class; } @Override public String getPath() { return "sr.wikifier"; } @Override public Wikifier get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException { if (runtimeParams == null || !runtimeParams.containsKey("language")) { throw new IllegalArgumentException("Wikifier requires 'language' runtime parameter."); } if (!config.getString("type").equals("identity")) { return null; } Language language = Language.getByLangCode(runtimeParams.get("language")); String linkName = config.getString("localLinkDao"); return new IdentityWikifier(language, getConfigurator().get(RawPageDao.class), getConfigurator().get(LocalLinkDao.class, linkName)); } } }