package org.wikibrain.sr.wikify; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.LocalLinkDao; import org.wikibrain.core.dao.LocalPageDao; import org.wikibrain.core.dao.RawPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.StringNormalizer; import org.wikibrain.core.model.LocalLink; import org.wikibrain.core.model.LocalPage; import org.wikibrain.core.model.NameSpace; import org.wikibrain.core.nlp.StringTokenizer; import org.wikibrain.phrases.*; import org.wikibrain.sr.SRMetric; import java.io.File; import java.io.IOException; import java.util.Iterator; /** * @author Shilad Sen */ public class Tester { public static void main(String args[]) throws ConfigurationException, DaoException, IOException { Env env = EnvBuilder.envFromArgs(args); Configurator c = env.getConfigurator(); RawPageDao rpd = c.get(RawPageDao.class); LocalLinkDao linkDao = c.get(LocalLinkDao.class); LocalPageDao pageDao = c.get(LocalPageDao.class); AnchorTextPhraseAnalyzer phraseAnalyzer = (AnchorTextPhraseAnalyzer) c.get(PhraseAnalyzer.class, "anchortext"); LinkProbabilityDao linkProbabilityDao = env.getComponent(LinkProbabilityDao.class, Language.SIMPLE); if (!linkProbabilityDao.isBuilt()) { linkProbabilityDao.build(); } linkProbabilityDao.useCache(true); // System.out.println(linkProbabilityDao.getLinkProbability(Language.SIMPLE, "United States")); // System.out.println(linkProbabilityDao.getLinkProbability(Language.SIMPLE, "United_States")); // System.exit(0); // System.out.println("text is " + rpd.getById(Language.SIMPLE, 116466).getPlainText()); // LinkProbabilityDao lpd = c.get(LinkProbabilityDao.class); // lpd.build(); SRMetric sr = c.get(SRMetric.class, "word2vec", "language", "simple"); Wikifier identity = c.get(Wikifier.class, "word2vec", "language", "simple"); Wikifier websail = new WebSailWikifier(identity, rpd, linkDao, linkProbabilityDao, phraseAnalyzer.getDao(), sr); WikiTextCorpusCreator creator = new WikiTextCorpusCreator(Language.SIMPLE, websail, rpd, pageDao, linkProbabilityDao); creator.write(new File("foo")); // LocalPage obama = pageDao.getByTitle(Language.SIMPLE, NameSpace.ARTICLE, "Barack Obama"); // for (LocalLink ll :websail.wikify(obama.getLocalId())) { // System.out.println(ll.getAnchorText() + ": " + pageDao.getById(Language.SIMPLE, ll.getLocalId()) + "\n"); // } } private static String cleanString(String s) { StringTokenizer t = new StringTokenizer(); return StringUtils.join(t.getWords(Language.SIMPLE, s), " "); } }