package org.wikibrain.sr.esa;
import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet;
import org.apache.commons.cli.*;
import org.apache.commons.io.FileUtils;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.DaoFilter;
import org.wikibrain.core.dao.LocalLinkDao;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.model.LocalPage;
import org.wikibrain.core.model.NameSpace;
import org.wikibrain.sr.SRResultList;
import org.wikibrain.sr.utils.Leaderboard;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.regex.Pattern;
/**
* @author Shilad Sen
*/
public class SRConceptSpaceGenerator {
private static final Logger LOG = LoggerFactory.getLogger(SRConceptSpaceGenerator.class);
private final Language lang;
private final LocalLinkDao linkDao;
private final LocalPageDao pageDao;
private final int numArticles;
private int maxConcepts = -1;
public SRConceptSpaceGenerator(Language lang, LocalLinkDao linkDao, LocalPageDao pageDao) throws DaoException {
this.lang = lang;
this.linkDao = linkDao;
this.pageDao = pageDao;
this.numArticles = pageDao.getCount(getFilter());
}
public DaoFilter getFilter() {
return new DaoFilter()
.setNameSpaces(NameSpace.ARTICLE)
.setLanguages(lang)
.setRedirect(false)
.setDisambig(false);
}
public TIntSet getConcepts() throws DaoException {
int numStopArticles =getNumStopConcepts();
Leaderboard mostLinked = new Leaderboard(getMaxConcepts() + numStopArticles);
for (LocalPage lp : (Iterable<LocalPage>)pageDao.get(getFilter())) {
if (lp == null) {
continue;
}
// the first few checks should be unnecessary, but let's be safe
if ((lp.getNameSpace() != NameSpace.ARTICLE)
|| (lp.isDisambig())
|| (lp.isRedirect())
|| (isBlacklisted(lp))
|| (isList(lp))) {
continue;
}
DaoFilter query = new DaoFilter().setLanguages(lang).setDestIds(lp.getLocalId());
int n = linkDao.getCount(query);
mostLinked.tallyScore(lp.getLocalId(), n);
}
SRResultList sorted = mostLinked.getTop();
TIntSet result = new TIntHashSet();
for (int i = 0; i < sorted.numDocs(); i++) {
if (i < numStopArticles) {
// int id = sorted.getId(i);
// double nlinks = sorted.getScore(i);
// System.out.println("skipping highly linked 'stop' page " +
// pageDao.getById(lang, id) +
// " (" + nlinks + " links)");
} else {
result.add(sorted.getId(i));
}
}
return result;
}
public void writeConcepts(File path) throws DaoException, IOException {
BufferedWriter writer = new BufferedWriter(new FileWriter(path));
for (int wpId : getConcepts().toArray()) {
writer.write(wpId + "\n");
}
writer.close();
}
/**
* TODO: make this multi-lingual.
* @param lp
* @return
*/
private static final Pattern[] TITLE_BLACKLIST = new Pattern[] {
// articles starting with a year
Pattern.compile("^[0-9]{4} .*"),
// articles starting with a month
Pattern.compile("^(January|February|March|April|May|June|July|August|September|October|November|December).*"),
// articles that are just digits
Pattern.compile("^[0-9]+$"),
};
private boolean isBlacklisted(LocalPage lp) {
String title = lp.getTitle().getCanonicalTitle();
for (Pattern p : TITLE_BLACKLIST) {
if (p.matcher(title).matches()) {
return true;
}
}
return false;
}
/**
* TODO: make multi lingual
* @param lp
* @return
*/
private boolean isList(LocalPage lp) {
return lp.getTitle().getCanonicalTitle().toLowerCase().startsWith("list");
}
/**
* Ridiculous heuristic: number of stop concepts = 2 * cubed-root-of(num-articles)
* For simple english (175K articles), this is about 100
* For english (4M articles), this is about 300
* @return
*/
public int getNumStopConcepts() {
return (int) (Math.pow(numArticles, 0.33333) * 2);
}
/**
* Simple heuristic for number of max concepts.
* For simple english (175K articles) default is about 55K
* For english (4M articles), default is about 158K
* @return
*/
public int getMaxConcepts() {
if (maxConcepts < 0) {
return (int) (Math.pow(numArticles, 0.33333) * 1000);
} else {
return maxConcepts;
}
}
public void setMaxConcepts(int maxConcepts) {
this.maxConcepts = maxConcepts;
}
public static void main(String args[]) throws ConfigurationException, DaoException, IOException {
Options options = new Options();
options.addOption(
new DefaultOptionBuilder()
.hasArg()
.withLongOpt("output-dir")
.withDescription("directory to output concept mapping to")
.create("d"));
options.addOption(
new DefaultOptionBuilder()
.hasArg()
.withLongOpt("max-concepts")
.withDescription("maximum number of concepts")
.create("x"));
EnvBuilder.addStandardOptions(options);
CommandLineParser parser = new PosixParser();
CommandLine cmd;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.err.println("Invalid option usage: " + e.getMessage());
new HelpFormatter().printHelp("SRConceptSpaceGenerator", options);
return;
}
Env env = new EnvBuilder(cmd).build();
Configurator c = env.getConfigurator();
LocalLinkDao linkDao = c.get(LocalLinkDao.class);
LocalPageDao pageDao = c.get(LocalPageDao.class);
File parentDir = new File(env.getConfiguration().get().getString("sr.concepts.path"));
if (cmd.hasOption("d")) {
parentDir = new File(cmd.getOptionValue("d"));
}
if (!parentDir.isDirectory()) {
FileUtils.deleteQuietly(parentDir);
parentDir.mkdirs();
}
for (Language lang : env.getLanguages()) {
SRConceptSpaceGenerator pruner = new SRConceptSpaceGenerator(lang, linkDao, pageDao);
if (cmd.hasOption("x")) {
pruner.setMaxConcepts(Integer.valueOf(cmd.getOptionValue("x")));
}
File path = new File(parentDir, lang.getLangCode() + ".txt");
pruner.writeConcepts(path);
}
}
}