package org.wikibrain.cookbook.sr; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.lang.Language; import org.wikibrain.sr.dataset.Dataset; import org.wikibrain.sr.dataset.DatasetDao; import org.wikibrain.sr.evaluation.MostSimilarDataset; import org.wikibrain.sr.utils.KnownSim; import java.util.List; /** * @author Shilad Sen */ public class MostSimilarDatasetExample { public static void main(String args[]) throws ConfigurationException, DaoException { Env env = new EnvBuilder().build(); DatasetDao dao = env.getConfigurator().get(DatasetDao.class); List<Dataset> allEn = dao.getAllInLanguage(Language.getByLangCode("simple")); // for (int i = 0; i < allEn.size(); i++) { // if (allEn.get(i).getName().equals("WikiSimi3000.txt")) { // allEn.remove(i); // break; // } // } MostSimilarDataset msd = new MostSimilarDataset(allEn); int histogram[] = new int[10000]; int max = 0; for (String phrase : msd.getPhrases()) { List<KnownSim> sims = msd.getSimilarities(phrase).getMostSimilar(); histogram[sims.size()]++; max = Math.max(max, sims.size()); if (sims.size() >= 5) { System.out.println("phrase " + phrase + ":"); for (int i = 0; i < sims.size(); i++) { System.out.println("\t" + (i+1) + ". " + sims.get(i)); } } } System.out.println("histogram of similar list sizes:"); for (int i = 1; i <= max; i++) { if (histogram[i] > 0) { System.out.println("\tsize " + i +": " + histogram[i] + " phrases"); } } } }