package org.wikibrain.sr; import gnu.trove.map.TIntDoubleMap; import gnu.trove.set.TIntSet; import org.wikibrain.core.WikiBrainException; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.lang.Language; import org.wikibrain.sr.dataset.Dataset; import org.wikibrain.sr.normalize.Normalizer; import java.io.File; import java.io.IOException; /** * A monolingual SR metric supports SR operations in a single language. * @author Matt Lesicko * @author Shilad Sen */ public interface SRMetric { /** * @return the name of the similarity metric in a human readable format */ public String getName(); /** * @return The language associated with this metric. */ public Language getLanguage(); /** * Returns the directory containing all data for the metric. * @return */ public File getDataDir(); /** * Sets the data directory associated with the model. * This will apply to all future reads and writes. * * @param dir */ public void setDataDir(File dir); /** * Determine the similarity between two local pages. * * @param pageId1 Id of the first page. * @param pageId2 Id of the second page. * @param explanations Whether explanations should be created. * @return */ public SRResult similarity(int pageId1, int pageId2, boolean explanations) throws DaoException; /** * Determine the similarity between two strings in a given language by mapping through local pages. * * @param phrase1 The first phrase. * @param phrase2 The second phrase. * @param explanations Whether explanations should be created. * @return */ public SRResult similarity(String phrase1, String phrase2, boolean explanations) throws DaoException; /** * Find the most similar local pages to a local page within the same language. * * @param pageId The id of the local page whose similarity we are examining. * @param maxResults The maximum number of results to return. * @return */ public SRResultList mostSimilar(int pageId, int maxResults) throws DaoException; /** * Find the most similar local pages to a local page. * * @param pageId The id of the local page whose similarity we are examining. * @param maxResults The maximum number of results to return. * @param validIds The local page ids to be considered. Null means all ids in the language. * @return */ public SRResultList mostSimilar(int pageId, int maxResults, TIntSet validIds) throws DaoException; /** * Find the most similar local pages to a phrase. * * @param phrase The phrase whose similarity we are examining. * @param maxResults The maximum number of results to return. * @return */ public SRResultList mostSimilar(String phrase, int maxResults) throws DaoException; /** * Find the most similar local pages to a phrase. * * @param phrase The phrase whose similarity we are examining. * @param maxResults The maximum number of results to return. * @param validIds The local page ids to be considered. Null means all ids in the language * @return */ public SRResultList mostSimilar(String phrase, int maxResults, TIntSet validIds) throws DaoException; /** * Writes the metric to the current data directory. * * @throws java.io.IOException */ public void write() throws IOException; /** * Reads the metric from the current data directory. */ public void read() throws IOException; /** * Train the similarity() function. * The KnownSims may already be associated with Wikipedia ids (check wpId1 and wpId2). * * @param dataset A gold standard dataset */ public void trainSimilarity(Dataset dataset) throws DaoException; /** * Train the mostSimilar() function * The KnownSims may already be associated with Wikipedia ids (check wpId1 and wpId2). * * @param dataset A gold standard dataset. * @param numResults The maximum number of similar articles computed per phrase. * @param validIds The Wikipedia ids that should be considered in result sets. Null means all ids. */ public void trainMostSimilar(Dataset dataset, int numResults, TIntSet validIds); /** * @return true if similarity() is already trained (or doesn't need training) */ public boolean similarityIsTrained(); /** * @return true if mostSimilar() is already trained (or doesn't need training) */ public boolean mostSimilarIsTrained(); /** * Construct a cosimilarity matrix of Wikipedia ids in a given language. * * @param wpRowIds * @param wpColIds * @return */ public double[][] cosimilarity(int wpRowIds[], int wpColIds[]) throws DaoException; /** * Construct a cosimilarity matrix of phrases. * * @param rowPhrases * @param colPhrases * @return */ public double[][] cosimilarity(String rowPhrases[], String colPhrases[]) throws DaoException; /** * Construct symmetric comsimilarity matrix of Wikipedia ids in a given language. * * @param ids * @return */ public double[][] cosimilarity(int ids[]) throws DaoException; /** * Construct symmetric cosimilarity matrix of phrases by mapping through local pages. * * @param phrases * @return */ public double[][] cosimilarity(String phrases[]) throws DaoException; /** * @return the most similar normalizer. */ public Normalizer getMostSimilarNormalizer(); /** * Sets the most similar normalizer * @param n */ public void setMostSimilarNormalizer(Normalizer n); /** * * @return the similarity normalizer. */ public Normalizer getSimilarityNormalizer(); /** * Sets the similarity normalizer. * @param n */ public void setSimilarityNormalizer(Normalizer n); }