package org.wikibrain.mapper.algorithms.conceptualign3; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.jgrapht.traverse.BreadthFirstIterator; import org.wikibrain.core.WikiBrainException; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.InterLanguageLinkDao; import org.wikibrain.core.dao.LocalPageDao; import org.wikibrain.core.dao.MetaInfoDao; import org.wikibrain.core.lang.LanguageSet; import org.wikibrain.core.lang.LocalId; import org.wikibrain.core.model.InterLanguageLink; import org.wikibrain.core.model.LocalPage; import org.wikibrain.core.model.NameSpace; import org.wikibrain.core.model.UniversalPage; import org.wikibrain.mapper.ConceptMapper; import org.wikibrain.mapper.MapperIterator; import org.wikibrain.mapper.algorithms.PureWikidataConceptMapper; import java.io.File; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Created by bjhecht on 4/24/14. * * Only supports article namespaces for now (no disambiguation pages), but this can be changed easily by manipulating the filters * in ILLGraph and adapting ClusterResult to contain namespace info (i.e. MapperIterator below can't just return Namespace.Article * * Disambiguation page support will be more difficult given that we have split articles and disambiguation pages into * separate namespaces. * * Due to the changes associated with Wikidata, much more research is needed in this area. MaxEdges has been set to 2 as this * seems reasonable, but again, more research is definitely need. * */ public class ConceptualignConceptMapper extends ConceptMapper{ private PureWikidataConceptMapper wdMapper; private final InterLanguageLinkDao illDao; private final MetaInfoDao miDao; private Iterable<UniversalPage> uPages; private LanguageSet uPageLs; private final boolean print; private static Logger LOG = LoggerFactory.getLogger(ConceptualignConceptMapper.class); public ConceptualignConceptMapper(File wikidataFilePath, int id, LocalPageDao localPageDao, InterLanguageLinkDao illDao, MetaInfoDao miDao, boolean print) { super(id, localPageDao); this.illDao = illDao; this.miDao = miDao; this.print = print; wdMapper = new PureWikidataConceptMapper(wikidataFilePath, -1, localPageDao); } /** * For testing purposes only * @param uPages * @param id * @param localPageDao * @param illDao * @param miDao */ public ConceptualignConceptMapper(Iterable<UniversalPage> uPages, LanguageSet uPagesLs, int id, LocalPageDao localPageDao, InterLanguageLinkDao illDao, MetaInfoDao miDao, boolean print) { super(id, localPageDao); this.illDao = illDao; this.miDao = miDao; this.uPages = uPages; this.print = print; this.uPageLs = uPagesLs; } @Override public Iterator<UniversalPage> getConceptMap(LanguageSet ls) throws WikiBrainException, DaoException { // parameters int maxEdge = 2; // see Bao et al. 2012 for definition double minLang = 1.0; // see Bao et al. 2012 for definition // load Wikidata mappings if (uPages == null) { LOG.info("Loading Wikidata concept mappings"); Iterator<UniversalPage> uPages = wdMapper.getConceptMap(ls); }else{ if (!ls.equals(uPageLs)){ throw new WikiBrainException("LanguageSet mismatch"); } } // perform Conceptualign CombinedIllDao combinedDao = new CombinedIllDao(uPages.iterator(), illDao); ILLGraph illGraph = new ILLGraph(combinedDao, localPageDao, miDao); BreadthFirstIterator<LocalId, ILLEdge> bfi = new BreadthFirstIterator<LocalId, ILLEdge>(illGraph); List<ConnectedComponentHandler> ccHandlers = new ArrayList<ConnectedComponentHandler>(); ccHandlers.add(new Conceptualign3ConnectedComponentHandler(minLang, maxEdge, true, this.localPageDao)); ConnectedComponentTraversalListener listener = new ConnectedComponentTraversalListener(illGraph, ccHandlers); bfi.addTraversalListener(listener); while (bfi.hasNext()){ LocalId localId = bfi.next(); } return new MapperIterator<UniversalPage>(listener.getClusterResults()) { @Override public UniversalPage transform(Object obj) { ClusterResult curCluster = (ClusterResult)obj; return new UniversalPage(curCluster.univId, getId(), NameSpace.ARTICLE, curCluster.vertices); } }; } }