package org.wikibrain.spatial.cookbook.tflevaluate; import au.com.bytecode.opencsv.CSVWriter; import com.google.common.collect.Sets; import com.vividsolutions.jts.geom.Geometry; import com.vividsolutions.jts.geom.Point; import gnu.trove.set.TIntSet; import org.geotools.referencing.GeodeticCalculator; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.core.WikiBrainException; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.LocalPageDao; import org.wikibrain.core.dao.UniversalPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageSet; import org.wikibrain.core.model.Title; import org.wikibrain.core.model.UniversalPage; import org.wikibrain.spatial.constants.RefSys; import org.wikibrain.spatial.dao.SpatialContainmentDao; import org.wikibrain.spatial.dao.SpatialDataDao; import org.wikibrain.spatial.dao.SpatialNeighborDao; import org.wikibrain.sr.SRMetric; import org.wikibrain.sr.SRResult; import org.wikibrain.utils.ParallelForEach; import org.wikibrain.utils.Procedure; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Created by toby on 5/22/14. */ public class TopoEvaluator { private static int WIKIDATA_CONCEPTS = 1; private static final Logger LOG = LoggerFactory.getLogger(TopoEvaluator.class); private Random random = new Random(); private final SpatialDataDao sdDao; private final LocalPageDao lpDao; private final UniversalPageDao upDao; private final SpatialNeighborDao snDao; private final SpatialContainmentDao scDao; private final List<Language> langs; private final Map<Language, SRMetric> metrics; private final DistanceMetrics distanceMetrics; private final List<UniversalPage> concepts = new ArrayList<UniversalPage>(); private final Map<Integer, Point> locations = new HashMap<Integer, Point>(); private final Map<Integer, List<Integer>> polygonWAG = new HashMap<Integer, List<Integer>>(); private final Env env; private CSVWriter output; private String layerName = "wikidata"; private Map<Integer, Integer> pointPolygonContainingMap = new HashMap<Integer, Integer>(); private Map<Map.Entry, Integer> polygonPairDistanceMap = new HashMap<Map.Entry, Integer>(); public TopoEvaluator(Env env, LanguageSet languages) throws ConfigurationException { this.env = env; //this.langs = new ArrayList<Language>(env.getLanguages().getLanguages()); langs = new ArrayList<Language>(); for(Language lang : languages.getLanguages()) langs.add(lang); // Get data access objects Configurator c = env.getConfigurator(); this.sdDao = c.get(SpatialDataDao.class); this.lpDao = c.get(LocalPageDao.class); this.upDao = c.get(UniversalPageDao.class); this.snDao = c.get(SpatialNeighborDao.class); this.scDao = c.get(SpatialContainmentDao.class); this.distanceMetrics = new DistanceMetrics(env, c, snDao); // build SR metrics this.metrics = new HashMap<Language, SRMetric>(); for(Language lang : langs){ SRMetric m = c.get(SRMetric.class, "ensemble", "language", lang.getLangCode()); metrics.put(lang, m); } } public void retrieveAllLocations(String pointLayer, String polygonLayer) throws DaoException, WikiBrainException{ Map<Integer, Geometry> geometries = sdDao.getAllGeometriesInLayer(pointLayer, "earth"); retrieveLocations(geometries, pointLayer, polygonLayer); } public void retrieveLocations(Map<Integer, Geometry> geometries, String pointLayer, String polygonLayer) throws DaoException, WikiBrainException { // Get all known concept geometries Map<Integer, Geometry> polygons = sdDao.getAllGeometriesInLayer(polygonLayer, "earth"); LOG.info(String.format("Found %d total geometries, now loading geometries", geometries.size())); // Build up list of concepts in all languages for (Integer conceptId : geometries.keySet()){ UniversalPage concept = upDao.getById(conceptId); if (concept != null && concept.hasAllLanguages(new LanguageSet(langs))) { concepts.add(concept); Geometry g1 = geometries.get(conceptId); locations.put(conceptId, g1.getCentroid()); if (concepts.size() % 1000 == 0) { LOG.info(String.format("Loaded %d geometries with articles in %s...", concepts.size(), langs)); } } } LOG.info(String.format("Found %d geometries with articles in %s", concepts.size(), langs)); //Build polygon WAG //Build point-polygon mapping int counter = 0; int dummy = 0; for(Map.Entry<Integer, Geometry> i : polygons.entrySet()){ counter ++; if(counter % 1 == 0){ LOG.info(String.format("Processing the %d th polygon : %s out of %d", counter, upDao.getById(i.getKey()).getBestEnglishTitle(lpDao, true).getCanonicalTitle(), polygons.size())); } Map<Integer, Geometry> neighbors = snDao.getNeighbors(i.getValue(), polygonLayer, "earth", new HashSet<Integer>()); if(!polygonWAG.containsKey(i.getKey())){ polygonWAG.put(i.getKey(), new ArrayList<Integer>()); } polygonWAG.get(i.getKey()).addAll(neighbors.keySet()); Set<String> layerSet = new HashSet<String>(); layerSet.add(pointLayer); TIntSet containedItem = scDao.getContainedItemIds(i.getValue(), "earth", layerSet, SpatialContainmentDao.ContainmentOperationType.CONTAINMENT); dummy++; for(Integer k : containedItem.toArray()){ pointPolygonContainingMap.put(k, i.getKey()); } dummy++; } } /** * Evaluate a specified number of random pairs from loaded concepts * @param outputPath * @param numSamples * @throws java.io.IOException */ public void evaluateSample(File outputPath, int numSamples) throws IOException { this.output = new CSVWriter(new FileWriter(outputPath), ','); writeHeader(); if(concepts.size() == 0) LOG.warn("No concept has been retrieved"); ParallelForEach.range(0, numSamples, new Procedure<Integer>() { @Override public void call(Integer i) throws Exception { evaluateOneSample(); } }); this.output.close(); } private void evaluateOneSample() throws DaoException, WikiBrainException, IOException { UniversalPage c1 = concepts.get(random.nextInt(concepts.size())); UniversalPage c2 = concepts.get(random.nextInt(concepts.size())); List<SRResult> results = new ArrayList<SRResult>(); for (Language lang : langs) { SRMetric sr = metrics.get(lang); results.add(sr.similarity(c1.getLocalId(lang), c2.getLocalId(lang), false)); if(sr.similarity(c1.getLocalId(lang), c2.getLocalId(lang), false) == null){ LOG.warn(String.format("error calculating SR for universal page %d %s and %d %s", c1.getUnivId(), c1.getBestEnglishTitle(lpDao, true), c2.getUnivId(), c2.getBestEnglishTitle(lpDao, true))); } } writeRow(c1, c2, results); } private void writeHeader() throws IOException { String[] headerEntries = new String[8 + langs.size()]; headerEntries[0] = "ITEM_NAME_1"; headerEntries[1] = "ITEM_ID_1"; headerEntries[2] = "CONTAINED_1"; headerEntries[3] = "ITEM_NAME_2"; headerEntries[4] = "ITEM_ID_2"; headerEntries[5] = "CONTAINED_2"; headerEntries[6] = "SPATIAL_DISTANCE"; headerEntries[7] = "TOPO_DISTANCE"; int counter = 0; for (Language lang : langs) { headerEntries[8 + counter] = lang.getLangCode() + "_SR"; counter ++; } output.writeNext(headerEntries); output.flush(); } public Integer polygonDistance(Integer itemIdA, Integer itemIdB, String layer, String refSys){ Map.Entry<Integer,Integer> keyEntry = new AbstractMap.SimpleEntry<Integer, Integer>(itemIdA, itemIdB); if (polygonPairDistanceMap.containsKey(keyEntry)){ return polygonPairDistanceMap.get(keyEntry); } Map<Integer, Integer> distList = new HashMap<Integer, Integer>(); for(Integer k : polygonWAG.keySet()){ distList.put(k, -1); } Queue<Integer> Q = new LinkedList<Integer>(); Set<Integer> V = new HashSet<Integer>(); distList.put(itemIdA, 0); Q.add(itemIdA); V.add(itemIdA); while(!Q.isEmpty()){ Integer t = Q.poll(); if(t.equals(itemIdB)){ return distList.get(t); } if(!polygonWAG.containsKey(t)) continue; for(Integer k : polygonWAG.get(t)){ if(!V.contains(k)){ V.add(k); Q.add(k); distList.put(k, distList.get(t) + 1); polygonPairDistanceMap.put(new AbstractMap.SimpleEntry(itemIdA, k), distList.get(k)); } } } return -1; } private void writeRow(UniversalPage c1, UniversalPage c2, List<SRResult> results) throws WikiBrainException, IOException, DaoException { try { double km; if((!locations.containsKey(c1.getUnivId())) || (!locations.containsKey(c2.getUnivId()))) return; Point p1 = locations.get(c1.getUnivId()).getCentroid(); Point p2 = locations.get(c2.getUnivId()).getCentroid(); //TODO: change this to a topological metric GeodeticCalculator geoCalc = new GeodeticCalculator(); geoCalc.setStartingGeographicPoint(p1.getX(), p1.getY()); geoCalc.setDestinationGeographicPoint(p2.getX(), p2.getY()); km = geoCalc.getOrthodromicDistance() / 1000; if(! (pointPolygonContainingMap.containsKey(c1.getUnivId()) && pointPolygonContainingMap.containsKey(c2.getUnivId()))) return; double TopoDist = polygonDistance(pointPolygonContainingMap.get(c1.getUnivId()), pointPolygonContainingMap.get(c2.getUnivId()), layerName, "earth"); Title t1 = c1.getBestEnglishTitle(lpDao, true); Title t2 = c2.getBestEnglishTitle(lpDao, true); String[] rowEntries = new String[8 + langs.size()]; rowEntries[0] = t1.getCanonicalTitle(); rowEntries[1] = String.valueOf(c1.getUnivId()); rowEntries[2] = upDao.getById(pointPolygonContainingMap.get(c1.getUnivId())).getBestEnglishTitle(lpDao, true).getCanonicalTitle(); rowEntries[3] = t2.getCanonicalTitle(); rowEntries[4] = String.valueOf(c2.getUnivId()); rowEntries[5] = upDao.getById(pointPolygonContainingMap.get(c2.getUnivId())).getBestEnglishTitle(lpDao, true).getCanonicalTitle(); rowEntries[6] = String.format("%.2f", km); rowEntries[7] = String.valueOf(TopoDist); int counter = 0; for (SRResult result : results) { if(result != null) rowEntries[8 + counter] = String.format("%.2f", result.getScore()); else rowEntries[8 + counter] = "0"; counter ++; } output.writeNext(rowEntries); output.flush(); } catch (Exception e){ LOG.warn(String.format("error writing row for universal page %d %s and %d %s", c1.getUnivId(), c1.getBestEnglishTitle(lpDao, true), c2.getUnivId(), c2.getBestEnglishTitle(lpDao, true))); //do nothing } } public static void main(String[] args) throws Exception { Env env = EnvBuilder.envFromArgs(args); Configurator conf = env.getConfigurator(); TopoEvaluator evaluator = new TopoEvaluator(env, new LanguageSet("simple")); SpatialDataDao sdDao = conf.get(SpatialDataDao.class); //Map<Integer, Geometry> allGeometries = sdDao.getAllGeometriesInLayer("wikidata", "earth"); //Map<Integer, Geometry> geometryMap = new HashMap<Integer, Geometry>(); Set<String> subLayers = Sets.newHashSet(); subLayers.add("wikidata"); SpatialContainmentDao scDao = conf.get(SpatialContainmentDao.class); TIntSet containedItemIds = scDao.getContainedItemIds(30, "country", RefSys.EARTH, subLayers, SpatialContainmentDao.ContainmentOperationType.CONTAINMENT); LinkedList<Integer> itemIdList = new LinkedList<Integer>(); int[] itemIds = containedItemIds.toArray(); for(Integer k : itemIds){ itemIdList.add(k); } Map<Integer, Geometry> geometryMap = sdDao.getBulkGeometriesInLayer(itemIdList, "wikidata", "earth"); evaluator.retrieveLocations(geometryMap, "wikidata", "states"); //evaluator.retrieveAllLocations("wikidata", "country"); evaluator.evaluateSample(new File("TopoEval.csv"), 500000); } }