package org.wikibrain.spatial.matcher;
import com.typesafe.config.Config;
import com.vividsolutions.jts.geom.Geometry;
import org.apache.commons.io.FileUtils;
import org.geotools.data.simple.SimpleFeatureIterator;
import org.opengis.feature.simple.SimpleFeature;
import org.supercsv.io.CsvListWriter;
import org.supercsv.io.CsvMapReader;
import org.supercsv.prefs.CsvPreference;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.model.LocalPage;
import org.wikibrain.spatial.loader.SpatialDataDownloader;
import org.wikibrain.spatial.loader.SpatialDataFolder;
import org.wikibrain.spatial.WikiBrainShapeFile;
import org.wikibrain.utils.WpIOUtils;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Creates or updates a mapping csv file from a shapefile to WikiBrain.
*
* @author Shilad Sen
*/
public class ShapeFileMatcher {
private static final char STATUS_UNKNOWN = 'U';
private static final char STATUS_VERIFIED = 'V';
private static final Logger LOG = LoggerFactory.getLogger(ShapeFileMatcher.class);
private final Env env;
private final SpatialDataFolder dir;
private final SpatialDataDownloader downloader;
public ShapeFileMatcher(Env env) {
this.env = env;
this.dir = new SpatialDataFolder(new File(env.getConfiguration().get().getString("spatial.dir")));
this.downloader = new SpatialDataDownloader(env);
}
public void match(String refSys, String layerGroup, String datasetName) throws IOException, InterruptedException, DaoException, ConfigurationException {
Config config = env.getConfiguration().getConfig("spatial.datasets", refSys, layerGroup, datasetName);
WikiBrainShapeFile shapeFile = downloader.download(refSys, layerGroup, datasetName, false);
writeMatches(config, shapeFile);
}
public void writeMatches(Config config, WikiBrainShapeFile shapeFile) throws IOException, ConfigurationException, DaoException {
Map<String, MappingInfo> existing = readExisting(shapeFile);
File newFile = File.createTempFile("wbmapping", "csv");
CsvListWriter csv = new CsvListWriter(WpIOUtils.openWriter(newFile), CsvPreference.STANDARD_PREFERENCE);
// Fields from the shapefile that should be included in the final CSV
List<String> extraFields = new ArrayList<String>();
for (String fieldsKey : new String[] { "titles", "context", "other" }) {
if (config.hasPath(fieldsKey)) {
for (String field : config.getStringList(fieldsKey)) {
extraFields.add(field);
}
}
}
List<String> featureNames = shapeFile.getFeatureNames();
GeoResolver resolver = new GeoResolver(env, config);
try {
writeHeader(csv, extraFields);
SimpleFeatureIterator iter = shapeFile.getFeatureIter();
int n = 0;
while (iter.hasNext()) {
if (n++ % 1000 == 0) {
LOG.info("Mapping row " + n + " of " + shapeFile.getFile());
}
SimpleFeature row = iter.next();
Map<String, String> rowMap = makeRow(featureNames, config.getStringList("key"), row);
Geometry geometry = (Geometry) row.getDefaultGeometry();
writeRow(resolver, csv, extraFields, rowMap, geometry, existing);
}
iter.close();
} finally {
csv.close();
}
// Move original to a backup if it exists
if (shapeFile.getMappingFile().exists()) {
File backup = new File(shapeFile.getMappingFile().getAbsoluteFile() + ".bak");
FileUtils.deleteQuietly(backup);
FileUtils.moveFile(shapeFile.getMappingFile(), backup);
}
FileUtils.moveFile(newFile, shapeFile.getMappingFile());
}
/**
* TODO: keep track of duplicate or missing keys with special status codes
* @param shapeFile
* @return
* @throws IOException
*/
private Map<String, MappingInfo> readExisting(WikiBrainShapeFile shapeFile) throws IOException {
HashMap<String, MappingInfo> mapping = new HashMap<String, MappingInfo>();
if (!shapeFile.hasMappingFile()) {
return mapping;
}
CsvMapReader reader = new CsvMapReader(
WpIOUtils.openBufferedReader(shapeFile.getMappingFile()),
CsvPreference.STANDARD_PREFERENCE
);
String [] header = reader.getHeader(true);
while (true) {
Map<String, String> row = reader.read(header);
if (row == null) {
break;
}
MappingInfo info = new MappingInfo(row);
if (!info.isUnknown()) {
mapping.put(info.key, info);
}
}
return mapping;
}
private Map<String, String> makeRow(List<String> featureNames, List<String> keyFields, SimpleFeature row) {
Map<String, String> rowMap = new HashMap<String, String>();
for (int i = 0; i < row.getAttributeCount(); i++) {
rowMap.put(featureNames.get(i).toUpperCase(), row.getAttribute(i).toString());
}
rowMap.put("WB_ID", row.getID());
String key = "";
for (String field : keyFields) {
if (key.length() != 0) {
key += "|";
}
key += rowMap.get(field);
}
rowMap.put("WB_KEY", key);
return rowMap;
}
private void writeHeader(CsvListWriter writer, List<String> extraFields) throws IOException {
List<String> fields = new ArrayList<String>();
fields.add("WB_ID");
fields.add("WB_KEY");
fields.add("WB_UPDATED");
fields.add("WB_STATUS");
fields.add("WB_TITLE");
fields.add("WB_GUESS1");
fields.add("WB_GUESS2");
fields.add("WB_GUESS3");
fields.add("WB_SCORE");
fields.addAll(extraFields);
writer.write(fields);
}
private void writeRow(GeoResolver resolver, CsvListWriter writer, List<String> extraFields, Map<String, String> row, Geometry geometry, Map<String, MappingInfo> existing) throws DaoException, IOException {
String tstamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date());
LinkedHashMap<LocalPage, Double> guesses = resolver.resolve(row, geometry, 3);
List<LocalPage> sorted = new ArrayList<LocalPage>(guesses.keySet());
List<String> newRow = new ArrayList<String>();
MappingInfo prev = existing.get(row.get("WB_KEY"));
newRow.add(row.get("WB_ID"));
newRow.add(row.get("WB_KEY"));
newRow.add(prev == null ? tstamp : prev.timestamp);
newRow.add(String.valueOf(prev == null ? STATUS_UNKNOWN : prev.status));
// Calculate best title
String title = "";
if (prev != null) title = prev.title;
else if (sorted.size() > 0) title = sorted.get(0).getTitle().getTitleStringWithoutNamespace();
newRow.add(title);
for (int i = 0; i < 3; i++) {
if (sorted.size() > i) {
newRow.add(sorted.get(i).getTitle().getTitleStringWithoutNamespace());
} else {
newRow.add("");
}
}
double score = 0;
if (sorted.size() >= 2) {
score = 2 * guesses.get(sorted.get(0)) - guesses.get(sorted.get(1));
} else if (sorted.size() == 1) {
score = guesses.get(sorted.get(0));
}
newRow.add(""+score);
for (String f : extraFields) {
newRow.add(row.get(f).toString());
}
writer.write(newRow);
}
public static class MappingInfo {
public final String key;
public final String timestamp;
public final char status;
public final String title;
public MappingInfo(Map<String, String> row) {
key = row.get("WB_KEY");
timestamp = row.get("WB_UPDATED");
status = row.get("WB_STATUS").toUpperCase().charAt(0);
title = row.get("WB_TITLE");
}
public boolean isUnknown() {
return status == STATUS_UNKNOWN;
}
}
public static void main(String args[]) throws Exception {
Env env = EnvBuilder.envFromArgs(args);
ShapeFileMatcher matcher = new ShapeFileMatcher(env);
matcher.match("earth", "marine", "naturalEarth");
}
}