//
// @(#)Map.java 4/2002
//
// Copyright 2002 Zachary DelProposto. All rights reserved.
// Use is subject to license terms.
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
// Or from http://www.gnu.org/
//
package dip.world;
import dip.order.*;
import java.util.*;
import java.io.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
import javax.xml.parsers.*;
/**
* A Map is a list of Provinces and Powers, and methods for obtaining and parsing
* these Provinces and Powers.
*
*
*
*
*/
public class Map implements Serializable
{
/**
*
*/
private static final long serialVersionUID = -6975129388283822932L;
// constants
private static final int MAP_SIZE = 211; // should be prime
private static final int POWER_SIZE = 17; // should be prime
// internal constant arrays
// all this data is serialized.
private final Power[] powers;
private final Province[] provinces;
// None of the data below here is serialized; it can be derived from
// the above (serialized) data.
//
// Province-related
private transient HashMap nameMap = null; // map of all (short & full) names to a province; names in lower case
private transient String[] names = null; // list of all province names [short & full]; names in lower case
// Power-related
private transient HashMap powerNameMap = null; // created by createMappings()
// fields created on first-use (by a method)
private transient String[] lcPowerNames = null; // lower case power names & adjectives
private transient String[] wsNames = null; // list of all province names that contain whitespace, "-", or " "
/**
* Constructs a Map object.
*
*
*/
protected Map(Power[] powerArray, Province[] provinceArray)
{
// define constant arrays.
powers = powerArray;
provinces = provinceArray;
// check provinceArray: index must be >= 0 and < provinceArray.length
int len = provinceArray.length;
for(int i=0; i<provinceArray.length; i++)
{
final int idx = provinceArray[i].getIndex();
if(idx < 0 || idx >= len)
{
throw new IllegalArgumentException("Province: "+provinceArray[i]+": illegal Index: "+idx);
}
if(idx != i)
{
throw new IllegalArgumentException("Province: "+provinceArray[i]+": out of order (index: "+idx+"; position: "+i+")");
}
}
// create mappings
createMappings();
}// Map()
/**
* Creates the name->power and name->province mappings.
* <p>
* After de-serialization, this method MUST be called, since
* the mappings aren't saved by default.
*
*/
private void createMappings()
{
// create powerNameMap
powerNameMap = new HashMap(POWER_SIZE);
for(int i=0; i<powers.length; i++)
{
Power power = powers[i];
String[] tmp = power.getNames();
for(int nmIdx=0; nmIdx<tmp.length; nmIdx++)
{
powerNameMap.put(tmp[nmIdx].toLowerCase(), power);
}
// also map adjectives
powerNameMap.put(power.getAdjective().toLowerCase(), power);
}
// create lcPowerNameList
createLCPowerNameList();
// province-related namemap
//
nameMap = new HashMap(MAP_SIZE);
ArrayList namesAL = new ArrayList(MAP_SIZE);
for(int i=0; i<provinces.length; i++)
{
Province province = provinces[i];
String lcName = province.getFullName().toLowerCase();
// map long name, and add to list
nameMap.put(lcName, province);
namesAL.add(lcName);
// map short names, and add to list
String[] lcShortNames = province.getShortNames();
for(int j=0; j<lcShortNames.length; j++)
{
lcName = lcShortNames[j].toLowerCase();
nameMap.put(lcName, province);
namesAL.add(lcName);
}
}
// create names array from ArrayList
names = (String[]) namesAL.toArray(new String[namesAL.size()]);
}// createMappings()
/**
* Returns an Array of all Powers.
*
*/
public final Power[] getPowers()
{
return powers;
}// getPowers()
/**
* Returns the power that matches name. Returns null if no
* match found.
* <p>
* The match must be exact, but is case-insensitive.
*/
public Power getPower(String name)
{
return (Power) powerNameMap.get(name.toLowerCase());
}// getPower()
/**
* Returns the closest Power to the given input String.
* If no reasonable match is found, or multiple matches are found,
* returns null.
* <p>
* This is different from getPowerMatching() in that this method
* assumes <i>a priori</i> that the input is a power; it therefore
* has looser parsing requirements. Likewise, if used on non-power tokens
* (e.g., Provinces), it may be sufficiently close to a Power that it will
* match; such improper (mis)matches would occur much LESS often
* with getPowerMatching().
* <p>
* As few as a single character can be matched (if it's unique);
* e.g., "E" for England.
*/
public Power getClosestPower(String powerName)
{
// return 'null' if powerName is empty
if("".equals(powerName))
{
return null;
}
// 1) check for an exact match.
//
Power matchPower = null;
matchPower = getPower(powerName);
if(matchPower != null)
{
return matchPower;
}
// make lowercase
powerName = powerName.toLowerCase();
// 2) check for a unique partial match
//
List list = findPartialPowerMatch(powerName);
if(list.size() == 1)
{
return (Power) list.get(0);
}
// 3) perform a Levenshtein match against power names.
//
int bestMatch = Integer.MAX_VALUE;
matchPower = null;
for(int i=0; i<lcPowerNames.length; i++)
{
String name = lcPowerNames[i];
final int distance = Distance.getLD(powerName, name);
if(distance < bestMatch)
{
matchPower = getPower(name);
bestMatch = distance;
}
else if(distance == bestMatch)
{
if(matchPower != getPower(name))
{
matchPower = null;
}
}
}
// if absolute error rate is too high, discard.
if(bestMatch <= ((int) (powerName.length() / 2)))
{
return matchPower;
}
// 4) nothing sufficiently close. Return null.
return null;
}// getClosestPower()
/**
* Returns the Power that matches the powerName. Returns
* null if no best match found.
* <p>
* This will match the closest power but requires at least
* 5 characters for a match.
*/
public Power getPowerMatching(String powerName)
{
// return 'null' if powerName is empty
if("".equals(powerName))
{
return null;
}
// first, check for exact match.
Power bestMatchingPower = null;
bestMatchingPower = getPower(powerName);
if(bestMatchingPower != null)
{
return bestMatchingPower;
}
powerName = powerName.toLowerCase();
// no exact match.
// otherwise we check for the 'max' matched characters, and go with this
// if there are multiple equivalent matches (ties), without a clear winner,
// return null.
if(powerName.length() >= 4)
{
List list = findPartialPowerMatch(powerName);
if(list.size() == 1)
{
return (Power) list.get(0);
}
}
// 3) perform a levenshtein match against power names.
//
int bestMatch = Integer.MAX_VALUE;
String bestMatchPowerName = null;
for(int i=0; i<lcPowerNames.length; i++)
{
String name = lcPowerNames[i];
final int distance = Distance.getLD(powerName, name);
if(distance < bestMatch)
{
bestMatchPowerName = name;
bestMatch = distance;
}
else if(distance == bestMatch)
{
bestMatchPowerName = null;
}
}
// if absolute error rate is too high, discard.
// we are stricter than in getClosestPower()
if(bestMatch <= ((int) (powerName.length() / 3)))
{
return getPower(bestMatchPowerName); // should never return null
}
// nothing is close
return null;
}// getPowerMatching()
/**
* Returns an Array of all Provinces.
*
*/
public final Province[] getProvinces()
{
return provinces;
}// getProvinces()
/**
* Returns the Province that matches name. Returns null if
* no match found.
* <p>
* The match must be exact, but is case-insensitive.
*/
public Province getProvince(String name)
{
return (Province) nameMap.get(name.toLowerCase());
}// getProvince()
/**
* Returns the Province that matches the input name. Returns
* null if no best match found.
* <p>
* This will match the closest power but requires at least
* 3 characters for a match. Ties result in no match at all.
* This method uses the Levenshtein distance algorithm
* to determine closeness.
*/
public Province getProvinceMatching(String input)
{
// return 'null' if input is empty
if(input == null || input.length() == 0)
{
return null;
}
// first, try exact match.
// (fastest, if it works)
Province province = getProvince(input);
if(province != null)
{
return province;
}
// we must be at least 3 chars
if(input.length() < 3)
{
return null;
}
// input converted to lower case
input = input.toLowerCase().trim();
// Do a partial match against the name list.
// If we tie, return no match. This is a 'partial first match'
// This is tried BEFORE we try Levenshtein
//
List list = findPartialProvinceMatch(input);
if(list.size() == 1)
{
return (Province) list.get(0);
}
// tie list. Use a Set so that we get no dupes
Set ties = new HashSet();
// compute Levenshteins on the match
// if there are ties, keep them.. for now
ties.clear();
int bestDist = Integer.MAX_VALUE;
for(int i=0; i<names.length; i++)
{
String name = names[i];
// check closeness. Smaller is better.
final int distance = Distance.getLD(input, name);
if(distance < bestDist)
{
ties.clear();
ties.add( getProvince(name) );
bestDist = distance;
}
else if(distance == bestDist)
{
ties.add( getProvince(name) );
}
}
/*
System.out.println("LD input: "+input);
System.out.println(" ties: "+ties);
System.out.println(" bestDist: "+bestDist);
System.out.println(" maxbest: "+((int) (input.length() / 2)));
*/
// if absolute error rate is too high, discard.
// if we have >1 unique ties, (or none at all) no match
if(bestDist <= ((int) (input.length() / 2)) && ties.size() == 1)
{
// there is but one
return (Province) ties.iterator().next();
}
return null;
}// getProvinceMatching
/**
* Finds the Province(s) that best match the given input.
* Returns a List of Provinces that match. If an empty list,
* nothing was close (e.g., less than three characters).
* If the list contains a single Province,
* it is the closest match. If the list contains multiple Provinces,
* there were several equally-close matches (ties).
* <p>
* This method uses the Levenshtein distance algorithm
* to determine closeness.
* <p>
*
*/
public Collection getProvincesMatchingClosest(String input)
{
// return empty list
if(input == null || input.length() == 0)
{
return new ArrayList(1);
}
// first, try exact match.
// (fastest, if it works)
Province province = getProvince(input);
if(province != null)
{
ArrayList matches = new ArrayList(1);
matches.add(province);
return matches;
}
// input converted to lower case
input = input.toLowerCase().trim();
// tie list. Use a Set so that we get no dupes
Set ties = new HashSet();
// if 2 or less, do no processing
if(input.length() <= 2)
{
return new ArrayList(1);
}
else if(input.length() == 3)
{
// if we are only 3 chars, do a partial-first match
// against provinces and return that tie list (or,
// if no tie, return the province)
//
// This works better than Levenshtein
// which can return some very odd results.
// for short strings...
//
for(int i=0; i<names.length; i++)
{
String name = names[i];
if(name.startsWith(input))
{
ties.add(getProvince(name));
}
}
}
else
{
// compute Levenshteins on the match
// if there are ties, keep them.. for now
int bestDist = Integer.MAX_VALUE;
for(int i=0; i<names.length; i++)
{
String name = names[i];
// check closeness. Smaller is better.
final int distance = Distance.getLD(input, name);
if(distance < bestDist)
{
ties.clear();
ties.add( getProvince(name) );
bestDist = distance;
}
else if(distance == bestDist)
{
ties.add( getProvince(name) );
}
}
}
return ties;
}// getProvincesMatchingClosest()
/**
* Parses text into a Location. This will discern coast
* information, if present, as per Coast.normalize() followed
* by Coast.parse().
*
*/
public Location parseLocation(String input)
{
Coast coast = null;
try
{
input = Coast.normalize(input);
coast = Coast.parse(input);
}
catch(OrderException e)
{
return null;
}
Province province = getProvinceMatching( Coast.getProvinceName(input) );
if(province != null)
{
return new Location(province, coast);
}
return null;
}// parseLocation()
/**
* Searches the input string for any province names that contain
* hyphens or whitespace ('-' or ' ') and replaces it with a short name.
* this simplifies parsing, later, and allows the parser to better understand
* multi-word names. ASSUMES input is all lower-case.
* <p>
* This is a special-purpose method for Order parsing.
*/
public void replaceProvinceNames(StringBuffer sb)
{
// create the whitespace list, if it doesn't exist.
if(wsNames == null)
{
List list = new ArrayList(50);
for(int i=0; i<names.length; i++)
{
String name = names[i];
if(name.indexOf(' ') != -1 || name.indexOf('-') != -1)
{
list.add(name.toLowerCase());
}
}
wsNames = (String[]) list.toArray(new String[list.size()]);
// sort array from longest entries to shortest. This
// eliminates errors in partial replacements.
Arrays.sort(wsNames, new Comparator()
{
// longer strings are more negative, thus rise to top
public int compare(Object o1, Object o2)
{
String s1 = (String) o1;
String s2 = (String) o2;
return (s2.length() - s1.length());
}// compare()
public boolean equals(Object obj) { return false; }
});
}
// search & replace.
for(int i=0; i<wsNames.length; i++)
{
String currentName = wsNames[i];
int idx = 0;
int start = sb.indexOf(currentName, idx);
while(start != -1)
{
int end = start + currentName.length();
sb.replace(start, end, getProvince(currentName).getShortName());
// repeat search
idx = start + currentName.length();
start = sb.indexOf(currentName, idx);
}
}
}// replaceProvinceNames()
/**
* Eliminates any Power Names (e.g., "France") after the first whitespace
* character or colon(this is done to prevent elimination of the first power,
* which is required).
* <p>
* <b>NOTE: assumes StringBuffer is all lower-case.</b>
* <p>
* This is a special-purpose method for Order parsing.
*/
public void filterPowerNames(StringBuffer sb)
{
// find first white space or colon
int wsIdx = -1;
for(int i=0; i<sb.length(); i++)
{
final char c = sb.charAt(i);
if(c == ':' || Character.isWhitespace(c))
{
wsIdx = i;
break;
}
}
// search / delete all names.
// just looks for a single power name.
//
// preceding character MUST be a whitespace character.
// thus "prussia" would not become "p"
if(wsIdx >= 0)
{
for(int i=0; i<lcPowerNames.length; i++)
{
final int idx = sb.indexOf(lcPowerNames[i], wsIdx);
if(idx >= 0)
{
if(idx != 0 && Character.isWhitespace(sb.charAt(idx-1)))
{
sb.delete(idx, (idx + lcPowerNames[i].length()));
}
}
}
}
}// filterPowerNames()
/**
* If a power token is specified (e.g., France), returns the token as a String.
* If no token is specified, returns null. If a colon is present, this is
* much looser than if no colon is present.
* <p>
* <b>NOTE: assumes StringBuffer is all lower-case, is trimmed, and
* that power names DO NOT contain whitespace.</b>
* <p>
* This is a special-purpose method for Order parsing.
* <p>
* examples:
* <code>
* France: xxx-yyy // returns "France"<br>
* Fra: xxx-yyy // returns "Fra" (assumed; it's before the colon)
* Fra xxx-yyy // returns null (Fra not recognized)
* xxx-yyy // returns null (xxx doesn't match a power)
* </code>
*
*/
public String getFirstPowerToken(StringBuffer sb)
{
assert(lcPowerNames != null);
// if we find a colon, we will ASSUME that the first token
// is a power, and use getClosestPower(); otherwise, we will
// just check against the lcPowerNames list.
boolean hasColon = false;
// find first white space (or ':')
int wsIdx = -1;
for(int i=0; i<sb.length(); i++)
{
final char c = sb.charAt(i);
if(c == ':')
{
hasColon = true;
wsIdx = i;
break;
}
if(Character.isWhitespace(c))
{
wsIdx = i;
break;
}
}
// return token iff we match a power
if(wsIdx >= 0)
{
String nameToTest = sb.substring(0, wsIdx).trim();
if(hasColon)
{
// looser: assume prior-to-colon is a power name.
// no testing.
return nameToTest;
}
else
{
// stricter: no ':'; first token may or may not be a power.
for(int i=0; i<lcPowerNames.length; i++)
{
if( nameToTest.startsWith(lcPowerNames[i]) )
{
return nameToTest;
}
}
}
}
return null;
}// getFirstPowerToken()
/**
* If a power token is specified (e.g., France), returns the token as a String.
* If no token is specified, returns null. If a colon is present, this is
* much looser than if no colon is present.
* <p>
* <b>NOTE: assumes StringBuffer is all lower-case, is trimmed, and
* that power names DO NOT contain whitespace.</b>
* <p>
* This is a special-purpose method for Order parsing.
* <p>
* examples:
* <code>
* France: xxx-yyy // returns "France"<br>
* Fra: xxx-yyy // returns "France" (assumed; it's before the colon)
* Fra xxx-yyy // returns null (Fra not recognized)
* xxx-yyy // returns null (xxx doesn't match a power)
* </code>
*
*/
public Power getFirstPower(String input)
{
assert(lcPowerNames != null);
// if we find a colon, we will ASSUME that the first token
// is a power, and use getClosestPower(); otherwise, we will
// just check against the lcPowerNames list.
boolean hasColon = false;
// find first white space (or ':')
int wsIdx = -1;
for(int i=0; i<input.length(); i++)
{
final char c = input.charAt(i);
if(c == ':')
{
hasColon = true;
wsIdx = i;
break;
}
if(Character.isWhitespace(c))
{
wsIdx = i;
break;
}
}
// return token iff we match a power
if(wsIdx >= 0)
{
String nameToTest = input.substring(0, wsIdx).trim();
if(hasColon)
{
// looser: assume prior-to-colon is a power name.
return getClosestPower(nameToTest);
}
else
{
// stricter: no ':'; first token may or may not be a power.
for(int i=0; i<lcPowerNames.length; i++)
{
if( nameToTest.startsWith(lcPowerNames[i]) )
{
return getPowerMatching(nameToTest);
}
}
}
}
return null;
}// getFirstPower()
/**
* Given an index, returns the Province to which that index corresponds.
*/
public final Province reverseIndex(int i)
{
return provinces[i];
}// reverseIndex()
/**
* Creats the reverse-sorted power name list required by
* getFirstPowerToken(), filterPowerNames(), and other methods.
* <p>
* Includes power adjectives.
*/
private void createLCPowerNameList()
{
List tmpNames = new ArrayList(powers.length);
for(int i=0; i<powers.length; i++)
{
Power power = powers[i];
String[] tmp = power.getNames();
for(int nmIdx=0; nmIdx<tmp.length; nmIdx++)
{
tmpNames.add(tmp[nmIdx].toLowerCase());
}
tmpNames.add(power.getAdjective().toLowerCase());
}
// sort collection, in reverse alpha order.
// Why? because we need to ensure power names (and adjectives) like
// "Russian" come before "Russia"; otherwise, the replacement will be f'd up.
Comparator reverseComp = Collections.reverseOrder();
Collections.sort( tmpNames, reverseComp );
lcPowerNames = (String[]) tmpNames.toArray(new String[tmpNames.size()]);
}// createLCPowerNameList()
/*
Deprecated
match string against another.
if src > dest, -1
higher number == closer!
not ideal for checking exact match.
we stop checking at the first letter that doesn't compare.
assumes: SRC is lower case
DEST lower case (now...)
private int getCloseness(String src, String dest)
{
if(src.length() > dest.length())
{
return -1;
}
int numCharsMatching = 0;
for(int i=0; i<src.length(); i++)
{
//if(src.charAt(i) != Character.toLowerCase(dest.charAt(i))) // OLD
if(src.charAt(i) != dest.charAt(i))
{
break;
}
numCharsMatching++;
}
return numCharsMatching;
}// getCloseness()
*/
/**
* Performs a 'best partial match' with a province name (trimmed, all
* lower case). Returns a List which will be:
* <ol>
* <li>Empty, if no match occurs</li>
* <li>One item, if a single ("best") match occured</li>
* <li>Multiple items, if ties occur</li>
* </ol>
* Null is never returned.
* <p>
* For example: given provinces "Liverpool" and "Livonia", and "Loveland":<br>
* "Li", and "Liv" will return a List of 2 items<br>
* "Liver" will return a List of 1 item (Liverpool)<br>
* "Xsdf" will return a List of 0 items.<br>
* <p>
* If there are multiple provinces with alternate names that
* completely match, (different names, same object), only ONE reference
* to the object will be returned in the collection.
* <p>
* The reason this is important is it is more reliable than Levenshtein
* for matching some types of short strings
* <p>
* THIS METHOD REPLACES getCloseness() FOR PROVINCE MATCHING.
*/
private List findPartialProvinceMatch(String input)
{
HashSet ties = new HashSet(41);
for(int i=0; i<lcPowerNames.length; i++)
{
String provName = names[i];
if(provName.startsWith(input))
{
ties.add( getProvince(provName) ); // should NEVER be null
}
}
ArrayList al = new ArrayList(ties.size());
al.addAll(ties);
return al;
}// findClosestProvince()
/**
* Same as findPartialProvinceMatch(), but for matching powers.
*
* THIS METHOD REPLACES getCloseness() FOR POWER MATCHING.
*/
private List findPartialPowerMatch(String input)
{
HashSet ties = new HashSet(41);
for(int i=0; i<lcPowerNames.length; i++)
{
String powerName = lcPowerNames[i];
if(powerName.startsWith(input))
{
ties.add( getPower(powerName) ); // should NEVER be null
}
}
ArrayList al = new ArrayList(ties.size());
al.addAll(ties);
return al;
}// findPartialPowerMatch()
/**
* Gets a Levenshtein Edit Distance
* Code by Michael Gilleland, Merriam Park Software
*
*/
private static class Distance
{
/** Get minimum of three values */
private static int getMin(int a, int b, int c)
{
int mi;
mi = a;
if (b < mi)
{
mi = b;
}
if (c < mi)
{
mi = c;
}
return mi;
}// getMin()
/** Compute Levenshtein distance */
public static int getLD(String s, String t)
{
int d[][]; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
// Step 1
n = s.length ();
m = t.length ();
if(n == 0)
{
return m;
}
if(m == 0)
{
return n;
}
d = new int[n+1][m+1];
// Step 2
for(i = 0; i <= n; i++)
{
d[i][0] = i;
}
for(j = 0; j <= m; j++)
{
d[0][j] = j;
}
// Step 3
for(i = 1; i <= n; i++)
{
s_i = s.charAt(i - 1);
// Step 4
for(j = 1; j <= m; j++)
{
t_j = t.charAt(j - 1);
// Step 5
cost = (s_i == t_j) ? 0 : 1;
// Step 6
d[i][j] = getMin(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost);
}// for(j)
}// for(i)
// Step 7
return d[n][m];
}// getLD()
}// inner class Distance
// reserialization: re-create mappings
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException
{
in.defaultReadObject();
// re-create transient data.
createMappings();
}// readObject()
}// class Map
///////