package er.chronic.numerizer;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Numerizer {
protected static class DirectNum {
private Pattern _name;
private String _number;
public DirectNum(String name, String number) {
_name = Pattern.compile(name, Pattern.CASE_INSENSITIVE);
_number = number;
}
public Pattern getName() {
return _name;
}
public String getNumber() {
return _number;
}
}
protected static class Prefix {
private String _name;
private Pattern _pattern;
private long _number;
public Prefix(String name, Pattern pattern, long number) {
_name = name;
_pattern = pattern;
_number = number;
}
public String getName() {
return _name;
}
public Pattern getPattern() {
return _pattern;
}
public long getNumber() {
return _number;
}
}
protected static class TenPrefix extends Prefix {
public TenPrefix(String name, long number) {
super(name, Pattern.compile("(?:" + name + ") *<num>(\\d(?=\\D|$))*", Pattern.CASE_INSENSITIVE), number);
}
}
protected static class BigPrefix extends Prefix {
public BigPrefix(String name, long number) {
super(name, Pattern.compile("(?:<num>)?(\\d*) *" + name, Pattern.CASE_INSENSITIVE), number);
}
}
protected static DirectNum[] DIRECT_NUMS;
protected static TenPrefix[] TEN_PREFIXES;
protected static BigPrefix[] BIG_PREFIXES;
static {
List<DirectNum> directNums = new LinkedList<>();
directNums.add(new DirectNum("eleven", "11"));
directNums.add(new DirectNum("twelve", "12"));
directNums.add(new DirectNum("thirteen", "13"));
directNums.add(new DirectNum("fourteen", "14"));
directNums.add(new DirectNum("fifteen", "15"));
directNums.add(new DirectNum("sixteen", "16"));
directNums.add(new DirectNum("seventeen", "17"));
directNums.add(new DirectNum("eighteen", "18"));
directNums.add(new DirectNum("nineteen", "19"));
directNums.add(new DirectNum("ninteen", "19")); // Common mis-spelling
directNums.add(new DirectNum("zero", "0"));
directNums.add(new DirectNum("one", "1"));
directNums.add(new DirectNum("two", "2"));
directNums.add(new DirectNum("three", "3"));
directNums.add(new DirectNum("four(\\W|$)", "4$1")); // The weird regex is so that it matches four but not fourty
directNums.add(new DirectNum("five", "5"));
directNums.add(new DirectNum("six(\\W|$)", "6$1"));
directNums.add(new DirectNum("seven(\\W|$)", "7$1"));
directNums.add(new DirectNum("eight(\\W|$)", "8$1"));
directNums.add(new DirectNum("nine(\\W|$)", "9$1"));
directNums.add(new DirectNum("ten", "10"));
directNums.add(new DirectNum("\\ba\\b(.)", "1$1"));
Numerizer.DIRECT_NUMS = directNums.toArray(new DirectNum[directNums.size()]);
List<TenPrefix> tenPrefixes = new LinkedList<>();
tenPrefixes.add(new TenPrefix("twenty", 20));
tenPrefixes.add(new TenPrefix("thirty", 30));
tenPrefixes.add(new TenPrefix("fourty", 40)); // Common mis-spelling
tenPrefixes.add(new TenPrefix("forty", 40));
tenPrefixes.add(new TenPrefix("fifty", 50));
tenPrefixes.add(new TenPrefix("sixty", 60));
tenPrefixes.add(new TenPrefix("seventy", 70));
tenPrefixes.add(new TenPrefix("eighty", 80));
tenPrefixes.add(new TenPrefix("ninety", 90));
tenPrefixes.add(new TenPrefix("ninty", 90)); // Common mis-spelling
Numerizer.TEN_PREFIXES = tenPrefixes.toArray(new TenPrefix[tenPrefixes.size()]);
List<BigPrefix> bigPrefixes = new LinkedList<>();
bigPrefixes.add(new BigPrefix("hundred", 100L));
bigPrefixes.add(new BigPrefix("thousand", 1000L));
bigPrefixes.add(new BigPrefix("million", 1000000L));
bigPrefixes.add(new BigPrefix("billion", 1000000000L));
bigPrefixes.add(new BigPrefix("trillion", 1000000000000L));
Numerizer.BIG_PREFIXES = bigPrefixes.toArray(new BigPrefix[bigPrefixes.size()]);
}
private static final Pattern DEHYPHENATOR = Pattern.compile(" +|(\\D)-(\\D)");
private static final Pattern DEHALFER = Pattern.compile("a half", Pattern.CASE_INSENSITIVE);
private static final Pattern DEHAALFER = Pattern.compile("(\\d+)(?: | and |-)*haAlf", Pattern.CASE_INSENSITIVE);
private static final Pattern ANDITION_PATTERN = Pattern.compile("<num>(\\d+)( | and )<num>(\\d+)(?=\\W|$)", Pattern.CASE_INSENSITIVE);
// FIXES
//string.gsub!(/ +|([^\d])-([^d])/, '\1 \2') # will mutilate hyphenated-words but shouldn't matter for date extraction
//string.gsub!(/ +|([^\d])-([^\\d])/, '\1 \2') # will mutilate hyphenated-words but shouldn't matter for date extraction
public static String numerize(String str) {
String numerizedStr = str;
// preprocess
numerizedStr = Numerizer.DEHYPHENATOR.matcher(numerizedStr).replaceAll("$1 $2"); // will mutilate hyphenated-words but shouldn't matter for date extraction
numerizedStr = Numerizer.DEHALFER.matcher(numerizedStr).replaceAll("haAlf"); // take the 'a' out so it doesn't turn into a 1, save the half for the end
// easy/direct replacements
for (DirectNum dn : Numerizer.DIRECT_NUMS) {
numerizedStr = dn.getName().matcher(numerizedStr).replaceAll("<num>" + dn.getNumber());
}
// ten, twenty, etc.
for (Prefix tp : Numerizer.TEN_PREFIXES) {
Matcher matcher = tp.getPattern().matcher(numerizedStr);
if (matcher.find()) {
StringBuffer matcherBuffer = new StringBuffer();
do {
if (matcher.group(1) == null) {
matcher.appendReplacement(matcherBuffer, "<num>" + String.valueOf(tp.getNumber()));
}
else {
matcher.appendReplacement(matcherBuffer, "<num>" + String.valueOf(tp.getNumber() + Long.parseLong(matcher.group(1).trim())));
}
} while (matcher.find());
matcher.appendTail(matcherBuffer);
numerizedStr = matcherBuffer.toString();
}
}
for (Prefix tp : Numerizer.TEN_PREFIXES) {
numerizedStr = Pattern.compile(tp.getName(), Pattern.CASE_INSENSITIVE).matcher(numerizedStr).replaceAll("<num>" + tp.getNumber());
}
// hundreds, thousands, millions, etc.
for (Prefix bp : Numerizer.BIG_PREFIXES) {
Matcher matcher = bp.getPattern().matcher(numerizedStr);
if (matcher.find()) {
StringBuffer matcherBuffer = new StringBuffer();
do {
if (matcher.group(1) == null) {
matcher.appendReplacement(matcherBuffer, "<num>" + String.valueOf(bp.getNumber()));
}
else {
matcher.appendReplacement(matcherBuffer, "<num>" + String.valueOf(bp.getNumber() * Long.parseLong(matcher.group(1).trim())));
}
} while (matcher.find());
matcher.appendTail(matcherBuffer);
numerizedStr = matcherBuffer.toString();
}
numerizedStr = Numerizer.andition(numerizedStr);
// combine_numbers(string) // Should to be more efficient way to do this
}
// fractional addition
// I'm not combining this with the previous block as using float addition complicates the strings
// (with extraneous .0's and such )
Matcher matcher = Numerizer.DEHAALFER.matcher(numerizedStr);
if (matcher.find()) {
StringBuffer matcherBuffer = new StringBuffer();
do {
matcher.appendReplacement(matcherBuffer, String.valueOf(Float.parseFloat(matcher.group(1).trim()) + 0.5f));
} while (matcher.find());
matcher.appendTail(matcherBuffer);
numerizedStr = matcherBuffer.toString();
}
//string.gsub!(/(\d+)(?: | and |-)*haAlf/i) { ($1.to_f + 0.5).to_s }
numerizedStr = numerizedStr.replaceAll("<num>", "");
return numerizedStr;
}
public static String andition(String str) {
StringBuilder anditionStr = new StringBuilder(str);
Matcher matcher = Numerizer.ANDITION_PATTERN.matcher(anditionStr);
while (matcher.find()) {
if (matcher.group(2).equalsIgnoreCase(" and ") || (matcher.group(1).length() > matcher.group(3).length() && matcher.group(1).matches("^.+0+$"))) {
anditionStr.replace(matcher.start(), matcher.end(), "<num>" + String.valueOf(Integer.parseInt(matcher.group(1).trim()) + Integer.parseInt(matcher.group(3).trim())));
matcher = Numerizer.ANDITION_PATTERN.matcher(anditionStr);
}
}
return anditionStr.toString();
}
}