/*
* Sifarish: Recommendation Engine
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.sifarish.etl;
import java.io.IOException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.sifarish.feature.DynamicAttrSimilarityStrategy;
import org.sifarish.util.Field;
/**
* Standard format for US
* @author pranab
*
*/
public class UnitedStatesStandardFormat extends CountryStandardFormat {
private StructuredTextNormalizer textNormalizer;
public UnitedStatesStandardFormat(StructuredTextNormalizer textNormalizer) {
super();
this.textNormalizer = textNormalizer;
}
/* (non-Javadoc)
* @see org.sifarish.etl.CountryStandardFormat#intializeStateCodes()
*/
public void intializeStateCodes() {
}
/* (non-Javadoc)
* @see org.sifarish.etl.CountryStandardFormat#caseFormat(java.lang.String, java.lang.String)
*/
public String caseFormat(String item, String format) {
String[] tokens = item.split("\\s+");
for (int i = 0; i < tokens.length; ++i) {
if (format.equals("lowerCase")) {
tokens[i] = tokens[i].toLowerCase();
} else if (format.equals("upperCase")) {
tokens[i] = tokens[i].toUpperCase();
} else if (format.equals("capitalize")) {
tokens[i] = StringUtils.capitalize(tokens[i].toLowerCase());
} else {
throw new IllegalArgumentException("invalid case format");
}
}
return org.chombo.util.Utility.join(tokens, "");
}
/* (non-Javadoc)
* @see org.sifarish.etl.CountryStandardFormat#phoneNumFormat(java.lang.String, java.lang.String)
*/
public String phoneNumFormat(String item, String format) {
item = item.replaceAll("^\\d", "");
if (format.equals("compact")) {
} else if (format.equals("areaCodeParen")) {
item = "(" + item.substring(0, 3) + ")" + item.substring(3);
} else if (format.equals("spaceSep")) {
item = item.substring(0, 3) + " " + item.substring(3,6) + " " + item.substring(6);
} else {
}
return item;
}
/* (non-Javadoc)
* @see org.sifarish.etl.CountryStandardFormat#stateFormat(java.lang.String)
*/
public String stateFormat(String item) throws IOException {
return stateFormat(item, false, null, 0);
}
/* (non-Javadoc)
* @see org.sifarish.etl.CountryStandardFormat#stateFormat(java.lang.String, boolean, org.sifarish.feature.DynamicAttrSimilarityStrategy, double)
*/
public String stateFormat(String item, boolean fuzzyMatch, DynamicAttrSimilarityStrategy textSimStrategy,
double minDist) throws IOException {
String newItem = item;
TextFieldTokenNormalizer tokenNormalizer =
textNormalizer.findTokenNormalizer(Field.TEXT_TYPE_STATE);
if (newItem.length() == 2) {
newItem = newItem.toUpperCase();
if (fuzzyMatch && !tokenNormalizer.containsNormalize(newItem)) {
//try fuzzy matching
Pair<String, Double> match = tokenNormalizer.fuzzymatchWithNormalized(newItem, textSimStrategy);
if (match.getRight() <= minDist) {
newItem = match.getLeft();
}
}
} else {
newItem = tokenNormalizer.normalize(newItem);
if (fuzzyMatch && newItem.equals(item)) {
//try fuzzy matching
Pair<String, Double> match = tokenNormalizer.fuzzymatchWithUnnormalized(newItem, textSimStrategy);
if (match.getRight() <= minDist) {
newItem = match.getLeft();
newItem = tokenNormalizer.normalize(newItem);
}
}
}
return newItem;
}
/* (non-Javadoc)
* @see org.sifarish.etl.CountryStandardFormat#streetAddressFormat(java.lang.String)
*/
public String streetAddressFormat(String item) throws IOException {
String newItem = streetAddressOneFormat(item);
return streetAddressTwoFormat(newItem);
}
/* (non-Javadoc)
* @see org.sifarish.etl.CountryStandardFormat#streetAddressOneFormat(java.lang.String)
*/
public String streetAddressOneFormat(String item) throws IOException {
return streetAddressOneFormat(item, false, null, 0);
}
/* (non-Javadoc)
* @see org.sifarish.etl.CountryStandardFormat#streetAddressOneFormat(java.lang.String, boolean, org.sifarish.feature.DynamicAttrSimilarityStrategy, double)
*/
public String streetAddressOneFormat(String item, boolean fuzzyMatch, DynamicAttrSimilarityStrategy textSimStrategy,
double minDist) throws IOException {
TextFieldTokenNormalizer tokenNormalizer =
textNormalizer.findTokenNormalizer(Field.TEXT_TYPE_STREET_ADDRESS_ONE);
String newItem = tokenNormalizer.normalize(item);
if (fuzzyMatch && newItem.equals(item)) {
String[] elements = newItem.split("\\s+");
String streetType = elements[elements.length -1];
tokenNormalizer = textNormalizer.findTokenNormalizer(Field.TEXT_TYPE_STREET_TYPE);
if (!tokenNormalizer.containsNormalize(streetType)) {
//try fuzzy matching
Pair<Boolean, String> fuzzyMatched = fuzyyMatchComponent(streetType, tokenNormalizer,
textSimStrategy, minDist);
if (fuzzyMatched.getLeft()) {
StringBuilder stBld = new StringBuilder(elements[0]);
for (int i = 1; i < elements.length-1; ++i) {
if (i == elements.length-1) {
stBld.append(" ").append(fuzzyMatched.getRight());
} else {
stBld.append(" ").append(elements[i]);
}
}
newItem = stBld.toString();
}
}
}
return newItem;
}
/* (non-Javadoc)
* @see org.sifarish.etl.CountryStandardFormat#streetAddressTwoFormat(java.lang.String)
*/
public String streetAddressTwoFormat(String item) throws IOException {
return streetAddressTwoFormat(item, false, null, 0);
}
/* (non-Javadoc)
* @see org.sifarish.etl.CountryStandardFormat#streetAddressTwoFormat(java.lang.String, boolean, org.sifarish.feature.DynamicAttrSimilarityStrategy, double)
*/
public String streetAddressTwoFormat(String item, boolean fuzzyMatch, DynamicAttrSimilarityStrategy textSimStrategy,
double minDist) throws IOException {
TextFieldTokenNormalizer tokenNormalizer =
textNormalizer.findTokenNormalizer(Field.TEXT_TYPE_STREET_ADDRESS_TWO);
String newItem = tokenNormalizer.normalize(item);
if (fuzzyMatch && newItem.equals(item)) {
String[] elements = newItem.split("\\s+");
String unitType = elements[0];
if (!tokenNormalizer.containsNormalize(unitType)) {
//try fuzzy matching
Pair<Boolean, String> fuzzyMatched = fuzyyMatchComponent(unitType, tokenNormalizer,
textSimStrategy, minDist);
if (fuzzyMatched.getLeft()) {
StringBuilder stBld = new StringBuilder(fuzzyMatched.getRight());
for (int i = 1; i < elements.length-1; ++i) {
stBld.append(" ").append(elements[i]);
}
newItem = stBld.toString();
}
}
}
return newItem;
}
/**
* Whole address in one field
* @param item
* @return
*/
public String addressFormat(String item) {
TextFieldTokenNormalizer tokenNormalizer =
textNormalizer.findTokenNormalizer(Field.TEXT_TYPE_STREET_ADDRESS_ONE);
String newItem = tokenNormalizer.normalize(item);
tokenNormalizer = textNormalizer.findTokenNormalizer(Field.TEXT_TYPE_STREET_ADDRESS_TWO);
newItem = tokenNormalizer.normalize(newItem);
String[] lines = newItem.split("\\n");
//break address line 2
if (lines.length == 3) {
lines[1] = breakAddressLine(lines[1], "Apartment");
lines[1] = breakAddressLine(lines[1], "Suite");
newItem = lines[0] + "\\n" + lines[1] + "\\n" + lines[2];
}
return newItem;
}
private String breakAddressLine(String line, String unit) {
String newLine = line;
if (line.contains(unit)) {
String[] subLines = line.split(unit);
newLine = subLines[0] + "\\n" + unit + subLines[1];
}
return newLine;
}
}