/**
* Copyright 2015 StreamSets Inc.
*
* Licensed under the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.streamsets.pipeline.lib.fuzzy;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.Sets;
import com.google.common.collect.Sets.SetView;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
public class FuzzyMatch {
private static final Logger LOG = LoggerFactory.getLogger(FuzzyMatch.class);
private static final Pattern removeNonAlphaNum = Pattern.compile("[^\\w+]", Pattern.UNICODE_CHARACTER_CLASS);
private static final Pattern camelAndSnakeCaseSplitter = Pattern.compile("(?=\\p{Lu})|(_)");
private FuzzyMatch() {}
// Build a cache with sane defaults. Shouldn't have to ask user to configure this.
private static LoadingCache<Pair<String, String>, Integer> ratioCache = CacheBuilder.newBuilder()
.maximumSize(1000)
.expireAfterAccess(1, TimeUnit.SECONDS)
.build(
new CacheLoader<Pair<String, String>, Integer>() {
@Override
public Integer load(Pair<String, String> pair) {
return FuzzyMatch.getRatio(pair.getLeft(), pair.getRight());
}
}
);
public static int getRatio(String s1, String s2, boolean useCache) {
if (useCache) {
return ratioCache.getUnchecked(Pair.of(s1, s2));
} else {
return getRatio(s1, s2);
}
}
/*
* t0 = [SORTED_INTERSECTION]
* t1 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
* t2 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
*
* outcome = max(t0,t1,t2)
*
*/
public static int getRatio(String s1, String s2) {
if (s1.length() >= s2.length()) {
// We need to swap s1 and s2
String temp = s2;
s2 = s1;
s1 = temp;
}
// Get alpha numeric characters
Set<String> set1 = tokenizeString(escapeString(s1));
Set<String> set2 = tokenizeString(escapeString(s2));
SetView<String> intersection = Sets.intersection(set1, set2);
TreeSet<String> sortedIntersection = Sets.newTreeSet(intersection);
if (LOG.isTraceEnabled()) {
StringBuilder sortedSb = new StringBuilder();
for (String s : sortedIntersection) {
sortedSb.append(s).append(" ");
}
LOG.trace("Sorted intersection --> {}", sortedSb.toString());
}
// Find out difference of sets set1 and intersection of set1,set2
SetView<String> restOfSet1 = Sets.symmetricDifference(set1, intersection);
// Sort it
TreeSet<String> sortedRestOfSet1 = Sets.newTreeSet(restOfSet1);
SetView<String> restOfSet2 = Sets.symmetricDifference(set2, intersection);
TreeSet<String> sortedRestOfSet2 = Sets.newTreeSet(restOfSet2);
if (LOG.isTraceEnabled()) {
StringBuilder sb1 = new StringBuilder();
for (String s : sortedRestOfSet1) {
sb1.append(s).append(" ");
}
LOG.trace("Sorted rest of 1 --> {}", sb1.toString());
StringBuilder sb2 = new StringBuilder();
for (String s : sortedRestOfSet1) {
sb2.append(s).append(" ");
}
LOG.trace("Sorted rest of 2 --> {}", sb2.toString());
}
StringBuilder t0Builder = new StringBuilder("");
StringBuilder t1Builder = new StringBuilder("");
StringBuilder t2Builder = new StringBuilder("");
for (String s : sortedIntersection) {
t0Builder.append(" ").append(s);
}
String t0 = t0Builder.toString().trim();
Set<String> setT1 = Sets.union(sortedIntersection, sortedRestOfSet1);
for (String s : setT1) {
t1Builder.append(" ").append(s);
}
String t1 = t1Builder.toString().trim();
Set<String> setT2 = Sets.union(intersection, sortedRestOfSet2);
for (String s : setT2) {
t2Builder.append(" ").append(s);
}
String t2 = t2Builder.toString().trim();
int amt1 = calculateLevenshteinDistance(t0, t1);
int amt2 = calculateLevenshteinDistance(t0, t2);
int amt3 = calculateLevenshteinDistance(t1, t2);
LOG.trace("t0 = {} --> {}", t0, amt1);
LOG.trace("t1 = {} --> {}", t1, amt2);
LOG.trace("t2 = {} --> {}", t2, amt3);
return Math.max(Math.max(amt1, amt2), amt3);
}
private static Set<String> tokenizeString(String str) {
Set<String> set = new HashSet<>();
// Normalize and tokenize the input strings before storing as a set.
StringTokenizer st = new StringTokenizer(str);
while (st.hasMoreTokens()) {
String t1 = st.nextToken();
String[] tokens = camelAndSnakeCaseSplitter.split(t1);
for (int i = 0; i < tokens.length; i++) {
tokens[i] = tokens[i].toLowerCase();
}
Collections.addAll(set, tokens);
}
set.remove("");
return set;
}
private static int calculateLevenshteinDistance(String s1, String s2) {
int distance = StringUtils.getLevenshteinDistance(s1, s2);
double ratio = ((double) distance) / (Math.max(s1.length(), s2.length()));
return 100 - (int)(ratio * 100);
}
private static String escapeString(String token) {
return removeNonAlphaNum.matcher(token).replaceAll(" ");
}
}