package org.wikibrain.utils;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.regex.Pattern;
public class WpStringUtils {
/**
* Replaces consecutive non alpha-numeric characters with a space
* Converts to lowercase
* Removes whitespace
*/
public static String normalize(String s) {
return REPLACE_WEIRD.matcher(s).replaceAll(" ").toLowerCase().trim();
}
private static Pattern REPLACE_WEIRD = Pattern.compile("[^\\p{L}\\p{N}]+");
public static long longHashCode(String s) {
MessageDigest messageDigest = null;
try {
messageDigest = MessageDigest.getInstance("SHA-256");
} catch (NoSuchAlgorithmException e) {
throw new IllegalStateException(e); // should not happen
}
try {
messageDigest.update(s.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException(e); // should not happen
}
byte[] bytes = messageDigest.digest();
long h = 1125899906842597L; //prime
for (byte b: bytes) {
h = 31*h + b;
}
return h;
}
/**
* This is much faster... TODO: look for and replace longHashCode calls.
* @param s
* @return
*/
public static long longHashCode2(String s) {
return MurmurHash.hash64(s);
}
}