/** * */ package org.voyanttools.trombone.util; import java.io.IOException; import java.util.List; import java.util.Locale; import java.util.regex.Pattern; import com.optimaize.langdetect.DetectedLanguage; import com.optimaize.langdetect.LanguageDetector; import com.optimaize.langdetect.LanguageDetectorBuilder; import com.optimaize.langdetect.ngram.NgramExtractors; import com.optimaize.langdetect.profiles.LanguageProfile; import com.optimaize.langdetect.profiles.LanguageProfileReader; import com.optimaize.langdetect.text.CommonTextObjectFactories; import com.optimaize.langdetect.text.TextObject; import com.optimaize.langdetect.text.TextObjectFactory; /** * @author sgs * */ public class LangDetector { private List<LanguageProfile> languageProfiles; private LanguageDetector languageDetector; private TextObjectFactory textObjectFactory; public static LangDetector langDetector = new LangDetector(); private static Pattern tagStripper = Pattern.compile("<.+?>", Pattern.DOTALL); /** * */ public LangDetector() { //load all languages: try { languageProfiles = new LanguageProfileReader().readAllBuiltIn(); } catch (IOException e) { throw new RuntimeException(e); } //build language detector: languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(languageProfiles) .build(); //create a text object factory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); } public String detect(String text, FlexibleParameters parameters) { return parameters.containsKey("language") ? new Locale(parameters.getParameterValue("language")).getLanguage() : detect(text); } public String detect(String text) { if (text==null) return ""; text = text.trim(); // quick and dirty tags stripper if (text.startsWith("<")) { text = tagStripper.matcher(text).replaceAll("").trim(); } if (text.contains("\u0F0B")) { // TIBETAN MARK INTERSYLLABIC TSHEG return new Locale("bo").getLanguage(); } TextObject textObject = textObjectFactory.forText(text); List<DetectedLanguage> langs = languageDetector.getProbabilities(textObject); return langs.isEmpty() ? "" : langs.get(0).getLocale().getLanguage(); } }