/** * */ package org.voyanttools.trombone.util; import java.text.BreakIterator; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.regex.Pattern; /** * @author sgs * */ public class TextUtils { public static List<String> getSentences(String text, Locale locale) { Pattern abbreviations = null; if (locale.equals(Locale.ENGLISH)) { abbreviations = Pattern.compile("\\b(Mrs?|Dr|Rev|Mr|Ms|st)\\.$", Pattern.CASE_INSENSITIVE); } List<String> sentences = new ArrayList<String>(); Stripper stripper = new Stripper(Stripper.TYPE.ALL); // only used for text output text = stripper.strip(text).trim().replace("&", "&"); text = text.replaceAll("\\s+", " "); // all whitepace becomes a single space BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(locale); sentenceIterator.setText(text); int start = sentenceIterator.first(); StringBuffer sb = new StringBuffer(); for (int end = sentenceIterator.next(); end != sentenceIterator.DONE; start = end, end = sentenceIterator .next()) { sb.append(text.substring(start, end).trim()); String sentence = sb.toString(); if (abbreviations==null || abbreviations.matcher(sentence).find() == false) { if (sentence.contains(" ")) { sentences.add(sentence); } sb.setLength(0); // reset buffer } else { sb.append(" "); } } return sentences; } public static List<String> getSentences(String text, String language) { return getSentences(text, new Locale(language)); } public static List<String> getSentences(String text) { return getSentences(text, Locale.ENGLISH); } public static String getLanguageCode(String text) { return Locale.ENGLISH.getLanguage(); } public static void main(String[] args) { System.out.println(Locale.ENGLISH.equals(new Locale("en"))); } }