/*******************************************************************************
* Mission Control Technologies, Copyright (c) 2009-2012, United States Government
* as represented by the Administrator of the National Aeronautics and Space
* Administration. All rights reserved.
*
* The MCT platform is licensed under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* MCT includes source code licensed under additional open source licenses. See
* the MCT Open Source Licenses file included with this distribution or the About
* MCT Licenses dialog available at runtime from the MCT Help menu for additional
* information.
*******************************************************************************/
package gov.nasa.arc.mct.abbreviation.impl;
import gov.nasa.arc.mct.abbreviation.Abbreviations;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Implements a manager of an abbreviations list.
*/
public class AbbreviationsManager {
/** A regular expression used to separate alternative abbreviations. (\s == any whitespace) */
private static final Pattern ABBREVIATION_SEPARATOR = Pattern.compile("\\s*\\|\\s*");
/** A regular expression used to separate words. */
private static final Pattern WORD_SEPARATOR = Pattern.compile("\\s+");
private Map<String, List<String>> abbreviations = new HashMap<String, List<String>>();
/**
* Creates a new abbreviations manager configured with a set of abbreviation
* properties. Abbreviation properties are of the form:
* <pre>
* phrase = alt1 | alt2 | ...
* </pre>
* Whitespace around the "=" and "|" separators is removed. The phrase is
* converted to lower case, but the alternatives are used verbatim.
*
* @param abbreviationProperties the abbreviation properties
*/
public AbbreviationsManager(Properties abbreviationProperties) {
@SuppressWarnings("unchecked")
Enumeration<String> e = (Enumeration<String>) abbreviationProperties.propertyNames();
while (e.hasMoreElements()) {
String phrase = e.nextElement();
String lcPhrase = phrase.toLowerCase();
String[] alternatives = ABBREVIATION_SEPARATOR.split(abbreviationProperties.getProperty(phrase).trim());
List<String> abbreviationsForPhrase = new ArrayList<String>(Arrays.asList(alternatives));
Collections.sort(abbreviationsForPhrase, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return o1.length() - o2.length();
}
});
abbreviations.put(lcPhrase, abbreviationsForPhrase);
}
}
/**
* Gets the alternative abbreviations for a phrase. The original phrase is always the
* the first alternative returned. If no abbreviations are found for the phrase, returns
* a list with one element, the original phrase. The phrase is converted to lower case
* before looking up its alternatives.
*
* @param phrase the phrase to abbreviate
* @return a list of alternative abbreviations, with the original phrase as the first element
*/
public List<String> getAlternatives(String phrase) {
List<String> result = new ArrayList<String>();
result.add(phrase);
List<String> alternatives = abbreviations.get(phrase.toLowerCase());
if (alternatives != null) {
result.addAll(alternatives);
}
return result;
}
/**
* Finds the phrases within a string that can be abbreviated, and returns
* a structure with those phrases and the alternatives for each phrase.
* A phrase is a sequence of one or more words in the original string, where
* words are delimited by whitespace. At each point in the original string,
* the longest phrase for which there are abbreviations is found.
*
* @param s the string to find abbreviations for
* @return a structure describing the available abbreviations
*/
public Abbreviations getAbbreviations(String s) {
AbbreviationsImpl abbrev = new AbbreviationsImpl(s);
List<String> phrases = getPhrasesWithAbbreviations(s);
for (String phrase : phrases) {
abbrev.addPhrase(phrase, getAlternatives(phrase));
}
return abbrev;
}
/**
* Constructs a partition of a string into phrases, along word boundaries,
* where each phrase has one or more alternative abbreviations, and each
* phrase is the longest match against the abbreviations at that position
* in the original string.
*
* @param s the original string to partition into phrases
* @return a list of phrases
*/
private List<String> getPhrasesWithAbbreviations(String s) {
int phraseStart = 0;
List<String> phrasesWithAbbreviations = new ArrayList<String>();
Matcher wordBoundary = WORD_SEPARATOR.matcher(s);
while (phraseStart < s.length()) {
int phraseLength = getLongestPhraseLength(s.substring(phraseStart));
phrasesWithAbbreviations.add(s.substring(phraseStart, phraseStart + phraseLength));
if (wordBoundary.find(phraseStart + phraseLength)) {
phraseStart = wordBoundary.end();
} else {
phraseStart = s.length();
}
}
return phrasesWithAbbreviations;
}
/**
* Finds the longest phrase within a string that has abbreviations. The first word
* is always a possibility, even if no alternatives exist to that word.
*
* @param s the string for which to find the longest phrase with alternatives
* @return the length of the longest phrase with alternative abbreviations
*/
private int getLongestPhraseLength(String s) {
// If the entire string matches, then it is obviously the longest matching phrase.
if (abbreviations.containsKey(s.toLowerCase())) {
return s.length();
}
Matcher wordBoundary = WORD_SEPARATOR.matcher(s);
if (!wordBoundary.find()) {
// No word boundaries found. Entire string is only possible phrase.
return s.length();
}
// First word is always an abbreviation candidate, perhaps with no
// alternatives but itself.
int longestMatchLength = wordBoundary.start();
while (wordBoundary.find()) {
if (abbreviations.containsKey(s.substring(0, wordBoundary.start()).toLowerCase())) {
longestMatchLength = wordBoundary.start();
}
}
return longestMatchLength;
}
}