/*******************************************************************************
* CogTool Copyright Notice and Distribution Terms
* CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University
* This software is distributed under the terms of the FSF Lesser
* Gnu Public License (see LGPL.txt).
*
* CogTool is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* CogTool is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with CogTool; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* CogTool makes use of several third-party components, with the
* following notices:
*
* Eclipse SWT version 3.448
* Eclipse GEF Draw2D version 3.2.1
*
* Unless otherwise indicated, all Content made available by the Eclipse
* Foundation is provided to you under the terms and conditions of the Eclipse
* Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this
* Content and is also available at http://www.eclipse.org/legal/epl-v10.html.
*
* CLISP version 2.38
*
* Copyright (c) Sam Steingold, Bruno Haible 2001-2006
* This software is distributed under the terms of the FSF Gnu Public License.
* See COPYRIGHT file in clisp installation folder for more information.
*
* ACT-R 6.0
*
* Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere &
* John R Anderson.
* This software is distributed under the terms of the FSF Lesser
* Gnu Public License (see LGPL.txt).
*
* Apache Jakarta Commons-Lang 2.1
*
* This product contains software developed by the Apache Software Foundation
* (http://www.apache.org/)
*
* jopt-simple version 1.0
*
* Copyright (c) 2004-2013 Paul R. Holser, Jr.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* Mozilla XULRunner 1.9.0.5
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/.
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
* License for the specific language governing rights and limitations
* under the License.
*
* The J2SE(TM) Java Runtime Environment version 5.0
*
* Copyright 2009 Sun Microsystems, Inc., 4150
* Network Circle, Santa Clara, California 95054, U.S.A. All
* rights reserved. U.S.
* See the LICENSE file in the jre folder for more information.
******************************************************************************/
package edu.cmu.cs.hcii.cogtool.model;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.commons.lang.builder.HashCodeBuilder;
import edu.cmu.cs.hcii.cogtool.util.FetchURLUtil;
import edu.cmu.cs.hcii.cogtool.util.NullSafe;
import edu.cmu.cs.hcii.cogtool.util.ObjectLoader;
import edu.cmu.cs.hcii.cogtool.util.ObjectSaver;
//The names of classes and interfaces around this are terrible, but we can't
//change them without breaking old .cgt files, since our persistence
//mechanism leaks the implementation detail of our class names into the
//abstraction of our file format.
//Note that the only implementer of this interface is the abstract calls
//CachedTermSimilarity. All concrete classes are subclasses of that, and there
//is no such thing as an un-cached TermSimilarity.
//Both GoogleSimilarity and CachedGoogleSimilarity are direct subsclasses
//of CachedTermSimilarity, and the one with cached in its name does not
//inherit from the one without. All very confusing.
//TODO once we augment our persistence mechanism in such a way that we
// can actually rename persistable classes, we should tidy up these names.
/**
* Implements a standard algorithm that determines a frequency count for each
* word in term (caching the results), determines a relationship value for
* pairs of words (caching the results), and, using those frequencies and
* values, determines a similarity measure.
*/
public abstract class CachedTermSimilarity implements ITermSimilarity
{
public static String toString(double similarity)
{
if (similarity == UNKNOWN) {
return "Unrelated";
}
return Double.toString(similarity);
}
public static final Double UNKNOWN_SIMILARITY = new Double(UNKNOWN);
/**
* Standard value entry for frequencyTable if zero
*/
public static final Long ZERO_FREQUENCY = new Long(0);
// If words are cached, maps word String to frequency Long
protected Map<String, Long> frequencyTable = new HashMap<String, Long>();
// Tracks words and phrases not found in a particular corpus;
// maps word/term to List of "replacement" words
// TODO: Currently, there is no way to populate this table.
protected LinkedHashMap<String, List<String>> zeroFreqTerms =
new LinkedHashMap<String, List<String>>();
/**
* A pair of strings treated as a struct (that is, comparison is
* overridden so that instances don't require identity match to act
* as keys in a Map).
*/
public static class WordPair
{
public static final int edu_cmu_cs_hcii_cogtool_model_CachedTermSimilarity$WordPair_version = 0;
protected static final String goalWordVAR = "goalWord";
protected static final String searchWordVAR = "searchWord";
private static ObjectSaver.IDataSaver<WordPair> SAVER =
new ObjectSaver.ADataSaver<WordPair>() {
@Override
public int getVersion()
{
return edu_cmu_cs_hcii_cogtool_model_CachedTermSimilarity$WordPair_version;
}
@Override
public void saveData(WordPair v, ObjectSaver saver)
throws java.io.IOException
{
saver.saveObject(v.goalWord, goalWordVAR);
saver.saveObject(v.searchWord, searchWordVAR);
}
};
public static void registerSaver()
{
ObjectSaver.registerSaver(WordPair.class.getName(),
SAVER);
}
private static ObjectLoader.IObjectLoader<WordPair> LOADER =
new ObjectLoader.AObjectLoader<WordPair>() {
@Override
public WordPair createObject()
{
return new WordPair();
}
@Override
public void set(WordPair target, String variable, Object value)
{
if (variable != null) {
if (variable.equals(goalWordVAR)) {
target.goalWord = (String) value;
}
else if (variable.equals(searchWordVAR)) {
target.searchWord = (String) value;
}
}
}
};
public static void registerLoader()
{
ObjectLoader.registerLoader(WordPair.class.getName(),
edu_cmu_cs_hcii_cogtool_model_CachedTermSimilarity$WordPair_version,
LOADER);
}
protected WordPair() { } // for loading
public static final WordPair SEARCH_KEY = new WordPair("", "");
public String goalWord;
public String searchWord;
public WordPair(String goal, String search)
{
goalWord = goal;
searchWord = search;
}
protected boolean valueEquals(WordPair other)
{
return NullSafe.equals(goalWord, other.goalWord) &&
NullSafe.equals(searchWord, other.searchWord);
}
@Override
public boolean equals(Object other)
{
return (other != null) &&
(other.getClass() == WordPair.class) &&
valueEquals((WordPair) other);
}
@Override
public int hashCode()
{
// Must have a unique ODD number for each class which uses
// hashCodeBuilder.
// this : 47, 5
return new HashCodeBuilder(47, 5).append(goalWord.hashCode())
.append(searchWord.hashCode())
.toHashCode();
}
}
// Maps WordPair to similarity Double value
protected Map<WordPair, Double> similarityTable =
new HashMap<WordPair, Double>();
/**
* An IURLProcessor that fetches a frequency count for a given word.
*/
protected interface IWordFrequencyParser extends FetchURLUtil.IURLProcessor
{
public long getWordFrequency();
}
/**
* Subclasses should implement this to return a URL processor that
* fetches a frequency count for the given word.
*/
protected abstract IWordFrequencyParser getWordFreqParser(String word,
List<String> errors);
/**
* At this point, the frequency cache does not know the given word,
* so get the URL processor for the given word and process the
* fetched content.
*/
protected Long fetchWordFrequency(String word, List<String> errors)
{
IWordFrequencyParser wordFreqParser =
getWordFreqParser(word, errors);
if (FetchURLUtil.processURL(wordFreqParser)) {
long frequency = wordFreqParser.getWordFrequency();
// No need to create a new instance for zero.
if (frequency == 0) {
return ZERO_FREQUENCY;
}
return new Long(frequency);
}
// Did not succeed at fetching a value.
return null;
}
/**
* Look up the word in the cache; if there, return the associated
* frequency. If not there, fetch it.
*/
protected long getWordFrequency(String word, List<String> errors)
{
if (word == null) {
return 0;
}
word = word.toLowerCase();
if (frequencyTable.containsKey(word)) {
Object frequency = frequencyTable.get(word);
if (frequency != null) {
return ((Long) frequency).longValue();
}
throw new IllegalStateException("Frequency table contains a null frequency for word: "
+ word);
}
Long frequency = fetchWordFrequency(word, errors);
if (frequency != null) {
frequencyTable.put(word, frequency);
return frequency.longValue();
}
return 0;
} // getWordFrequency
private static final Pattern SPLITTER = Pattern.compile("\\s+");
/**
* Break term into words, fetch each word's frequency, use replacements
* if necessary (and specified), and return an array of words/replacements
* that have non-zero frequencies.
*/
private String[] getWordFrequencies(String term,
List<String> errors,
ITermSimilarity.Continuable cont)
{
String[] words = SPLITTER.split(term);
List<String> nonzeroWords = new ArrayList<String>();
for (String word : words) {
if (getWordFrequency(word, errors) > 0) {
nonzeroWords.add(word);
}
else if (zeroFreqTerms.containsKey(word)) {
List<String> replacement = zeroFreqTerms.get(word);
if (replacement != null) {
Iterator<String> others = replacement.iterator();
while (others.hasNext()) {
getWordFrequency(others.next(), errors);
if (! cont.isContinuing()) {
return null;
}
}
// TODO: Even if freq returned is zero in loop above?
nonzeroWords.addAll(replacement);
}
else {
// No replacements for previously seen zero freq word;
// TODO: return what???? "inform crawlWebsite() to reinsert link into queue"
}
}
else {
// First time seeing this zero freq word; insert and "return what????"
zeroFreqTerms.put(word, null);
}
if (! cont.isContinuing()) {
return null;
}
}
if (nonzeroWords.isEmpty()) {
return null;
}
String[] wordFreqs = new String[nonzeroWords.size()];
return nonzeroWords.toArray(wordFreqs);
} // getWordFrequencies
/**
* An IURLProcessor that fetches a similarity strength for a pair of words.
*/
protected interface ISimilarityParser extends FetchURLUtil.IURLProcessor
{
public double getSimilarity();
}
/**
* Subclasses should implement this to return a URL processor that
* fetches a similarity strength for a pair of words.
*/
protected abstract ISimilarityParser getSimilarityParser(String goal,
String search,
List<String> errors);
/**
* At this point, the word-pair similarity cache does not know,
* so get the URL processor for the given word pair and process the
* fetched content.
*/
protected Double fetchWordSimilarity(String goalWord,
String searchWord,
List<String> errors)
{
ISimilarityParser goalSimilarityParser =
getSimilarityParser(goalWord, searchWord, errors);
if (FetchURLUtil.processURL(goalSimilarityParser)) {
return new Double(goalSimilarityParser.getSimilarity());
}
return null;
}
/**
* Look up the word pair in the cache; if there, return the associated
* similarity. If not there, fetch it.
*/
protected double getWordSimilarity(String goalWord,
String searchWord,
List<String> errors)
{
if ((goalWord == null) || (searchWord == null)) {
return UNKNOWN;
}
WordPair.SEARCH_KEY.goalWord = goalWord.toLowerCase();
WordPair.SEARCH_KEY.searchWord = searchWord.toLowerCase();
if (similarityTable.containsKey(WordPair.SEARCH_KEY)) {
Double cachedSimilarity =
similarityTable.get(WordPair.SEARCH_KEY);
if (cachedSimilarity != null) {
return cachedSimilarity.doubleValue();
}
throw new IllegalStateException("Similarity table contains a null similarity for pair: "
+ goalWord + ", " + searchWord);
}
Double similarity = fetchWordSimilarity(goalWord, searchWord, errors);
if (similarity != null) {
WordPair newEntry = new WordPair(goalWord, searchWord);
similarityTable.put(newEntry, similarity);
return similarity.doubleValue();
}
return UNKNOWN;
} // getWordSimilarity
/**
* Computes the similarity between two strings (both can contain multiple words)
* using multiple queries
*
* The two strings are tokenized into individual words and word-pairs are created
* between the two strings. Each similarity of each word-pair is computed and then
* totaled to give the similarity of both entire strings
*
* We split on whitespace and leave it to the analysis to split or rewrite words that are
* hyphenated, apostrophe'd, etc
*/
public double determineSimilarity(String goalTerm,
String searchTerm,
List<String> errors,
ITermSimilarity.Continuable cont)
{
String[] goalWords = getWordFrequencies(goalTerm, errors, cont);
if (! cont.isContinuing()) {
return UNKNOWN;
}
String[] searchWords = getWordFrequencies(searchTerm, errors, cont);
if (! cont.isContinuing()) {
return UNKNOWN;
}
if ((goalWords == null) ||
(goalWords.length == 0) ||
(searchWords == null) ||
(searchWords.length == 0))
{
return UNKNOWN;
}
double totalPMI = 0.0;
int pairCount = 0;
for (String goalWord : goalWords) {
for (String searchWord : searchWords) {
double wordSimilarity =
getWordSimilarity(goalWord, searchWord, errors);
if (! cont.isContinuing()) {
return UNKNOWN;
}
if (wordSimilarity >= 0.0) {
totalPMI += wordSimilarity;
pairCount++;
}
}
}
if (pairCount > 0) {
return totalPMI / pairCount;
}
return UNKNOWN; // TODO: not quite; better some UNRELATED value
} // determineSimilarity
}