edu.cmu.cs.hcii.cogtool.model.CachedTermSimilarity.java Source code

Java tutorial

Introduction

Here is the source code for edu.cmu.cs.hcii.cogtool.model.CachedTermSimilarity.java

Source

/*******************************************************************************
 * CogTool Copyright Notice and Distribution Terms
 * CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University
 * This software is distributed under the terms of the FSF Lesser
 * Gnu Public License (see LGPL.txt). 
 * 
 * CogTool is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 * 
 * CogTool is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with CogTool; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 * 
 * CogTool makes use of several third-party components, with the 
 * following notices:
 * 
 * Eclipse SWT version 3.448
 * Eclipse GEF Draw2D version 3.2.1
 * 
 * Unless otherwise indicated, all Content made available by the Eclipse 
 * Foundation is provided to you under the terms and conditions of the Eclipse 
 * Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this 
 * Content and is also available at http://www.eclipse.org/legal/epl-v10.html.
 * 
 * CLISP version 2.38
 * 
 * Copyright (c) Sam Steingold, Bruno Haible 2001-2006
 * This software is distributed under the terms of the FSF Gnu Public License.
 * See COPYRIGHT file in clisp installation folder for more information.
 * 
 * ACT-R 6.0
 * 
 * Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere & 
 *                         John R Anderson. 
 * This software is distributed under the terms of the FSF Lesser
 * Gnu Public License (see LGPL.txt).
 * 
 * Apache Jakarta Commons-Lang 2.1
 * 
 * This product contains software developed by the Apache Software Foundation
 * (http://www.apache.org/)
 * 
 * jopt-simple version 1.0
 * 
 * Copyright (c) 2004-2013 Paul R. Holser, Jr.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 * Mozilla XULRunner 1.9.0.5
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/.
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * The J2SE(TM) Java Runtime Environment version 5.0
 * 
 * Copyright 2009 Sun Microsystems, Inc., 4150
 * Network Circle, Santa Clara, California 95054, U.S.A.  All
 * rights reserved. U.S.  
 * See the LICENSE file in the jre folder for more information.
 ******************************************************************************/

package edu.cmu.cs.hcii.cogtool.model;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.commons.lang.builder.HashCodeBuilder;

import edu.cmu.cs.hcii.cogtool.util.FetchURLUtil;
import edu.cmu.cs.hcii.cogtool.util.NullSafe;
import edu.cmu.cs.hcii.cogtool.util.ObjectLoader;
import edu.cmu.cs.hcii.cogtool.util.ObjectSaver;

//The names of classes and interfaces around this are terrible, but we can't
//change them without breaking old .cgt files, since our persistence
//mechanism leaks the implementation detail of our class names into the
//abstraction of our file format.
//Note that the only implementer of this interface is the abstract calls
//CachedTermSimilarity. All concrete classes are subclasses of that, and there
//is no such thing as an un-cached TermSimilarity.
//Both GoogleSimilarity and CachedGoogleSimilarity are direct subsclasses
//of CachedTermSimilarity, and the one with cached in its name does not
//inherit from the one without. All very confusing.
//TODO once we augment our persistence mechanism in such a way that we
//   can actually rename persistable classes, we should tidy up these names.

/**
 * Implements a standard algorithm that determines a frequency count for each
 * word in term (caching the results), determines a relationship value for
 * pairs of words (caching the results), and, using those frequencies and
 * values, determines a similarity measure.
 */
public abstract class CachedTermSimilarity implements ITermSimilarity {
    public static String toString(double similarity) {
        if (similarity == UNKNOWN) {
            return "Unrelated";
        }

        return Double.toString(similarity);
    }

    public static final Double UNKNOWN_SIMILARITY = new Double(UNKNOWN);

    /**
     * Standard value entry for frequencyTable if zero
     */
    public static final Long ZERO_FREQUENCY = new Long(0);

    // If words are cached, maps word String to frequency Long
    protected Map<String, Long> frequencyTable = new HashMap<String, Long>();

    // Tracks words and phrases not found in a particular corpus;
    // maps word/term to List of "replacement" words
    // TODO: Currently, there is no way to populate this table.
    protected LinkedHashMap<String, List<String>> zeroFreqTerms = new LinkedHashMap<String, List<String>>();

    /**
     * A pair of strings treated as a struct (that is, comparison is
     * overridden so that instances don't require identity match to act
     * as keys in a Map).
     */
    public static class WordPair {
        public static final int edu_cmu_cs_hcii_cogtool_model_CachedTermSimilarity$WordPair_version = 0;

        protected static final String goalWordVAR = "goalWord";
        protected static final String searchWordVAR = "searchWord";

        private static ObjectSaver.IDataSaver<WordPair> SAVER = new ObjectSaver.ADataSaver<WordPair>() {
            @Override
            public int getVersion() {
                return edu_cmu_cs_hcii_cogtool_model_CachedTermSimilarity$WordPair_version;
            }

            @Override
            public void saveData(WordPair v, ObjectSaver saver) throws java.io.IOException {
                saver.saveObject(v.goalWord, goalWordVAR);
                saver.saveObject(v.searchWord, searchWordVAR);
            }
        };

        public static void registerSaver() {
            ObjectSaver.registerSaver(WordPair.class.getName(), SAVER);
        }

        private static ObjectLoader.IObjectLoader<WordPair> LOADER = new ObjectLoader.AObjectLoader<WordPair>() {
            @Override
            public WordPair createObject() {
                return new WordPair();
            }

            @Override
            public void set(WordPair target, String variable, Object value) {
                if (variable != null) {
                    if (variable.equals(goalWordVAR)) {
                        target.goalWord = (String) value;
                    } else if (variable.equals(searchWordVAR)) {
                        target.searchWord = (String) value;
                    }
                }
            }
        };

        public static void registerLoader() {
            ObjectLoader.registerLoader(WordPair.class.getName(),
                    edu_cmu_cs_hcii_cogtool_model_CachedTermSimilarity$WordPair_version, LOADER);
        }

        protected WordPair() {
        } // for loading

        public static final WordPair SEARCH_KEY = new WordPair("", "");

        public String goalWord;
        public String searchWord;

        public WordPair(String goal, String search) {
            goalWord = goal;
            searchWord = search;
        }

        protected boolean valueEquals(WordPair other) {
            return NullSafe.equals(goalWord, other.goalWord) && NullSafe.equals(searchWord, other.searchWord);
        }

        @Override
        public boolean equals(Object other) {
            return (other != null) && (other.getClass() == WordPair.class) && valueEquals((WordPair) other);
        }

        @Override
        public int hashCode() {
            // Must have a unique ODD number for each class which uses
            // hashCodeBuilder.
            // this   : 47, 5
            return new HashCodeBuilder(47, 5).append(goalWord.hashCode()).append(searchWord.hashCode())
                    .toHashCode();
        }
    }

    // Maps WordPair to similarity Double value
    protected Map<WordPair, Double> similarityTable = new HashMap<WordPair, Double>();

    /**
     * An IURLProcessor that fetches a frequency count for a given word.
     */
    protected interface IWordFrequencyParser extends FetchURLUtil.IURLProcessor {
        public long getWordFrequency();
    }

    /**
     * Subclasses should implement this to return a URL processor that
     * fetches a frequency count for the given word.
     */
    protected abstract IWordFrequencyParser getWordFreqParser(String word, List<String> errors);

    /**
     * At this point, the frequency cache does not know the given word,
     * so get the URL processor for the given word and process the
     * fetched content.
     */
    protected Long fetchWordFrequency(String word, List<String> errors) {
        IWordFrequencyParser wordFreqParser = getWordFreqParser(word, errors);

        if (FetchURLUtil.processURL(wordFreqParser)) {
            long frequency = wordFreqParser.getWordFrequency();

            // No need to create a new instance for zero.
            if (frequency == 0) {
                return ZERO_FREQUENCY;
            }

            return new Long(frequency);
        }

        // Did not succeed at fetching a value.
        return null;
    }

    /**
     * Look up the word in the cache; if there, return the associated
     * frequency.  If not there, fetch it.
     */
    protected long getWordFrequency(String word, List<String> errors) {
        if (word == null) {
            return 0;
        }

        word = word.toLowerCase();

        if (frequencyTable.containsKey(word)) {
            Object frequency = frequencyTable.get(word);

            if (frequency != null) {
                return ((Long) frequency).longValue();
            }

            throw new IllegalStateException("Frequency table contains a null frequency for word: " + word);
        }

        Long frequency = fetchWordFrequency(word, errors);

        if (frequency != null) {
            frequencyTable.put(word, frequency);

            return frequency.longValue();
        }

        return 0;
    } // getWordFrequency

    private static final Pattern SPLITTER = Pattern.compile("\\s+");

    /**
     * Break term into words, fetch each word's frequency, use replacements
     * if necessary (and specified), and return an array of words/replacements
     * that have non-zero frequencies.
     */
    private String[] getWordFrequencies(String term, List<String> errors, ITermSimilarity.Continuable cont) {
        String[] words = SPLITTER.split(term);
        List<String> nonzeroWords = new ArrayList<String>();

        for (String word : words) {
            if (getWordFrequency(word, errors) > 0) {
                nonzeroWords.add(word);
            } else if (zeroFreqTerms.containsKey(word)) {
                List<String> replacement = zeroFreqTerms.get(word);

                if (replacement != null) {
                    Iterator<String> others = replacement.iterator();

                    while (others.hasNext()) {
                        getWordFrequency(others.next(), errors);
                        if (!cont.isContinuing()) {
                            return null;
                        }
                    }

                    // TODO: Even if freq returned is zero in loop above?
                    nonzeroWords.addAll(replacement);
                } else {
                    // No replacements for previously seen zero freq word;
                    // TODO: return what???? "inform crawlWebsite() to reinsert link into queue"
                }
            } else {
                // First time seeing this zero freq word; insert and "return what????"
                zeroFreqTerms.put(word, null);
            }
            if (!cont.isContinuing()) {
                return null;
            }
        }

        if (nonzeroWords.isEmpty()) {
            return null;
        }

        String[] wordFreqs = new String[nonzeroWords.size()];

        return nonzeroWords.toArray(wordFreqs);
    } // getWordFrequencies

    /**
     * An IURLProcessor that fetches a similarity strength for a pair of words.
     */
    protected interface ISimilarityParser extends FetchURLUtil.IURLProcessor {
        public double getSimilarity();
    }

    /**
     * Subclasses should implement this to return a URL processor that
     * fetches a similarity strength for a pair of words.
     */
    protected abstract ISimilarityParser getSimilarityParser(String goal, String search, List<String> errors);

    /**
     * At this point, the word-pair similarity cache does not know,
     * so get the URL processor for the given word pair and process the
     * fetched content.
     */
    protected Double fetchWordSimilarity(String goalWord, String searchWord, List<String> errors) {
        ISimilarityParser goalSimilarityParser = getSimilarityParser(goalWord, searchWord, errors);

        if (FetchURLUtil.processURL(goalSimilarityParser)) {
            return new Double(goalSimilarityParser.getSimilarity());
        }

        return null;
    }

    /**
     * Look up the word pair in the cache; if there, return the associated
     * similarity.  If not there, fetch it.
     */
    protected double getWordSimilarity(String goalWord, String searchWord, List<String> errors) {
        if ((goalWord == null) || (searchWord == null)) {
            return UNKNOWN;
        }

        WordPair.SEARCH_KEY.goalWord = goalWord.toLowerCase();
        WordPair.SEARCH_KEY.searchWord = searchWord.toLowerCase();

        if (similarityTable.containsKey(WordPair.SEARCH_KEY)) {
            Double cachedSimilarity = similarityTable.get(WordPair.SEARCH_KEY);

            if (cachedSimilarity != null) {
                return cachedSimilarity.doubleValue();
            }

            throw new IllegalStateException(
                    "Similarity table contains a null similarity for pair: " + goalWord + ", " + searchWord);
        }

        Double similarity = fetchWordSimilarity(goalWord, searchWord, errors);

        if (similarity != null) {
            WordPair newEntry = new WordPair(goalWord, searchWord);

            similarityTable.put(newEntry, similarity);

            return similarity.doubleValue();
        }

        return UNKNOWN;
    } // getWordSimilarity

    /**
     * Computes the similarity between two strings (both can contain multiple words)
     * using multiple queries
     *
     * The two strings are tokenized into individual words and word-pairs are created
     * between the two strings. Each similarity of each word-pair is computed and then
     * totaled to give the similarity of both entire strings
     *
     * We split on whitespace and leave it to the analysis to split or rewrite words that are
     * hyphenated, apostrophe'd, etc
     */
    public double determineSimilarity(String goalTerm, String searchTerm, List<String> errors,
            ITermSimilarity.Continuable cont) {
        String[] goalWords = getWordFrequencies(goalTerm, errors, cont);
        if (!cont.isContinuing()) {
            return UNKNOWN;
        }
        String[] searchWords = getWordFrequencies(searchTerm, errors, cont);
        if (!cont.isContinuing()) {
            return UNKNOWN;
        }

        if ((goalWords == null) || (goalWords.length == 0) || (searchWords == null) || (searchWords.length == 0)) {
            return UNKNOWN;
        }

        double totalPMI = 0.0;
        int pairCount = 0;

        for (String goalWord : goalWords) {
            for (String searchWord : searchWords) {
                double wordSimilarity = getWordSimilarity(goalWord, searchWord, errors);
                if (!cont.isContinuing()) {
                    return UNKNOWN;
                }

                if (wordSimilarity >= 0.0) {
                    totalPMI += wordSimilarity;
                    pairCount++;
                }
            }
        }

        if (pairCount > 0) {
            return totalPMI / pairCount;
        }

        return UNKNOWN; // TODO: not quite; better some UNRELATED value
    } // determineSimilarity
}