edu.cmu.cs.hcii.cogtool.model.CachedGoogleSimilarity.java Source code

Java tutorial

Introduction

Here is the source code for edu.cmu.cs.hcii.cogtool.model.CachedGoogleSimilarity.java

Source

/*******************************************************************************
 * CogTool Copyright Notice and Distribution Terms
 * CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University
 * This software is distributed under the terms of the FSF Lesser
 * Gnu Public License (see LGPL.txt). 
 * 
 * CogTool is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 * 
 * CogTool is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with CogTool; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 * 
 * CogTool makes use of several third-party components, with the 
 * following notices:
 * 
 * Eclipse SWT version 3.448
 * Eclipse GEF Draw2D version 3.2.1
 * 
 * Unless otherwise indicated, all Content made available by the Eclipse 
 * Foundation is provided to you under the terms and conditions of the Eclipse 
 * Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this 
 * Content and is also available at http://www.eclipse.org/legal/epl-v10.html.
 * 
 * CLISP version 2.38
 * 
 * Copyright (c) Sam Steingold, Bruno Haible 2001-2006
 * This software is distributed under the terms of the FSF Gnu Public License.
 * See COPYRIGHT file in clisp installation folder for more information.
 * 
 * ACT-R 6.0
 * 
 * Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere & 
 *                         John R Anderson. 
 * This software is distributed under the terms of the FSF Lesser
 * Gnu Public License (see LGPL.txt).
 * 
 * Apache Jakarta Commons-Lang 2.1
 * 
 * This product contains software developed by the Apache Software Foundation
 * (http://www.apache.org/)
 * 
 * jopt-simple version 1.0
 * 
 * Copyright (c) 2004-2013 Paul R. Holser, Jr.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 * Mozilla XULRunner 1.9.0.5
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/.
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * The J2SE(TM) Java Runtime Environment version 5.0
 * 
 * Copyright 2009 Sun Microsystems, Inc., 4150
 * Network Circle, Santa Clara, California 95054, U.S.A.  All
 * rights reserved. U.S.  
 * See the LICENSE file in the jre folder for more information.
 ******************************************************************************/

package edu.cmu.cs.hcii.cogtool.model;

import java.util.List;

import org.apache.commons.lang.builder.HashCodeBuilder;

import edu.cmu.cs.hcii.cogtool.CogTool;
import edu.cmu.cs.hcii.cogtool.CogToolPref;
import edu.cmu.cs.hcii.cogtool.util.ObjectLoader;
import edu.cmu.cs.hcii.cogtool.util.ObjectSaver;

//The names of classes and interfaces around this are terrible, but we can't
//change them without breaking old .cgt files, since our persistence
//mechanism leaks the implementation detail of our class names into the
//abstraction of our file format.
//Note that there
//is no such thing as an un-cached TermSimilarity.
//Both GoogleSimilarity and CachedGoogleSimilarity are direct subsclasses
//of CachedTermSimilarity, and the one with cached in its name does not
//inherit from the one without. All very confusing.
//TODO once we augment our persistence mechanism in such a way that we
//   can actually rename persistable classes, we should tidy up these names.

/**
 * Corresponds to computeGSimilarity in Leonghwee's code.
 * But in the UI it's called PMI-G (word).
 * It also differs from Leongwhee's code in that we now normalize the results.
 */
public class CachedGoogleSimilarity extends CachedTermSimilarity implements ISitedTermSimilarity {
    public static final int edu_cmu_cs_hcii_cogtool_model_CachedGoogleSimilarity_version = 0;

    protected static final String contextSiteVAR = "contextSite";

    private static ObjectSaver.IDataSaver<CachedGoogleSimilarity> SAVER = new ObjectSaver.ADataSaver<CachedGoogleSimilarity>() {
        @Override
        public int getVersion() {
            return edu_cmu_cs_hcii_cogtool_model_CachedGoogleSimilarity_version;
        }

        @Override
        public void saveData(CachedGoogleSimilarity v, ObjectSaver saver) throws java.io.IOException {
            saver.saveObject(v.site, contextSiteVAR);
        }
    };

    public static void registerSaver() {
        ObjectSaver.registerSaver(CachedGoogleSimilarity.class.getName(), SAVER);
    }

    private static ObjectLoader.IObjectLoader<CachedGoogleSimilarity> LOADER = new ObjectLoader.AObjectLoader<CachedGoogleSimilarity>() {
        @Override
        public CachedGoogleSimilarity createObject() {
            return new CachedGoogleSimilarity();
        }

        @Override
        public void set(CachedGoogleSimilarity target, String variable, Object value) {
            if (variable != null) {
                if (variable.equals(contextSiteVAR)) {
                    target.site = (String) value;
                }
            }
        }
    };

    public static void registerLoader() {
        ObjectLoader.registerLoader(CachedGoogleSimilarity.class.getName(),
                edu_cmu_cs_hcii_cogtool_model_CachedGoogleSimilarity_version, LOADER);
    }

    protected String site = null;

    public static final double PMI_G_SIZE_AUTOMATIC = -1.0;

    protected static CachedGoogleSimilarity checkAlg = null;

    public static CachedGoogleSimilarity create(String limitToSite) {
        if (checkAlg == null) {
            checkAlg = new CachedGoogleSimilarity();
        }

        checkAlg.resetSite(limitToSite);

        CachedGoogleSimilarity registered = (CachedGoogleSimilarity) AlgorithmRegistry.ONLY.register(checkAlg);

        // If null, then the registry used checkAlg and registered it.
        if (registered == null) {
            registered = checkAlg;
            checkAlg = null;
        }

        // registered now contains the algorithm to use
        return registered;
    }

    protected CachedGoogleSimilarity() {
    }

    /**
     * This algorithm also allows one to provide a document set scope
     * by limiting the fetches to a specified site.
     */
    protected void resetSite(String limitToSite) {
        if ("".equals(limitToSite)) {
            limitToSite = null;
        }

        site = limitToSite;
    }

    @Override
    protected IWordFrequencyParser getWordFreqParser(String word, List<String> errors) {
        GoogleSimilarity.termFreqParser.reset(word, errors, site);

        return GoogleSimilarity.termFreqParser;
    } // getWordFreqParser

    protected static final String COMMON_WORD = "the";

    /**
     * No need for a similarity URL processor; word similarity is computed
     * purely using word frequencies.
     */
    @Override
    protected Double fetchWordSimilarity(String goalWord, String searchWord, List<String> errors) {
        double indexSize = CogToolPref.PMI_G_SIZE.getDouble();
        if (indexSize == CachedGoogleSimilarity.PMI_G_SIZE_AUTOMATIC) {
            indexSize = getWordFrequency(COMMON_WORD, errors);
        }
        double numerator = (getWordFrequency(goalWord + " " + searchWord, errors)) / indexSize;
        double denominator = ((getWordFrequency(goalWord, errors)) / indexSize)
                * ((getWordFrequency(searchWord, errors)) / indexSize);

        // TODO we may want to normalize the results differently
        if (denominator != 0.0) {
            double unnormalized = Math.log10(numerator / denominator);
            double normalized = unnormalized / Math.log10(indexSize);
            CogTool.logger.finer(String.format("Normalizing PMI-G (word) value (%s, %s) from %g to %g", goalWord,
                    searchWord, unnormalized, normalized));
            return new Double(normalized);
        }

        CogTool.logger.finer(String.format("Unknown PMI-G (word) similarity for %s, %s", goalWord, searchWord));
        return UNKNOWN_SIMILARITY;
    }

    @Override
    protected ISimilarityParser getSimilarityParser(String goal, String search, List<String> errors) {
        throw new UnsupportedOperationException(
                "No similarity parser is required for CachedGoogle for: " + goal + ", " + search);
    }

    public String getContextSite() {
        return site;
    }

    @Override
    public boolean equals(Object other) {
        if (other instanceof CachedGoogleSimilarity) {
            CachedGoogleSimilarity cgs = (CachedGoogleSimilarity) other;

            if (site == null) {
                return cgs.site == null;
            }

            return site.equals(cgs.site);
        }

        return false;
    }

    @Override
    public int hashCode() {
        // Must have a unique ODD number for each class which uses
        // hashCodeBuilder.
        // this   : 139, 207
        HashCodeBuilder hcb = new HashCodeBuilder(139, 207);

        if (site != null) {
            hcb.append(site.hashCode());
        }

        return hcb.toHashCode();
    }

    public ITermSimilarity duplicate() {
        CachedGoogleSimilarity cgs = new CachedGoogleSimilarity();
        cgs.site = site;
        return cgs;
    }
}