dkpro.similarity.algorithms.structure.TokenPairDistanceMeasure.java Source code

Java tutorial

Introduction

Here is the source code for dkpro.similarity.algorithms.structure.TokenPairDistanceMeasure.java

Source

/*******************************************************************************
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package dkpro.similarity.algorithms.structure;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.math.stat.correlation.PearsonsCorrelation;

import dkpro.similarity.algorithms.api.SimilarityException;
import dkpro.similarity.algorithms.api.TextSimilarityMeasureBase;

/**
 * This measure corresponds to the Composite Feature "Distance" as described in:
 * 
 * Hatzivassiloglou, V., Klavans, J., & Eskin, E. (1999)
 * Detecting text similarity over short passages: Exploring linguistic feature
 * combinations via machine learning. Proceedings of the Joint SIGDAT Conference
 * on Empirical Methods in Natural Language Processing and Very Large Corpora. 
 * 
 * Two feature vector of distances are computed, one for each text. Then they
 * are compared using Pearson correlation. The correlation score is returned
 * as similarity score.
 */
public class TokenPairDistanceMeasure extends TextSimilarityMeasureBase {
    @Override
    public double getSimilarity(Collection<String> stringList1, Collection<String> stringList2)
            throws SimilarityException {
        // Transform input lists into lowercase string lists
        List<String> sl1 = new ArrayList<String>();
        for (String s : stringList1) {
            sl1.add(s.toLowerCase());
        }

        List<String> sl2 = new ArrayList<String>();
        for (String s : stringList2) {
            sl2.add(s.toLowerCase());
        }

        // Get word sets
        Set<String> strings1 = new HashSet<String>(sl1);
        Set<String> strings2 = new HashSet<String>(sl2);

        // Get a common word list
        List<String> commonStrings = new ArrayList<String>(strings1);
        commonStrings.retainAll(strings2);

        // Build up pairs (ignoring order, i.e. a-b or b-a)
        Set<Pair> pairs = new HashSet<Pair>();
        for (String s1 : commonStrings) {
            for (String s2 : commonStrings) {
                if (!s1.equals(s2)) {
                    Pair p = new Pair(s1, s2);
                    pairs.add(p);
                }
            }
        }

        if (pairs.size() > 1) {
            double[] v1 = new double[pairs.size()];
            double[] v2 = new double[pairs.size()];

            List<Pair> pairList = new ArrayList<Pair>(pairs);

            for (int i = 0; i < pairList.size(); i++) {
                Pair p = pairList.get(i);

                int idx1a = sl1.indexOf(p.getString1());
                int idx1b = sl1.indexOf(p.getString2());
                int idx1diff = transform(idx1a - idx1b);

                int idx2a = sl2.indexOf(p.getString1());
                int idx2b = sl2.indexOf(p.getString2());
                int idx2diff = transform(idx2a - idx2b);

                v1[i] = idx1diff;
                v2[i] = idx2diff;
            }

            PearsonsCorrelation pearson = new PearsonsCorrelation();
            return pearson.correlation(v1, v2);
        }

        return 0.0;
    }

    public int transform(int diff) {
        // Pass through
        return diff;
    }

    private class Pair {
        String s1;
        String s2;

        public Pair(String s1, String s2) {
            this.s1 = s1;
            this.s2 = s2;
        }

        @Override
        public int hashCode() {
            return s1.hashCode() + s2.hashCode();
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null) {
                return false;
            }
            if (this.getClass().equals(obj.getClass())) {
                Pair otherObj = (Pair) obj;
                if ((s1.equals(otherObj.getString1()) && s2.equals(otherObj.getString2()))
                        || (s2.equals(otherObj.getString1()) && s1.equals(otherObj.getString2()))) {
                    return true;
                }
            }
            return false;
        }

        public String getString1() {
            return s1;
        }

        public String getString2() {
            return s2;
        }
    }
}