org.dkpro.similarity.algorithms.structure.PosNGramJaccardMeasure.java Source code

Java tutorial

Introduction

Here is the source code for org.dkpro.similarity.algorithms.structure.PosNGramJaccardMeasure.java

Source

/*******************************************************************************
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package org.dkpro.similarity.algorithms.structure;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.lang.NotImplementedException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.dkpro.similarity.algorithms.api.JCasTextSimilarityMeasureBase;
import org.dkpro.similarity.algorithms.api.SimilarityException;

import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;

/**
 * Computes the overlap of part-of-speech n-grams using the
 * Jaccard measure, similarly as for the word n-grams
 * in the {@code WordNGramJaccardMeasure}. 
 */
public class PosNGramJaccardMeasure extends JCasTextSimilarityMeasureBase {
    int n;

    public PosNGramJaccardMeasure(int n) {
        this.n = n;
    }

    @Override
    public double getSimilarity(Collection<String> stringList1, Collection<String> stringList2)
            throws SimilarityException {
        throw new SimilarityException(new NotImplementedException());
    }

    @Override
    public double getSimilarity(JCas jcas1, JCas jcas2) throws SimilarityException {
        // Get POS
        Collection<POS> pos1 = JCasUtil.select(jcas1, POS.class);
        Collection<POS> pos2 = JCasUtil.select(jcas2, POS.class);

        return getPosNGramSimilarity(pos1, pos2);
    }

    @Override
    public double getSimilarity(JCas jcas1, JCas jcas2, Annotation coveringAnnotation1,
            Annotation coveringAnnotation2) throws SimilarityException {
        // Get POS
        Collection<POS> pos1 = JCasUtil.selectCovered(jcas1, POS.class, coveringAnnotation1);
        Collection<POS> pos2 = JCasUtil.selectCovered(jcas2, POS.class, coveringAnnotation2);

        return getPosNGramSimilarity(pos1, pos2);
    }

    private double getPosNGramSimilarity(Collection<POS> pos1, Collection<POS> pos2) {
        // Get n-grams
        Set<List<String>> ngrams1 = getPosNGrams(new ArrayList<POS>(pos1));
        Set<List<String>> ngrams2 = getPosNGrams(new ArrayList<POS>(pos2));

        double sim = getNormalizedSimilarity(ngrams1, ngrams2);

        return sim;
    }

    private Set<List<String>> getPosNGrams(List<POS> pos) {
        Set<List<String>> ngrams = new HashSet<List<String>>();

        for (int i = 0; i < pos.size() - (n - 1); i++) {
            // Generate n-gram at index i
            List<String> ngram = new ArrayList<String>();
            for (int k = 0; k < n; k++) {
                String token = pos.get(i + k).getPosValue();
                ngram.add(token);
            }

            // Add
            ngrams.add(ngram);
        }

        return ngrams;
    }

    protected double getNormalizedSimilarity(Set<List<String>> suspiciousNGrams, Set<List<String>> originalNGrams) {
        // Compare using the Jaccard similarity coefficient (Manning & Schtze, 1999)
        Set<List<String>> commonNGrams = new HashSet<List<String>>();
        commonNGrams.addAll(suspiciousNGrams);
        commonNGrams.retainAll(originalNGrams);

        Set<List<String>> unionNGrams = new HashSet<List<String>>();
        unionNGrams.addAll(suspiciousNGrams);
        unionNGrams.addAll(originalNGrams);

        double norm = unionNGrams.size();
        double sim = 0.0;

        if (norm > 0.0) {
            sim = commonNGrams.size() / norm;
        }

        return sim;
    }

    @Override
    public String getName() {
        return this.getClass().getSimpleName() + "_" + n + "grams";
    }
}