org.carrot2.output.metrics.NormalizedMutualInformationMetric.java Source code

Java tutorial

Introduction

Here is the source code for org.carrot2.output.metrics.NormalizedMutualInformationMetric.java

Source

/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisaw Osiski.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.output.metrics;

import java.util.*;

import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.util.attribute.*;

import com.google.common.base.Function;
import com.google.common.collect.*;

/**
 * Computes Normalized Mutual Information (NMI) metric for the cluster set.
 * <p>
 * Metrics will be calculated only if all input documents have non-blank
 * {@link Document#PARTITIONS}.
 * </p>
 */
@Bindable
public class NormalizedMutualInformationMetric extends IdealPartitioningBasedMetric {
    /**
     * Normalized Mutual Information of the whole cluster set.
     */
    @Processing
    @Output
    @Attribute
    public Double normalizedMutualInformation;

    @Processing
    @Input
    @Attribute(key = AttributeNames.DOCUMENTS)
    public List<Document> documents;

    @Processing
    @Input
    @Attribute(key = AttributeNames.CLUSTERS)
    public List<Cluster> clusters;

    /**
     * Calculate Normalized Mutual Information metric.
     */
    @Processing
    @Input
    @Attribute
    public boolean enabled = true;

    public void calculate() {
        final int partitionCount = getPartitionsCount(documents);
        if (partitionCount == 0) {
            return;
        }

        if (clusters.size() == 0) {
            return;
        }

        final Set<Object> partitions = getPartitions(documents);
        final SetMultimap<Object, Document> documentsByPartition = getDocumentsByPartition(documents);
        final Map<Object, Integer> documentCountByPartition = getDocumentCountByPartition(documents);
        final int documentCount = documents.size();

        if (partitions.size() <= 1) {
            normalizedMutualInformation = 0.0;
            return;
        }

        final Collection<Integer> partitionSizes = Maps
                .transformValues(documentsByPartition.asMap(), new Function<Collection<Document>, Integer>() {
                    public Integer apply(Collection<Document> documents) {
                        return documents.size();
                    }
                }).values();
        double partitionEntropy = entropy(documentCount,
                partitionSizes.toArray(new Integer[partitionSizes.size()]));

        final List<Integer> clusterSizes = Lists.transform(clusters, new Function<Cluster, Integer>() {
            public Integer apply(Cluster cluster) {
                return cluster.size();
            }
        });
        double clusterEntropy = entropy(documentCount, clusterSizes.toArray(new Integer[clusterSizes.size()]));

        double mutualInformation = 0;
        for (Cluster cluster : this.clusters) {
            final int clusterSize = cluster.size();
            for (Object partition : partitions) {
                final List<Document> clusterDocuments = cluster.getAllDocuments();
                if (cluster.isOtherTopics() || clusterDocuments.size() == 0) {
                    continue;
                }

                final Set<Document> commonDocuments = Sets.newHashSet(documentsByPartition.get(partition));
                commonDocuments.retainAll(clusterDocuments);
                int commonDocumentsCount = commonDocuments.size();

                if (commonDocumentsCount != 0) {
                    mutualInformation += (commonDocumentsCount / (double) documentCount)
                            * Math.log(documentCount * commonDocumentsCount
                                    / (double) (clusterSize * documentCountByPartition.get(partition)));
                }
            }
        }

        normalizedMutualInformation = mutualInformation / ((clusterEntropy + partitionEntropy) / 2);
    }

    private double entropy(int count, Integer... elements) {
        double entropy = 0;
        for (int d : elements) {
            if (d != 0) {
                entropy += d / (double) count * Math.log(d / (double) count);
            }
        }
        return -entropy;
    }

    public boolean isEnabled() {
        return enabled;
    }
}