com.mythesis.profileanalysis.Metrics.java Source code

Introduction

Here is the source code for com.mythesis.profileanalysis.Metrics.java
Source

/* 
 * Copyright 2015 Konstantinos Papangelou.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.mythesis.profileanalysis;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import static org.apache.commons.io.FileUtils.readLines;
import static org.apache.commons.math3.special.Gamma.logGamma;

/**
 * This class contains usefull metrics in order to evaluate LDA's results
 * @author Konstantinos Papangelou
 */
public class Metrics {

    /**
     * A method that that computes log-likelihood given the results of LDA
     * @param LDAdirectory the directory that contains the model to be evaluated
     * @param file the name of the the file that contains the documents
     * @return log-likelihood
     */
    public double getLogLikelihood(String LDAdirectory, String file) {
        double logLikelihood = 0.0;
        double alpha = 0.0;
        int T = 0;
        int W = 0;
        double beta = 0.0;
        int D = 0;

        File root = new File(LDAdirectory);
        File[] contents = root.listFiles();
        List<String> others = new ArrayList<>();
        List<String> tassign = new ArrayList<>();

        for (File f : contents) {
            String str = f.getAbsolutePath().substring(f.getAbsolutePath().lastIndexOf('\\') + 1);
            if (str.endsWith("others") && str.startsWith(file)) {
                try {
                    others = readLines(f);
                } catch (IOException ex) {
                    Logger.getLogger(Evaluate.class.getName()).log(Level.SEVERE, null, ex);
                }
            }

            if (str.endsWith("tassign") && str.startsWith(file)) {
                try {
                    tassign = readLines(f);
                } catch (IOException ex) {
                    Logger.getLogger(Evaluate.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        }

        for (String s : others) {
            if (s.startsWith("nwords"))
                W = Integer.valueOf(s.substring(s.indexOf('=') + 1));
            if (s.startsWith("ntopics"))
                T = Integer.valueOf(s.substring(s.indexOf('=') + 1));
            if (s.startsWith("beta"))
                beta = Double.valueOf(s.substring(s.indexOf('=') + 1));
            if (s.startsWith("alpha"))
                alpha = Double.valueOf(s.substring(s.indexOf('=') + 1));
            if (s.startsWith("ndocs"))
                D = Integer.valueOf(s.substring(s.indexOf('=') + 1));
        }

        //n_w_j is the number of times term w in topic j 
        //n_j is the total number of terms in topic j
        //n_d_j is the number of times a word from document d has been assigned to topic j
        //n_d is the total number of topics in document d
        int[] n_j = new int[T];
        int[][] n_w_j = new int[W][T];
        int[][] n_d_j = new int[D][T];
        int[] n_d = new int[D];

        int index = 0;
        for (String doc : tassign) {
            String[] wordTopicTemp = doc.split(" ");
            for (String s : wordTopicTemp) {
                for (int i = 0; i < T; i++) {
                    if (s.endsWith(String.valueOf(i))) {
                        n_j[i]++;
                        n_d_j[index][i]++;
                    }
                }
                n_d[index]++;
            }
            index++;
        }

        Map<String, Integer> wordTopic = new HashMap<>();
        for (String doc : tassign) {
            String[] temp = doc.split(" ");
            for (String s : temp) {
                Integer count = wordTopic.get(s);
                wordTopic.put(s, (count == null) ? 1 : count + 1);
            }
        }

        String[] temp;
        for (String s : wordTopic.keySet()) {
            temp = s.split(":");
            int word = Integer.valueOf(temp[0]);
            int topic = Integer.valueOf(temp[1]);
            n_w_j[word][topic] = wordTopic.get(s);
        }

        // first part-log(p(z))
        double logGammaAlpha = logGamma(alpha);

        for (int doc = 0; doc < D; doc++) {
            for (int topic = 0; topic < T; topic++) {
                if (n_d_j[doc][topic] > 0) {
                    logLikelihood += logGamma(alpha + n_d_j[doc][topic]) - logGammaAlpha;
                }
            }

            logLikelihood -= logGamma(T * alpha + n_d[doc]);
        }

        logLikelihood += D * logGamma(alpha * T);

        //second part-log(p(w|z))
        double logGammaBeta = logGamma(beta);

        for (int word = 0; word < W; word++) {
            for (int topic = 0; topic < T; topic++) {
                if (n_w_j[word][topic] == 0) {
                    continue;
                }

                logLikelihood += logGamma(beta + n_w_j[word][topic]) + logGammaBeta;
            }
        }

        for (int topic = 0; topic < T; topic++) {
            logLikelihood -= logGamma((beta * W) + n_j[topic]);
        }

        logLikelihood += T * logGamma(beta * W);

        return logLikelihood;

    }
}