Java tutorial
/* * Copyright 2015 Konstantinos Papangelou. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mythesis.profileanalysis; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import static org.apache.commons.io.FileUtils.readLines; import static org.apache.commons.math3.special.Gamma.logGamma; /** * This class contains usefull metrics in order to evaluate LDA's results * @author Konstantinos Papangelou */ public class Metrics { /** * A method that that computes log-likelihood given the results of LDA * @param LDAdirectory the directory that contains the model to be evaluated * @param file the name of the the file that contains the documents * @return log-likelihood */ public double getLogLikelihood(String LDAdirectory, String file) { double logLikelihood = 0.0; double alpha = 0.0; int T = 0; int W = 0; double beta = 0.0; int D = 0; File root = new File(LDAdirectory); File[] contents = root.listFiles(); List<String> others = new ArrayList<>(); List<String> tassign = new ArrayList<>(); for (File f : contents) { String str = f.getAbsolutePath().substring(f.getAbsolutePath().lastIndexOf('\\') + 1); if (str.endsWith("others") && str.startsWith(file)) { try { others = readLines(f); } catch (IOException ex) { Logger.getLogger(Evaluate.class.getName()).log(Level.SEVERE, null, ex); } } if (str.endsWith("tassign") && str.startsWith(file)) { try { tassign = readLines(f); } catch (IOException ex) { Logger.getLogger(Evaluate.class.getName()).log(Level.SEVERE, null, ex); } } } for (String s : others) { if (s.startsWith("nwords")) W = Integer.valueOf(s.substring(s.indexOf('=') + 1)); if (s.startsWith("ntopics")) T = Integer.valueOf(s.substring(s.indexOf('=') + 1)); if (s.startsWith("beta")) beta = Double.valueOf(s.substring(s.indexOf('=') + 1)); if (s.startsWith("alpha")) alpha = Double.valueOf(s.substring(s.indexOf('=') + 1)); if (s.startsWith("ndocs")) D = Integer.valueOf(s.substring(s.indexOf('=') + 1)); } //n_w_j is the number of times term w in topic j //n_j is the total number of terms in topic j //n_d_j is the number of times a word from document d has been assigned to topic j //n_d is the total number of topics in document d int[] n_j = new int[T]; int[][] n_w_j = new int[W][T]; int[][] n_d_j = new int[D][T]; int[] n_d = new int[D]; int index = 0; for (String doc : tassign) { String[] wordTopicTemp = doc.split(" "); for (String s : wordTopicTemp) { for (int i = 0; i < T; i++) { if (s.endsWith(String.valueOf(i))) { n_j[i]++; n_d_j[index][i]++; } } n_d[index]++; } index++; } Map<String, Integer> wordTopic = new HashMap<>(); for (String doc : tassign) { String[] temp = doc.split(" "); for (String s : temp) { Integer count = wordTopic.get(s); wordTopic.put(s, (count == null) ? 1 : count + 1); } } String[] temp; for (String s : wordTopic.keySet()) { temp = s.split(":"); int word = Integer.valueOf(temp[0]); int topic = Integer.valueOf(temp[1]); n_w_j[word][topic] = wordTopic.get(s); } // first part-log(p(z)) double logGammaAlpha = logGamma(alpha); for (int doc = 0; doc < D; doc++) { for (int topic = 0; topic < T; topic++) { if (n_d_j[doc][topic] > 0) { logLikelihood += logGamma(alpha + n_d_j[doc][topic]) - logGammaAlpha; } } logLikelihood -= logGamma(T * alpha + n_d[doc]); } logLikelihood += D * logGamma(alpha * T); //second part-log(p(w|z)) double logGammaBeta = logGamma(beta); for (int word = 0; word < W; word++) { for (int topic = 0; topic < T; topic++) { if (n_w_j[word][topic] == 0) { continue; } logLikelihood += logGamma(beta + n_w_j[word][topic]) + logGammaBeta; } } for (int topic = 0; topic < T; topic++) { logLikelihood -= logGamma((beta * W) + n_j[topic]); } logLikelihood += T * logGamma(beta * W); return logLikelihood; } }