com.mythesis.profileanalysis.WordVectorFinder.java Source code

Introduction

Here is the source code for com.mythesis.profileanalysis.WordVectorFinder.java
Source

/* 
 * Copyright 2015 Konstantinos Papangelou
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.mythesis.profileanalysis;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;

/**
 * This class calculates the wordvectors of a specific domain
 * @author Konstantinos Papangelou
 */
public class WordVectorFinder {

    /**
     * a method that calculates the word vector for a specific domain
     * @param domain the general domain
     * @param wordVectorDirectory the directory where i will save the word vector
     * @param LDAdirectory LDA directory
     * @param nTopTopics number of top topics 
     * @param choice how to select the top topics (1 for average, 2 for median, 3 for number of documents that have probability higher than 1/nTopics)
     * @param top_words number of top words
     * @param nTopics total number of topics
     */
    public void getWordVector(String domain, String wordVectorDirectory, String LDAdirectory, int nTopTopics,
            int choice, int top_words, int nTopics) {

        String path = LDAdirectory;
        LDAtopicsWords rk = new LDAtopicsWords();
        if (nTopTopics > nTopics)
            nTopTopics = nTopics;
        //get a number of top topics and a number of top words from every topic
        HashMap<Integer, HashMap<String, Double>> topicwordprobmap = rk.readFile(path, top_words, nTopics,
                nTopTopics, choice);

        List<String> wordVector = new ArrayList<>();
        for (Integer topicindex : topicwordprobmap.keySet()) { //iterate through every topic
            Set keySet = topicwordprobmap.get(topicindex).keySet();
            Iterator iterator = keySet.iterator();
            while (iterator.hasNext()) { //iterate through every word
                String word = iterator.next().toString();
                if (!wordVector.contains(word)) {
                    wordVector.add(word); //put the word in word vector
                }
            }
        }

        //store the word vector
        File wordVectorFile = new File(wordVectorDirectory + "wordVector" + domain + ".txt");
        try {
            FileUtils.writeLines(wordVectorFile, wordVector);
        } catch (IOException ex) {
            Logger.getLogger(WordVectorFinder.class.getName()).log(Level.SEVERE, null, ex);
        }

    }

    /**
     * a method that cleans the word vector of a certain sub-domain
     * @param directory the directory where im gonna save the results
     * @param profile the profile i want to get the wordvector 
     * @param master the general domain
     */
    public void cleanWordVector(String directory, String profile, String master) {

        List<String> masterVector = new ArrayList<>();
        List<String> wordVector = new ArrayList<>();
        File wordsFile;
        try {
            wordsFile = new File(directory + "\\wordVector" + master + ".txt");
            masterVector = FileUtils.readLines(wordsFile); // get the word vector of the general domain
            wordsFile = new File(directory + "\\wordVector" + profile + ".txt");
            wordVector = FileUtils.readLines(wordsFile); // get the word vector of the sub-domain
        } catch (IOException ex) {
            Logger.getLogger(WordVectorFinder.class.getName()).log(Level.SEVERE, null, ex);
        }

        for (String s : masterVector) {
            if (wordVector.contains(s))
                wordVector.remove(s); // remove any word that occurs in general domain's word vector
        }

        // save the new word vector
        File wordVectorPath = new File(directory + "\\wordVectorCleaned" + profile + ".txt");
        try {
            FileUtils.writeLines(wordVectorPath, wordVector);
        } catch (IOException ex) {
            Logger.getLogger(WordVectorFinder.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
}