Java tutorial
/* * Copyright 2015 Konstantinos Papangelou * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mythesis.profileanalysis; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; /** * This class calculates the wordvectors of a specific domain * @author Konstantinos Papangelou */ public class WordVectorFinder { /** * a method that calculates the word vector for a specific domain * @param domain the general domain * @param wordVectorDirectory the directory where i will save the word vector * @param LDAdirectory LDA directory * @param nTopTopics number of top topics * @param choice how to select the top topics (1 for average, 2 for median, 3 for number of documents that have probability higher than 1/nTopics) * @param top_words number of top words * @param nTopics total number of topics */ public void getWordVector(String domain, String wordVectorDirectory, String LDAdirectory, int nTopTopics, int choice, int top_words, int nTopics) { String path = LDAdirectory; LDAtopicsWords rk = new LDAtopicsWords(); if (nTopTopics > nTopics) nTopTopics = nTopics; //get a number of top topics and a number of top words from every topic HashMap<Integer, HashMap<String, Double>> topicwordprobmap = rk.readFile(path, top_words, nTopics, nTopTopics, choice); List<String> wordVector = new ArrayList<>(); for (Integer topicindex : topicwordprobmap.keySet()) { //iterate through every topic Set keySet = topicwordprobmap.get(topicindex).keySet(); Iterator iterator = keySet.iterator(); while (iterator.hasNext()) { //iterate through every word String word = iterator.next().toString(); if (!wordVector.contains(word)) { wordVector.add(word); //put the word in word vector } } } //store the word vector File wordVectorFile = new File(wordVectorDirectory + "wordVector" + domain + ".txt"); try { FileUtils.writeLines(wordVectorFile, wordVector); } catch (IOException ex) { Logger.getLogger(WordVectorFinder.class.getName()).log(Level.SEVERE, null, ex); } } /** * a method that cleans the word vector of a certain sub-domain * @param directory the directory where im gonna save the results * @param profile the profile i want to get the wordvector * @param master the general domain */ public void cleanWordVector(String directory, String profile, String master) { List<String> masterVector = new ArrayList<>(); List<String> wordVector = new ArrayList<>(); File wordsFile; try { wordsFile = new File(directory + "\\wordVector" + master + ".txt"); masterVector = FileUtils.readLines(wordsFile); // get the word vector of the general domain wordsFile = new File(directory + "\\wordVector" + profile + ".txt"); wordVector = FileUtils.readLines(wordsFile); // get the word vector of the sub-domain } catch (IOException ex) { Logger.getLogger(WordVectorFinder.class.getName()).log(Level.SEVERE, null, ex); } for (String s : masterVector) { if (wordVector.contains(s)) wordVector.remove(s); // remove any word that occurs in general domain's word vector } // save the new word vector File wordVectorPath = new File(directory + "\\wordVectorCleaned" + profile + ".txt"); try { FileUtils.writeLines(wordVectorPath, wordVector); } catch (IOException ex) { Logger.getLogger(WordVectorFinder.class.getName()).log(Level.SEVERE, null, ex); } } }