com.mythesis.profileanalysis.Utils.java Source code

Introduction

Here is the source code for com.mythesis.profileanalysis.Utils.java
Source

/* 
 * Copyright 2015 Konstantinos Papangelou.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.mythesis.profileanalysis;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import static org.apache.commons.io.FileUtils.directoryContains;
import static org.apache.commons.io.FileUtils.readFileToString;
import org.apache.commons.lang3.StringUtils;

/**
 * This class contains auxiliary functions
 * @author Konstantinos Papangelou
 */
public class Utils {

    /**
     * Method that returns all the files of a certain extension from a directory
     * @param directory_path A String with the directory 
     * @param filetype A string with the filetype (without dot symbol)
     * @return a Collection that contains all the files found
     */
    public Collection<File> getinputfiles(String directory_path, String filetype) {
        String[] extensions = { filetype };//set the file extensions you would like to parse, e.g. you could have {txt,jpeg,pdf}
        File directory = new File(directory_path);
        //----FileUtils listfiles(File directory, IOFileFilter fileFilter, IOFileFilter dirFilter)

        //---- file filter is set to the extensions
        //---- the dirFilter is set to true and it performs recursive search to all the subdirectories
        //String collection = FileUtils.listFiles(directory, extensions, true).toString();
        Collection<File> Files = FileUtils.listFiles(directory, extensions, false);
        String[] paths = new String[Files.size()];//----the String array will contain all the paths of the files
        int j = 0;
        for (File file : Files) {
            paths[j] = file.getPath();
            j++;
        }
        return Files;
    }

    /**
     * Method that gets the results form sWebRank and puts them all together in the same file
     * @param inputDirectory the output directory of sWebRank
     * @param outputDirectory the directory where im gonna save the new file
     * @param file the name of the file where im gonna save the results e.g. xxx.txt
     */
    public void getLDAcontent(String inputDirectory, String outputDirectory, String file) {

        System.out.println("Getting webpages from " + inputDirectory);
        //Find output paths
        File root = new File(inputDirectory);
        File[] contents = root.listFiles();
        List<String> sWebRanklevels = new ArrayList<>();
        for (File f : contents) {
            if (f.getAbsolutePath().contains("level"))
                sWebRanklevels.add(f.getAbsolutePath());
        }

        //Find all query paths
        //for now this only works for bing search engine
        List<String> totalQueries = new ArrayList<>();
        for (String s : sWebRanklevels) {
            File level = new File(s);
            File[] queries = level.listFiles();
            for (File f : queries) {
                if (!f.getAbsolutePath().contains("txt"))
                    totalQueries.add(f.getAbsolutePath() + "\\" + "bing" + "\\");

            }
        }

        int totalDocs = 0;
        //String totalContent="";
        List<String> totalContent = new ArrayList<>();
        String webpage = "";
        for (String s : totalQueries) {
            File level = new File(s);
            File[] docPaths = level.listFiles();
            for (File f : docPaths) {
                String str = f.getAbsolutePath();
                if (StringUtils.isNumeric(str.substring(str.lastIndexOf("\\") + 1))) {
                    File webPagePath = new File(str + "\\html_parse_content.txt");
                    try {
                        if (!directoryContains(f, webPagePath)) {
                            webPagePath = new File(f.getAbsolutePath() + "\\youtube_content.txt");
                            if (!directoryContains(f, webPagePath))
                                continue;
                            webpage = readFileToString(webPagePath).replace("\n", "").replace("\r", "");
                            YoutubeContent ytc = new YoutubeContent();
                            webpage = ytc.parse(webpage);
                        } else {
                            webpage = readFileToString(webPagePath);
                        }
                    } catch (IOException ex) {
                        Logger.getLogger(Utils.class.getName()).log(Level.SEVERE, null, ex);
                    }
                    if (webpage.isEmpty())
                        continue;

                    totalContent.add(webpage);
                }
            }
        }

        Set<String> hs = new HashSet<>();
        hs.addAll(totalContent);
        totalContent.clear();
        totalContent.addAll(hs);

        //find for each word the documents that appears in
        Map<String, HashSet<Integer>> wordsToSetOfDocsMap = new TreeMap<>();
        int size = totalContent.size();
        for (int d = 0; d < size; d++) {
            String doc = totalContent.get(d);
            String[] words = doc.trim().split(" ");
            int length = words.length;
            for (int w = 0; w < length; w++) {
                String word = words[w];
                if (!wordsToSetOfDocsMap.containsKey(word)) {
                    wordsToSetOfDocsMap.put(word, new HashSet<Integer>());
                }
                HashSet<Integer> setOfDocs = wordsToSetOfDocsMap.get(word);
                setOfDocs.add(d);
                wordsToSetOfDocsMap.put(word, setOfDocs);
            }
        }

        //remove frequent words
        totalContent = removeFrequentWords(wordsToSetOfDocsMap, totalContent);

        //remove infrequent words
        totalContent = removeInfrequentWords(wordsToSetOfDocsMap, totalContent);

        //store results
        for (int i = 0; i < totalContent.size(); i++) {
            String content = totalContent.get(i).replaceAll("\\s+", " ").trim();
            totalContent.set(i, content);
        }
        totalDocs = totalContent.size();
        totalContent.add(0, String.valueOf(totalDocs));
        String LDAdirectory = outputDirectory;
        File ldaContent = new File(LDAdirectory + file);
        try {
            FileUtils.writeLines(ldaContent, totalContent);
        } catch (IOException ex) {
            Logger.getLogger(Utils.class.getName()).log(Level.SEVERE, null, ex);
        }

        System.out.println("I stored the results in " + outputDirectory + file);
    }

    /**
     * a method that removes most frequent words from a collection of documents
     * @param wordsToSetOfDocs words and the documents they appear in
     * @param docs list of docs
     * @return list of docs without the most frequent words
     */
    private List<String> removeFrequentWords(Map<String, HashSet<Integer>> wordsToSetOfDocs, List<String> docs) {

        int upperThreshold = (int) (0.7 * docs.size()); //remove words that appear in more than 70% of documents
        for (String word : wordsToSetOfDocs.keySet()) {
            if (wordsToSetOfDocs.get(word).size() > upperThreshold) {
                for (int i = 0; i < docs.size(); i++) {
                    String regex = "\\b" + word + "\\b";
                    String content = docs.get(i).replaceAll(regex, " ");
                    docs.set(i, content);
                }
            }
        }

        return docs;

    }

    /**
     * a method that removes infrequent words from a collection of documents
     * @param wordsToSetOfDocs words and the documents they appear in
     * @param docs list of docs
     * @return list of docs without the infrequent words
     */
    private List<String> removeInfrequentWords(Map<String, HashSet<Integer>> wordsToSetOfDocs, List<String> docs) {

        int downThreshold = 1; // remove words that appear in only one document
        //(int) (0.005*docs.size()); //remove words that appear in less than 0.5% of documents
        for (String word : wordsToSetOfDocs.keySet()) {
            if (wordsToSetOfDocs.get(word).size() == downThreshold) {
                for (int i = 0; i < docs.size(); i++) {
                    String regex = "\\b" + word + "\\b";
                    String content = docs.get(i).replaceAll(regex, " ");
                    docs.set(i, content);
                }
            }
        }

        return docs;

    }

}