Java tutorial
/* * Copyright 2015 Konstantinos Papangelou. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mythesis.profileanalysis; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; import static org.apache.commons.io.FileUtils.directoryContains; import static org.apache.commons.io.FileUtils.readFileToString; import org.apache.commons.lang3.StringUtils; /** * This class contains auxiliary functions * @author Konstantinos Papangelou */ public class Utils { /** * Method that returns all the files of a certain extension from a directory * @param directory_path A String with the directory * @param filetype A string with the filetype (without dot symbol) * @return a Collection that contains all the files found */ public Collection<File> getinputfiles(String directory_path, String filetype) { String[] extensions = { filetype };//set the file extensions you would like to parse, e.g. you could have {txt,jpeg,pdf} File directory = new File(directory_path); //----FileUtils listfiles(File directory, IOFileFilter fileFilter, IOFileFilter dirFilter) //---- file filter is set to the extensions //---- the dirFilter is set to true and it performs recursive search to all the subdirectories //String collection = FileUtils.listFiles(directory, extensions, true).toString(); Collection<File> Files = FileUtils.listFiles(directory, extensions, false); String[] paths = new String[Files.size()];//----the String array will contain all the paths of the files int j = 0; for (File file : Files) { paths[j] = file.getPath(); j++; } return Files; } /** * Method that gets the results form sWebRank and puts them all together in the same file * @param inputDirectory the output directory of sWebRank * @param outputDirectory the directory where im gonna save the new file * @param file the name of the file where im gonna save the results e.g. xxx.txt */ public void getLDAcontent(String inputDirectory, String outputDirectory, String file) { System.out.println("Getting webpages from " + inputDirectory); //Find output paths File root = new File(inputDirectory); File[] contents = root.listFiles(); List<String> sWebRanklevels = new ArrayList<>(); for (File f : contents) { if (f.getAbsolutePath().contains("level")) sWebRanklevels.add(f.getAbsolutePath()); } //Find all query paths //for now this only works for bing search engine List<String> totalQueries = new ArrayList<>(); for (String s : sWebRanklevels) { File level = new File(s); File[] queries = level.listFiles(); for (File f : queries) { if (!f.getAbsolutePath().contains("txt")) totalQueries.add(f.getAbsolutePath() + "\\" + "bing" + "\\"); } } int totalDocs = 0; //String totalContent=""; List<String> totalContent = new ArrayList<>(); String webpage = ""; for (String s : totalQueries) { File level = new File(s); File[] docPaths = level.listFiles(); for (File f : docPaths) { String str = f.getAbsolutePath(); if (StringUtils.isNumeric(str.substring(str.lastIndexOf("\\") + 1))) { File webPagePath = new File(str + "\\html_parse_content.txt"); try { if (!directoryContains(f, webPagePath)) { webPagePath = new File(f.getAbsolutePath() + "\\youtube_content.txt"); if (!directoryContains(f, webPagePath)) continue; webpage = readFileToString(webPagePath).replace("\n", "").replace("\r", ""); YoutubeContent ytc = new YoutubeContent(); webpage = ytc.parse(webpage); } else { webpage = readFileToString(webPagePath); } } catch (IOException ex) { Logger.getLogger(Utils.class.getName()).log(Level.SEVERE, null, ex); } if (webpage.isEmpty()) continue; totalContent.add(webpage); } } } Set<String> hs = new HashSet<>(); hs.addAll(totalContent); totalContent.clear(); totalContent.addAll(hs); //find for each word the documents that appears in Map<String, HashSet<Integer>> wordsToSetOfDocsMap = new TreeMap<>(); int size = totalContent.size(); for (int d = 0; d < size; d++) { String doc = totalContent.get(d); String[] words = doc.trim().split(" "); int length = words.length; for (int w = 0; w < length; w++) { String word = words[w]; if (!wordsToSetOfDocsMap.containsKey(word)) { wordsToSetOfDocsMap.put(word, new HashSet<Integer>()); } HashSet<Integer> setOfDocs = wordsToSetOfDocsMap.get(word); setOfDocs.add(d); wordsToSetOfDocsMap.put(word, setOfDocs); } } //remove frequent words totalContent = removeFrequentWords(wordsToSetOfDocsMap, totalContent); //remove infrequent words totalContent = removeInfrequentWords(wordsToSetOfDocsMap, totalContent); //store results for (int i = 0; i < totalContent.size(); i++) { String content = totalContent.get(i).replaceAll("\\s+", " ").trim(); totalContent.set(i, content); } totalDocs = totalContent.size(); totalContent.add(0, String.valueOf(totalDocs)); String LDAdirectory = outputDirectory; File ldaContent = new File(LDAdirectory + file); try { FileUtils.writeLines(ldaContent, totalContent); } catch (IOException ex) { Logger.getLogger(Utils.class.getName()).log(Level.SEVERE, null, ex); } System.out.println("I stored the results in " + outputDirectory + file); } /** * a method that removes most frequent words from a collection of documents * @param wordsToSetOfDocs words and the documents they appear in * @param docs list of docs * @return list of docs without the most frequent words */ private List<String> removeFrequentWords(Map<String, HashSet<Integer>> wordsToSetOfDocs, List<String> docs) { int upperThreshold = (int) (0.7 * docs.size()); //remove words that appear in more than 70% of documents for (String word : wordsToSetOfDocs.keySet()) { if (wordsToSetOfDocs.get(word).size() > upperThreshold) { for (int i = 0; i < docs.size(); i++) { String regex = "\\b" + word + "\\b"; String content = docs.get(i).replaceAll(regex, " "); docs.set(i, content); } } } return docs; } /** * a method that removes infrequent words from a collection of documents * @param wordsToSetOfDocs words and the documents they appear in * @param docs list of docs * @return list of docs without the infrequent words */ private List<String> removeInfrequentWords(Map<String, HashSet<Integer>> wordsToSetOfDocs, List<String> docs) { int downThreshold = 1; // remove words that appear in only one document //(int) (0.005*docs.size()); //remove words that appear in less than 0.5% of documents for (String word : wordsToSetOfDocs.keySet()) { if (wordsToSetOfDocs.get(word).size() == downThreshold) { for (int i = 0; i < docs.size(); i++) { String regex = "\\b" + word + "\\b"; String content = docs.get(i).replaceAll(regex, " "); docs.set(i, content); } } } return docs; } }