Java tutorial
/* * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.thesmartweb.swebrank; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; /** * Class that contains methods that manipulate different data * for SWebRank * @author Themis Mavridis */ public class DataManipulation { /** * Method that clears a List from duplicates and null elements * @param wordList It contains the List to be cleared * @return a List cleared from duplicates and null elements */ public List<String> clearListString(List<String> wordList) { //remove all null elements of the wordlist wordList.removeAll(Collections.singleton(null)); //remove the duplicate elements since HashSet does not allow duplicates HashSet<String> hashSet_wordList = new HashSet<String>(wordList); //create an iterator to the hashset to add the elements back to the wordlist Iterator wordList_iterator = hashSet_wordList.iterator(); //clear the wordlist wordList.clear(); while (wordList_iterator.hasNext()) { wordList.add(wordList_iterator.next().toString()); } return wordList; } /** * Method that writes a List to a file * @param wordList List to be saved * @param file_wordlist The file in a string format that the List is going to be saved * @return True/False */ public boolean AppendWordList(List<String> wordList, String file_wordlist) { //----------------append the wordlist to a file File wordlist_file = new File(file_wordlist); try { FileUtils.writeLines(wordlist_file, wordList); return true; } catch (IOException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); return false; } } /** * Method that writes a String to a file * @param input String to be saved * @param file_string The file in a string format that the String input is going to be saved * @return True/False */ public boolean AppendString(String input, String file_string) { File string_file = new File(file_string); try { FileUtils.writeStringToFile(string_file, input); return true; } catch (IOException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); return false; } } /** * Method that adds a List to another List * @param wordListtoAdd List to be added * @param wordListTotal List in which the elements of wordListtoAdd are going to be added * @return wordListTotal which contains the elements already existent in it along with the elements of wordListtoAdd */ public List<String> AddAList(List<String> wordListtoAdd, List<String> wordListTotal) { Iterator wordList_new_final_iterator = wordListtoAdd.iterator(); while (wordList_new_final_iterator.hasNext()) { wordListTotal.add(wordList_new_final_iterator.next().toString()); } return wordListTotal; } /** * Method that recognizes if a string contains a an extension that is not supported * @param input String of the file * @return True/False */ public boolean StructuredFileCheck(String input) { List<String> structuredFileTypes = new ArrayList<>(); structuredFileTypes.add(".pdf"); structuredFileTypes.add(".ppt"); structuredFileTypes.add(".doc"); Iterator filesiterator = structuredFileTypes.iterator(); boolean flag_found = false; while (filesiterator.hasNext() && !flag_found) { if (filesiterator.next().toString().contains(input)) { flag_found = true; return flag_found; } } return flag_found; } /** * Removes the characters from a string * @param str String to be cleaned from the characters * @return the String cleaned from characters */ public String removeChars(String str){ if (str != null) { try { //str = str.replaceAll("(\r\n|\r|\n|\n\r)", " "); //Clear Paragraph escape sequences str = str.replaceAll("\\.", " "); //Clear dots str = str.replaceAll("\\-", " "); // str = str.replaceAll("\\_", " "); // str = str.replaceAll(":", " "); str = str.replaceAll("\\+", " "); str = str.replaceAll("\\/", " "); str = str.replaceAll("\\|", " "); str = str.replaceAll("\\[", " "); str = str.replaceAll("\\?", " "); str = str.replaceAll("\\#", " "); str = str.replaceAll("\\!", " "); str = str.replaceAll("'", " "); //Clear apostrophes str = str.replaceAll(",", " "); //Clear commas str = str.replaceAll("@", " "); //Clear @'s (optional) str = str.replaceAll("$", " "); //Clear $'s (optional) str = str.replaceAll("\\\\", "**&**"); //Clear special character backslash 4 \'s due to regexp format str = str.replaceAll("&", "&"); //change & to & str = str.replaceAll("<", "<"); //change < to < str = str.replaceAll(">", ">"); //change > to > // str = str.replaceAll("<[^<>]*>"," "); //drop anything in <> str = str.replaceAll("&#\\d+;", " "); //change &#[digits]; to space str = str.replaceAll(""", " "); //change " to space // str = str.replaceAll("http://[^ ]+ "," "); //drop urls str = str.replaceAll("-", " "); //drop non-alphanumeric characters str = str.replaceAll("[^0-9a-zA-Z ]", " "); //drop non-alphanumeric characters str = str.replaceAll("·", " "); str = str.replaceAll("\\>", " "); str = str.replaceAll("\\<", " "); str = str.replaceAll("<[^>]*>", ""); str = str.replaceAll("\\d"," "); //str=str.replaceAll("\\<.*?\\>", ""); str = str.replace('', ' '); str = str.replace('', ' '); str = str.replace('', ' '); str = str.replace(')', ' '); str = str.replace('(', ' '); str = str.replace('[', ' '); str = str.replace(']', ' '); str = str.replace('`', ' '); str = str.replace('~', ' '); str = str.replace('!', ' '); str = str.replace('#', ' '); str = str.replace('%', ' '); str = str.replace('^', ' '); str = str.replace('*', ' '); str = str.replace('&', ' '); str = str.replace('_', ' '); str = str.replace('=', ' '); str = str.replace('+', ' '); str = str.replace('|', ' '); str = str.replace('\\', ' '); str = str.replace('{', ' '); str = str.replace('}', ' '); str = str.replace(',', ' '); str = str.replace('.', ' '); str = str.replace('/', ' '); str = str.replace('?', ' '); str = str.replace('"', ' '); str = str.replace(':', ' '); str = str.replace('>', ' '); str = str.replace(';', ' '); str = str.replace('<', ' '); str = str.replace('$', ' '); str = str.replace('-', ' '); str = str.replace('@', ' '); str = str.replace('', ' '); //remove space InputStreamReader in = new InputStreamReader(IOUtils.toInputStream(str)); BufferedReader br = new BufferedReader(in); Pattern p; Matcher m; String afterReplace = ""; String strLine; String inputText = ""; while ((strLine = br.readLine()) != null) { inputText = strLine; p = Pattern.compile("\\s+"); m = p.matcher(inputText); afterReplace = afterReplace + m.replaceAll(" "); } br.close(); str = afterReplace; return str; } catch (IOException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); str=null; return str; } } else { return str; } } /** * Method that sorts a HashMap according to their values * @param map the HashMap to be sorted * @return a List that contains the keys in sorted (descending) fashion */ public List<String> sortHashmap(final HashMap<String, Double> map) { Set<String> set = map.keySet(); List<String> keys = new ArrayList<String>(set); Collections.sort(keys, new Comparator<String>() { @Override public int compare(String s1, String s2) { return Double.compare(map.get(s2), map.get(s1)); } }); return keys; } public HashMap sortHashMapByValuesD(HashMap passedMap) { List mapKeys = new ArrayList(passedMap.keySet()); List mapValues = new ArrayList(passedMap.values()); Collections.sort(mapValues); Collections.sort(mapKeys); HashMap sortedMap = new HashMap(); Iterator valueIt = mapValues.iterator(); while (valueIt.hasNext()) { Object val = valueIt.next(); Iterator keyIt = mapKeys.iterator(); while (keyIt.hasNext()) { Object key = keyIt.next(); String comp1 = passedMap.get(key).toString(); String comp2 = val.toString(); if (comp1.equals(comp2)) { passedMap.remove(key); mapKeys.remove(key); sortedMap.put((String) key, (Double) val); break; } } } return sortedMap; } /** * Method that returns all the files of a certain extension from a directory * @param directory_path A String with the directory * @param filetype A string with the filetype (without dot symbol) * @return a Collection that contains all the files found */ public Collection<File> getinputfiles(String directory_path, String filetype) { String[] extensions = { filetype };//set the file extensions you would like to parse, e.g. you could have {txt,jpeg,pdf} File directory = new File(directory_path); //----FileUtils listfiles(File directory, IOFileFilter fileFilter, IOFileFilter dirFilter) //---- file filter is set to the extensions //---- the dirFilter is set to true and it performs recursive search to all the subdirectories String collection = FileUtils.listFiles(directory, extensions, true).toString(); Collection<File> Files = FileUtils.listFiles(directory, extensions, true); String[] paths = new String[Files.size()];//----the String array will contain all the paths of the files int j = 0; for (File file : Files) { paths[j] = file.getPath(); j++; } return Files; } }