Java tutorial
/* * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.thesmartweb.swebrank; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; /** * Class for TFIDF analysis * @author Themistoklis Mavridis */ public class TFIDF { /** * a list with the top words recognized by TFIDF */ protected List<String> topWordsList; /** * Method to calculate TF score * @param Doc the document to analyze * @param termToCheck the term to calculate tf for * @return th TF score */ public double tfCalculator(String Doc, String termToCheck) { double count = 0; //to count the overall occurrence of the term termToCheck String[] tokenizedTerms = Doc.toString().replaceAll("[\\W&&[^\\s]]", "").split("\\W+"); //to get individual terms for (String s : tokenizedTerms) { if (s.equalsIgnoreCase(termToCheck)) { count++; } } double tfvalue = Math.pow((count / tokenizedTerms.length), 0.5); return tfvalue; } /** * Method to calculate idf score * @param allwordsList all the words * @param termToCheck the term to check for * @param NumberOfDocs the number of documents we analyze * @return the idf score */ public double idfCalculator(List<List<String>> allwordsList, String termToCheck, int NumberOfDocs) { double count = 0; for (List<String> wordList : allwordsList) { for (String s : wordList) { if (s.equalsIgnoreCase(termToCheck)) { count++; break; } } } double output = 1 + Math.log(NumberOfDocs / (1 + count)); return output; } /** * Method to compute the TFIDF score * @param allDocs all the documents to analyze * @param topWords the amount of top words to get * @param directory the directory to save the output * @return a list with the top words */ public List<String> compute(String[] allDocs, int topWords, String directory) { try { List<List<String>> allwordsList = new ArrayList<>(); int counterwords = 0; int negtfidf = 0; for (int i = 0; i < allDocs.length; i++) { List<String> allwordsList_single = new ArrayList<>(); if (!(allDocs[i] == null)) { String stringtosplit = allDocs[i]; if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) { stringtosplit = stringtosplit.replaceAll("[\\W&&[^\\s]]", ""); if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) { String[] tokenizedTerms = stringtosplit.split("\\W+"); for (int j = 0; j < tokenizedTerms.length; j++) { if (!(tokenizedTerms[j] == null) && (!(tokenizedTerms[j].equalsIgnoreCase("")))) { allwordsList_single.add(tokenizedTerms[j]); counterwords++; } } } } } allwordsList.add(i, allwordsList_single); } HashMap<String, Double> wordTFIDFscores = new HashMap<>(); List<String> topwordsTFIDF; topwordsTFIDF = new ArrayList<>(); List<String> wordsTFIDF = new ArrayList<>(); List<Double> TFIDFscoreslist; List<Double> TFIDFscoreslistcopy = new ArrayList<>(); TFIDFscoreslist = new ArrayList<>(); for (int i = 0; i < allDocs.length; i++) { if (!(allDocs[i] == null)) { String stringtosplit = allDocs[i]; if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) { stringtosplit = stringtosplit.replaceAll("[\\W&&[^\\s]]", ""); if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) { String[] tokenizedTerms = stringtosplit.split("\\W+"); for (int j = 0; j < tokenizedTerms.length; j++) { if (!(tokenizedTerms[j] == null) && (!(tokenizedTerms[j].equalsIgnoreCase("")))) { Double tfvalue = tfCalculator(allDocs[i], tokenizedTerms[j]); Double idfvalue = idfCalculator(allwordsList, tokenizedTerms[j], allDocs.length); Double tfidfvalue = tfvalue * idfvalue; if (tfidfvalue < 0) { negtfidf++; } TFIDFscoreslist.add(tfvalue.doubleValue()); TFIDFscoreslistcopy.add(tfvalue.doubleValue()); wordsTFIDF.add(tokenizedTerms[j]); if (wordTFIDFscores.get(tokenizedTerms[j]) == null || wordTFIDFscores.get(tokenizedTerms[j]).doubleValue() > tfidfvalue) { wordTFIDFscores.put(tokenizedTerms[j], tfidfvalue); } } } } } } } DataManipulation shmap = new DataManipulation(); topwordsTFIDF = shmap.sortHashmap(wordTFIDFscores).subList(0, topWords); topWordsList = topwordsTFIDF; File file_words = new File(directory + "words.txt"); FileUtils.writeLines(file_words, topWordsList); return topWordsList; } catch (IOException ex) { Logger.getLogger(TFIDF.class.getName()).log(Level.SEVERE, null, ex); return topWordsList; } } }