com.thesmartweb.swebrank.TFIDF.java Source code

Introduction

Here is the source code for com.thesmartweb.swebrank.TFIDF.java
Source

/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;

/**
 * Class for TFIDF analysis
 * @author Themistoklis Mavridis
 */
public class TFIDF {

    /**
     * a list with the top words recognized by TFIDF
     */
    protected List<String> topWordsList;

    /**
     * Method to calculate TF score
     * @param Doc the document to analyze
     * @param termToCheck the term to calculate tf for
     * @return th TF score
     */
    public double tfCalculator(String Doc, String termToCheck) {
        double count = 0; //to count the overall occurrence of the term termToCheck
        String[] tokenizedTerms = Doc.toString().replaceAll("[\\W&&[^\\s]]", "").split("\\W+"); //to get individual terms
        for (String s : tokenizedTerms) {
            if (s.equalsIgnoreCase(termToCheck)) {
                count++;
            }
        }
        double tfvalue = Math.pow((count / tokenizedTerms.length), 0.5);

        return tfvalue;
    }

    /**
     * Method to calculate idf score
     * @param allwordsList all the words
     * @param termToCheck the term to check for
     * @param NumberOfDocs the number of documents we analyze
     * @return the idf score
     */
    public double idfCalculator(List<List<String>> allwordsList, String termToCheck, int NumberOfDocs) {
        double count = 0;
        for (List<String> wordList : allwordsList) {
            for (String s : wordList) {
                if (s.equalsIgnoreCase(termToCheck)) {
                    count++;
                    break;
                }
            }

        }

        double output = 1 + Math.log(NumberOfDocs / (1 + count));
        return output;
    }

    /**
     * Method to compute the TFIDF score
     * @param allDocs all the documents to analyze
     * @param topWords the amount of top words to get
     * @param directory the directory to save the output
     * @return a list with the top words
     */
    public List<String> compute(String[] allDocs, int topWords, String directory) {
        try {
            List<List<String>> allwordsList = new ArrayList<>();
            int counterwords = 0;
            int negtfidf = 0;
            for (int i = 0; i < allDocs.length; i++) {
                List<String> allwordsList_single = new ArrayList<>();
                if (!(allDocs[i] == null)) {
                    String stringtosplit = allDocs[i];
                    if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) {
                        stringtosplit = stringtosplit.replaceAll("[\\W&&[^\\s]]", "");
                        if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) {
                            String[] tokenizedTerms = stringtosplit.split("\\W+");
                            for (int j = 0; j < tokenizedTerms.length; j++) {
                                if (!(tokenizedTerms[j] == null) && (!(tokenizedTerms[j].equalsIgnoreCase("")))) {
                                    allwordsList_single.add(tokenizedTerms[j]);
                                    counterwords++;
                                }
                            }
                        }
                    }
                }
                allwordsList.add(i, allwordsList_single);
            }

            HashMap<String, Double> wordTFIDFscores = new HashMap<>();
            List<String> topwordsTFIDF;
            topwordsTFIDF = new ArrayList<>();
            List<String> wordsTFIDF = new ArrayList<>();
            List<Double> TFIDFscoreslist;
            List<Double> TFIDFscoreslistcopy = new ArrayList<>();
            TFIDFscoreslist = new ArrayList<>();
            for (int i = 0; i < allDocs.length; i++) {
                if (!(allDocs[i] == null)) {
                    String stringtosplit = allDocs[i];
                    if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) {
                        stringtosplit = stringtosplit.replaceAll("[\\W&&[^\\s]]", "");
                        if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) {
                            String[] tokenizedTerms = stringtosplit.split("\\W+");
                            for (int j = 0; j < tokenizedTerms.length; j++) {
                                if (!(tokenizedTerms[j] == null) && (!(tokenizedTerms[j].equalsIgnoreCase("")))) {
                                    Double tfvalue = tfCalculator(allDocs[i], tokenizedTerms[j]);
                                    Double idfvalue = idfCalculator(allwordsList, tokenizedTerms[j],
                                            allDocs.length);
                                    Double tfidfvalue = tfvalue * idfvalue;
                                    if (tfidfvalue < 0) {
                                        negtfidf++;
                                    }
                                    TFIDFscoreslist.add(tfvalue.doubleValue());
                                    TFIDFscoreslistcopy.add(tfvalue.doubleValue());
                                    wordsTFIDF.add(tokenizedTerms[j]);
                                    if (wordTFIDFscores.get(tokenizedTerms[j]) == null
                                            || wordTFIDFscores.get(tokenizedTerms[j]).doubleValue() > tfidfvalue) {
                                        wordTFIDFscores.put(tokenizedTerms[j], tfidfvalue);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            DataManipulation shmap = new DataManipulation();
            topwordsTFIDF = shmap.sortHashmap(wordTFIDFscores).subList(0, topWords);
            topWordsList = topwordsTFIDF;
            File file_words = new File(directory + "words.txt");
            FileUtils.writeLines(file_words, topWordsList);
            return topWordsList;
        } catch (IOException ex) {
            Logger.getLogger(TFIDF.class.getName()).log(Level.SEVERE, null, ex);

            return topWordsList;
        }

    }
}