eu.edisonproject.utility.commons.IDFSort.java Source code

Introduction

Here is the source code for eu.edisonproject.utility.commons.IDFSort.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package eu.edisonproject.utility.commons;

import eu.edisonproject.utility.file.ConfigHelper;
import eu.edisonproject.utility.file.ReaderFile;
import eu.edisonproject.utility.text.processing.StopWord;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.util.CharArraySet;

/**
 *
 * @author S. Koulouzis
 */
public class IDFSort implements SortTerms {

    private final StopWord cleanStopWord;

    public IDFSort(String stopWordsPath) {
        CharArraySet stopWordArraySet = new CharArraySet(ConfigHelper.loadStopWords(stopWordsPath), true);
        cleanStopWord = new StopWord(stopWordArraySet);
    }

    @Override
    public Map<String, Double> sort(Map<String, Double> termDictionaray, String dirPath)
            throws IOException, InterruptedException {
        Map<String, Double> newTermDictionaray = new HashMap<>();
        File dir = new File(dirPath);
        File[] docs = dir.listFiles();
        //        GrepOptionSets go = new GrepOptionSets();
        for (String term : termDictionaray.keySet()) {
            int numOfDocsWithTerm = 0;
            for (File f : docs) {
                int count = 0;
                if (FilenameUtils.getExtension(f.getName()).endsWith("txt")) {
                    count++;
                    Logger.getLogger(IDFSort.class.getName()).log(Level.FINE, "{0}: {1} of {2}",
                            new Object[] { f.getName(), count, docs.length });
                    ReaderFile rf = new ReaderFile(f.getAbsolutePath());
                    String contents = rf.readFileWithN();
                    cleanStopWord.setDescription(contents);
                    contents = cleanStopWord.execute();

                    cleanStopWord.setDescription(term.replaceAll("_", " "));
                    String cTerm = cleanStopWord.execute();

                    //                    int lastIndex = 0;
                    //                    int wcount = 0;
                    //
                    //                    while (lastIndex != -1) {
                    //
                    //                        lastIndex = contents.indexOf(cTerm, lastIndex);
                    //
                    //                        if (lastIndex != -1) {
                    //                            wcount++;
                    //                            lastIndex += cTerm.length();
                    //                        }
                    //                    }
                    //                    System.out.println(f.getName() + "," + cTerm + "," + wcount);
                    //
                    //                    wcount = StringUtils.countMatches(contents, cTerm);
                    //                    System.out.println(f.getName() + "," + cTerm + "," + wcount);
                    if (contents.contains(cTerm)) {
                        numOfDocsWithTerm++;
                        //                        System.out.println(f.getName() + "," + cTerm + "," + numOfDocsWithTerm);
                    }
                    //                    try (InputStream fis = new FileInputStream(f)) {
                    //                        String result = Unix4j.from(fis).grep(go.i.count, cTerm).toStringResult();
                    //                        Integer lineCount = Integer.valueOf(result);
                    //                        if (lineCount > 0) {
                    //                            numOfDocsWithTerm++;
                    //                        }
                    //                    }
                }

            }
            double idf = 0;
            if (numOfDocsWithTerm > 0) {
                idf = Math.log((double) docs.length / (double) numOfDocsWithTerm);
            }
            newTermDictionaray.put(term, idf);
        }
        return newTermDictionaray;
    }
}