pro.miner.classess.EntropyOfSynonyms.java Source code

Java tutorial

Introduction

Here is the source code for pro.miner.classess.EntropyOfSynonyms.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package pro.miner.classess;

import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.DefaultListModel;
import javax.swing.SwingWorker;
import org.apache.commons.lang3.StringUtils;
import static pro.miner.classess.GlossarySearcher.file;
import static pro.miner.classess.MergeListFreqCounter.countFreq;
import static pro.miner.classess.MergeListFreqCounter.countWordFreq;
import static pro.miner.classess.MergeListFreqCounter.list1_list2_freq_model;
import static pro.miner.classess.MergeListFreqCounter.mergeListFreq;
import static pro.miner.classess.MergeListFreqCounter.mergeListProb;
import forms.Form_domain_expert;

/**
 *
 * @author Farooq
 */
public class EntropyOfSynonyms extends SwingWorker {

    public static HashMap<String, String> EntropyOfSimilarWord = new HashMap<String, String>();
    public static DefaultListModel entropy_list_model = new DefaultListModel();
    public static double total_entropy = 0;
    public static int word_count_in_corpus = 0;

    @Override
    protected String doInBackground() throws Exception {
        countEntropy();
        return "";
    }

    public static void countEntropy() {
        Form_domain_expert.pbofwordnet.setIndeterminate(true);

        entropy_list_model.clear();
        Form_domain_expert.entropyList.removeAll();

        for (int i = 0; i < Form_domain_expert.relatedList.getModel().getSize(); i++) {

            try {

                //Get the word from list
                String word = Form_domain_expert.relatedList.getModel().getElementAt(i).toString();

                //Compute probability of the word
                fillEntropy(word);

                double prob = Double.parseDouble(EntropyOfSimilarWord.get(word).toString());
                double log_prob = Math.log(prob);
                double entropy = prob > 0 ? prob * log_prob : 0;

                //total_entropy += entropy;
                //System.out.println("=========================");
                //System.out.println("=========================Entropy  Prob [" + word + "]: " + prob);
                //System.out.println("=========================Entropy  Prob Log[" + word + "]: " + log_prob);
                System.out.println("=========================Entropy [" + word + "]: " + entropy);
                //System.out.println("=========================");

                total_entropy = total_entropy + entropy;

                entropy_list_model.addElement(entropy);
            } catch (Exception e) {
                System.out.println("Exception occured while computing entroy, exception: " + e.getMessage());
            }
        }

        Form_domain_expert.pbofwordnet.setIndeterminate(false);
        Form_domain_expert.entropyList.setModel(entropy_list_model);

        //        for (int i = 0; i < Form_domain_expert.entropyList.getModel().getSize(); i++) {
        //            String entropy_item = Form_domain_expert.entropyList.getModel().getElementAt(i).toString();
        //            double entropy_double_item = Double.parseDouble(entropy_item);
        //            total_entropy += entropy_double_item;
        //        }
        //        System.out.println("Total Entropy : " + total_entropy);
    }

    private static double fillEntropy(String input_word) {
        double prob = 0;
        double count_of_word = 0;
        //double total_words = 0;
        try {
            Form_domain_expert.pbofwordnet.setIndeterminate(true);
            //BufferedReader br = new BufferedReader(new FileReader(file));
            String line = "";

            //            while ((line = br.readLine()) != null) {
            //                if (!line.isEmpty()) {
            //                    /* REMOVAL OF STOP WORDS :: GENERAL */
            //                    line = removeStopWordsFromLine(line);
            //
            //                    /* REMOVAL OF STOP WORDS :: WORDNET */
            //                    List<String> tempList = removeWordNetStopWords(line);
            //                    
            //                    line = StringUtils.join(tempList, " ");
            //                    
            //                    String[] word_array = input_word.split("\\s");
            //                    count_of_word += countWordFreq(StringUtils.join(word_array, "(\\s+)"), line);
            //                    total_words += CountWords(line);
            //                }
            //            }
            //
            //            br.close();
            //System.out.println("Count :" + count_of_word);
            line = Global.file_text;

            String[] word_array = input_word.split("\\s");
            count_of_word = countWordFreq(StringUtils.join(word_array, "(\\s+)"), line);
            if (word_count_in_corpus == 0) {
                word_count_in_corpus = CountWords(line);
            }

            //Get the probability of word in corpus file
            prob = count_of_word / word_count_in_corpus;

            EntropyOfSimilarWord.put(input_word, Double.toString(prob));
            Form_domain_expert.pbofwordnet.setIndeterminate(false);
            return prob;
        } catch (Exception e) {
            return 0;
        }
    }

    public static int countWordFreq(String word, String line) {
        Pattern pattern = Pattern.compile(word);
        Matcher matcher = pattern.matcher(line);
        int counter = 0;
        while (matcher.find()) {
            counter++;
        }
        return counter;
    }

    private static int CountWords(String in) {
        String trim = in.trim();
        if (trim.isEmpty()) {
            return 0;
        }
        return trim.split("\\s+").length; //separate string around spaces
    }

    private static String removeStopWordsFromLine(String inputLine) {
        inputLine = inputLine.toLowerCase().replace("also", " ").replace("most", " ").replace("be", " ")
                .replace("more", " ").replace("each", " ").replace("have", " ").replace("may", " ")
                .replace("has", " ").replace("other", " ").replace("the", " ").replace("more", " ")
                .replace("that", " ");

        inputLine = inputLine.toLowerCase().replace("@", " ").replace("[", " ").replace("", " ")
                .replace("=", " ").replace("+", " ").replace("?", " ").replace("&", " ").replace("%", " ")
                .replace("$", " ").replace(" ", " ").replace("<", " ").replace(">", " ").replace("]", " ")
                .replace("*", " ").replace("_", " ").replace("{", " ").replace("}", " ").replace("#", " ")
                .replace(":", " ").replace("", " ").replace("\n", " ").replace("\u0092", " ").replace("'", " ")
                .replace("", " ").replace("--", " ") // replaced double hipehn first in order to save single hiphen words
                .replace("", " ").replace("\n", " ")
                //.replace(".", " ")
                .replace("(", " ").replace(")", " ")
                //.replace(",", " ")
                .replace(";", " ").replace("", " ").replace("\\", " ").replace("/", " ").replace("?", " ")
                .replace("", "") // in order to get words like "multi-national" => "multinational"
                .replace("1", " ").replace("2", " ").replace("3", " ").replace("4", " ").replace("5", " ")
                .replace("6", " ").replace("7", " ").replace("8", " ").replace("9", " ").replace("0", " ");

        inputLine = inputLine.trim().replaceAll("\\s+", " ");

        return inputLine;
    }

    private static List<String> removeWordNetStopWords(String inputLine) {
        List<String> tempList = new ArrayList<String>();
        try {
            for (int i = 0; i < Arrays.asList(inputLine.split(" ")).size(); i++) {
                String word = Arrays.asList(inputLine.split(" ")).get(i).toString();
                if (!Arrays.asList(RemoveStopWord.stopWordsofwordnet).contains(word)) {
                    tempList.add(word);
                }
            }
        } catch (Exception e) {
        }

        return tempList;
    }

}