gr.iit.demokritos.cru.cps.ai.KeyphraseClustering.java Source code

Java tutorial

Introduction

Here is the source code for gr.iit.demokritos.cru.cps.ai.KeyphraseClustering.java

Source

/*
 * Copyright (C) 2015 Computational Systems & Human Mind Research Unit
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package gr.iit.demokritos.cru.cps.ai;

import gr.iit.demokritos.cru.cps.utilities.wordnet.WNAccess;
import gr.iit.demokritos.cru.cps.utilities.wordnet.WNDE;
import gr.iit.demokritos.cru.cps.utilities.wordnet.WNEL;
import gr.iit.demokritos.cru.cps.utilities.wordnet.WordNetDeDistance;
import gr.iit.demokritos.cru.cps.utilities.wordnet.WordNetENDistance;
import gr.iit.demokritos.cru.cps.utilities.wordnet.WordNetElDistance;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import static java.lang.Math.ceil;
import static java.lang.Math.floor;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;
import static org.apache.commons.lang3.StringUtils.getLevenshteinDistance;
import weka.clusterers.EM;
import weka.clusterers.HierarchicalClusterer;
import weka.clusterers.SimpleKMeans;
import weka.clusterers.XMeans;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;

/**
 *
 * @author Antonis Koukourikos
 */
public class KeyphraseClustering {

    private ArrayList<String> keys;
    private String language;
    private int clusters;
    private WordNetENDistance wd;
    private WordNetDeDistance wdde;
    private WordNetElDistance wdel;
    private WNAccess wn;
    private WNDE wnde;
    private WNEL wnel;
    private Instances data;
    //private int clusters;
    // private HashMap<String, String> tokensHashMap = new HashMap<String, String>();

    public KeyphraseClustering(ArrayList<String> k, int numberOfClust, String language, WNAccess wn, WNDE wnde,
            WNEL wnel) throws ClassNotFoundException {
        this.language = language;

        this.wn = wn;
        this.wd = new WordNetENDistance();
        this.wdde = new WordNetDeDistance(wnde);
        this.wdel = new WordNetElDistance(wnel);

        this.keys = new ArrayList<String>();
        for (int i = 0; i < k.size(); i++) {
            String[] tokenLine = k.get(i).split(";");
            String key = tokenLine[0];
            this.keys.add(key);
        }
        Attribute words = new Attribute("words", (FastVector) null);
        FastVector fvWekaAttributes = new FastVector();
        fvWekaAttributes.addElement(words);
        this.data = new Instances("words", fvWekaAttributes, 0);

        double sum = 0.0;
        for (String s : this.keys) {
            //keep the sum of the semantic distance between all the words
            for (String p : this.keys) {
                if (!p.equalsIgnoreCase(s)) {
                    sum += getDistance(s, p);
                }
            }
            //create new instance for every key and add it to the data
            Instance inst = new Instance(1);
            //System.out.println(fvWekaAttributes.get(0));   //System.out.println(data.attribute(0));
            inst.setValue(this.data.attribute(0), s);
            // data.add(new inst(1.0,vals));
            this.data.add(inst);
        }
        if (numberOfClust == 0) {
            //sum += keys.size();
            int numerator = (int) ceil(sum);
            int clust = (int) ceil(numerator / (double) (this.keys.size()));
            this.clusters = clust;
        } else {
            this.clusters = numberOfClust;
        }
    }

    public ArrayList<String> getClusters() throws Exception {
        System.out.println("Clustering......");
        // int[] clusters_size = new int[clusters];
        HierarchicalClusterer cl = new HierarchicalClusterer();
        // EM em=new EM();
        // XMeans xm = new XMeans();       no nominal attributes
        // DBSCAN db= new DBSCAN();        not our distance function
        // CascadeSimpleKMeans c = new CascadeSimpleKMeans(); not our distance function  

        cl.setNumClusters(this.clusters);
        if (language.equals("en")) {
            // cl.setDistanceFunction(wd);
            //xm.setDistanceF(wd);
            cl.setDistanceFunction(wd);
        } else if (language.equals("de")) {
            cl.setDistanceFunction(wdde);
            //c.setDistanceFunction(wdde);
            //xm.setDistanceF(wdde);
        } else if (language.equals("el")) {
            cl.setDistanceFunction(wdel);
            //c.setDistanceFunction(wdel);
            // xm.setDistanceF(wdel);
        }
        cl.buildClusterer(data);
        //xm.buildClusterer(data);
        //c.setMaxIterations(5);
        ArrayList<String> clustersList = new ArrayList<String>();
        for (int i = 0; i < cl.numberOfClusters(); i++) {
            clustersList.add("");
        }
        //cl.buildClusterer(data);
        //em.buildClusterer(data);
        // xm.buildClusterer(data);

        for (int j = 0; j < data.numInstances(); j++) {
            //double[] prob = c.distributionForInstance(data.instance(j));
            //double[] prob = cl.distributionForInstance(data.instance(j)); 
            String clusterLine = data.instance(j).stringValue(0);

            int clust = cl.clusterInstance(data.instance(j));
            clustersList.set(clust, clustersList.get(clust).concat(clusterLine + ";"));
            //take the probabilities prob[i] that it is in the coresponding cluster i
            /*for (int i = 0; i < prob.length; i++) {
             //keep the cluster that has prob>0.9, as this is the cluster that the word is in
             if (prob[i] > 0.9) {
             //keep for every cluster its terms
             clustersList.set(i, clustersList.get(i).concat(clusterLine + ";"));
             //keep the size of cluster i
             // clusters_size[i] = clusters_size[i] + 1;
             }
             }*/
        }
        return clustersList;
    }

    public double getDistance(String s1, String s2) {
        double sem = 0.0;
        if (this.language.equalsIgnoreCase("en")) {
            sem = 0.75 * wn.getDistance(s1, s2);
        } else if (this.language.equalsIgnoreCase("de")) {
            sem = 0.75 * this.wnde.getDistance(s1, s2);
        } else if (this.language.equalsIgnoreCase("el")) {
            sem = 0.75 * wnel.getDistance(s1, s2);
        }
        double lev = 0.25 * getLevenshteinDistance(s1, s2) / ((s1.length() + s2.length()) / 2);
        //System.out.println(w1 + "-" + w2 + ": " + d);
        return sem + lev;
    }
}