it.uniroma2.sag.kelp.wordspace.Wordspace.java Source code

Introduction

Here is the source code for it.uniroma2.sag.kelp.wordspace.Wordspace.java
Source

/*
 * Copyright 2014 Simone Filice and Giuseppe Castellucci and Danilo Croce and Roberto Basili
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package it.uniroma2.sag.kelp.wordspace;

import gnu.trove.map.hash.TLongObjectHashMap;
import it.uniroma2.sag.kelp.data.representation.Vector;
import it.uniroma2.sag.kelp.data.representation.vector.DenseVector;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import org.ejml.data.DenseMatrix64F;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonTypeName;

/**
 * This is an implementation of a wordspace used for associating words to vectors.
 * In particular this wordspace represents each word as a <code>DenseVector</code>
 * 
 * <p>
 * NOTE: in order to speed-up the computation and to reduce the memory occupation,
 * vectors, instead of being associated to words, are associated to their MD5.
 * This, in some remote cases, can lead to word-collision. If it happens when the wordspace
 * is loaded a WARNING message is provided.
 * 
 * @author Danilo Croce, Simone Filice
 *
 */
@JsonTypeName("wordspace")
public class Wordspace implements WordspaceI {

    private final static Logger logger = LoggerFactory.getLogger(Wordspace.class);

    private String matrixPath;

    /**
     * The vectors of the Word Space
     */

    @JsonIgnore
    private TLongObjectHashMap<Vector> vectors;

    /**
     * The words represented in the Word Space
     */
    @JsonIgnore
    private TLongObjectHashMap<char[]> words;

    @JsonIgnore
    private MessageDigest wordEncoder;

    public Wordspace() {
        words = new TLongObjectHashMap<char[]>();
        vectors = new TLongObjectHashMap<Vector>();
        try {
            wordEncoder = MessageDigest.getInstance("MD5");
        } catch (NoSuchAlgorithmException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public Wordspace(String matrixPath) throws IOException {
        this();
        this.setMatrixPath(matrixPath);
    }

    @Override
    public void addWordVector(String word, Vector vector) {
        long l = md5Encode(word);

        if (vectors.containsKey(l)) {
            logger.error("Warning: collision while reading matrix. The word " + word + " collides with "
                    + String.valueOf(words.get(l)));
        }

        vectors.put(l, vector);
        words.put(l, word.toCharArray());
    }

    private long md5Encode(String str) {
        try {
            byte[] bytesOfMessage = str.getBytes("UTF-8");
            byte[] digest = wordEncoder.digest(bytesOfMessage);
            return ByteBuffer.wrap(digest).getLong();
        } catch (Exception e) {
            e.printStackTrace();
            return 0;
        }
    }

    /**
     * Loads the word-vector pairs stored in the file whose path is <code>filename</code>
     * The file can be a plain text file or a .gz archive.
     * <p> 
     * The expected format is: </br>
     * number_of_vectors space_dimensionality</br> 
     * word_i [TAB] 1.0 [TAB] 0 [TAB] vector values comma separated
     * </code> </br></br> Example: </br></br> <code>
     * 3 5</br>
     * dog::n [TAB] 1.0 [TAB] 0 [TAB] 2.1,4.1,1.4,2.3,0.9</br>
     * cat::n [TAB] 1.0 [TAB] 0 [TAB] 3.2,4.3,1.2,2.2,0.8</br>
     * mouse::n [TAB] 1.0 [TAB] 0 [TAB] 2.4,4.4,2.4,1.3,0.92</br>
     *   
     * 
     * @param filename the path of the file containing the word-vector pairs
     * @throws IOException
     */
    private void populate(String filename) throws IOException {
        BufferedReader br = null;
        GZIPInputStream gzis = null;
        if (filename.endsWith(".gz")) {
            gzis = new GZIPInputStream(new FileInputStream(filename));
            InputStreamReader reader = new InputStreamReader(gzis, "UTF8");
            br = new BufferedReader(reader);
        } else {
            br = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF8"));
        }
        String line;
        ArrayList<String> split;
        String label;
        String[] vSplit;

        Pattern iPattern = Pattern.compile(",");
        float[] v = null;

        while ((line = br.readLine()) != null) {
            if (!line.contains("\t"))
                continue;
            float norm2 = 0;
            split = mySplit(line);
            label = split.get(0);
            vSplit = iPattern.split(split.get(3), 0);
            if (v == null)
                v = new float[vSplit.length];
            for (int i = 0; i < v.length; i++) {
                v[i] = Float.parseFloat(vSplit[i]);
                norm2 += v[i] * v[i];
            }
            float norm = (float) Math.sqrt(norm2);
            for (int i = 0; i < v.length; i++) {
                v[i] /= norm;
            }

            DenseMatrix64F featureVector = new DenseMatrix64F(1, v.length);
            for (int i = 0; i < v.length; i++) {
                featureVector.set(0, i, (double) v[i]);
            }

            DenseVector denseFeatureVector = new DenseVector(featureVector);

            addWordVector(label, denseFeatureVector);

        }
        if (filename.endsWith(".gz")) {
            gzis.close();
        }
        br.close();

    }

    private ArrayList<String> mySplit(String s) {
        char[] c = (s).toCharArray();
        ArrayList<String> ll = new ArrayList<String>();
        int index = 0;
        for (int i = 0; i < c.length; i++) {
            if (c[i] == '\t') {
                ll.add(s.substring(index, i));
                index = i + 1;
            }
        }
        ll.add(s.substring(index, s.length()));
        return ll;
    }

    @Override
    public Vector getVector(String word) {
        long l = md5Encode(word);
        if (!vectors.contains(l))
            return null;
        return vectors.get(l);
    }

    @Override
    public char[][] getDictionaryDanilo() {
        return (char[][]) this.words.values();
    }

    /**
     * @return the matrixPath
     */
    public String getMatrixPath() {
        return matrixPath;
    }

    /**
     * Sets the path of the file where the word vectors are stored and 
     * loads them.
     * The file can be a plain text file or a .gz archive.
     * <p> 
     * The expected format is: </br>
     * number_of_vectors space_dimensionality</br> 
     * word_i [TAB] 1.0 [TAB] 0 [TAB] vector values comma separated
     * </code> </br></br> Example: </br></br> <code>
     * 3 5</br>
     * dog::n [TAB] 1.0 [TAB] 0 [TAB] 2.1,4.1,1.4,2.3,0.9</br>
     * cat::n [TAB] 1.0 [TAB] 0 [TAB] 3.2,4.3,1.2,2.2,0.8</br>
     * mouse::n [TAB] 1.0 [TAB] 0 [TAB] 2.4,4.4,2.4,1.3,0.92</br>
     * <p>
     * @param matrixPath the matrixPath to set
     * @throws IOException 
     */
    public void setMatrixPath(String matrixPath) throws IOException {
        this.matrixPath = matrixPath;
        this.populate(matrixPath);
    }

}