nl.uva.sne.commons.SemanticUtils.java Source code

Introduction

Here is the source code for nl.uva.sne.commons.SemanticUtils.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package nl.uva.sne.commons;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import net.didion.jwnl.JWNL;
import net.didion.jwnl.JWNLException;
import net.didion.jwnl.data.IndexWord;
import net.didion.jwnl.data.IndexWordSet;
import net.didion.jwnl.data.POS;
import net.didion.jwnl.dictionary.Dictionary;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import org.json.simple.parser.ParseException;

/**
 *
 * @author S. Koulouzis
 */
public class SemanticUtils {

    static Set<String> stopwords = new HashSet();
    public static String stopwordsFile = System.getProperty("user.home") + File.separator + "workspace"
            + File.separator + "TEXT2" + File.separator + "etc" + File.separator + "stopwords.csv";

    static Map<String, Map<String, Double>> termDocCache;

    static Set<String> nonLematizedWords = new HashSet();
    private static Dictionary wordNetdictionary;

    private static File nonLematizedWordsFile = new File(System.getProperty("user.home") + File.separator
            + "workspace" + File.separator + "TEXT2" + File.separator + "termXtraction" + File.separator + "etc"
            + File.separator + "nonLematizedWords");
    private static Map<String, String> stemCache = new HashMap<>();

    static {
        try {
            JWNL.initialize(new FileInputStream(System.getProperty("user.home") + File.separator + "workspace"
                    + File.separator + "TEXT2" + File.separator + "etc" + File.separator + "file_properties.xml"));
        } catch (JWNLException | FileNotFoundException ex) {
            Logger.getLogger(SemanticUtils.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public static double tfIdf(List<String> doc, List<List<String>> docs, String term) {
        return tf(doc, term) * idf(docs, term);
    }

    private static double tf(List<String> doc, String term) {
        double result = 0;
        for (String word : doc) {
            if (term.equalsIgnoreCase(word)) {
                result++;
            }
        }
        return result / (double) doc.size();
    }

    private static double idf(List<List<String>> docs, String term) {
        double n = 0;
        for (List<String> doc : docs) {
            for (String word : doc) {
                if (term.equalsIgnoreCase(word)) {
                    n++;
                    break;
                }
            }
        }
        if (n <= 0) {
            n = 1;
        }
        return Math.log(docs.size() / n);
    }

    //Code From org.apache.commons.text.similarity. 
    /**
     * Calculates the cosine similarity for two given vectors.
     *
     * @param leftVector left vector
     * @param rightVector right vector
     * @return cosine similarity between the two vectors
     */
    public static Double cosineSimilarity(Map<String, Double> leftVector, Map<String, Double> rightVector) {
        if (leftVector == null || rightVector == null) {
            throw new IllegalArgumentException("Vectors must not be null");
        }

        Set<String> intersection = getIntersection(leftVector, rightVector);

        //        System.err.println(leftVector);
        //        System.err.println(rightVector);
        double dotProduct = dot(leftVector, rightVector, intersection);
        double d1 = 0.0d;
        for (Double value : leftVector.values()) {
            d1 += Math.pow(value, 2);
        }
        double d2 = 0.0d;
        for (Double value : rightVector.values()) {
            d2 += Math.pow(value, 2);
        }
        double cosineSimilarity;
        if (d1 <= 0.0 || d2 <= 0.0) {
            cosineSimilarity = 0.0;
        } else {
            double a = Math.sqrt(d1) * Math.sqrt(d2);
            cosineSimilarity = (dotProduct / a);
        }
        return cosineSimilarity;
    }

    /**
     * Returns a set with strings common to the two given maps.
     *
     * @param leftVector left vector map
     * @param rightVector right vector map
     * @return common strings
     */
    private static Set<String> getIntersection(Map<String, Double> leftVector, Map<String, Double> rightVector) {

        //        ValueComparator bvc = new ValueComparator(leftVector);
        //        TreeMap<String, Double> Lsorted_map = new TreeMap(bvc);
        //        Lsorted_map.putAll(leftVector);
        //
        //        bvc = new ValueComparator(rightVector);
        //        TreeMap<String, Double> Rsorted_map = new TreeMap(bvc);
        //        Rsorted_map.putAll(rightVector);
        //
        //        SortedSet<String> Lkeys = new TreeSet<>(leftVector.keySet());
        //        SortedSet<String> Rkeys = new TreeSet<>(rightVector.keySet());
        Set<String> intersection = new HashSet<>(leftVector.keySet());
        intersection.retainAll(rightVector.keySet());
        return intersection;
    }

    /**
     * Computes the dot product of two vectors. It ignores remaining elements.
     * It means that if a vector is longer than other, then a smaller part of it
     * will be used to compute the dot product.
     *
     * @param leftVector left vector
     * @param rightVector right vector
     * @param intersection common elements
     * @return the dot product
     */
    private static double dot(Map<String, Double> leftVector, Map<String, Double> rightVector,
            Set<String> intersection) {
        Double dotProduct = 0.0;
        for (String key : intersection) {
            dotProduct += leftVector.get(key) * rightVector.get(key);
        }
        return dotProduct;
    }

    public static Set<String> getDocument(Term term)
            throws IOException, JWNLException, MalformedURLException, ParseException {

        Set<String> doc = new HashSet<>();

        List<String> g = term.getGlosses();
        if (g != null) {
            for (String s : g) {
                if (s != null) {
                    doc.addAll(tokenize(s, true));
                }
            }
        }
        List<String> al = term.getAlternativeLables();
        if (al != null) {
            for (String s : al) {
                if (s != null) {
                    doc.addAll(tokenize(s, true));
                }
            }
        }
        List<String> cat = term.getCategories();
        if (cat != null) {
            for (String s : cat) {
                if (s != null) {
                    doc.addAll(tokenize(s, true));
                }
            }
        }
        return doc;
    }

    public static List<String> getNGrams(String text, int maxNGrams) throws IOException {
        List<String> words = new ArrayList<>();

        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42, CharArraySet.EMPTY_SET);
        TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
        StopFilter stopFilter = new StopFilter(Version.LUCENE_42, tokenStream, getStopWords());
        stopFilter.setEnablePositionIncrements(false);
        //        SnowballFilter snowballFilter = new SnowballFilter(stopFilter, "English");

        try (ShingleFilter sf = new ShingleFilter(stopFilter, 2, maxNGrams)) {
            sf.setOutputUnigrams(false);
            CharTermAttribute charTermAttribute = sf.addAttribute(CharTermAttribute.class);
            sf.reset();
            while (sf.incrementToken()) {
                String word = charTermAttribute.toString();
                words.add(word.replaceAll(" ", "_"));
            }
            sf.end();
        }
        return words;
    }

    public static TokenStream tokenStemStream(String fieldName, Reader reader) throws IOException {
        TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_42, reader);
        stream = new StopFilter(Version.LUCENE_42, stream, getStopWords());
        stream = new PorterStemFilter(stream);
        return stream;
    }

    public static TokenStream tokenStream(String fieldName, Reader reader) throws IOException {
        Analyzer analyzer = new ArmenianAnalyzer(Version.LUCENE_42, getStopWords());
        TokenStream stream = analyzer.tokenStream("field", reader);
        return stream;
    }

    public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException {
        text = text.replaceAll("", "'");

        text = text.replaceAll("_", " ");
        text = text.replaceAll("[0-9]", "");
        text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " ");

        text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
        text = text.toLowerCase();

        TokenStream tokenStream;
        if (stem) {
            tokenStream = tokenStemStream("field", new StringReader(text));
        } else {
            tokenStream = tokenStream("field", new StringReader(text));
        }

        ArrayList<String> words = new ArrayList<>();
        try {
            CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                words.add(term.toString());
            }
            tokenStream.end();
        } finally {
            tokenStream.close();
        }
        //        Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens");
        return words;
    }

    private static CharArraySet getStopWords() throws IOException {
        if (stopwords.isEmpty()) {
            readStopWords();
        }
        return new CharArraySet(Version.LUCENE_42, stopwords, true);
    }

    private static void readStopWords() throws IOException {
        try (BufferedReader br = new BufferedReader(new FileReader(stopwordsFile))) {
            for (String line; (line = br.readLine()) != null;) {
                stopwords.add(line);
            }
        }
    }

    public static Map<String, Double> getTermsInDoc(Map<String, Double> termDictionaray, File f)
            throws IOException, JWNLException {

        if (termDocCache == null) {
            termDocCache = new HashMap<>();
        }
        Map<String, Double> termsInDoc = termDocCache.get(f.getAbsolutePath());
        if (termsInDoc != null) {
            return termsInDoc;
        }
        termsInDoc = new HashMap<>();
        List<String> tokens = new ArrayList<>();
        try (BufferedReader br = new BufferedReader(new FileReader(f))) {
            for (String text; (text = br.readLine()) != null;) {
                tokens.addAll(tokenize(text, true));
            }
        }

        for (String term : termDictionaray.keySet()) {
            for (String t : tokens) {
                if (term.equals(t)) {
                    Double tf = termsInDoc.get(t);
                    if (tf != null) {
                        tf++;
                    } else {
                        tf = 1.0;
                    }
                    termsInDoc.put(t, tf);
                }
            }
        }
        termDocCache.put(f.getAbsolutePath(), termsInDoc);
        return termsInDoc;
    }

    public static String lemmatize(String word) throws JWNLException, FileNotFoundException, MalformedURLException,
            IOException, ParseException, Exception {
        if (nonLemetize(word) || word.contains("_")) {
            return word;
        }

        wordNetdictionary = getWordNetDictionary();
        IndexWordSet set = wordNetdictionary.lookupAllIndexWords(word);
        for (IndexWord iw : set.getIndexWordArray()) {
            return iw.getLemma();
        }
        //        word = lmmtizeFromOnlineWordNet(word, language);
        //        word = lemmatizeFromBabelNet(word, language);

        return word;
    }

    public static boolean nonLemetize(String word) throws FileNotFoundException, IOException {
        if (nonLematizedWords.isEmpty() || nonLematizedWords == null) {
            loadNonLematizeWords();
        }
        return nonLematizedWords.contains(word);
    }

    private static Dictionary getWordNetDictionary() {
        if (wordNetdictionary == null) {
            wordNetdictionary = Dictionary.getInstance();
        }
        return wordNetdictionary;
    }

    private static void loadNonLematizeWords() throws FileNotFoundException, IOException {
        if (nonLematizedWordsFile.exists() && nonLematizedWordsFile.length() > 1) {
            try (BufferedReader br = new BufferedReader(new FileReader(nonLematizedWordsFile))) {
                String line;
                while ((line = br.readLine()) != null) {
                    nonLematizedWords.add(line);
                }
            }
        }
    }

    public static POS[] getPOS(String s) throws JWNLException {
        // Look up all IndexWords (an IndexWord can only be one POS)
        wordNetdictionary = getWordNetDictionary();
        IndexWordSet set = wordNetdictionary.lookupAllIndexWords(s);
        // Turn it into an array of IndexWords
        IndexWord[] words = set.getIndexWordArray();
        // Make the array of POS
        POS[] pos = new POS[words.length];
        for (int i = 0; i < words.length; i++) {
            pos[i] = words[i].getPOS();
        }
        return pos;
    }

    public static Set<Term> tf_idf_Disambiguation(Set<Term> possibleTerms, Set<String> nGrams, String lemma,
            double confidence, boolean matchTitle) throws IOException, JWNLException, ParseException {
        List<List<String>> allDocs = new ArrayList<>();
        Map<String, List<String>> docs = new HashMap<>();
        if (possibleTerms == null || possibleTerms.size() < 10) {
            return null;
        }
        for (Term tv : possibleTerms) {
            Set<String> doc = SemanticUtils.getDocument(tv);
            allDocs.add(new ArrayList<>(doc));
            docs.put(tv.getUID(), new ArrayList<>(doc));
        }

        Set<String> contextDoc = new HashSet<>();
        for (String s : nGrams) {
            if (s.contains("_")) {
                String[] parts = s.split("_");
                for (String token : parts) {
                    if (token.length() >= 1 && !token.contains(lemma)) {
                        contextDoc.add(token);
                    }
                }
            } else if (s.length() >= 1 && !s.contains(lemma)) {
                contextDoc.add(s);
            }
        }
        docs.put("context", new ArrayList<>(contextDoc));

        Map<String, Map<String, Double>> featureVectors = new HashMap<>();
        for (String k : docs.keySet()) {
            List<String> doc = docs.get(k);
            Map<String, Double> featureVector = new TreeMap<>();
            for (String term : doc) {
                if (!featureVector.containsKey(term)) {
                    double tfidf = tfIdf(doc, allDocs, term);
                    featureVector.put(term, tfidf);
                }
            }
            featureVectors.put(k, featureVector);
        }

        Map<String, Double> contextVector = featureVectors.remove("context");

        Map<String, Double> scoreMap = new HashMap<>();
        for (String key : featureVectors.keySet()) {
            Double similarity = cosineSimilarity(contextVector, featureVectors.get(key));

            for (Term t : possibleTerms) {
                if (t.getUID().equals(key)) {

                    String stemTitle = stem(t.getLemma().toLowerCase());
                    String stemLema = stem(lemma);
                    //                    List<String> subTokens = new ArrayList<>();
                    //                    if (!t.getLemma().toLowerCase().startsWith("(") && t.getLemma().toLowerCase().contains("(") && t.getLemma().toLowerCase().contains(")")) {
                    //                        int index1 = t.getLemma().toLowerCase().indexOf("(") + 1;
                    //                        int index2 = t.getLemma().toLowerCase().indexOf(")");
                    //                        String sub = t.getLemma().toLowerCase().substring(index1, index2);
                    //                        subTokens.addAll(tokenize(sub, true));
                    //                    }
                    //
                    //                    List<String> nTokens = new ArrayList<>();
                    //                    for (String s : nGrams) {
                    //                        if (s.contains("_")) {
                    //                            String[] parts = s.split("_");
                    //                            for (String token : parts) {
                    //                                nTokens.addAll(tokenize(token, true));
                    //                            }
                    //                        } else {
                    //                            nTokens.addAll(tokenize(s, true));
                    //                        }
                    //                    }
                    //                    if (t.getCategories() != null) {
                    //                        for (String s : t.getCategories()) {
                    //                            if (s != null && s.contains("_")) {
                    //                                String[] parts = s.split("_");
                    //                                for (String token : parts) {
                    //                                    subTokens.addAll(tokenize(token, true));
                    //                                }
                    //                            } else if (s != null) {
                    //                                subTokens.addAll(tokenize(s, true));
                    //                            }
                    //                        }
                    //                    }
                    ////                    System.err.println(t.getGlosses());
                    //                    Set<String> intersection = new HashSet<>(nTokens);
                    //                    intersection.retainAll(subTokens);
                    //                    if (intersection.isEmpty()) {
                    //                        similarity -= 0.1;
                    //                    }
                    int dist = edu.stanford.nlp.util.StringUtils.editDistance(stemTitle, stemLema);
                    similarity = similarity - (dist * 0.05);
                    t.setConfidence(similarity);
                }
            }
            scoreMap.put(key, similarity);
        }

        if (scoreMap.isEmpty()) {
            return null;
        }

        ValueComparator bvc = new ValueComparator(scoreMap);
        TreeMap<String, Double> sorted_map = new TreeMap(bvc);
        sorted_map.putAll(scoreMap);
        //        System.err.println(sorted_map);

        Iterator<String> it = sorted_map.keySet().iterator();
        String winner = it.next();

        Double s1 = scoreMap.get(winner);
        if (s1 < confidence) {
            return null;
        }

        Set<Term> terms = new HashSet<>();
        for (Term t : possibleTerms) {
            if (t.getUID().equals(winner)) {
                terms.add(t);
            }
        }
        if (!terms.isEmpty()) {
            return terms;
        } else {
            Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "No winner");
            return null;
        }
    }

    public static Term disambiguate(String term, Set<Term> possibleTerms, String allTermsDictionary,
            double confidence, boolean matchTitle) throws IOException, JWNLException, ParseException {
        Set<String> ngarms = FileUtils.getNGramsFromTermDictionary(term, allTermsDictionary);
        possibleTerms = SemanticUtils.tf_idf_Disambiguation(possibleTerms, ngarms, term, confidence, matchTitle);
        Term dis = null;
        if (possibleTerms != null && possibleTerms.size() == 1) {
            dis = possibleTerms.iterator().next();
        }
        //        if (dis != null) {
        //            Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Term: {0}. Confidence: {1}", new Object[]{dis, dis.getConfidence()});
        //        } else {
        //            Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Couldn''''t figure out what ''{0}'' means", term);
        //        }

        return dis;
    }

    public static String stem(String string) throws IOException, JWNLException {
        if (string.length() < 1) {
            return string;
        }
        String res = stemCache.get(string);
        if (res != null) {
            return res;
        }

        List<String> stems = SemanticUtils.tokenize(string, true);
        StringBuilder stem = new StringBuilder();
        for (String s : stems) {
            stem.append(s).append(" ");
        }
        if (stem.length() > 1) {
            stem.deleteCharAt(stem.length() - 1);
            stem.setLength(stem.length());
        }

        stemCache.put(string, stem.toString());
        return stem.toString();
    }

}