eu.edisonproject.training.wsd.DisambiguatorImpl.java Source code

Java tutorial

Introduction

Here is the source code for eu.edisonproject.training.wsd.DisambiguatorImpl.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package eu.edisonproject.training.wsd;

import eu.edisonproject.training.tfidf.mapreduce.ITFIDFDriver;
import eu.edisonproject.training.tfidf.mapreduce.TFIDFDriverImpl;
import eu.edisonproject.utility.commons.Term;
import eu.edisonproject.utility.commons.TermAvroSerializer;
import eu.edisonproject.utility.commons.TermFactory;
import eu.edisonproject.utility.commons.ValueComparator;
import eu.edisonproject.utility.file.CSVFileReader;
import eu.edisonproject.utility.file.ConfigHelper;
import eu.edisonproject.utility.file.DBTools;
import eu.edisonproject.utility.text.processing.StanfordLemmatizer;
import eu.edisonproject.utility.text.processing.Stemming;
import eu.edisonproject.utility.text.processing.StopWord;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.json.simple.parser.ParseException;

import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.SubstringComparator;
import org.apache.hadoop.hbase.filter.ValueFilter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.lucene.analysis.util.CharArraySet;

/**
 *
 * @author S. Koulouzis
 */
public class DisambiguatorImpl implements Disambiguator, Callable {

    private Integer limit;
    private Double minimumSimilarity;
    private Integer lineOffset;
    private String termToProcess;
    private static String stopWordsPath;
    private String itemsFilePath;
    public static final TableName TERMS_TBL_NAME = TableName.valueOf("terms");
    protected StopWord tokenizer;
    protected StanfordLemmatizer lematizer;
    protected Stemming stemer;
    private static final Logger LOGGER = Logger.getLogger(DisambiguatorImpl.class.getName());
    private String candidateTermsFile;

    /**
     *
     * @param candidateTermsFile
     * @return
     * @throws IOException
     * @throws FileNotFoundException
     * @throws ParseException
     */
    @Override
    public List<Term> disambiguateTerms(String candidateTermsFile)
            throws IOException, FileNotFoundException, ParseException {
        LOGGER.log(Level.FINE, "filterredDictionary: {0}", candidateTermsFile);
        this.candidateTermsFile = candidateTermsFile;
        List<Term> terms = new ArrayList<>();

        File dictionary = new File(candidateTermsFile);
        int count = 0;
        int lineCount = 1;
        try (BufferedReader br = new BufferedReader(new FileReader(dictionary))) {
            for (String line; (line = br.readLine()) != null;) {
                //                LOGGER.log(Level.FINE, "line: {0}", line);
                if (lineCount >= getLineOffset()) {

                    String[] parts = line.split(",");
                    String term = parts[0];
                    //                Integer score = Integer.valueOf(parts[1]);
                    if (term.length() >= 1) {
                        count++;
                        if (count > getLimit()) {
                            break;
                        }
                        LOGGER.log(Level.INFO, "Processing: {0}  at line: {1} of " + getLimit(),
                                new Object[] { line, lineCount });
                        Term tt = getTerm(term);
                        if (tt != null) {
                            terms.add(tt);
                        }
                    }
                }
                lineCount++;
            }
        } catch (Exception ex) {
            LOGGER.log(Level.WARNING, "Failed while processing line: " + lineCount + " from: " + candidateTermsFile,
                    ex);
        } finally {
            return terms;
        }
    }

    @Override
    public void configure(Properties properties) {
        String numOfTerms = System.getProperty("num.of.terms");

        if (numOfTerms == null) {
            limit = Integer.valueOf(properties.getProperty("num.of.terms", "5"));
        } else {
            limit = Integer.valueOf(numOfTerms);
        }
        LOGGER.log(Level.FINE, "num.of.terms: {0}", limit);

        String offset = System.getProperty("offset.terms");

        if (offset == null) {
            lineOffset = Integer.valueOf(properties.getProperty("offset.terms", "1"));
        } else {
            lineOffset = Integer.valueOf(offset);
        }
        LOGGER.log(Level.FINE, "offset.terms: {0}", lineOffset);
        String minimumSimilarityStr = System.getProperty("minimum.similarity");
        if (minimumSimilarityStr == null) {
            minimumSimilarityStr = properties.getProperty("minimum.similarity", "0,3");
        }
        minimumSimilarity = Double.valueOf(minimumSimilarityStr);
        LOGGER.log(Level.FINE, "minimum.similarity: {0}", lineOffset);
        stopWordsPath = System.getProperty("stop.words.file");

        if (stopWordsPath == null) {
            stopWordsPath = properties.getProperty("stop.words.file",
                    ".." + File.separator + "etc" + File.separator + "stopwords.csv");
        }
        LOGGER.log(Level.FINE, "stop.words.file: {0}", stopWordsPath);
        itemsFilePath = System.getProperty("itemset.file");
        if (itemsFilePath == null) {
            itemsFilePath = properties.getProperty("itemset.file",
                    ".." + File.separator + "etc" + File.separator + "allTerms.csv");
        }
        LOGGER.log(Level.FINE, "itemset.file: {0}", itemsFilePath);
        //        Configuration config = HBaseConfiguration.create();
        //        try {
        //            conn = ConnectionFactory.createConnection(config);
        //        } catch (IOException ex) {
        //            LOGGER.log(Level.SEVERE, null, ex);
        //        }
        CharArraySet stopwordsCharArray = new CharArraySet(ConfigHelper.loadStopWords(stopWordsPath), true);
        tokenizer = new StopWord(stopwordsCharArray);
        lematizer = new StanfordLemmatizer();
        stemer = new Stemming();

        //        try {
        //            String logPropFile = properties.getProperty("logging.properties.file", ".." + File.separator + "etc" + File.separator + "log.properties");
        //            FileInputStream fis = new FileInputStream(logPropFile);
        //            LogManager.getLogManager().readConfiguration(fis);
        //        } catch (FileNotFoundException ex) {
        //            Logger.getLogger(DisambiguatorImpl.class.getName()).log(Level.WARNING, null, ex);
        //        } catch (IOException | SecurityException ex) {
        //            Logger.getLogger(DisambiguatorImpl.class.getName()).log(Level.WARNING, null, ex);
        //        }
        Level level = Level.parse(properties.getProperty("log.level", "INFO"));

        Handler[] handlers = Logger.getLogger("").getHandlers();
        for (int index = 0; index < handlers.length; index++) {
            handlers[index].setLevel(level);
        }
        LOGGER.setLevel(level);
    }

    @Override
    public Term getTerm(String term) throws IOException, ParseException {
        Set<String> termsStr = getPossibleTermsFromDB(term, null);
        if (termsStr != null) {
            Set<Term> possibaleTerms = new HashSet<>();
            for (String jsonTerm : termsStr) {
                possibaleTerms.add(TermFactory.create(jsonTerm));
            }
            String delimeter = ",";
            String wordSeperator = " ";
            Set<String> ngarms = CSVFileReader.getNGramsForTerm(term, getItemsFilePath(), delimeter, wordSeperator);
            return disambiguate(term, possibaleTerms, ngarms, getMinimumSimilarity());
        } else {
            return null;
        }
    }

    /**
     * @return the limit
     */
    public Integer getLimit() {
        return limit;
    }

    /**
     * @return the minimumSimilarity
     */
    public Double getMinimumSimilarity() {
        return minimumSimilarity;
    }

    /**
     * @return the lineOffset
     */
    public Integer getLineOffset() {
        return lineOffset;
    }

    /**
     * @return the termToProcess
     */
    public String getTermToProcess() {
        return termToProcess;
    }

    /**
     * @param termToProcess the termToProcess to set
     */
    public void setTermToProcess(String termToProcess) {
        this.termToProcess = termToProcess;
    }

    @Override
    public Term call() throws Exception {
        return getTerm(getTermToProcess());
    }

    /**
     * @return the stopWordsPath
     */
    public String getStopWordsPath() {
        return stopWordsPath;
    }

    protected Term disambiguate(String term, Set<Term> possibleTerms, Set<String> ngarms, double minimumSimilarity)
            throws IOException, ParseException {
        Set<Term> dis = tf_idf_Disambiguation(possibleTerms, ngarms, term, getMinimumSimilarity(), true);
        if (dis != null) {
            return dis.iterator().next();
        }
        return null;
    }

    private Set<Term> tf_idf_Disambiguation(Set<Term> possibleTerms, Set<String> nGrams, String lemma,
            double confidence, boolean matchTitle) throws IOException, ParseException {
        LOGGER.log(Level.FINE, "Loaded {0} for {1}", new Object[] { nGrams.size(), lemma });
        if (nGrams.size() < 7) {
            LOGGER.log(Level.WARNING, "Found only {0} n-grams for {1}. Not enough for disambiguation.",
                    new Object[] { nGrams.size(), lemma });
            return null;
        }

        List<List<String>> allDocs = new ArrayList<>();
        Map<CharSequence, List<String>> docs = new HashMap<>();

        for (Term tv : possibleTerms) {
            Set<String> doc = getDocument(tv);
            allDocs.add(new ArrayList<>(doc));
            docs.put(tv.getUid(), new ArrayList<>(doc));
        }

        Set<String> contextDoc = new HashSet<>();
        StringBuilder ngString = new StringBuilder();
        for (String s : nGrams) {
            if (s.contains("_")) {
                String[] parts = s.split("_");
                for (String token : parts) {
                    if (token.length() >= 1 && !token.contains(lemma)) {
                        //                        contextDoc.add(token);
                        ngString.append(token).append(" ");
                    }
                }
            } else if (s.length() >= 1 && !s.contains(lemma)) {
                ngString.append(s).append(" ");
                //                contextDoc.add(s);
            }
        }
        tokenizer.setDescription(ngString.toString());
        String cleanText = tokenizer.execute();
        lematizer.setDescription(cleanText);
        String lematizedText = lematizer.execute();
        List<String> ngList = Arrays.asList(lematizedText.split(" "));
        contextDoc.addAll(ngList);

        docs.put("context", new ArrayList<>(contextDoc));

        Map<CharSequence, Map<String, Double>> featureVectors = new HashMap<>();
        for (CharSequence k : docs.keySet()) {
            List<String> doc = docs.get(k);
            Map<String, Double> featureVector = new TreeMap<>();
            for (String term : doc) {
                if (!featureVector.containsKey(term)) {
                    double tfidf = tfIdf(doc, allDocs, term);
                    featureVector.put(term, tfidf);
                }
            }
            featureVectors.put(k, featureVector);
        }

        Map<String, Double> contextVector = featureVectors.remove("context");
        Map<CharSequence, Double> scoreMap = new HashMap<>();
        for (CharSequence key : featureVectors.keySet()) {
            Double similarity = cosineSimilarity(contextVector, featureVectors.get(key));

            for (Term t : possibleTerms) {
                if (t.getUid().equals(key) && matchTitle) {
                    stemer.setDescription(t.getLemma().toString());
                    String stemTitle = stemer.execute();
                    stemer.setDescription(lemma);
                    String stemLema = stemer.execute();
                    //                    List<String> subTokens = new ArrayList<>();
                    //                    if (!t.getLemma().toString().toLowerCase().startsWith("(") && t.getLemma().toString().toLowerCase().contains("(") && t.getLemma().toLowerCase().contains(")")) {
                    //                        int index1 = t.getLemma().toString().toLowerCase().indexOf("(") + 1;
                    //                        int index2 = t.getLemma().toString().toLowerCase().indexOf(")");
                    //                        String sub = t.getLemma().toString().toLowerCase().substring(index1, index2);
                    //                        subTokens.addAll(tokenize(sub, true));
                    //                    }
                    double factor = 0.15;
                    if (stemTitle.length() > stemLema.length()) {
                        if (stemTitle.contains(stemLema)) {
                            factor = 0.075;
                        }
                    } else if (stemLema.length() > stemTitle.length()) {
                        if (stemLema.contains(stemTitle)) {
                            factor = 0.075;
                        }
                    }
                    int dist = edu.stanford.nlp.util.StringUtils.editDistance(stemTitle, stemLema);
                    similarity = similarity - (dist * factor);
                    t.setConfidence(similarity);
                }
            }
            scoreMap.put(key, similarity);
        }

        if (scoreMap.isEmpty()) {
            return null;
        }

        ValueComparator bvc = new ValueComparator(scoreMap);
        TreeMap<CharSequence, Double> sorted_map = new TreeMap(bvc);
        sorted_map.putAll(scoreMap);
        //        System.err.println(sorted_map);

        Iterator<CharSequence> it = sorted_map.keySet().iterator();
        CharSequence winner = it.next();

        Double s1 = scoreMap.get(winner);
        if (s1 < confidence) {
            return null;
        }

        Set<Term> terms = new HashSet<>();
        for (Term t : possibleTerms) {
            if (t.getUid().equals(winner)) {
                terms.add(t);
            }
        }
        if (!terms.isEmpty()) {
            return terms;
        } else {
            LOGGER.log(Level.INFO, "No winner");
            return null;
        }
    }

    protected void addPossibleTermsToDB(String ambiguousTerm, Set<Term> terms) throws IOException {
        List<String> families = new ArrayList<>();
        families.add("jsonString");
        families.add("ambiguousTerm");
        DBTools.createOrUpdateTable(TERMS_TBL_NAME, families, false);
        try (Admin admin = DBTools.getConn().getAdmin()) {
            try (Table tbl = DBTools.getConn().getTable(TERMS_TBL_NAME)) {
                for (Term t : terms) {
                    Put put = new Put(Bytes.toBytes(t.getUid().toString()));
                    String jsonStr = TermFactory.term2Json(t).toJSONString();
                    put.addColumn(Bytes.toBytes("jsonString"), Bytes.toBytes("jsonString"), Bytes.toBytes(jsonStr));
                    put.addColumn(Bytes.toBytes("ambiguousTerm"), Bytes.toBytes("ambiguousTerm"),
                            Bytes.toBytes(ambiguousTerm));
                    tbl.put(put);
                }
            }
            admin.flush(TERMS_TBL_NAME);
        }
    }

    protected Set<String> getPossibleTermsFromDB(String term, CharSequence url) throws IOException {
        try (Admin admin = DBTools.getConn().getAdmin()) {
            if (admin.tableExists(TERMS_TBL_NAME)) {
                try (Table tbl = DBTools.getConn().getTable(TERMS_TBL_NAME)) {
                    //shell query: 'scan 'terms', { COLUMNS => 'ambiguousTerm:ambiguousTerm', FILTER => "ValueFilter( =, 'binary:python' )" }'
                    Scan scan = new Scan();
                    //                    scan.addFamily(Bytes.toBytes("ambiguousTerm"));
                    //                    scan.addFamily(Bytes.toBytes("jsonString"));

                    List<Filter> filterList = new ArrayList<>();
                    ValueFilter valueFilter = new ValueFilter(CompareOp.EQUAL, new SubstringComparator(term));
                    filterList.add(valueFilter);

                    FilterList filter = new FilterList(FilterList.Operator.MUST_PASS_ALL, filterList);

                    scan.setFilter(filter);

                    ResultScanner resultScanner = tbl.getScanner(scan);
                    Iterator<Result> results = resultScanner.iterator();
                    Set<String> jsonTerms = new HashSet<>();
                    while (results.hasNext()) {
                        Result r = results.next();
                        String ambiguousTerm = Bytes.toString(
                                r.getValue(Bytes.toBytes("ambiguousTerm"), Bytes.toBytes("ambiguousTerm")));
                        String jsonStr = Bytes
                                .toString(r.getValue(Bytes.toBytes("jsonString"), Bytes.toBytes("jsonString")));
                        if (jsonStr != null) {
                            if (url != null) {
                                if (ambiguousTerm != null && ambiguousTerm.equals(term) && jsonStr.contains(url)) {
                                    jsonTerms.add(jsonStr);
                                }
                            } else if (ambiguousTerm != null && ambiguousTerm.equals(term)) {
                                jsonTerms.add(jsonStr);
                            }
                        } else {
                            deleteEntryFromTerms(r.getRow());
                        }

                    }
                    return jsonTerms;
                }
            }
        }
        return null;
    }

    private void deleteEntryFromTerms(byte[] id) throws IOException {
        try (Table tbl = DBTools.getConn().getTable(TERMS_TBL_NAME)) {
            Delete d = new Delete(id);
            tbl.delete(d);
        }
    }

    /**
     * @return the itemsFilePath
     */
    public String getItemsFilePath() {
        return itemsFilePath;
    }

    private Set<String> getDocument(Term term) throws IOException, MalformedURLException, ParseException {
        Set<String> doc = new HashSet<>();

        List<CharSequence> g = term.getGlosses();
        if (g != null) {
            for (CharSequence s : g) {
                if (s != null && s.length() > 0) {
                    s = s.toString().replaceAll("_", " ");
                    tokenizer.setDescription(s.toString());
                    String cleanText = tokenizer.execute();
                    lematizer.setDescription(cleanText);
                    String lematizedText = lematizer.execute();

                    //                    cleaner.setDescription(s.toString());
                    //                    String stemed = cleaner.execute();
                    doc.addAll(Arrays.asList(lematizedText.split(" ")));
                }
            }
        }
        List<CharSequence> al = term.getAltLables();
        if (al != null) {
            for (CharSequence s : al) {
                if (s != null && s.length() > 0) {
                    //                    cleaner.setDescription(s.toString());
                    //                    String stemed = cleaner.execute();
                    s = s.toString().replaceAll("_", " ");
                    tokenizer.setDescription(s.toString());
                    String cleanText = tokenizer.execute();
                    lematizer.setDescription(cleanText);
                    String lematizedText = lematizer.execute();

                    doc.addAll(Arrays.asList(lematizedText.split(" ")));
                }
            }
        }
        List<CharSequence> cat = term.getCategories();
        if (cat != null) {
            for (CharSequence s : cat) {
                if (s != null && s.length() > 0) {
                    s = s.toString().replaceAll("_", " ");
                    //                    cleaner.setDescription(s.toString());
                    //                    String stemed = cleaner.execute();
                    tokenizer.setDescription(s.toString());
                    String cleanText = tokenizer.execute();
                    lematizer.setDescription(cleanText);
                    String lematizedText = lematizer.execute();
                    doc.addAll(Arrays.asList(lematizedText.split(" ")));
                }
            }
        }
        return doc;
    }

    private double tfIdf(List<String> doc, List<List<String>> docs, String term) {
        return tf(doc, term) * idf(docs, term);
    }

    private static double idf(List<List<String>> docs, String term) {
        double n = 0;
        for (List<String> doc : docs) {
            for (String word : doc) {
                if (term.equalsIgnoreCase(word)) {
                    n++;
                    break;
                }
            }
        }
        if (n <= 0) {
            n = 1;
        }
        return Math.log(docs.size() / n);
    }

    private static double tf(List<String> doc, String term) {
        double result = 0;
        for (String word : doc) {
            if (term.equalsIgnoreCase(word)) {
                result++;
            }
        }
        return result / (double) doc.size();
    }

    //Code From org.apache.commons.text.similarity. 
    /**
     * Calculates the cosine similarity for two given vectors.
     *
     * @param leftVector left vector
     * @param rightVector right vector
     * @return cosine similarity between the two vectors
     */
    public static Double cosineSimilarity(Map<String, Double> leftVector, Map<String, Double> rightVector) {
        if (leftVector == null || rightVector == null) {
            throw new IllegalArgumentException("Vectors must not be null");
        }

        Set<String> intersection = getIntersection(leftVector, rightVector);

        //        System.err.println(leftVector);
        //        System.err.println(rightVector);
        double dotProduct = dot(leftVector, rightVector, intersection);
        double d1 = 0.0d;
        for (Double value : leftVector.values()) {
            d1 += Math.pow(value, 2);
        }
        double d2 = 0.0d;
        for (Double value : rightVector.values()) {
            d2 += Math.pow(value, 2);
        }
        double cosineSimilarity;
        if (d1 <= 0.0 || d2 <= 0.0) {
            cosineSimilarity = 0.0;
        } else {
            double a = Math.sqrt(d1) * Math.sqrt(d2);
            cosineSimilarity = (dotProduct / a);
        }
        return cosineSimilarity;

    }

    /**
     * Returns a set with strings common to the two given maps.
     *
     * @param leftVector left vector map
     * @param rightVector right vector map
     * @return common strings
     */
    private static Set<String> getIntersection(Map<String, Double> leftVector, Map<String, Double> rightVector) {

        //        ValueComparator bvc = new ValueComparator(leftVector);
        //        TreeMap<String, Double> Lsorted_map = new TreeMap(bvc);
        //        Lsorted_map.putAll(leftVector);
        //
        //        bvc = new ValueComparator(rightVector);
        //        TreeMap<String, Double> Rsorted_map = new TreeMap(bvc);
        //        Rsorted_map.putAll(rightVector);
        //
        //        SortedSet<String> Lkeys = new TreeSet<>(leftVector.keySet());
        //        SortedSet<String> Rkeys = new TreeSet<>(rightVector.keySet());
        Set<String> intersection = new HashSet<>(leftVector.keySet());
        intersection.retainAll(rightVector.keySet());
        return intersection;
    }

    /**
     * Computes the dot product of two vectors. It ignores remaining elements.
     * It means that if a vector is longer than other, then a smaller part of it
     * will be used to compute the dot product.
     *
     * @param leftVector left vector
     * @param rightVector right vector
     * @param intersection common elements
     * @return the dot product
     */
    private static double dot(Map<String, Double> leftVector, Map<String, Double> rightVector,
            Set<String> intersection) {
        Double dotProduct = 0.0;
        for (String key : intersection) {
            dotProduct += leftVector.get(key) * rightVector.get(key);
        }
        return dotProduct;
    }

    //    /**
    //     * @return the conn
    //     */
    //    public Connection getConn() {
    //        return conn;
    //    }
    private Term getTermFromDB(CharSequence winner) {
        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
    }

    private Term mapreduceDisambiguate(String term, Set<Term> possibleTerms, Set<String> ngarms,
            double minimumSimilarity) throws IOException {
        String filePath = ".." + File.separator + "etc" + File.separator + "Avro Document" + File.separator + term
                + File.separator + term + ".avro";
        TermAvroSerializer ts = new TermAvroSerializer(filePath, Term.getClassSchema());
        List<CharSequence> empty = new ArrayList<>();
        empty.add("");
        for (Term t : possibleTerms) {
            List<CharSequence> nuid = t.getNuids();
            if (nuid == null || nuid.isEmpty() || nuid.contains(null)) {
                t.setNuids(empty);
            }

            List<CharSequence> buids = t.getBuids();
            if (buids == null || buids.isEmpty() || buids.contains(null)) {
                t.setBuids(empty);
            }
            List<CharSequence> alt = t.getAltLables();
            if (alt == null || alt.isEmpty() || alt.contains(null)) {
                t.setAltLables(empty);
            }
            List<CharSequence> gl = t.getGlosses();
            if (gl == null || gl.isEmpty() || gl.contains(null)) {
                t.setGlosses(empty);
            } else {
                StringBuilder glosses = new StringBuilder();
                for (CharSequence n : gl) {
                    glosses.append(n).append(" ");
                }
                gl = new ArrayList<>();
                stemer.setDescription(glosses.toString());
                gl.add(stemer.execute());
                t.setGlosses(gl);

            }
            List<CharSequence> cat = t.getCategories();
            if (cat == null || cat.contains(null)) {
                t.setCategories(empty);
            }
            ts.serialize(t);
        }
        Term context = new Term();
        context.setUid("context");
        StringBuilder glosses = new StringBuilder();
        context.setLemma(term);
        context.setOriginalTerm(term);
        context.setUrl("empty");
        for (String n : ngarms) {
            glosses.append(n).append(" ");
        }
        List<CharSequence> contextGlosses = new ArrayList<>();
        stemer.setDescription(glosses.toString());

        contextGlosses.add(stemer.execute());
        context.setGlosses(contextGlosses);
        List<CharSequence> nuid = context.getNuids();
        if (nuid == null || nuid.isEmpty() || nuid.contains(null)) {
            context.setNuids(empty);
        }

        List<CharSequence> buids = context.getBuids();
        if (buids == null || buids.isEmpty() || buids.contains(null)) {
            context.setBuids(empty);
        }
        List<CharSequence> alt = context.getAltLables();
        if (alt == null || alt.isEmpty() || alt.contains(null)) {
            context.setAltLables(empty);
        }
        List<CharSequence> gl = context.getGlosses();
        if (gl == null || gl.isEmpty() || gl.contains(null)) {
            context.setGlosses(empty);
        }
        List<CharSequence> cat = context.getCategories();
        if (cat == null || cat.contains(null)) {
            context.setCategories(empty);
        }
        ts.serialize(context);
        ts.close();

        ITFIDFDriver tfidfDriver = new TFIDFDriverImpl(term);
        tfidfDriver.executeTFIDF(new File(filePath).getParent());

        Map<CharSequence, Map<String, Double>> featureVectors = CSVFileReader
                .tfidfResult2Map(TFIDFDriverImpl.OUTPUT_PATH4 + File.separator + "part-r-00000");
        Map<String, Double> contextVector = featureVectors.remove("context");

        Map<CharSequence, Double> scoreMap = new HashMap<>();
        for (CharSequence key : featureVectors.keySet()) {
            Double similarity = cosineSimilarity(contextVector, featureVectors.get(key));
            scoreMap.put(key, similarity);
        }
        if (scoreMap.isEmpty()) {
            return null;
        }

        ValueComparator bvc = new ValueComparator(scoreMap);
        TreeMap<CharSequence, Double> sorted_map = new TreeMap(bvc);
        sorted_map.putAll(scoreMap);

        Iterator<CharSequence> it = sorted_map.keySet().iterator();
        CharSequence winner = it.next();

        Double s1 = scoreMap.get(winner);
        if (s1 < getMinimumSimilarity()) {
            return null;
        }

        return getTermFromDB(winner);

    }

    @Override
    public Set<Term> getCandidates(String lemma)
            throws MalformedURLException, IOException, ParseException, InterruptedException, ExecutionException {
        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
    }

    /**
     * @return the candidateTermsFile
     */
    public String getCandidateTermsFile() {
        return candidateTermsFile;
    }

}