com.thesmartweb.swebrank.ElasticGetWordList.java Source code

Introduction

Here is the source code for com.thesmartweb.swebrank.ElasticGetWordList.java
Source

/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.node.Node;
import com.google.common.collect.SortedSetMultimap;
import com.google.common.collect.TreeMultimap;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map.Entry;
import java.util.stream.Stream;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;

/**
 * Class that contains method that retrieve words from an index in the cluster of ElasticSearch where the content is saved
 * @author Themistoklis Mavridis
 */
public class ElasticGetWordList {
    /**
     * Method gets all the words of all the documents regardless of topic for the ids passed as input
     * @param ids It contains all the ids for which the words are going to be captured
     * @param config_path configuration directory to get the names of the elastic search indexes
     * @return All the words in a List
     */
    public List<String> get(List<String> ids, String config_path) {
        try {
            //Node node = nodeBuilder().client(true).clusterName("lshrankldacluster").node();
            //Client client = node.client();
            Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", "lshrankldacluster")
                    .build();
            Client client = new TransportClient(settings)
                    .addTransportAddress(new InetSocketTransportAddress("localhost", 9300));
            ReadInput ri = new ReadInput();
            List<String> elasticIndexes = ri.GetKeyFile(config_path, "elasticSearchIndexes");
            List<String> wordList = new ArrayList<>();
            for (String id : ids) {
                SearchResponse responseSearch = client.prepareSearch(elasticIndexes.get(2))
                        .setSearchType(SearchType.QUERY_AND_FETCH).setQuery(QueryBuilders.idsQuery().ids(id))
                        .execute().actionGet();
                XContentBuilder builder = XContentFactory.jsonBuilder();
                builder.startObject();
                responseSearch.toXContent(builder, ToXContent.EMPTY_PARAMS);
                builder.endObject();
                String JSONresponse = builder.string();
                JsonParser parser = new JsonParser();
                JsonObject JSONobject = (JsonObject) parser.parse(JSONresponse);
                JsonObject hitsJsonObject = JSONobject.getAsJsonObject("hits");
                JsonArray hitsJsonArray = hitsJsonObject.getAsJsonArray("hits");
                for (JsonElement hitJsonElement : hitsJsonArray) {
                    JsonObject jsonElementObj = hitJsonElement.getAsJsonObject();
                    jsonElementObj = jsonElementObj.getAsJsonObject("_source");
                    JsonArray TopicsArray = jsonElementObj.getAsJsonArray("TopicsWordMap");
                    for (JsonElement Topic : TopicsArray) {
                        JsonObject TopicObj = Topic.getAsJsonObject();
                        JsonObject wordsmap = TopicObj.getAsJsonObject("wordsmap");
                        Set<Map.Entry<String, JsonElement>> entrySet = wordsmap.entrySet();
                        Iterator<Map.Entry<String, JsonElement>> iterator = entrySet.iterator();
                        while (iterator.hasNext()) {
                            Map.Entry<String, JsonElement> next = iterator.next();
                            String word = next.getKey();
                            wordList.add(word);
                        }
                    }
                }
            }
            //node.close();
            client.close();
            return wordList;
        } catch (IOException ex) {
            Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
            List<String> wordList = new ArrayList<>();
            return wordList;
        }

    }

    /**
     * Method gets all the top N max words for each topic of all the documents with their IDs (of the documents) passed as input.
     * @param ids It contains all the ids for which the words are going to be captured
     * @param top It contains the number of max words to be returned
     * @return All the words in a List
     */
    public List<String> getMaxWords(List<String> ids, int top, String config_path) {
        try {
            ReadInput ri = new ReadInput();
            List<String> elasticIndexes = ri.GetKeyFile(config_path, "elasticSearchIndexes");
            Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", "lshrankldacluster")
                    .build();
            Client client = new TransportClient(settings)
                    .addTransportAddress(new InetSocketTransportAddress("localhost", 9300));
            //Node node = nodeBuilder().client(true).clusterName("lshrankldacluster").node();
            //Client client = node.client();
            List<String> MaxwordList = new ArrayList<>();
            HashMap<String, Double> wordsMap = new HashMap<>();
            SortedSetMultimap<Double, String> wordsMultisorted = TreeMultimap.create();
            for (String id : ids) {//for every id loop
                SearchResponse responseSearch = client.prepareSearch(elasticIndexes.get(2))
                        .setSearchType(SearchType.QUERY_AND_FETCH).setQuery(QueryBuilders.idsQuery().ids(id))
                        .execute().actionGet();//search for this id
                //----build an object with the response
                XContentBuilder builder = XContentFactory.jsonBuilder();
                builder.startObject();
                responseSearch.toXContent(builder, ToXContent.EMPTY_PARAMS);
                builder.endObject();
                String JSONresponse = builder.string();
                //----parse the JSON response
                JsonParser parser = new JsonParser();
                JsonObject JSONobject = (JsonObject) parser.parse(JSONresponse);
                JsonObject hitsJsonObject = JSONobject.getAsJsonObject("hits");
                JsonArray hitsJsonArray = hitsJsonObject.getAsJsonArray("hits");
                //get all the JSON hits (check ElasticSearch typical response format for more)
                for (JsonElement hitJsonElement : hitsJsonArray) {
                    JsonObject jsonElementObj = hitJsonElement.getAsJsonObject();
                    jsonElementObj = jsonElementObj.getAsJsonObject("_source");
                    JsonArray TopicsArray = jsonElementObj.getAsJsonArray("TopicsWordMap");//get the topics word map (every word has a probability
                    for (JsonElement Topic : TopicsArray) {//for every topic I get the word with the max score
                        JsonObject TopicObj = Topic.getAsJsonObject();
                        JsonObject wordsmap = TopicObj.getAsJsonObject("wordsmap");//get the wordmap
                        Set<Map.Entry<String, JsonElement>> entrySet = wordsmap.entrySet();
                        Iterator<Map.Entry<String, JsonElement>> iterator = entrySet.iterator();
                        double max = 0.0;
                        String maxword = "";
                        while (iterator.hasNext()) {
                            Map.Entry<String, JsonElement> next = iterator.next();
                            if (next.getValue().getAsDouble() > max) {
                                maxword = next.getKey();
                                max = next.getValue().getAsDouble();
                            }
                        }
                        if (wordsMap.containsKey(maxword)) {
                            if (wordsMap.get(maxword) < max) {
                                wordsMap.put(maxword, max);
                            }
                        } else {
                            wordsMap.put(maxword, max);
                        }
                    }
                }
            }
            //we are going to sort all the max words
            Map<String, Double> wordsMapsorted = new HashMap<>();
            wordsMapsorted = sortByValue(wordsMap);//sorts the map in ascending fashion
            Iterator<Entry<String, Double>> iterator = wordsMapsorted.entrySet().iterator();
            //we are going to get the first top words from the list of Max words
            int beginindex = 0;
            //===we find the beginning index
            if (wordsMapsorted.entrySet().size() > top) {
                beginindex = wordsMapsorted.entrySet().size() - top;
            }
            int index = 0;
            //if the beginning index is larger we try to find the element
            while (index < beginindex) {
                iterator.next();
                index++;
            }
            //while the maxword list size is smaller than the top number and we have an extra value, add this word
            while (MaxwordList.size() < top && iterator.hasNext()) {
                String word = iterator.next().getKey();
                MaxwordList.add(word);

            }
            client.close();
            //node.close();
            return MaxwordList;
        } catch (IOException ex) {
            Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
            List<String> MaxwordList = new ArrayList<>();
            return MaxwordList;
        }

    }

    /**
     * Method that sorts a Map
     * @param <K> any primitive
     * @param <V> any primitive
     * @param map the map to be sorted
     * @return The map sorted in ascending fashion
     */
    public static <K, V extends Comparable<? super V>> Map<K, V> sortByValue(Map<K, V> map) {
        Map<K, V> result = new LinkedHashMap<>();
        Stream<Entry<K, V>> st = map.entrySet().stream();

        st.sorted(Comparator.comparing(e -> e.getValue())).forEach(e -> result.put(e.getKey(), e.getValue()));

        return result;
    }
}