com.thesmartweb.swebrank.Total_analysis.java Source code

Introduction

Here is the source code for com.thesmartweb.swebrank.Total_analysis.java
Source

/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;

/**
 *
 * @author Themis Mavridis
 */
import java.util.*;

import java.util.List;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.node.Node;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import org.json.simple.JSONObject;

/**
 * Class of the main algorithm functionalities
 * @author Themistoklis Mavridis
 */
public class Total_analysis {

    /**
     * a list with all the wordlists produced
     */
    protected List<ArrayList<String>> array_wordLists = new ArrayList<>();

    /**
     * a list with all the words from all the wordlists
     */
    protected List<String> wordList_total = new ArrayList<>();

    /**
     * the convergence score
     */
    protected double convergence;

    /**
     * Method to call search analysis for every query and to save the wordlists
     * @param wordList_previous the previous wordlist to check convergence
     * @param iteration_counter the iteration number
     * @param example_dir the directory to save the files
     * @param domain the domain we analyze
     * @param enginechoice the search engines chosen
     * @param queries the queries we search for
     * @param results_number the amount of results for each query
     * @param top_visible the amount of results if we use Visibility Score  (http://www.advancedwebranking.com/user-guide/html/en/ch08s06.html)
     * @param mozMetrics the metrics of Moz chosen
     * @param moz_threshold_option flag if we are going to use Moz threshold or not
     * @param moz_threshold the threshold to moz metrics
     * @param top_count_moz the amount of links to keep if we use Moz for evaluation
     * @param ContentSemantics get the choice of Content Semantic Analysis algorithm that we are going to use
     * @param SensebotConcepts the amount of concepts to be recognized if Sensebot is used
     * @param SWebRankSettings the settings for LDA and SwebRank in general (check the ReadInput Class)
     * @param config_path the configuration path to get all the api keys
     */
    public void perform(List<String> wordList_previous, int iteration_counter, String example_dir, String domain,
            List<Boolean> enginechoice, List<String> queries, int results_number, int top_visible,
            List<Boolean> mozMetrics, boolean moz_threshold_option, double moz_threshold, int top_count_moz,
            List<Boolean> ContentSemantics, int SensebotConcepts, List<Double> SWebRankSettings,
            String config_path) {
        //for every term of the query String[] it performs the search analysis function
        //which includes sumbission of the term to the search engines, getting the results according to the options selected
        //parsing the websites and getting the content and the running LDA on them and getting the top content
        for (String query : queries) {
            System.gc();
            System.gc();
            System.gc();
            List<String> wordList = new ArrayList<>();
            //we call search analysis that is doing all the work needed and returns to us the wordlists
            Search_analysis sa = new Search_analysis();
            //the following string represents the directory for each query
            String example_directory = example_dir + query + "-query//";
            //we set the alpha variable of the LDA algorithm to the value that is said to be optimal in the paper of LDA, alpha
            double alpha = 50 / SWebRankSettings.get(1);
            //we call perform method of search analysis
            wordList = sa.perform(iteration_counter, example_directory, domain, enginechoice, query, results_number,
                    top_visible, SWebRankSettings, alpha, mozMetrics, top_count_moz, moz_threshold_option,
                    moz_threshold, ContentSemantics, SensebotConcepts, config_path);
            //we add the wordlist to the vector of word list
            ArrayList<String> wordArrayList = new ArrayList<>(wordList);
            array_wordLists.add(wordArrayList);
            //we add the wordlist and to the total wordlist
            wordList_total.addAll(wordList);
        }
        //we are going to check the convergence rate
        CheckConvergence cc = new CheckConvergence(); // here we check the convergence between the two wordLists, the new and the previous
        //the concergence percentage of this iteration, we save it in Elastic Search
        convergence = cc.ConvergenceCalc(wordList_total, wordList_previous);
        //Node node = nodeBuilder().client(true).clusterName("lshrankldacluster").node();
        //Client client = node.client();
        ReadInput ri = new ReadInput();
        List<String> elasticIndexes = ri.GetKeyFile(config_path, "elasticSearchIndexes");
        Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", "lshrankldacluster").build();
        Client client = new TransportClient(settings)
                .addTransportAddress(new InetSocketTransportAddress("localhost", 9300));
        JSONObject objEngineLevel = new JSONObject();
        objEngineLevel.put("RoundContent", wordList_total);
        objEngineLevel.put("Round", iteration_counter);
        objEngineLevel.put("Convergence", convergence);
        String id = domain + "/" + iteration_counter;
        IndexRequest indexReq = new IndexRequest(elasticIndexes.get(1), "content", id);
        indexReq.source(objEngineLevel);
        IndexResponse indexRes = client.index(indexReq).actionGet();
        client.close();
        //node.close();
    }

    /**
     * Getter of convergence score
     * @return convergence score
     */
    public double getConvergence() {
        return convergence;
    }

    /**
     * Getter of the total wordlist
     * @return the list with all the words produced from all queries
     */
    public List<String> getwordList_total() {
        return wordList_total;
    }

    /**
     * Getter of all the wordlists for each query in separate
     * @return all the wordlists for each query
     */
    public List<ArrayList<String>> getarray_wordLists() {
        return array_wordLists;
    }

}