com.thesmartweb.swebrank.Main.java Source code

Introduction

Here is the source code for com.thesmartweb.swebrank.Main.java
Source

/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;
/**
 * Main method of SWebRank. 
 * It receives the input in a txt file in a structure format.
 * Passes all the input variables to the total analysis class.
 * It receives the wordlist of every iteration for each query and creates a wordlist for every domain.
 * It get all the combinations and permutations of all the words in the wordlist.
 * Creates the new queries according to their Normalized Web Distance (using Bing Search API)
 * It compares the wordlist of every iteration with the previous one using Normalized Mutual Information
 * @author Themis Mavridis
 */

import java.io.*;
import java.util.*;
import java.util.List;
import org.apache.commons.io.FilenameUtils;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.node.Node;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import org.json.simple.JSONObject;

/**
 * Main class of SWebRank that gets the settings and get the results of every iteration.
 * It calls the process to create new queries and to check if we converge.
 * @author themis
 */
public class Main {

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        Path input_path = Paths.get("//mnt//var//DBs//inputsL10//nba//");//input directory
        String output_parent_directory = "//mnt//var//DBs//outputsConfL10//nba//";//output directory
        String config_path = "//mnt//var//DBs//config//";//input directory
        //---Disable apache log manually----
        //System.setProperty("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
        System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.Log4JLogger");
        //--------------Domain that is searched----------
        String domain = "";
        //------------------search engine related options----------------------
        List<String> queries = null;
        int results_number = 0;//the number of results that are returned from each search engine
        List<Boolean> enginechoice = null;
        //list element #0. True/False Bing
        //list element #1. True/False Google
        //list element #2. True/False Yahoo!
        //list element #3. True/False Merged
        //-----------Moz options---------------------
        List<Boolean> mozMetrics = null;
        //The list is going to contain the moz related input in the following order
        //list element #1. True/False, True we use Moz API, false not
        //list element #2. True if we use Domain Authority
        //list element #3. True if we use External MozRank
        //list element #4. True if we use MozRank
        //list element #5. True if we use MozTrust
        //list element #6. True if we use Subdomain MozRank
        //list element #7. True if we use Page Authority
        //only one is used (the first to be set to true)
        boolean moz_threshold_option = false;//set to true we use the threshold
        Double moz_threshold = 0.0;//if we want to have a threshold in moz
        int top_count_moz = 0;//if we want to get the moz top-something results
        //---------------Semantic Analysis method----------------
        List<Boolean> ContentSemantics = null;
        int SensebotConcepts = 0;//define the amount of concepts that sensebot is going to recognize
        List<Double> SWebRankSettings = null;
        //------(string)directory is going to be used later-----
        String output_child_directory;
        //-------we get all the paths of the txt (input) files from the input directory-------
        DataManipulation getfiles = new DataManipulation();//class responsible for the extraction of paths
        Collection<File> inputs_files;//array to include the paths of the txt files
        inputs_files = getfiles.getinputfiles(input_path.toString(), "txt");//method to retrieve all the path of the input documents
        //------------read the txt files------------
        for (File input : inputs_files) {
            ReadInput ri = new ReadInput();//function to read the input
            boolean check_reading_input = ri.perform(input);
            if (check_reading_input) {
                domain = ri.domain;
                //----------
                queries = ri.queries;
                results_number = ri.results_number;
                enginechoice = ri.enginechoice;
                //------------
                mozMetrics = ri.mozMetrics;
                moz_threshold_option = ri.moz_threshold_option;
                moz_threshold = ri.moz_threshold.doubleValue();
                //---------------
                ContentSemantics = ri.ContentSemantics;
                SWebRankSettings = ri.SWebRankSettings;
            }
            int top_visible = 0;//option to set the amount of results you can get in the merged search engine
            //------if we choose to use a Moz metric or Visibility score for our ranking, we need to set the results_number for the search engines to its max which is 50 
            //-----we set the top results number for moz or Visibility rank----
            if (mozMetrics.get(0) || enginechoice.get(3)) {
                if (mozMetrics.get(0)) {
                    top_count_moz = results_number;
                } //if moz is true, top_count_moz gets the value of result number
                if (enginechoice.get(3)) {
                    top_visible = results_number;
                } //if merged engine is true, top_visible gets the value of result number
                results_number = 50;//this is the max amount of results that you can get from the search engine APIs
            }
            //-----if we want to use Moz we should check first if it works
            if (mozMetrics.get(0)) {
                Moz Moz = new Moz();
                //---if it works, moz remains true, otherwise it is set to false
                mozMetrics.add(0, Moz.check(config_path));
                //if it is false and we have chosen to use Visibility score with Moz, we reset back to the standard settings (ranking and not merged)
                //therefore, we reset the number of results from 50 to the top_count_moz which contained the original number of results
                if (!mozMetrics.get(0)) {
                    if (!enginechoice.get(3)) {
                        results_number = top_count_moz;
                    }
                }
            }
            //----------we set the wordLists that we are going to use---------------------
            List<String> finalList = new ArrayList<String>();//finalList is going to contain all the content in the end
            Total_analysis ta = new Total_analysis();//we call total analysis
            int iteration_counter = 0;//the iteration_counter is used in order to count the number of iterations of the algorithm and to be checked with perf_limit
            //this list of arraylists  is going to contain all the wordLists that are produced for every term of the String[] query,
            //in order to calculate the NGD scores between every term of the wordList and the term that was used as query in order to produce the spesific wordList
            List<ArrayList<String>> array_wordLists = new ArrayList<>();
            List<String> wordList_previous = new ArrayList<>();
            List<String> wordList_new = new ArrayList<>();
            double convergence = 0;//we create the convergence percentage and initialize it
            String conv_percentages = "";//string that contains all the convergence percentages
            DataManipulation wordsmanipulation = new DataManipulation();//method to manipulate various word data (String, list<String>, etc)
            do { //if we run the algorithm for the 1st time we already have the query so we skip the loop below that produces the new array of query
                if (iteration_counter != 0) {
                    wordList_previous = wordList_new;
                    //we add the previous wordList to the finalList
                    finalList = wordsmanipulation.AddAList(wordList_previous, finalList);
                    List<String> query_new_list_total = new ArrayList<>();
                    int iteration_previous = iteration_counter - 1;
                    Combinations_Engine cn = new Combinations_Engine();//call the class to combine the terms produced
                    for (String query : queries) {
                        List<String> ids = new ArrayList<>();
                        if (enginechoice.get(0)) {
                            String id = domain + "/" + query + "/bing" + "/" + iteration_previous;
                            ids.add(id);
                        }
                        if (enginechoice.get(1)) {
                            String id = domain + "/" + query + "/google" + "/" + iteration_previous;
                            ids.add(id);
                        }
                        if (enginechoice.get(2)) {
                            String id = domain + "/" + query + "/yahoo" + "/" + iteration_previous;
                            ids.add(id);
                        }
                        ElasticGetWordList ESget = new ElasticGetWordList();//we call this class to get the wordlist from the Elastic Search
                        List<String> maxWords = ESget.getMaxWords(ids, SWebRankSettings.get(9).intValue(),
                                config_path);//we are going to get a max amount of words
                        int query_index = queries.indexOf(query);
                        int size_query_new = SWebRankSettings.get(10).intValue();//the amount of new queries we are willing to create
                        //we create the new queries for every query of the previous round by combining the words produced from this query
                        List<String> query_new_list = cn.perform(maxWords, SWebRankSettings.get(7), queries,
                                SWebRankSettings.get(6), query_index, size_query_new, config_path);
                        //we add the list of new queries to the total list that containas all the new queries
                        query_new_list_total.addAll(query_new_list);
                        System.out.println("query pointer=" + query_index + "");
                    }
                    //---------------------the following cleans a list from null and duplicates
                    query_new_list_total = wordsmanipulation.clearListString(query_new_list_total);
                    //--------------we create the new directory that our files are going to be saved 
                    String txt_directory = FilenameUtils.getBaseName(input.getName());
                    output_child_directory = output_parent_directory + txt_directory + "_level_" + iteration_counter
                            + "//";
                    //----------------append the wordlist to a file------------------
                    wordsmanipulation.AppendWordList(query_new_list_total,
                            output_child_directory + "queries_" + iteration_counter + ".txt");
                    if (query_new_list_total.size() < 1) {
                        break;
                    } //if we don't create new queries we end the while loop
                    //total analysis' function is going to do all the work and return back what we need
                    ta = new Total_analysis();
                    ta.perform(wordList_previous, iteration_counter, output_child_directory, domain, enginechoice,
                            query_new_list_total, results_number, top_visible, mozMetrics, moz_threshold_option,
                            moz_threshold.doubleValue(), top_count_moz, ContentSemantics, SensebotConcepts,
                            SWebRankSettings, config_path);
                    //we get the array of wordlists
                    array_wordLists = ta.getarray_wordLists();
                    //get the wordlist that includes all the new queries
                    wordList_new = ta.getwordList_total();
                    //---------------------the following cleans a list from null and duplicates-------------
                    wordList_new = wordsmanipulation.clearListString(wordList_new);
                    //----------------append the wordlist to a file--------------------
                    wordsmanipulation.AppendWordList(wordList_new, output_child_directory + "wordList.txt");
                    //the concergence percentage of this iteration
                    convergence = ta.getConvergence();//we are going to use convergence score to check the convergence
                    //a string that contains all the convergence percentage for each round separated by \n character
                    conv_percentages = conv_percentages + "\n" + convergence;
                    //a file that is going to include the convergence percentages
                    wordsmanipulation.AppendString(conv_percentages,
                            output_child_directory + "convergence_percentage.txt");
                    //we add the new wordList to the finalList
                    finalList = wordsmanipulation.AddAList(wordList_new, finalList);
                    //we set the query array to be equal to the query new total that we have created
                    queries = query_new_list_total;
                    //we increment the iteration_counter in order to count the iterations of the algorithm and to use the perf_limit
                    iteration_counter++;
                } else {//the following source code is performed on the 1st run of the loop
                        //------------we extract the parent path of the file 
                    String txt_directory = FilenameUtils.getBaseName(input.getName());
                    //----------we create a string that is going to be used for the corresponding directory of outputs
                    output_child_directory = output_parent_directory + txt_directory + "_level_" + iteration_counter
                            + "//";
                    //we call total analysis function performOld
                    ta.perform(wordList_new, iteration_counter, output_child_directory, domain, enginechoice,
                            queries, results_number, top_visible, mozMetrics, moz_threshold_option,
                            moz_threshold.doubleValue(), top_count_moz, ContentSemantics, SensebotConcepts,
                            SWebRankSettings, config_path);
                    //we get the array of wordlists
                    array_wordLists = ta.getarray_wordLists();
                    //get the wordlist that includes all the new queries
                    wordList_new = ta.getwordList_total();
                    //---------------------the following cleans a list from null and duplicates
                    wordList_new = wordsmanipulation.clearListString(wordList_new);
                    //----------------append the wordlist to a file
                    wordsmanipulation.AppendWordList(wordList_new, output_child_directory + "wordList.txt");
                    //-----------------------------------------
                    iteration_counter++;//increase the iteration_counter that counts the iterations of the algorithm
                }
            } while (convergence < SWebRankSettings.get(5).doubleValue()
                    && iteration_counter < SWebRankSettings.get(8).intValue());//while the convergence percentage is below the limit and the iteration_counter below the performance limit
            if (iteration_counter == 1) {
                finalList = wordsmanipulation.AddAList(wordList_new, finalList);
            }
            //--------------------content List----------------
            if (!finalList.isEmpty()) {
                //---------------------the following cleans the final list from null and duplicates
                finalList = wordsmanipulation.clearListString(finalList);
                //write the keywords to a file
                boolean flag_file = false;//boolean flag to declare successful write to file
                flag_file = wordsmanipulation.AppendWordList(finalList,
                        output_parent_directory + "total_content.txt");
                if (!flag_file) {
                    System.out.print("can not create the content file for: " + output_parent_directory
                            + "total_content.txt");
                }
            }
            //we are going to save the total content with its convergence on the ElasticSearch cluster in a separated index
            //Node node = nodeBuilder().client(true).clusterName("lshrankldacluster").node();
            //Client client = node.client();
            //get the elastic search indexes in a list
            List<String> elasticIndexes = ri.GetKeyFile(config_path, "elasticSearchIndexes");
            Settings settings = ImmutableSettings.settingsBuilder().put("cluster.name", "lshrankldacluster")
                    .build();
            Client client = new TransportClient(settings)
                    .addTransportAddress(new InetSocketTransportAddress("localhost", 9300));
            JSONObject objEngineLevel = new JSONObject();
            objEngineLevel.put("TotalContent", finalList);//we save the total content
            objEngineLevel.put("Convergences", conv_percentages);//we save the convergence percentages
            IndexRequest indexReq = new IndexRequest(elasticIndexes.get(0), "content", domain);//we save also the domain 
            indexReq.source(objEngineLevel);
            IndexResponse indexRes = client.index(indexReq).actionGet();
            //node.close();
            client.close();
            //----------------------convergence percentages writing to file---------------
            //use the conv_percentages string
            if (conv_percentages.length() != 0) {
                boolean flag_file = false;//boolean flag to declare successful write to file
                flag_file = wordsmanipulation.AppendString(conv_percentages,
                        output_parent_directory + "convergence_percentages.txt");
                if (!flag_file) {
                    System.out.print("can not create the convergence file for: " + output_parent_directory
                            + "convergence_percentages.txt");
                }
            }
        }
    }
}