com.thesmartweb.swebrank.Sensebot.java Source code

Introduction

Here is the source code for com.thesmartweb.swebrank.Sensebot.java
Source

/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;

import java.io.File;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;

/**
 * Class to deal with the various functionalities related to Sensebot
 * @author Administrator
 */

public class Sensebot {

    /**
     * Method that connects to the Sensebot url and gets the document using SAXReader
     * @param link_ur the link to read from
     * @return the response in a string
     */
    public String connect(URL link_ur) {
        try {
            SAXReader reader = new SAXReader();
            Document document = reader.read(link_ur);
            Element root = document.getRootElement();
            List<Node> content = root.content();
            String stringValue = "";
            if (!(content.isEmpty()) && content.size() > 1) {
                Node get = content.get(1);
                stringValue = get.getStringValue();
                DataManipulation tp = new DataManipulation();
                stringValue = tp.removeChars(stringValue).toLowerCase();
            }
            return stringValue;
        } catch (DocumentException ex) {
            Logger.getLogger(Sensebot.class.getName()).log(Level.SEVERE, null, ex);
            String output = "";
            return output;
        }

    }

    /**
     * Method to get the top sensebot concepts recognized for given links
     * @param links the links to search for
     * @param directory the directory to save the results to
     * @param SensebotConcepts the amount of concepts to search for
     * @param config_path the path to find sensebot's username
     * @return a list with all the top sensebot concepts recognized for the given links
     */
    public List<String> compute(String[] links, String directory, int SensebotConcepts, String config_path) {
        List<String> wordList = new ArrayList<>();
        try {
            URL diff_url = null;
            String stringtosplit = "";
            String username = GetUserName(config_path);
            for (String link : links) {
                if (!(link == null)) {
                    diff_url = new URL("http://api.sensebot.net/svc/extconcone.asmx/ExtractConcepts?userName="
                            + username + "&numConcepts=" + SensebotConcepts
                            + "&artClass=&artLength=0&Lang=English&allURLs=" + link);
                    stringtosplit = connect(diff_url);
                    if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) {
                        stringtosplit = stringtosplit.replaceAll("[\\W&&[^\\s]]", "");
                        if (!(stringtosplit == null) && (!(stringtosplit.equalsIgnoreCase("")))) {
                            String[] tokenizedTerms = stringtosplit.split("\\W+"); //to get individual terms
                            for (String tokenizedTerm : tokenizedTerms) {
                                if (!(tokenizedTerm == null) && (!(tokenizedTerm.equalsIgnoreCase("")))) {
                                    wordList.add(tokenizedTerm);
                                }
                            }
                        }
                    }
                }
            }
            File file_words = new File(directory + "words.txt");
            FileUtils.writeLines(file_words, wordList);
            return wordList;
        } catch (MalformedURLException ex) {
            Logger.getLogger(Diffbot.class.getName()).log(Level.SEVERE, null, ex);
            return wordList;
        } catch (IOException ex) {
            Logger.getLogger(Diffbot.class.getName()).log(Level.SEVERE, null, ex);
            return wordList;
        }
    }

    /**
     * Method to get the userName of sensebot
     * @param config_path the path to find sensebot's username
     * @return Sensebot's username
     */
    public String GetUserName(String config_path) {
        Path input_path = Paths.get(config_path);
        DataManipulation getfiles = new DataManipulation();//class responsible for the extraction of paths
        Collection<File> inputs_files;//array to include the paths of the txt files
        inputs_files = getfiles.getinputfiles(input_path.toString(), "txt");//method to retrieve all the path of the input documents
        List<String> tokenList = new ArrayList<>();
        ReadInput ri = new ReadInput();
        for (File input : inputs_files) {
            if (input.getName().contains("sensebotUsername")) {
                tokenList = ri.readLinesConfig(input);
            }
        }
        if (tokenList.size() > 0) {
            return tokenList.get(0);
        } else {
            String output = "";
            return output;
        }
    }
}