eu.sisob.uma.extractors.adhoc.websearchers.WebSearchersExtractor.java Source code

Java tutorial

Introduction

Here is the source code for eu.sisob.uma.extractors.adhoc.websearchers.WebSearchersExtractor.java

Source

/*
Copyright (c) 2014 "(IA)2 Research Group. Universidad de Mlaga"
                    http://iaia.lcc.uma.es | http://www.uma.es
This file is part of SISOB Data Extractor.
SISOB Data Extractor is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SISOB Data Extractor is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SISOB Data Extractor. If not, see <http://www.gnu.org/licenses/>.
*/

package eu.sisob.uma.extractors.adhoc.websearchers;

import eu.sisob.uma.footils.Web.Downloader;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 ** @author Daniel Lpez Gonzlez (dlopezgonzalez@gmail.com) for the SISOB PROJECT (http://sisob.lcc.uma.es/)
 */
public class WebSearchersExtractor extends WebSearchersExtractorCommons {

    /**
     *
     */
    public enum SearchPatterns {
        /**
         *
         */
        P1,
        /**
         *
         */
        P2,
        /**
         *
         */
        P3,
        /**
         *
         */
        P4,
        /**
         *
         */
        P5
    }

    SearchPatterns search_patterns;

    /**
     *
     * @param search_patterns
     */
    public WebSearchersExtractor(SearchPatterns search_patterns) {

        this.search_patterns = search_patterns;
    }

    /**
     *
     * @param nextLine
     * @param idStaffIdentifier
     * @param idName
     * @param idFirstName
     * @param idLastName
     * @param idInitials
     * @param idSubject
     * @param idInstitutionName
     * @param idWebAddress
     * @param expression
     * @param params
     * @return
     */
    @Override
    protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName,
            int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress,
            String expression, Object[] params) {

        String keywords = " (PROFILE OR PHD OR RESEARCHER OR FACULTY OR PROFESSOR OR RESEARCH) AND ";
        keywords = "";

        String domain = clean_site(nextLine[idWebAddress]);
        String subject = nextLine[idSubject];
        String and_institution_name = (idInstitutionName != -1 ? " AND " + nextLine[idInstitutionName] : "");
        String expression_subject = expression + " AND " + subject;
        String expression_site = expression + " site: " + domain;
        String expression_inst_name = expression + and_institution_name;
        String expression_inst_name_and_subject = expression + and_institution_name + " AND " + subject;

        String url = "";

        switch (search_patterns) {
        case P1:
            url = "https://duckduckgo.com/html/?q=" + keywords + expression;
            break;
        case P2:
            url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject;
            break;
        case P3:
            url = "https://duckduckgo.com/html/?q=" + keywords + expression_site;
            break;
        case P4:
            url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name;
            break;
        case P5:
            url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name_and_subject;
            break;
        default:
            url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject;
            break;
        }
        Logger.getRootLogger().info("Go with " + url);
        boolean again = false;
        Document doc = null;
        do {
            doc = getDocumentFromPage(url, 10, 1000, 5000);

            if (doc != null && doc.text().contains("If this error persists, please let us know")) {
                try {
                    Thread.sleep(30000);
                } catch (InterruptedException ex) {
                }
                again = true;
            } else {
                again = false;
            }
        } while (again);

        String final_result = "";
        if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) {

            /* Write resercher founded */
            Elements elements = doc.select("div[class*=links_main] > a");

            /* We will take the first html page and the first pdf */

            HashMap<String, String> results = new HashMap<String, String>();

            int max_results = 2;
            int i_result = 0;
            for (Element e : elements) {
                if ((e.text().startsWith("[")
                //&& !e.text().startsWith("[PDF]")
                ) || e.absUrl("href").contains("duckduckgo.com/y.js") || e.absUrl("href").contains("wikipedia.")
                        || e.absUrl("href").contains("facebook.com") || e.absUrl("href").contains("microsoft.com")
                        || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin")
                        || e.absUrl("href").contains("www.biography.com")
                        || e.absUrl("href").contains("biomedexperts.com")
                        || e.absUrl("href").contains("www.experts.scival.com")
                        || e.absUrl("href").contains("ratemyprofessors.com")
                        || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt")
                        || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml")
                        || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx")
                        || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs")
                        || e.absUrl("href").contains("www.amazon")) {
                    max_results++;
                    continue;
                }

                boolean add = false;
                String score = "";
                String ext = "";
                if (!results.containsKey("HTML") && !e.text().startsWith("[")) {
                    //results.put("html", )

                    File temp;
                    try {
                        temp = File.createTempFile("temp-file-name", ".tmp");
                        URL fetched_url = Downloader.fetchURL(e.absUrl("href"));
                        FileUtils.copyURLToFile(fetched_url, temp);
                        long sizeInBytes = temp.length();
                        long sizeInMb = sizeInBytes / (1024 * 1024);
                        if (sizeInMb > 100) {
                            score = "B";
                        } else {
                            String content = FileUtils.readFileToString(temp);
                            if (content.contains(nextLine[idLastName])) {
                                score = "A";
                            } else {
                                score = "B";
                            }
                        }
                    } catch (IOException ex) {
                        score = "B";
                    }

                    ext = "HTML";
                    add = true;
                }

                //if(!results.containsKey("PDF") && e.text().startsWith("[PDF]")){                                                        
                //    score = "A";
                //    ext = "PDF";
                //    add = true;
                //}                          

                if (add) {
                    String result = "";
                    result += "\"" + nextLine[idStaffIdentifier] + "\";";
                    result += "\"" + nextLine[idLastName] + "\";";
                    result += "\"" + nextLine[idInitials] + "\";";
                    if (idFirstName != -1)
                        result += "\"" + nextLine[idFirstName] + "\";";
                    if (idName != -1)
                        result += "\"" + nextLine[idName] + "\";";
                    result += "\"" + e.absUrl("href") + "\";";
                    result += "\"" + ext + "\";";
                    result += "\"" + "CV" + "\";";
                    result += "\"" + score + "\"";
                    result += "\r\n";
                    results.put(ext, result);

                    Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + e.text());
                }

                //                if(results.containsKey("PDF") && results.containsKey("HTML")){
                //                    break;
                //                }

                i_result++;
                if (max_results <= i_result) {
                    break;
                }
            }

            //            if(results.containsKey("PDF"))
            //                final_result = results.get("PDF");
            //            else 
            if (results.containsKey("HTML"))
                final_result = results.get("HTML");
            else
                final_result = "";
        }

        return final_result;
    }

}