eu.sisob.api.sisobcrawlingworks.CrawlerResearchesPagesV3Test.java Source code

Java tutorial

Introduction

Here is the source code for eu.sisob.api.sisobcrawlingworks.CrawlerResearchesPagesV3Test.java

Source

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

/*
Copyright (c) 2014 "(IA)2 Research Group. Universidad de Mlaga"
                    http://iaia.lcc.uma.es | http://www.uma.es
This file is part of SISOB Data Extractor.
SISOB Data Extractor is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SISOB Data Extractor is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SISOB Data Extractor. If not, see <http://www.gnu.org/licenses/>.
*/

package eu.sisob.api.sisobcrawlingworks;

import eu.sisob.uma.crawler.CrawlerTrace;
import eu.sisob.uma.crawler.ResearchersCrawlerService;
import eu.sisob.uma.crawler.ResearchersCrawlers.CrawlerResearchesPagesV3;
import eu.sisob.uma.crawler.ResearchersCrawlers.CrawlerResearchesPagesV3Controller;
import eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.ResearcherNameInfo;
import eu.sisob.uma.footils.File.FileFootils;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import static org.junit.Assert.*;
import org.junit.*;

/**
 *
 * @author Daniel Lpez Gonzlez (dlopezgonzalez@gmail.com) for the SISOB PROJECT (http://sisob.lcc.uma.es/)
 */
public class CrawlerResearchesPagesV3Test {
    @Before
    public void setup() {
        //        BasicConfigurator.configure();
    }

    @Test
    public void TestingRESTServices() {
        if (true) {
            assertEquals(true, true);
            return;
        }

        String test_dirname = "test-data";
        String crawler_data_path = "crawler-data";

        ResearchersCrawlerService.setServiceSettings(crawler_data_path,
                ClassLoader.getSystemClassLoader().getResource("eu/sisob/uma/crawler/keywords"), true, true);
        ResearchersCrawlerService.createInstance();

        //        File keywords_dir = new File(keywords_dirname);
        //        if(!(keywords_dir).exists())
        //        {            
        //            FileFootils.copyResourcesRecursively(ClassLoader.getSystemClassLoader().getResource("eu/sisob/uma/crawler/keywords"), keywords_dir);      
        //        }

        String sInstitutionName = "AMC";

        String seed = "http://www.amc.edu/";
        String contain_pattern = seed.replace("http://www.", "");
        int index = contain_pattern.indexOf("/");
        if (index == -1)
            index = contain_pattern.length() - 1;
        contain_pattern = contain_pattern.substring(0, index);

        String sUnitOfAssessment_Description = "cbc";
        List<String> department_web_addresses = new ArrayList<String>();
        department_web_addresses.add("http://www.amc.edu/research/cbc/");

        List<ResearcherNameInfo> researchers = new ArrayList<ResearcherNameInfo>();

        /*
         * Crawling to search the researchers
         */
        CrawlerResearchesPagesV3Controller controllerReseachers = null;
        try {
            String university_subject_crawler_data_folder = test_dirname + File.separator
                    + sInstitutionName.replace(" ", ".") + "_" + sUnitOfAssessment_Description.replace(" ", ".")
                    + "-crawler-data";
            controllerReseachers = new CrawlerResearchesPagesV3Controller(university_subject_crawler_data_folder,
                    new File(crawler_data_path), researchers);
            String sSeeds = "";
            for (String s : department_web_addresses) {
                controllerReseachers.addSeed(s);
                sSeeds += s + ",";
            }

            controllerReseachers.setPolitenessDelay(200);
            controllerReseachers.setMaximumCrawlDepth(3);
            controllerReseachers.setMaximumPagesToFetch(-1);
            controllerReseachers.setContainPattern(contain_pattern);
            controllerReseachers.clearInterestingUrlsDetected();

            Logger.getLogger("root").info("Begin crawling: " + sUnitOfAssessment_Description + " - "
                    + sInstitutionName + " - [" + StringUtils.join(department_web_addresses, ",") + "]");
            long lTimerAux = java.lang.System.currentTimeMillis();

            controllerReseachers.start(CrawlerResearchesPagesV3.class, 1);

            controllerReseachers.postProcessResults();

            lTimerAux = java.lang.System.currentTimeMillis() - lTimerAux;
            Logger.getLogger("root")
                    .info("End crawling: " + sUnitOfAssessment_Description + " - " + sInstitutionName + " - Time: "
                            + lTimerAux + " ms - [" + StringUtils.join(department_web_addresses, ",") + "]");
        } catch (Exception ex) {
            Logger.getLogger("root").error(ex.getMessage(), ex);
        }

        ResearchersCrawlerService.releaseInstance();
    }

}