cu.uci.gws.sdlcrawler.PdfCrawlController.java Source code

Introduction

Here is the source code for cu.uci.gws.sdlcrawler.PdfCrawlController.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package cu.uci.gws.sdlcrawler;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import org.apache.http.message.BasicHeader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author Yasser Ganjisaffar [lastname at gmail dot com]
 * @author Yusniel Hidalgo Delgado [yhidalgo86 at gmail dot com]
 */
public class PdfCrawlController {

    private static Logger logger = LoggerFactory.getLogger(PdfCrawlController.class);

    public static void main(String[] args) throws Exception {
        Properties cm = PdfCrawlerConfigManager.getInstance().loadConfigFile();
        long startTime = System.currentTimeMillis();
        DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
        Date date = new Date();
        System.out.println(dateFormat.format(date));
        int numberOfCrawlers = Integer.parseInt(cm.getProperty("sdlcrawler.NumberOfCrawlers"));
        String pdfFolder = cm.getProperty("sdlcrawler.CrawlPdfFolder");

        CrawlConfig config = new CrawlConfig();

        config.setCrawlStorageFolder(cm.getProperty("sdlcrawler.CrawlStorageFolder"));
        config.setProxyHost(cm.getProperty("sdlcrawler.ProxyHost"));
        if (!"".equals(cm.getProperty("sdlcrawler.ProxyPort"))) {
            config.setProxyPort(Integer.parseInt(cm.getProperty("sdlcrawler.ProxyPort")));
        }
        config.setProxyUsername(cm.getProperty("sdlcrawler.ProxyUser"));
        config.setProxyPassword(cm.getProperty("sdlcrawler.ProxyPass"));
        config.setMaxDownloadSize(Integer.parseInt(cm.getProperty("sdlcrawler.MaxDownloadSize")));
        config.setIncludeBinaryContentInCrawling(
                Boolean.parseBoolean(cm.getProperty("sdlcrawler.IncludeBinaryContent")));
        config.setFollowRedirects(Boolean.parseBoolean(cm.getProperty("sdlcrawler.Redirects")));
        config.setUserAgentString(cm.getProperty("sdlcrawler.UserAgent"));
        config.setMaxDepthOfCrawling(Integer.parseInt(cm.getProperty("sdlcrawler.MaxDepthCrawl")));
        config.setMaxConnectionsPerHost(Integer.parseInt(cm.getProperty("sdlcrawler.MaxConnectionsPerHost")));
        config.setSocketTimeout(Integer.parseInt(cm.getProperty("sdlcrawler.SocketTimeout")));
        config.setMaxOutgoingLinksToFollow(Integer.parseInt(cm.getProperty("sdlcrawler.MaxOutgoingLinks")));
        config.setResumableCrawling(Boolean.parseBoolean(cm.getProperty("sdlcrawler.ResumableCrawling")));
        config.setIncludeHttpsPages(Boolean.parseBoolean(cm.getProperty("sdlcrawler.IncludeHttpsPages")));
        config.setMaxTotalConnections(Integer.parseInt(cm.getProperty("sdlcrawler.MaxTotalConnections")));
        config.setMaxPagesToFetch(Integer.parseInt(cm.getProperty("sdlcrawler.MaxPagesToFetch")));
        config.setPolitenessDelay(Integer.parseInt(cm.getProperty("sdlcrawler.PolitenessDelay")));
        config.setConnectionTimeout(Integer.parseInt(cm.getProperty("sdlcrawler.ConnectionTimeout")));

        System.out.println(config.toString());
        Collection<BasicHeader> defaultHeaders = new HashSet<>();
        defaultHeaders
                .add(new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));
        defaultHeaders.add(new BasicHeader("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"));
        defaultHeaders.add(new BasicHeader("Accept-Language", "en-US,en,es-ES,es;q=0.8"));
        defaultHeaders.add(new BasicHeader("Connection", "keep-alive"));
        config.setDefaultHeaders(defaultHeaders);

        List<String> list = Files.readAllLines(Paths.get("config/" + cm.getProperty("sdlcrawler.SeedFile")),
                StandardCharsets.UTF_8);
        String[] crawlDomains = list.toArray(new String[list.size()]);

        PageFetcher pageFetcher = new PageFetcher(config);
        RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
        RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
        CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
        for (String domain : crawlDomains) {
            controller.addSeed(domain);
        }

        PdfCrawler.configure(crawlDomains, pdfFolder);
        controller.start(PdfCrawler.class, numberOfCrawlers);
        DateFormat dateFormat1 = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
        Date date1 = new Date();
        System.out.println(dateFormat1.format(date1));
        long endTime = System.currentTimeMillis();
        long totalTime = endTime - startTime;
        System.out.println("Total time:" + totalTime);
    }
}