com.sangupta.httptools.DownloadUrlCommand.java Source code

Java tutorial

Introduction

Here is the source code for com.sangupta.httptools.DownloadUrlCommand.java

Source

/**
 *
 * http-toolbox: Command line HTTP tools
 * Copyright (c) 2014, Sandeep Gupta
 * 
 * http://sangupta.com/projects/http-toolbox
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

package com.sangupta.httptools;

import io.airlift.command.Command;
import io.airlift.command.Option;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.sangupta.jerry.http.WebInvoker;
import com.sangupta.jerry.http.WebResponse;
import com.sangupta.jerry.util.AssertUtils;
import com.sangupta.jerry.util.GsonUtils;

/**
 * Download all URLs and persist them to disk.
 * 
 * @author sangupta
 *
 */
@Command(name = "download", description = "Download URLs to the disk")
public class DownloadUrlCommand extends HttpToolBoxCommand {

    private static final Logger LOGGER = LoggerFactory.getLogger(DownloadUrlCommand.class);

    @Option(name = { "-u", "--urlFile" }, description = "File containing one URL per line", required = true)
    public String urlFile;

    @Option(name = { "-p", "--prefix" }, description = "Prefix to be appended to each URL")
    public String prefix;

    @Option(name = { "-s", "--suffix" }, description = "Suffix to be appended to each URL")
    public String suffix;

    @Option(name = { "-n", "--numThreads" }, description = "Number of threads to spawn for crawling, default is 20")
    public int numThreads = 10;

    @Option(name = { "-o",
            "--output" }, description = "Output folder where individual files are written", required = true)
    public String outputFolder;

    /**
     * The runnable tasks that we create before we fire threads for downloading
     */
    private final List<Runnable> downloadTasks = new ArrayList<Runnable>();

    /**
     * The suffix for each filename that we write to disk
     */
    private final String storeSuffix = "." + UUID.randomUUID().toString() + ".response";

    /**
     * Keeps track of current progress
     */
    private AtomicInteger count = new AtomicInteger();

    /**
     * The base directory where we need to write data
     */
    private File outputDir;

    /**
     * Indicates if we need to split folders as number of files per folder will be huge
     */
    private boolean splitFolders;

    /**
     * Total number of tasks that we have created
     */
    private int numTasks;

    @Override
    public void run() {
        File file = new File(this.urlFile);
        if (file == null || !file.exists()) {
            System.out.println("URL file cannot be found.");
            return;
        }

        if (!file.isFile()) {
            System.out.println("URL file does not represent a valid file.");
            return;
        }

        if (this.numThreads <= 0 || this.numThreads > 50) {
            System.out.println("Number of assigned threads should be between 1 and 50");
            return;
        }

        outputDir = new File(this.outputFolder);
        if (outputDir.exists() && !outputDir.isDirectory()) {
            System.out.println("Output folder does not represent a valid directory");
            return;
        }

        if (!outputDir.exists()) {
            outputDir.mkdirs();
        }

        // try and parse and read all URLs
        int line = 1;
        try {
            LineIterator iterator = FileUtils.lineIterator(file);
            while (iterator.hasNext()) {
                ++line;
                String readURL = iterator.next();
                createURLTask(readURL);
            }
        } catch (IOException e) {
            System.out.println("Unable to read URLs from the file at line: " + line);
            return;
        }

        // all set - create number of threads
        // and start fetching
        ExecutorService service = Executors.newFixedThreadPool(this.numThreads);

        final long start = System.currentTimeMillis();
        for (Runnable runnable : this.downloadTasks) {
            service.submit(runnable);
        }

        // intialize some variables
        this.numTasks = this.downloadTasks.size();
        this.downloadTasks.clear();

        if (this.numTasks > 1000) {
            this.splitFolders = true;
        }

        // shutdown
        shutdownAndAwaitTermination(service);
        final long end = System.currentTimeMillis();

        // everything done
        System.out.println(this.downloadTasks.size() + " urls downloaded in " + (end - start) + " millis.");
    }

    /**
     * Create a {@link Runnable} task for downloading and storage for the given
     * URL.
     * 
     * @param url
     *            the url to be downloaded
     */
    private void createURLTask(String url) {
        if (AssertUtils.isEmpty(url)) {
            return;
        }

        if (AssertUtils.isNotEmpty(this.prefix)) {
            url = this.prefix + url;
        }

        if (AssertUtils.isNotEmpty(this.suffix)) {
            url = url + this.suffix;
        }

        final String downloadURL = url;
        this.downloadTasks.add(new Runnable() {

            @Override
            public void run() {
                downloadAndStoreURL(downloadURL);
            }

        });
    }

    /**
     * Download the URL from web and then ask for storage.
     * 
     * @param url
     *            the URL to be downloaded
     */
    private void downloadAndStoreURL(String url) {
        int current = count.incrementAndGet();
        System.out.println("Download " + current + "/" + this.numTasks + " url: " + url + "...");
        WebResponse response = WebInvoker.getResponse(url);
        if (response == null) {
            LOGGER.debug("Unable to fetch response for URL from server: {}", url);
            return;
        }

        if (!response.isSuccess()) {
            LOGGER.debug("Non-success response for URL from server: {}", url);
            return;
        }

        store(current, url, response);
    }

    /**
     * Store the downloaded web response to disk.
     * 
     * @param current
     *            the current index count
     * 
     * @param url
     *            the URL that was downloaded
     * 
     * @param response
     *            the response from the server
     */
    private void store(int current, String url, WebResponse response) {
        String json = GsonUtils.getGson().toJson(response);
        try {
            if (this.splitFolders) {
                int first = current % 16;
                int second = (current / 16) % 16;
                File folder = new File(
                        this.outputDir.getAbsolutePath() + File.separator + first + File.separator + second);
                folder.mkdirs();
                FileUtils.write(new File(folder, "url-" + current + this.storeSuffix), json);
            } else {
                FileUtils.write(new File(this.outputDir, "url-" + current + this.storeSuffix), json);
            }
        } catch (IOException e) {
            LOGGER.error("Unable to write web response from URL {} to disk: {}", url, json);
        }
    }

    /**
     * Terminate the thread pool
     * 
     * @param pool
     *            the thread pool to terminate
     */
    private void shutdownAndAwaitTermination(ExecutorService pool) {
        pool.shutdown(); // Disable new tasks from being submitted
        try {
            // Wait a while for existing tasks to terminate
            if (!pool.awaitTermination(1, TimeUnit.DAYS)) {
                pool.shutdownNow(); // Cancel currently executing tasks

                // Wait a while for tasks to respond to being cancelled
                if (!pool.awaitTermination(60, TimeUnit.SECONDS))
                    System.err.println("Pool did not terminate");
            }
        } catch (InterruptedException ie) {
            // (Re-)Cancel if current thread also interrupted
            pool.shutdownNow();
            // Preserve interrupt status
            Thread.currentThread().interrupt();
        }
    }

}