com.difference.historybook.importer.crawler.Crawler.java Source code

Introduction

Here is the source code for com.difference.historybook.importer.crawler.Crawler.java
Source

/*
 * Copyright 2016 Andrew W. Buchanan (buchanan@difference.com)
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.difference.historybook.importer.crawler;

import java.io.IOException;
import java.io.PrintStream;
import java.util.function.UnaryOperator;
import java.util.stream.Stream;

import org.apache.commons.codec.Charsets;
import org.apache.http.client.fluent.Content;
import org.apache.http.client.fluent.Request;

import com.difference.historybook.importer.HistoryRecord;
import com.difference.historybook.importer.HistoryRecordJSONSerialization;

/**
 * Utility to read a file of JSON HistoryRecords and crawl the pages to fetch the content 
 *
 */
public class Crawler {
    private static final int THROTTLE_MILLIS = 3000;

    public static void main(String[] args) throws IOException {
        if (args.length < 1) {
            System.err.println("usage: Crawler <inputfile>");
            System.exit(1);
        }

        String fileName = args[0];
        process(fileName);
    }

    /**
     * Reads a HistoryRecord JSON file and fetch each with default throttle config
     * 
     * @param filename HistoryRecord JSON file to read from
     * @throws IOException
     */
    public static void process(String filename) throws IOException {
        process(filename, THROTTLE_MILLIS);
    }

    /**
     * Reads a HistoryRecord JSON file and fetch each with specified throttle config
     * 
     * @param filename HistoryRecord JSON file to read from
     * @param throttleMillis time for each thread to pause between fetches (how nice to be to the servers)
     * @throws IOException
     */
    public static void process(String filename, int throttleMillis) throws IOException {
        process(filename, throttleMillis, Crawler::fetchBody, System.err, System.out);
    }

    /**
     * Reads a HistoryRecord JSON file and run a process against each
     * 
     * @param filename HistoryRecord JSON file to read from
     * @param throttleMillis time for each thread to pause between fetches (how nice to be to the servers)
     * @param processor process to run against each HistoryRecord
     * @param progress where to print out the urls as we fetch
     * @param output where to print the new HistoryRecord JSON with the body content
     * @throws IOException
     */
    public static void process(String filename, int throttleMillis, UnaryOperator<HistoryRecord> processor,
            PrintStream progress, PrintStream output) throws IOException {
        getUrlStream(filename).parallel().map(new Throttle<HistoryRecord>(throttleMillis))
                .map((r) -> printUrl(r, progress)).map(processor).map(HistoryRecordJSONSerialization::toJSONString)
                .forEach(output::println);
    }

    /**
     * Get Stream of HistoryRecord filtered to valid fetchable urls
     * 
     * @param filename HistoryRecord JSON file to read from
     * @return
     * @throws IOException
     */
    protected static Stream<HistoryRecord> getUrlStream(String filename) throws IOException {
        return HistoryRecordJSONSerialization.parseFile(filename).filter(Crawler::isValidProtocol)
                .map(Crawler::removeFragment);
    }

    // filter HistoryRecords to web pages
    private static boolean isValidProtocol(HistoryRecord record) {
        String lCaseUrl = record.getUrl().toLowerCase();
        return lCaseUrl.startsWith("http:") || lCaseUrl.startsWith("https:");
    }

    // Modify HistoryRecords to remove fragment portions of URL
    private static HistoryRecord removeFragment(HistoryRecord record) {
        String url = record.getUrl();
        int fragmentStart = url.indexOf('#');
        if (fragmentStart >= 0) {
            record.setUrl(url.substring(0, fragmentStart));
        }
        return record;
    }

    // print the url from a HistoryRecord to the specified PrintStream
    private static HistoryRecord printUrl(HistoryRecord record, PrintStream output) {
        output.println(record.getUrl());
        return record;
    }

    /**
     * Fetch the body for a given HistoryRecord and add it to the record
     * 
     * @param record HistoryRecord containing a URL to fetch
     * @return
     */
    protected static HistoryRecord fetchBody(HistoryRecord record) {
        try {
            Content content = Request.Get(encodeSpecialChars(record.getUrl())).execute().returnContent();
            if (content != null) {
                record.setBody(content.asString(Charsets.UTF_8));
            }
        } catch (IOException e) {
            System.err.println("Failed to fetch " + record.getUrl() + ": " + e.getLocalizedMessage());
        }
        return record;
    }

    // encode special characters that seem to show up in otherwise URL encoded URLs...
    // TODO: Do we still need the # encode? That might have been from before filtering them out...
    private static String encodeSpecialChars(String url) {
        String result = url.replace("{", "%7B").replace("}", "%7D").replace("#", "%23").replace("|", "%7C")
                .replace(" ", "%20");
        return result;
    }

}