org.tallison.cc.CCGetter.java Source code

Introduction

Here is the source code for org.tallison.cc.CCGetter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.tallison.cc;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.zip.GZIPInputStream;

import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import org.apache.commons.codec.binary.Base32;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.httpclient.Header;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultRedirectStrategy;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.RedirectLocations;
import org.apache.http.protocol.HttpCoreContext;
import org.apache.log4j.Logger;
import org.apache.tika.io.IOUtils;
import org.archive.format.warc.WARCConstants;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.warc.WARCRecord;
import org.archive.util.LaxHttpParser;
import org.tallison.cc.index.CCIndexRecord;

/**
 * Class to read in an index file or a subset of an index file
 * and to "get" those files from cc to a local directory
 *
 * This relies heavily on centic9's CommonCrawlDocumenDownload.
 * Thank you, Dominik!!!
 */
public class CCGetter {

    enum FETCH_STATUS {
        BAD_URL, //0
        FETCHED_IO_EXCEPTION, //1
        FETCHED_NOT_200, //2
        FETCHED_IO_EXCEPTION_READING_ENTITY, //3
        FETCHED_IO_EXCEPTION_SHA1, //4
        ALREADY_IN_REPOSITORY, //5
        FETCHED_EXCEPTION_COPYING_TO_REPOSITORY, //6
        ADDED_TO_REPOSITORY; //7
    }

    private final static String AWS_BASE = "https://commoncrawl.s3.amazonaws.com/";
    static Logger logger = Logger.getLogger(CCGetter.class);

    private Base32 base32 = new Base32();
    private boolean writtenHeader = false;

    private final String proxyHost;
    private final int proxyPort;

    public CCGetter(String proxyHost, int proxyPort) {
        this.proxyHost = proxyHost;
        this.proxyPort = proxyPort;
    }

    private void execute(Path indexFile, Path rootDir, Path statusFile) throws IOException {

        int count = 0;
        BufferedWriter writer = Files.newBufferedWriter(statusFile, StandardCharsets.UTF_8);
        InputStream is = null;
        try {
            if (indexFile.endsWith(".gz")) {
                is = new BufferedInputStream(new GZIPInputStream(Files.newInputStream(indexFile)));
            } else {
                is = new BufferedInputStream(Files.newInputStream(indexFile));
            }
            try (BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
                String line = reader.readLine();
                while (line != null) {
                    processRow(line, rootDir, writer);
                    if (++count % 100 == 0) {
                        logger.info(indexFile.getFileName().toString() + ": " + count);
                    }
                    line = reader.readLine();
                }

            }
        } finally {
            IOUtils.closeQuietly(is);
            try {
                writer.flush();
                writer.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    private void processRow(String row, Path rootDir, BufferedWriter writer) throws IOException {
        for (CCIndexRecord r : CCIndexRecord.parseRecords(row)) {
            fetch(r, rootDir, writer);
        }
    }

    private void fetch(CCIndexRecord r, Path rootDir, BufferedWriter writer) throws IOException {
        Path targFile = rootDir.resolve(r.getDigest().substring(0, 2) + "/" + r.getDigest());

        if (Files.isRegularFile(targFile)) {
            writeStatus(r, FETCH_STATUS.ALREADY_IN_REPOSITORY, writer);
            logger.info("already retrieved:" + targFile.toAbsolutePath());
            return;
        }

        String url = AWS_BASE + r.getFilename();
        URI uri = null;
        try {
            uri = new URI(url);
        } catch (URISyntaxException e) {
            logger.warn("Bad url: " + url);
            writeStatus(r, FETCH_STATUS.BAD_URL, writer);
            return;
        }
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpHost target = new HttpHost(uri.getHost());
        String urlPath = uri.getRawPath();
        if (uri.getRawQuery() != null) {
            urlPath += "?" + uri.getRawQuery();
        }
        HttpGet httpGet = null;
        try {
            httpGet = new HttpGet(urlPath);
        } catch (Exception e) {
            logger.warn("bad path " + uri.toString(), e);
            writeStatus(r, FETCH_STATUS.BAD_URL, writer);
            return;
        }
        if (proxyHost != null && proxyPort > -1) {
            HttpHost proxy = new HttpHost(proxyHost, proxyPort, "http");
            RequestConfig requestConfig = RequestConfig.custom().setProxy(proxy).build();
            httpGet.setConfig(requestConfig);
        }
        httpGet.addHeader("Range", r.getOffsetHeader());
        HttpCoreContext coreContext = new HttpCoreContext();
        CloseableHttpResponse httpResponse = null;
        URI lastURI = null;
        try {
            httpResponse = httpClient.execute(target, httpGet, coreContext);
            RedirectLocations redirectLocations = (RedirectLocations) coreContext
                    .getAttribute(DefaultRedirectStrategy.REDIRECT_LOCATIONS);
            if (redirectLocations != null) {
                for (URI redirectURI : redirectLocations.getAll()) {
                    lastURI = redirectURI;
                }
            } else {
                lastURI = httpGet.getURI();
            }
        } catch (IOException e) {
            logger.warn("IOException for " + uri.toString(), e);
            writeStatus(r, FETCH_STATUS.FETCHED_IO_EXCEPTION, writer);
            return;
        }
        lastURI = uri.resolve(lastURI);

        if (httpResponse.getStatusLine().getStatusCode() != 200
                && httpResponse.getStatusLine().getStatusCode() != 206) {
            logger.warn("Bad status for " + uri.toString() + " : " + httpResponse.getStatusLine().getStatusCode());
            writeStatus(r, FETCH_STATUS.FETCHED_NOT_200, writer);
            return;
        }
        Path tmp = null;
        Header[] headers = null;
        boolean isTruncated = false;
        try {
            //this among other parts is plagiarized from centic9's CommonCrawlDocumentDownload
            //probably saved me hours.  Thank you, Dominik!
            tmp = Files.createTempFile("cc-getter", "");
            try (InputStream is = new GZIPInputStream(httpResponse.getEntity().getContent())) {
                WARCRecord warcRecord = new WARCRecord(new FastBufferedInputStream(is), "", 0);
                ArchiveRecordHeader archiveRecordHeader = warcRecord.getHeader();
                if (archiveRecordHeader.getHeaderFields().containsKey(WARCConstants.HEADER_KEY_TRUNCATED)) {
                    isTruncated = true;
                }
                headers = LaxHttpParser.parseHeaders(warcRecord, "UTF-8");

                Files.copy(warcRecord, tmp, StandardCopyOption.REPLACE_EXISTING);
            }
        } catch (IOException e) {
            writeStatus(r, null, headers, 0L, isTruncated, FETCH_STATUS.FETCHED_IO_EXCEPTION_READING_ENTITY,
                    writer);
            deleteTmp(tmp);
            return;
        }

        String digest = null;
        long tmpLength = 0l;
        try (InputStream is = Files.newInputStream(tmp)) {
            digest = base32.encodeAsString(DigestUtils.sha1(is));
            tmpLength = Files.size(tmp);
        } catch (IOException e) {
            writeStatus(r, null, headers, tmpLength, isTruncated, FETCH_STATUS.FETCHED_IO_EXCEPTION_SHA1, writer);
            logger.warn("IOException during digesting: " + tmp.toAbsolutePath());
            deleteTmp(tmp);
            return;
        }

        if (Files.exists(targFile)) {
            writeStatus(r, digest, headers, tmpLength, isTruncated, FETCH_STATUS.ALREADY_IN_REPOSITORY, writer);
            deleteTmp(tmp);
            return;
        }
        try {
            Files.createDirectories(targFile.getParent());
            Files.copy(tmp, targFile);
        } catch (IOException e) {
            writeStatus(r, digest, headers, tmpLength, isTruncated,
                    FETCH_STATUS.FETCHED_EXCEPTION_COPYING_TO_REPOSITORY, writer);
            deleteTmp(tmp);

        }
        writeStatus(r, digest, headers, tmpLength, isTruncated, FETCH_STATUS.ADDED_TO_REPOSITORY, writer);
        deleteTmp(tmp);
    }

    private void writeStatus(CCIndexRecord r, FETCH_STATUS fetchStatus, BufferedWriter writer) throws IOException {
        writeStatus(r, null, null, -1l, false, fetchStatus, writer);
    }

    private void writeStatus(CCIndexRecord r, String actualDigest, Header[] headers, long actualLength,
            boolean isTruncated, FETCH_STATUS fetchStatus, BufferedWriter writer) throws IOException {
        List<String> row = new LinkedList<>();

        if (!writtenHeader) {
            row.addAll(Arrays.asList(new String[] { "URL", "CC_MIME", "CC_DIGEST", "COMPUTED_DIGEST",
                    "HEADER_ENCODING", "HEADER_TYPE", "HEADER_LANGUAGE", "HEADER_LENGTH", "ACTUAL_LENGTH",
                    "WARC_IS_TRUNCATED", "FETCH_STATUS" }));
            writer.write(StringUtils.join(row, "\t"));
            writer.write("\n");
            row.clear();
            writtenHeader = true;
        }

        row.add(clean(r.getUrl()));
        row.add(clean(r.getMime()));
        row.add(clean(r.getDigest()));
        if (actualDigest != null) {
            row.add(actualDigest);
        } else {
            row.add("");
        }
        row.add(getHeader("content-encoding", headers));
        row.add(getHeader("content-type", headers));
        row.add(getHeader("content-language", headers));
        row.add(getHeader("content-length", headers));
        row.add(Long.toString(actualLength));
        if (isTruncated) {
            row.add("TRUE");
        } else {
            row.add("");
        }
        row.add(clean(fetchStatus.toString()));

        writer.write(StringUtils.join(row, "\t"));
        writer.write("\n");
    }

    private String getHeader(String headerNameLC, Header[] headers) {
        if (headers == null) {
            return "";
        }
        for (Header header : headers) {
            if (header.getName().equalsIgnoreCase(headerNameLC)) {
                return clean(header.getValue());
            }
        }
        return "";
    }

    private String clean(String s) {
        //make sure that the string doesn't contain \t or new line
        if (s == null) {
            return "";
        }

        return s.replaceAll("\\s", " ");
    }

    private void deleteTmp(Path tmp) {
        try {
            Files.delete(tmp);
        } catch (IOException e1) {
            logger.error("Couldn't delete tmp file: " + tmp.toAbsolutePath());
        }
    }

    public static void main(String[] args) throws IOException {
        Path indexFile = Paths.get(args[0]);
        Path rootDir = Paths.get(args[1]);
        Path statusFile = Paths.get(args[2]);
        String proxy = null;
        int proxyPort = -1;
        if (args.length > 3) {
            proxy = args[3];
            proxyPort = Integer.parseInt(args[4]);
        }
        CCGetter ccGetter = new CCGetter(proxy, proxyPort);
        ccGetter.execute(indexFile, rootDir, statusFile);
    }
}