org.archive.modules.writer.ARCWriterProcessor.java Source code

Introduction

Here is the source code for org.archive.modules.writer.ARCWriterProcessor.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.writer;

import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WRITE_TAG;

import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.io.ArchiveFileConstants;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.io.arc.ARCWriter;
import org.archive.io.arc.ARCWriterPool;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;

/**
 * Processor module for writing the results of successful fetches (and
 * perhaps someday, certain kinds of network failures) to the Internet Archive
 * ARC file format.
 *
 * Assumption is that there is only one of these ARCWriterProcessors per
 * Heritrix instance.
 *
 * @author Parker Thompson
 */
public class ARCWriterProcessor extends WriterPoolProcessor {

    final static private String METADATA_TEMPLATE = readMetadataTemplate();

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;

    private static final Logger logger = Logger.getLogger(ARCWriterProcessor.class.getName());

    public long getDefaultMaxFileSize() {
        return 100000000L; // 100 SI mega-bytes (10^8 bytes)
    }

    public List<ConfigPath> getDefaultStorePaths() {
        List<ConfigPath> paths = new ArrayList<ConfigPath>();
        paths.add(new ConfigPath("arcs default store path", "arcs"));
        return paths;
    }

    private transient List<String> cachedMetadata;

    public ARCWriterProcessor() {
    }

    @Override
    protected void setupPool(AtomicInteger serialNo) {
        setPool(new ARCWriterPool(serialNo, this, getPoolMaxActive(), getMaxWaitForIdleMs()));
    }

    /**
     * Writes a CrawlURI and its associated data to store file.
     *
     * Currently this method understands the following uri types: dns, http, 
     * and https.
     *
     * @param curi CrawlURI to process.
     */
    protected ProcessResult innerProcessResult(CrawlURI puri) {
        CrawlURI curi = (CrawlURI) puri;

        long recordLength = getRecordedSize(curi);

        ReplayInputStream ris = null;
        try {
            if (shouldWrite(curi)) {
                ris = curi.getRecorder().getRecordedInput().getReplayInputStream();
                return write(curi, recordLength, ris, getHostAddress(curi));
            } else {
                logger.info("does not write " + curi.toString());
                copyForwardWriteTagIfDupe(curi);
            }
        } catch (IOException e) {
            curi.getNonFatalFailures().add(e);
            logger.log(Level.SEVERE, "Failed write of Record: " + curi.toString(), e);
        } finally {
            IOUtils.closeQuietly(ris);
        }
        return ProcessResult.PROCEED;
    }

    protected ProcessResult write(CrawlURI curi, long recordLength, InputStream in, String ip) throws IOException {
        WriterPoolMember writer = getPool().borrowFile();
        long position = writer.getPosition();
        // See if we need to open a new file because we've exceeded maxBytes.
        // Call to checkFileSize will open new file if we're at maximum for
        // current file.
        writer.checkSize();
        if (writer.getPosition() != position) {
            // We just closed the file because it was larger than maxBytes.
            // Add to the totalBytesWritten the size of the first record
            // in the file, if any.
            setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - position));
            position = writer.getPosition();
        }

        ARCWriter w = (ARCWriter) writer;
        try {
            if (in instanceof ReplayInputStream) {
                w.write(curi.toString(), curi.getContentType(), ip, curi.getFetchBeginTime(), recordLength,
                        (ReplayInputStream) in);
            } else {
                w.write(curi.toString(), curi.getContentType(), ip, curi.getFetchBeginTime(), recordLength, in);
            }
        } catch (IOException e) {
            // Invalidate this file (It gets a '.invalid' suffix).
            getPool().invalidateFile(writer);
            // Set the writer to null otherwise the pool accounting
            // of how many active writers gets skewed if we subsequently
            // do a returnWriter call on this object in the finally block.
            writer = null;
            throw e;
        } finally {
            if (writer != null) {
                setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - position));
                getPool().returnFile(writer);

                String filename = writer.getFile().getName();
                if (filename.endsWith(ArchiveFileConstants.OCCUPIED_SUFFIX)) {
                    filename = filename.substring(0,
                            filename.length() - ArchiveFileConstants.OCCUPIED_SUFFIX.length());
                }
                curi.addExtraInfo("arcFilename", filename);

                Map<String, Object>[] history = curi.getFetchHistory();
                if (history != null && history[0] != null) {
                    history[0].put(A_WRITE_TAG, filename);
                }
            }
        }
        return checkBytesWritten();
    }

    public List<String> getMetadata() {
        if (METADATA_TEMPLATE == null) {
            return null;
        }

        if (cachedMetadata != null) {
            return cachedMetadata;
        }

        String meta = METADATA_TEMPLATE;
        meta = replace(meta, "${VERSION}", ArchiveUtils.VERSION);
        meta = replace(meta, "${HOST}", getHostName());
        meta = replace(meta, "${IP}", getHostAddress());

        if (meta != null) {
            meta = replace(meta, "${JOB_NAME}", getMetadataProvider().getJobName());
            meta = replace(meta, "${DESCRIPTION}", getMetadataProvider().getDescription());
            meta = replace(meta, "${OPERATOR}", getMetadataProvider().getOperator());
            // TODO: fix this to match job-start-date (from UI or operator setting)
            // in the meantime, don't include a slightly-off date
            // meta = replace(meta, "${DATE}", GMT());
            meta = replace(meta, "${USER_AGENT}", getMetadataProvider().getUserAgent());
            meta = replace(meta, "${FROM}", getMetadataProvider().getOperatorFrom());
            meta = replace(meta, "${ROBOTS}", getMetadataProvider().getRobotsPolicyName());
        }

        this.cachedMetadata = Collections.singletonList(meta);
        return this.cachedMetadata;
        // ${VERSION}
        // ${HOST}
        // ${IP}
        // ${JOB_NAME}
        // ${DESCRIPTION}
        // ${OPERATOR}
        // ${DATE}
        // ${USER_AGENT}
        // ${FROM}
        // ${ROBOTS}

    }

    private static String replace(String meta, String find, String replace) {
        replace = StringUtils.defaultString(replace);
        replace = StringEscapeUtils.escapeXml(replace);
        return meta.replace(find, replace);
    }

    private static String getHostName() {
        try {
            return InetAddress.getLocalHost().getCanonicalHostName();
        } catch (UnknownHostException e) {
            logger.log(Level.SEVERE, "Could not get local host name.", e);
            return "localhost";
        }
    }

    private static String getHostAddress() {
        try {
            return InetAddress.getLocalHost().getHostAddress();
        } catch (UnknownHostException e) {
            logger.log(Level.SEVERE, "Could not get local host address.", e);
            return "localhost";
        }
    }

    private static String readMetadataTemplate() {
        InputStream input = ARCWriterProcessor.class.getResourceAsStream("arc_metadata_template.xml");
        if (input == null) {
            logger.severe("No metadata template.");
            return null;
        }
        try {
            return IOUtils.toString(input);
        } catch (IOException e) {
            throw new IllegalStateException(e);
        } finally {
            IOUtils.closeQuietly(input);
        }
    }
}