org.archive.io.Arc2Warc.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.io.Arc2Warc.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.io;

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.archive.io.arc.ARCConstants;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.arc.ARCRecord;
import org.archive.format.warc.WARCConstants;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.io.warc.WARCWriter;
import org.archive.io.warc.WARCWriterPoolSettings;
import org.archive.io.warc.WARCWriterPoolSettingsData;
import org.archive.uid.RecordIDGenerator;
import org.archive.uid.UUIDGenerator;
import org.archive.util.FileUtils;
import org.archive.util.anvl.ANVLRecord;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.ISODateTimeFormat;

/**
 * Convert ARCs to (sortof) WARCs.
 * @author stack
 * @version $Date$ $Revision$
 */
public class Arc2Warc {
    protected RecordIDGenerator generator = new UUIDGenerator();

    private static void usage(HelpFormatter formatter, Options options, int exitCode) {
        formatter.printHelp("java org.archive.io.arc.Arc2Warc " + "[--force] ARC_INPUT WARC_OUTPUT", options);
        System.exit(exitCode);
    }

    private static String getRevision() {
        return Warc2Arc.parseRevision("$Revision$");
    }

    public void transform(final File arc, final File warc, final boolean force) throws IOException {
        FileUtils.assertReadable(arc);
        if (warc.exists() && !force) {
            throw new IOException("Target WARC already exists. " + "Will not overwrite.");
        }

        ARCReader reader = ARCReaderFactory.get(arc, false, 0);
        transform(reader, warc);
    }

    protected void transform(final ARCReader reader, final File warc) throws IOException {
        WARCWriter writer = null;
        // No point digesting. Digest is available after reading of ARC which
        // is too late for inclusion in WARC.
        reader.setDigest(false);
        try {
            BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(warc));
            // Get the body of the first ARC record as a String so can dump it
            // into first record of WARC.
            final Iterator<ArchiveRecord> i = reader.iterator();
            ARCRecord firstRecord = (ARCRecord) i.next();
            ByteArrayOutputStream baos = new ByteArrayOutputStream((int) firstRecord.getHeader().getLength());
            firstRecord.dump(baos);
            // Add ARC first record content as an ANVLRecord.
            ANVLRecord ar = new ANVLRecord();
            ar.addLabelValue("Filedesc", baos.toString());
            List<String> metadata = new ArrayList<String>(1);
            metadata.add(ar.toString());
            // Now create the writer.  If reader was compressed, lets write
            // a compressed WARC.
            writer = new WARCWriter(new AtomicInteger(), bos, warc,
                    new WARCWriterPoolSettingsData("", "", -1, reader.isCompressed(), null, metadata, generator));
            // Write a warcinfo record with description about how this WARC
            // was made.
            writer.writeWarcinfoRecord(warc.getName(), "Made from " + reader.getReaderIdentifier() + " by "
                    + this.getClass().getName() + "/" + getRevision());
            for (; i.hasNext();) {
                write(writer, (ARCRecord) i.next());
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
            if (writer != null) {
                // I don't want the close being logged -- least, not w/o log of
                // an opening (and that'd be a little silly for simple script
                // like this). Currently, it logs at level INFO so that close
                // of files gets written to log files.  Up the log level just
                // for the close.
                Logger l = Logger.getLogger(writer.getClass().getName());
                Level oldLevel = l.getLevel();
                l.setLevel(Level.WARNING);
                try {
                    writer.close();
                } finally {
                    l.setLevel(oldLevel);
                }
            }
        }
    }

    protected void write(final WARCWriter writer, final ARCRecord r) throws IOException {
        WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setUrl(r.getHeader().getUrl());
        recordInfo.setContentStream(r);
        recordInfo.setContentLength(r.getHeader().getLength());
        recordInfo.setEnforceLength(true);

        // convert ARC date to WARC-Date format
        String arcDateString = r.getHeader().getDate();
        String warcDateString = DateTimeFormat.forPattern("yyyyMMddHHmmss").withZone(DateTimeZone.UTC)
                .parseDateTime(arcDateString).toString(ISODateTimeFormat.dateTimeNoMillis());
        recordInfo.setCreate14DigitDate(warcDateString);

        ANVLRecord ar = new ANVLRecord();
        String ip = (String) r.getHeader().getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY));
        if (ip != null && ip.length() > 0) {
            ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
            r.getMetaData();
        }
        recordInfo.setExtraHeaders(ar);

        // enable reconstruction of ARC from transformed WARC
        // TODO: deferred for further analysis (see HER-1750) 
        // ar.addLabelValue("ARC-Header-Line", r.getHeaderString());

        // If contentBody > 0, assume http headers.  Make the mimetype
        // be application/http.  Otherwise, give it ARC mimetype.
        if (r.getHeader().getContentBegin() > 0) {
            recordInfo.setType(WARCRecordType.response);
            recordInfo.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
            recordInfo.setRecordId(generator.getRecordID());
        } else {
            recordInfo.setType(WARCRecordType.resource);
            recordInfo.setMimetype(r.getHeader().getMimetype());
            recordInfo.setRecordId(((WARCWriterPoolSettings) writer.settings).getRecordIDGenerator().getRecordID());
        }

        writer.writeRecord(recordInfo);
    }

    /**
     * Command-line interface to Arc2Warc.
     *
     * @param args Command-line arguments.
     * @throws ParseException Failed parse of the command line.
     * @throws IOException
     * @throws java.text.ParseException
     */
    @SuppressWarnings("unchecked")
    public static void main(String[] args) throws ParseException, IOException, java.text.ParseException {
        Options options = new Options();
        options.addOption(new Option("h", "help", false, "Prints this message and exits."));
        options.addOption(new Option("f", "force", false, "Force overwrite of target file."));
        PosixParser parser = new PosixParser();
        CommandLine cmdline = parser.parse(options, args, false);
        List<String> cmdlineArgs = cmdline.getArgList();
        Option[] cmdlineOptions = cmdline.getOptions();
        HelpFormatter formatter = new HelpFormatter();

        // If no args, print help.
        if (cmdlineArgs.size() <= 0) {
            usage(formatter, options, 0);
        }

        // Now look at options passed.
        boolean force = false;
        for (int i = 0; i < cmdlineOptions.length; i++) {
            switch (cmdlineOptions[i].getId()) {
            case 'h':
                usage(formatter, options, 0);
                break;

            case 'f':
                force = true;
                break;

            default:
                throw new RuntimeException("Unexpected option: " + +cmdlineOptions[i].getId());
            }
        }

        // If no args, print help.
        if (cmdlineArgs.size() != 2) {
            usage(formatter, options, 0);
        }
        (new Arc2Warc()).transform(new File(cmdlineArgs.get(0)), new File(cmdlineArgs.get(1)), force);
    }
}