eu.scape_project.arc2warc.Arc2WarcMigration.java Source code

Java tutorial

Introduction

Here is the source code for eu.scape_project.arc2warc.Arc2WarcMigration.java

Source

/*
 * Copyright 2012 The SCAPE Project Consortium.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * under the License.
 */
package eu.scape_project.arc2warc;

import eu.scape_project.arc2warc.cli.Arc2WarcMigrationConfig;
import eu.scape_project.arc2warc.cli.Arc2WarcMigrationOptions;
import eu.scape_project.hawarp.utils.RegexUtils;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;

/**
 * ARC to WARC conversion.
 *
 * @author Sven Schlarb <https://github.com/shsdev>
 */
public class Arc2WarcMigration {

    private static final Log LOG = LogFactory.getLog(Arc2WarcMigration.class);

    private static Arc2WarcMigrationConfig config;

    public Arc2WarcMigration() {
    }

    public static Arc2WarcMigrationConfig getConfig() {
        return config;
    }

    /**
     * Main entry point.
     *
     * @param args
     * @throws java.io.IOException
     * @throws org.apache.commons.cli.ParseException
     */
    public static void main(String[] args) throws IOException, ParseException {
        Configuration conf = new Configuration();
        // Command line interface
        config = new Arc2WarcMigrationConfig();
        CommandLineParser cmdParser = new PosixParser();
        GenericOptionsParser gop = new GenericOptionsParser(conf, args);
        Arc2WarcMigrationOptions a2wopt = new Arc2WarcMigrationOptions();
        CommandLine cmd = cmdParser.parse(a2wopt.options, gop.getRemainingArgs());
        if ((args.length == 0) || (cmd.hasOption(a2wopt.HELP_OPT))) {
            a2wopt.exit("Help", 0);
        } else {
            a2wopt.initOptions(cmd, config);
        }
        Arc2WarcMigration a2wm = new Arc2WarcMigration();
        long startMillis = System.currentTimeMillis();
        File input = new File(config.getInputStr());

        if (input.isDirectory()) {
            config.setDirectoryInput(true);
            a2wm.traverseDir(input);
        } else {
            migrate(input);
        }
        long elapsedTimeMillis = System.currentTimeMillis() - startMillis;
        LOG.info("Processing time (sec): " + elapsedTimeMillis / 1000F);
        System.exit(0);
    }

    /**
     * Traverse the root directory recursively
     *
     * @param dirStructItem Root directory
     * @throws FileNotFoundException
     * @throws IOException
     */
    private void traverseDir(File dirStructItem) {
        if (dirStructItem.isDirectory()) {
            String[] children = dirStructItem.list();
            for (String child : children) {
                traverseDir(new File(dirStructItem, child));
            }
        } else {
            String filePath = dirStructItem.getAbsolutePath();
            if (RegexUtils.pathMatchesRegexFilter(filePath, config.getInputPathRegexFilter())) {
                migrate(dirStructItem);
            }
        }
    }

    private static void migrate(File dirStructItem) {
        //first do normal migration.
        File output = new File(config.getOutputStr(), dirStructItem.getName().replaceAll("\\.arc(.gz)?$",
                config.createCompressedWarc() ? ".warc.gz" : ".warc"));
        ArcMigrator arcMigrator = new ArcMigrator(config, dirStructItem, output, false);
        arcMigrator.migrateArcFile();

        if (dirStructItem.getName().contains("-metadata-")) {
            //then make a special deduplication arc file
            output = new File(config.getOutputStr(), output.getName().replace("-metadata-", "-duplications-"));
            arcMigrator = new ArcMigrator(config, dirStructItem, output, true);
            arcMigrator.migrateArcFile();

        }
    }
}