edu.jhu.hlt.concrete.ingesters.webposts.WebPostIngesterRunner.java Source code

Java tutorial

Introduction

Here is the source code for edu.jhu.hlt.concrete.ingesters.webposts.WebPostIngesterRunner.java

Source

/*
 * Copyright 2012-2016 Johns Hopkins University HLTCOE. All rights reserved.
 * See LICENSE in the project root directory.
 */
package edu.jhu.hlt.concrete.ingesters.webposts;

import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.ParametersDelegate;

import edu.jhu.hlt.acute.archivers.tar.TarArchiver;
import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.ingesters.base.IngestException;
import edu.jhu.hlt.concrete.ingesters.base.IngesterParameterDelegate;
import edu.jhu.hlt.concrete.serialization.archiver.ArchivableCommunication;
import edu.jhu.hlt.utilt.ex.LoggedUncaughtExceptionHandler;
import edu.jhu.hlt.utilt.io.ExistingNonDirectoryFile;
import edu.jhu.hlt.utilt.io.NotFileException;

/**
 * Class used for bulk conversion of web post documents found in various LDC
 * corpora.
 *
 * @see #main(String...)
 */
public class WebPostIngesterRunner {

    private static final Logger LOGGER = LoggerFactory.getLogger(WebPostIngesterRunner.class);

    @ParametersDelegate
    private IngesterParameterDelegate delegate = new IngesterParameterDelegate();

    /**
     * @param args
     */
    public static void main(String... args) {
        Thread.setDefaultUncaughtExceptionHandler(new LoggedUncaughtExceptionHandler());
        WebPostIngesterRunner run = new WebPostIngesterRunner();
        JCommander jc = new JCommander(run, args);
        jc.setProgramName(WebPostIngesterRunner.class.getSimpleName());
        if (run.delegate.help) {
            jc.usage();
        }

        try {
            Path outpath = Paths.get(run.delegate.outputPath);
            IngesterParameterDelegate.prepare(outpath);
            WebPostIngester ing = new WebPostIngester();
            Path outWithExt = outpath.resolve("webposts.tar.gz");

            if (Files.exists(outWithExt)) {
                if (!run.delegate.overwrite) {
                    LOGGER.info("File: {} exists and overwrite disabled. Not running.", outWithExt.toString());
                    return;
                } else {
                    Files.delete(outWithExt);
                }
            }

            try (OutputStream os = Files.newOutputStream(outWithExt);
                    GzipCompressorOutputStream gout = new GzipCompressorOutputStream(os);
                    TarArchiver arch = new TarArchiver(gout)) {
                for (String pstr : run.delegate.paths) {
                    LOGGER.debug("Running on file: {}", pstr);
                    Path p = Paths.get(pstr);
                    new ExistingNonDirectoryFile(p);
                    try {
                        Communication next = ing.fromCharacterBasedFile(p);
                        arch.addEntry(new ArchivableCommunication(next));
                    } catch (IngestException e) {
                        LOGGER.error("Error processing file: " + pstr, e);
                    }
                }
            }
        } catch (NotFileException | IOException e) {
            LOGGER.error("Caught exception processing.", e);
        }
    }
}