org.wikidata.wdtk.client.DumpProcessingOutputAction.java Source code

Java tutorial

Introduction

Here is the source code for org.wikidata.wdtk.client.DumpProcessingOutputAction.java

Source

package org.wikidata.wdtk.client;

import java.io.BufferedOutputStream;
import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipParameters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikidata.wdtk.datamodel.interfaces.Sites;
import org.wikidata.wdtk.util.DirectoryManager;
import org.wikidata.wdtk.util.DirectoryManagerFactory;

/*
 * #%L
 * Wikidata Toolkit Examples
 * %%
 * Copyright (C) 2014 Wikidata Toolkit Developers
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

/**
 * Abstract implementation of {@link DumpProcessingAction} that represents
 * actions which generate outputs (in the form of files). It supports options
 * {@link #OPTION_DESTINATION}, {@link #OPTION_COMPRESSION}, and
 * {@link #OPTION_USE_STDOUT}. Moreover, it provides some static helper
 * functions for opening files for writing.
 * 
 * @author Michael Gnther
 * @author Markus Kroetzsch
 */
public abstract class DumpProcessingOutputAction implements DumpProcessingAction {

    static final Logger logger = LoggerFactory.getLogger(DumpProcessingAction.class);

    /**
     * Name of the option used to define that output be compressed in a
     * particular way. By default, output is not compressed. When using
     * compression, the output file will be extended with a suitable file
     * extension.
     */
    public static final String OPTION_COMPRESSION = "compression";
    /**
     * Name of the option to set the destination file name (may include path)
     * for outputs generated by this action. If not given, actions should pick a
     * default output file.
     */
    public static final String OPTION_DESTINATION = "output";
    /**
     * Name of the option to redirect output to stdout. This is achieved by
     * setting the value to the string "true". Other values are ignored.
     */
    public static final String OPTION_USE_STDOUT = "stdout";

    public static final String COMPRESS_BZ2 = "bz2";
    public static final String COMPRESS_GZIP = "gz";
    public static final String COMPRESS_NONE = "";

    /**
     * Output streams that were created by this class. If close is called, it
     * will close all of them properly.
     * 
     */
    protected Set<Closeable> outputStreams = new HashSet<>();

    /**
     * The name of the action.
     */
    protected String name;

    /**
     * The {@link Sites} object if provided.
     * 
     * @see #needsSites()
     */
    protected Sites sites;

    /**
     * For operations that generate output, this is the name of the output file,
     * or null to use the default file name for the given options.
     */
    protected String outputDestination = null;

    /**
     * Specifies if output should be redirected to stdout rather than being
     * written to a file.
     */
    protected boolean useStdOut = false;

    /**
     * String to indicate the output compression to be used, if any.
     */
    protected String compressionType = COMPRESS_NONE;

    /**
     * Date stamp of the dump to be processed.
     */
    protected String dateStamp = "UNKNOWN";
    /**
     * String name of the site that the processed dump file comes from.
     */
    protected String project = "UNKNOWN";

    @Override
    public void setSites(Sites sites) {
        this.sites = sites;
    }

    @Override
    public boolean setOption(String option, String value) {
        switch (option) {
        case OPTION_DESTINATION:
            this.outputDestination = value;
            return true;
        case OPTION_COMPRESSION:
            this.compressionType = value.toLowerCase();
            return true;
        case OPTION_USE_STDOUT:
            this.useStdOut = (value == null) || "true".equals(value);
            return true;
        default:
            return false;
        }
    }

    @Override
    public boolean useStdOut() {
        return this.useStdOut;
    }

    @Override
    public void setDumpInformation(String project, String dateStamp) {
        this.project = project;
        this.dateStamp = dateStamp;
    }

    @Override
    public void setActionName(String name) {
        this.name = name;
    }

    @Override
    public String getActionName() {
        if (this.name != null) {
            return this.name;
        } else {
            return getDefaultActionName();
        }
    }

    public String getOutputFilename() {
        if (this.outputDestination == null) {
            return "unnamed.out";
        } else {
            return insertDumpInformation(this.outputDestination);
        }
    }

    public String insertDumpInformation(String pattern) {
        return pattern.replace("{DATE}", this.dateStamp).replace("{PROJECT}", this.project);
    }

    @Override
    public void close() {
        for (Closeable closeable : this.outputStreams) {
            DumpProcessingOutputAction.close(closeable);
        }
    }

    /**
     * Creates an compressing {@link OutputStream}. The result is owned by the
     * caller and should be closed later. Neverhteless, the {@link #close()}
     * method of this class must also be called, since it may free additional
     * resources created.
     * 
     * @param useStdOut
     *            if true, {@link System#out} is returned and the other
     *            parameters are ignored
     * @param filePath
     *            the string name of the output file, possibly including path
     *            information
     * @param compressionType
     *            a string that refers to a type of output compression or the
     *            empty string (no compression); a suitable file extension will
     *            be added to the output file
     * 
     * @return compressing {@link OutputStream}
     * @throws IOException
     *             if there were problems opening the required streams
     */
    protected OutputStream getOutputStream(boolean useStdOut, String filePath, String compressionType)
            throws IOException {
        if (useStdOut) {
            return System.out;
        }

        if (!compressionType.isEmpty()) {
            filePath += "." + compressionType;
        }

        Path outputDirectory = Paths.get(filePath).getParent();
        if (outputDirectory == null) {
            outputDirectory = Paths.get(".");
        }

        DirectoryManager dm = DirectoryManagerFactory.createDirectoryManager(outputDirectory);
        OutputStream out = dm.getOutputStreamForFile(Paths.get(filePath).getFileName().toString());

        OutputStream bufferedFileOutputStream = new BufferedOutputStream(out, 1024 * 1024 * 5);

        switch (compressionType) {
        case COMPRESS_BZ2:
            return getAsynchronousOutputStream(new BZip2CompressorOutputStream(bufferedFileOutputStream));
        case COMPRESS_GZIP:
            GzipParameters gzipParameters = new GzipParameters();
            gzipParameters.setCompressionLevel(7);
            return getAsynchronousOutputStream(
                    new GzipCompressorOutputStream(bufferedFileOutputStream, gzipParameters));
        case COMPRESS_NONE:
            return bufferedFileOutputStream;
        default:
            bufferedFileOutputStream.close();
            throw new IllegalArgumentException("Unsupported compression format: " + compressionType);
        }
    }

    /**
     * Simple interface for a Runnable that can be stopped gracefully by calling
     * a method {@link FinishableRunnable#finish()}.
     * 
     * @author Markus Kroetzsch
     * 
     */
    protected interface FinishableRunnable extends Runnable {

        /**
         * Finishes the current operation gracefully. The method will wait until
         * the thread has really finished.
         */
        void finish();
    };

    /**
     * Creates a separate thread for writing into the given output stream and
     * returns a pipe output stream that can be used to pass data to this
     * thread.
     * <p>
     * This code is inspired by
     * http://stackoverflow.com/questions/12532073/gzipoutputstream
     * -that-does-its-compression-in-a-separate-thread
     * 
     * @param outputStream
     *            the stream to write to in the thread
     * @return a new stream that data should be written to
     * @throws IOException
     *             if the pipes could not be created for some reason
     */
    protected OutputStream getAsynchronousOutputStream(final OutputStream outputStream) throws IOException {
        final int SIZE = 1024 * 1024 * 10;
        final PipedOutputStream pos = new PipedOutputStream();
        final PipedInputStream pis = new PipedInputStream(pos, SIZE);

        final FinishableRunnable run = new FinishableRunnable() {

            volatile boolean finish = false;
            volatile boolean hasFinished = false;

            @Override
            public void finish() {
                this.finish = true;
                while (!this.hasFinished) {
                    // loop until thread is really finished
                }
            }

            @Override
            public void run() {
                try {
                    byte[] bytes = new byte[SIZE];
                    // Note that we finish really gently here, writing all data
                    // that is still in the input first (in theory, new data
                    // could arrive asynchronously, so that the thread never
                    // finishes, but this is not the intended mode of
                    // operation).
                    for (int len; (!this.finish || pis.available() > 0) && (len = pis.read(bytes)) > 0;) {
                        outputStream.write(bytes, 0, len);
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                } finally {
                    close(pis);
                    close(outputStream);
                    this.hasFinished = true;
                }
            }
        };

        new Thread(run, "async-output-stream").start();

        this.outputStreams.add(new Closeable() {
            @Override
            public void close() throws IOException {
                run.finish();
            }
        });

        return pos;
    }

    /**
     * Closes a Closeable and swallows any exceptions that might occur in the
     * process.
     * 
     * @param closeable
     */
    private static void close(Closeable closeable) {
        if (closeable != null) {
            try {
                closeable.close();
            } catch (IOException ignored) {
                logger.error("Failed to close output stream: " + ignored.getMessage());
            }
        }
    }

}