io.ecarf.core.compress.NTripleGzipProcessor.java Source code

Java tutorial

Introduction

Here is the source code for io.ecarf.core.compress.NTripleGzipProcessor.java

Source

/**
 * The contents of this file may be used under the terms of the Apache License, Version 2.0
 * in which case, the provisions of the Apache License Version 2.0 are applicable instead of those above.
 *
 * Copyright 2014, Ecarf.io
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.ecarf.core.compress;

import io.ecarf.core.utils.Constants;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
import org.apache.commons.compress.compressors.gzip.GzipUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.semanticweb.yars.nx.Node;
import org.semanticweb.yars.nx.parser.NxParser;

/**
 * Processes a normal/gzip input file and outputs
 * to a gzip file
 * 
 * @author Omer Dawelbeit (omerio)
 *
 */
public class NTripleGzipProcessor {

    private final static Log log = LogFactory.getLog(NTripleGzipProcessor.class);

    private String inputFile;

    private String outputFile;

    /**
     * @param inputFile
     * @param outputFile
     */
    public NTripleGzipProcessor(String inputFile) {
        super();
        this.inputFile = inputFile;
        // get the file name before the ext
        String ext = FilenameUtils.getExtension(inputFile);
        // construct an output file in the format inputfile_out.ext
        this.outputFile = StringUtils.removeEnd(inputFile, "." + ext);
        this.outputFile = outputFile + Constants.OUT_FILE_SUFFIX + ext;
    }

    /**
     * Read the input file, gunziped if needed and call the callback to process each line
     * no output is produced
     * @param callback
     * @throws IOException
     */
    public void read(NTripleGzipCallback callback) throws IOException {

        try (BufferedReader deflated = new BufferedReader(
                new InputStreamReader(this.getDeflatedStream(new FileInputStream(this.inputFile))),
                Constants.GZIP_BUF_SIZE);) {

            NxParser nxp = new NxParser(deflated);

            while (nxp.hasNext()) {

                Node[] ns = nxp.next();

                //We are only interested in triples, no quads
                if (ns.length == 3) {

                    callback.process(ns);

                } else {
                    log.warn("Ignoring line: " + ns);
                }
            }
        }
    }

    /**
     * Reads the input file, gunziped if needed, calls the callback to process
     * each line that being read then writes the file back to a gziped output file
     * @param callback
     * @throws IOException 
     */
    public String process(NTripleGzipCallback callback) throws IOException {

        try (BufferedReader deflated = new BufferedReader(
                new InputStreamReader(this.getDeflatedStream(new FileInputStream(this.inputFile))),
                Constants.GZIP_BUF_SIZE);) {

            try (//BufferedReader bf = new BufferedReader(new InputStreamReader(deflated, Constants.UTF8));
                    PrintWriter writer = new PrintWriter(new BufferedOutputStream(
                            new GZIPOutputStream(new FileOutputStream(this.outputFile), Constants.GZIP_BUF_SIZE),
                            Constants.GZIP_BUF_SIZE));) {

                String outLine;

                callback.setOutput(writer);

                NxParser nxp = new NxParser(deflated);

                while (nxp.hasNext()) {

                    Node[] ns = nxp.next();

                    //We are only interested in triples, no quads
                    if (ns.length == 3) {

                        outLine = callback.process(ns);
                        if (outLine != null) {
                            writer.println(outLine);
                        }

                    } else {
                        log.warn("Ignoring line: " + ns);
                    }
                }

            }

            return this.outputFile;
        }
    }

    /**
     * Get a deflated stream from the provided input
     * @param input
     * @return
     * @throws IOException
     */
    private InputStream getDeflatedStream(InputStream input) throws IOException {

        InputStream deflated = input;

        // gzip
        if (GzipUtils.isCompressedFilename(this.inputFile)) {
            deflated = new GZIPInputStream(input, Constants.GZIP_BUF_SIZE);

        }
        // bz2
        else if (BZip2Utils.isCompressedFilename(this.inputFile)) {
            deflated = new BZip2CompressorInputStream(new BufferedInputStream(input));
        }

        return deflated;
    }

    /**
     * @param inputFile the inputFile to set
     */
    public void setInputFile(String inputFile) {
        this.inputFile = inputFile;
    }

}