io.github.thammegowda.Local2SeqFile.java Source code

Java tutorial

Introduction

Here is the source code for io.github.thammegowda.Local2SeqFile.java

Source

package io.github.thammegowda;
/*
 * Copyright 2017 Thamme Gowda
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 */

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.io.Text;
import org.apache.tika.io.IOUtils;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;

import java.io.*;
import java.util.Iterator;

/**
 * This tool creates a large sequence file by combing many small files on local file system.
 * This tool offers command line interface.
 */
public class Local2SeqFile {

    @Option(name = "--in", aliases = "-in", required = true, usage = "Path to input on local file system. "
            + "This could be path to a parent directory or a file having list of paths.")
    private String input;

    @Option(name = "--out", aliases = "-out", required = true, usage = "Path to output sequence file")
    private String output;

    @Option(name = "--min-size", aliases = "-min", usage = "Files having fewer number of bytes than this number will be skipped.")
    private long minFileSize = 1;

    @Option(name = "--max-size", aliases = "-max", usage = "Files having more bytes than this number will be skipped. "
            + "Note: the value type of sequence file is hadoop.io.BytesWritable, "
            + "meaning that the content will be held in memory. "
            + "Thus, setting it to a large value could cause memory overflow")
    private long maxFileSize = 64 * 1024 * 1024;

    private FileSystem localFs;
    private FileSystem distribFs;

    public Local2SeqFile() throws IOException {
        super();
        Configuration config = new Configuration();
        localFs = LocalFileSystem.get(config);
        distribFs = DistributedFileSystem.get(config);
    }

    private static class FileListIterator implements RemoteIterator<FileStatus>, Iterator<FileStatus>, Closeable {

        private FileStatus next;
        private BufferedReader reader;
        private FileSystem fs;

        public FileListIterator(InputStream content, FileSystem fs) {
            reader = new BufferedReader(new InputStreamReader(content));
            next = getNext();
            this.fs = fs;
        }

        private FileStatus getNext() {
            if (reader == null) {
                throw new IllegalStateException("Reader already closed");
            }
            try {
                String line = reader.readLine();
                if (line == null) { // end of stream
                    close();
                    return null;
                }
                line = line.trim();
                if (line.isEmpty()) {
                    return getNext();
                }
                return fs.getFileStatus(new Path(line));
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }

        public void close() throws IOException {
            if (reader != null) {
                reader.close();
            }
            reader = null;
        }

        @Override
        public boolean hasNext() {
            return next != null;
        }

        @Override
        public FileStatus next() {
            FileStatus tmp = next;
            next = getNext();
            return tmp;
        }
    }

    private RemoteIterator<? extends FileStatus> readInput() throws IOException {
        Path inPath = new Path(input);
        if (!localFs.exists(inPath)) {
            throw new IllegalArgumentException("File " + input + " doesnt exists");
        }
        RemoteIterator<? extends FileStatus> files;
        if (localFs.isFile(inPath)) {
            try (FSDataInputStream content = localFs.open(inPath)) {
                files = new FileListIterator(content, localFs);
            }
        } else if (localFs.isDirectory(inPath)) {
            files = localFs.listFiles(inPath, true);
        } else {
            throw new RuntimeException("Unknown input file type");
        }
        return files;
    }

    private boolean filter(FileStatus file) {
        return file.isFile() && file.getLen() >= minFileSize && file.getLen() <= maxFileSize;
    }

    private void writeOutput(RemoteIterator<? extends FileStatus> input) throws IOException {
        Path outPath = new Path(output);
        if (distribFs.exists(outPath)) {
            throw new IllegalArgumentException("Output file already exists, Not overwriting it:" + output);
        }

        Writer writer = SequenceFile.createWriter(distribFs.getConf(), Writer.file(outPath),
                Writer.keyClass(Text.class), Writer.valueClass(BytesWritable.class),
                Writer.compression(SequenceFile.CompressionType.RECORD));
        Text key = new Text();
        BytesWritable value = new BytesWritable();
        long skipped = 0;
        long copied = 0;
        while (input.hasNext()) {
            FileStatus next = input.next();
            if (filter(next)) {
                key.set(next.getPath().toString());
                FSDataInputStream stream = localFs.open(next.getPath());
                //CAUTION : this could cause memory overflow
                byte[] bytes = IOUtils.toByteArray(stream);
                value.set(bytes, 0, bytes.length);
                writer.append(key, value);
                copied++;
            } else {
                skipped++;
            }
        }
        writer.close();
        System.out.println("Files copied ::" + copied);
        System.out.println("Files skipped ::" + skipped);
    }

    public static void main(String[] args) throws CmdLineException, IOException {
        Local2SeqFile copier = new Local2SeqFile();
        CmdLineParser parser = new CmdLineParser(copier);
        try {
            parser.parseArgument(args);
        } catch (CmdLineException e) {
            System.out.println(e.getMessage());
            parser.printUsage(System.out);
            return;
        }
        copier.writeOutput(copier.readInput());
    }
}