org.apache.parquet.tools.command.MergeCommand.java Source code

Introduction

Here is the source code for org.apache.parquet.tools.command.MergeCommand.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.tools.command;

import org.apache.commons.cli.CommandLine;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.hadoop.util.HiddenFileFilter;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.tools.Main;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;

public class MergeCommand extends ArgsOnlyCommand {
    public static final String[] USAGE = new String[] { "<input> [<input> ...] <output>",
            "where <input> is the source parquet files/directory to be merged",
            "   <output> is the destination parquet file" };

    /**
     * Biggest number of input files we can merge.
     */
    private static final int MAX_FILE_NUM = 100;
    private static final long TOO_SMALL_FILE_THRESHOLD = 64 * 1024 * 1024;

    private Configuration conf;

    public MergeCommand() {
        super(2, MAX_FILE_NUM + 1);

        conf = new Configuration();
    }

    @Override
    public String[] getUsageDescription() {
        return USAGE;
    }

    @Override
    public String getCommandDescription() {
        return "Merges multiple Parquet files into one. "
                + "The command doesn't merge row groups, just places one after the other. "
                + "When used to merge many small files, the resulting file will still contain small row groups, "
                + "which usually leads to bad query performance.";
    }

    @Override
    public void execute(CommandLine options) throws Exception {
        // Prepare arguments
        List<String> args = options.getArgList();
        List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1));
        Path outputFile = new Path(args.get(args.size() - 1));

        // Merge schema and extraMeta
        FileMetaData mergedMeta = mergedMetadata(inputFiles);
        PrintWriter out = new PrintWriter(Main.out, true);

        // Merge data
        ParquetFileWriter writer = new ParquetFileWriter(conf, mergedMeta.getSchema(), outputFile,
                ParquetFileWriter.Mode.CREATE);
        writer.start();
        boolean tooSmallFilesMerged = false;
        for (Path input : inputFiles) {
            if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) {
                out.format("Warning: file %s is too small, length: %d\n", input,
                        input.getFileSystem(conf).getFileStatus(input).getLen());
                tooSmallFilesMerged = true;
            }

            writer.appendFile(HadoopInputFile.fromPath(input, conf));
        }

        if (tooSmallFilesMerged) {
            out.println("Warning: you merged too small files. "
                    + "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, "
                    + "which usually leads to bad query performance!");
        }
        writer.end(mergedMeta.getKeyValueMetaData());
    }

    private FileMetaData mergedMetadata(List<Path> inputFiles) throws IOException {
        return ParquetFileWriter.mergeMetadataFiles(inputFiles, conf).getFileMetaData();
    }

    /**
     * Get all input files.
     * @param input input files or directory.
     * @return ordered input files.
     */
    private List<Path> getInputFiles(List<String> input) throws IOException {
        List<Path> inputFiles = null;

        if (input.size() == 1) {
            Path p = new Path(input.get(0));
            FileSystem fs = p.getFileSystem(conf);
            FileStatus status = fs.getFileStatus(p);

            if (status.isDir()) {
                inputFiles = getInputFilesFromDirectory(status);
            }
        } else {
            inputFiles = parseInputFiles(input);
        }

        checkParquetFiles(inputFiles);

        return inputFiles;
    }

    /**
     * Check input files basically.
     * ParquetFileReader will throw exception when reading an illegal parquet file.
     *
     * @param inputFiles files to be merged.
     * @throws IOException
     */
    private void checkParquetFiles(List<Path> inputFiles) throws IOException {
        if (inputFiles == null || inputFiles.size() <= 1) {
            throw new IllegalArgumentException("Not enough files to merge");
        }

        for (Path inputFile : inputFiles) {
            FileSystem fs = inputFile.getFileSystem(conf);
            FileStatus status = fs.getFileStatus(inputFile);

            if (status.isDir()) {
                throw new IllegalArgumentException("Illegal parquet file: " + inputFile.toUri());
            }
        }
    }

    /**
     * Get all parquet files under partition directory.
     * @param partitionDir partition directory.
     * @return parquet files to be merged.
     */
    private List<Path> getInputFilesFromDirectory(FileStatus partitionDir) throws IOException {
        FileSystem fs = partitionDir.getPath().getFileSystem(conf);
        FileStatus[] inputFiles = fs.listStatus(partitionDir.getPath(), HiddenFileFilter.INSTANCE);

        List<Path> input = new ArrayList<Path>();
        for (FileStatus f : inputFiles) {
            input.add(f.getPath());
        }
        return input;
    }

    private List<Path> parseInputFiles(List<String> input) {
        List<Path> inputFiles = new ArrayList<Path>();

        for (String name : input) {
            inputFiles.add(new Path(name));
        }

        return inputFiles;
    }
}