org.apache.accumulo.core.file.rfile.SplitLarge.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.accumulo.core.file.rfile.SplitLarge.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.core.file.rfile;

import java.util.ArrayList;

import org.apache.accumulo.core.conf.DefaultConfiguration;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.ByteSequence;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile;
import org.apache.accumulo.core.file.rfile.RFile.Reader;
import org.apache.accumulo.core.file.rfile.RFile.Writer;
import org.apache.accumulo.core.util.CachedConfiguration;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/**
 * Split an RFile into large and small key/value files.
 * 
 */
public class SplitLarge {

    public static void main(String[] args) throws Exception {
        Configuration conf = CachedConfiguration.getInstance();
        FileSystem fs = FileSystem.get(conf);
        long maxSize = 10 * 1024 * 1024;

        Options opts = new Options();
        Option maxSizeOption = new Option("m", "", true,
                "the maximum size of the key/value pair to shunt to the small file");
        opts.addOption(maxSizeOption);

        CommandLine commandLine = new BasicParser().parse(opts, args);
        if (commandLine.hasOption(maxSizeOption.getOpt())) {
            maxSize = Long.parseLong(commandLine.getOptionValue(maxSizeOption.getOpt()));
        }

        for (String arg : commandLine.getArgs()) {
            Path path = new Path(arg);
            CachableBlockFile.Reader rdr = new CachableBlockFile.Reader(fs, path, conf, null, null);
            Reader iter = new RFile.Reader(rdr);

            if (!arg.endsWith(".rf")) {
                throw new IllegalArgumentException("File must end with .rf");
            }
            String smallName = arg.substring(0, arg.length() - 3) + "_small.rf";
            String largeName = arg.substring(0, arg.length() - 3) + "_large.rf";

            int blockSize = (int) DefaultConfiguration.getDefaultConfiguration()
                    .getMemoryInBytes(Property.TABLE_FILE_BLOCK_SIZE);
            Writer small = new RFile.Writer(new CachableBlockFile.Writer(fs, new Path(smallName), "gz", conf),
                    blockSize);
            small.startDefaultLocalityGroup();
            Writer large = new RFile.Writer(new CachableBlockFile.Writer(fs, new Path(largeName), "gz", conf),
                    blockSize);
            large.startDefaultLocalityGroup();

            iter.seek(new Range(), new ArrayList<ByteSequence>(), false);
            while (iter.hasTop()) {
                Key key = iter.getTopKey();
                Value value = iter.getTopValue();
                if (key.getSize() + value.getSize() < maxSize) {
                    small.append(key, value);
                } else {
                    large.append(key, value);
                }
                iter.next();
            }

            iter.close();
            large.close();
            small.close();
        }
    }

}