gaffer.accumulostore.operation.hdfs.handler.job.SampleDataForSplitPointsJobFactory.java Source code

Introduction

Here is the source code for gaffer.accumulostore.operation.hdfs.handler.job.SampleDataForSplitPointsJobFactory.java
Source

/*
 * Copyright 2016 Crown Copyright
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package gaffer.accumulostore.operation.hdfs.handler.job;

import gaffer.accumulostore.operation.hdfs.impl.SampleDataForSplitPoints;
import gaffer.commonutil.CommonConstants;
import gaffer.store.Store;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Value;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.IOException;

public class SampleDataForSplitPointsJobFactory {

    public static final String PROPORTION_TO_SAMPLE = "proportion_to_sample";
    public static final String SCHEMA = "schema";
    public static final String MAPPER_GENERATOR = "mapperGenerator";
    public static final String VALIDATE = "validate";

    /**
     * Creates a job with the store specific job initialisation and then applies the operation specific
     * {@link gaffer.operation.simple.hdfs.handler.jobfactory.JobInitialiser}.
     *
     * @param operation the add elements from hdfs operation
     * @param store     the store executing the operation
     * @return the created job
     * @throws IOException for IO issues
     */
    public Job createJob(final SampleDataForSplitPoints operation, final Store store) throws IOException {
        final JobConf jobConf = createJobConf(operation, store);
        setupJobConf(jobConf, operation, store);

        final Job job = Job.getInstance(jobConf);
        setupJob(job, operation, store);

        // Apply Operation Specific Job Configuration
        if (null != operation.getJobInitialiser()) {
            operation.getJobInitialiser().initialiseJob(job, operation, store);
        }

        return job;
    }

    protected JobConf createJobConf(final SampleDataForSplitPoints operation, final Store store)
            throws IOException {
        return new JobConf(new Configuration());
    }

    protected void setupJobConf(final JobConf jobConf, final SampleDataForSplitPoints operation, final Store store)
            throws IOException {
        jobConf.set(SCHEMA, new String(store.getSchema().toJson(false), CommonConstants.UTF_8));
        jobConf.set(MAPPER_GENERATOR, operation.getMapperGeneratorClassName());
        jobConf.set(VALIDATE, String.valueOf(operation.isValidate()));
        jobConf.set(PROPORTION_TO_SAMPLE, String.valueOf(operation.getProportionToSample()));
        Integer numTasks = operation.getNumMapTasks();
        if (null != numTasks) {
            jobConf.setNumMapTasks(numTasks);
        }
        numTasks = operation.getNumReduceTasks();
        if (null != numTasks) {
            jobConf.setNumReduceTasks(numTasks);
        }
    }

    protected void setupJob(final Job job, final SampleDataForSplitPoints operation, final Store store)
            throws IOException {
        job.setJarByClass(getClass());
        job.setJobName(getJobName(operation.getInputPath(), operation.getOutputPath()));
        setupMapper(job, operation, store);
        setupReducer(job, operation, store);
        setupOutput(job, operation, store);
    }

    protected String getJobName(final Path inputPath, final Path outputPath) {
        return "Split Table: input=" + inputPath + ", output=" + outputPath;
    }

    private void setupMapper(final Job job, final SampleDataForSplitPoints operation, final Store store)
            throws IOException {
        job.setMapperClass(SampleDataForSplitPointsMapper.class);
        job.setMapOutputKeyClass(Key.class);
        job.setMapOutputValueClass(Value.class);
    }

    private void setupReducer(final Job job, final SampleDataForSplitPoints operation, final Store store)
            throws IOException {
        job.setReducerClass(AccumuloKeyValueReducer.class);
        job.setOutputKeyClass(Key.class);
        job.setOutputValueClass(Value.class);
    }

    private void setupOutput(final Job job, final SampleDataForSplitPoints operation, final Store store)
            throws IOException {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(job, operation.getOutputPath());
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }
}