Example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the value class for the map output data.

Usage

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

/**
 * Create a map-only Hadoop Job out of the passed in parameters.  Does not set the
 * Job name./*  ww w. ja va 2s. c  o  m*/
 *
 * @see #getCustomJobName(String, JobContext, Class, Class)
 */
public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat, Configuration conf)
        throws IOException {

    //    Job job = new Job(new Configuration(conf));
    Job job = Job.getInstance(conf);
    Configuration jobConf = job.getConfiguration();

    if (mapper.equals(Mapper.class)) {
        throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
    }
    job.setJarByClass(mapper);

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath.toString());

    job.setMapperClass(mapper);
    job.setMapOutputKeyClass(mapperKey);
    job.setMapOutputValueClass(mapperValue);
    job.setOutputKeyClass(mapperKey);
    job.setOutputValueClass(mapperValue);
    jobConf.setBoolean("mapred.compress.map.output", true);
    job.setNumReduceTasks(0);

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath.toString());

    return job;
}

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

/**
 *
 * @param inputPaths//  w ww.ja v  a 2 s.  co  m
 * @param outputPath
 * @param inputFormat
 * @param inputKey
 * @param inputValue
 * @param mapper
 * @param mapperKey
 * @param mapperValue
 * @param combiner
 * @param reducer
 * @param outputKey
 * @param outputValue
 * @param outputFormat
 * @param conf
 * @param overwrite
 * @param isCompress
 * @return
 * @throws IOException
 */
public static Job prepareAvroJob(String inputPaths, String outputPath, Class<? extends InputFormat> inputFormat,
        Object inputKey, Object inputValue, Class<? extends Mapper> mapper, Object mapperKey,
        Object mapperValue, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer,
        Object outputKey, Object outputValue, Class<? extends OutputFormat> outputFormat, Configuration conf,
        boolean overwrite, boolean isCompress) throws IOException {

    Job job = Job.getInstance(conf);
    Configuration jobConf = job.getConfiguration();
    if (inputKey instanceof Schema) {

        if (inputValue instanceof Schema) {
            inputFormat = inputFormat == null ? AvroKeyValueInputFormat.class : inputFormat;
        }
        inputFormat = inputFormat == null ? AvroKeyInputFormat.class : inputFormat;

    }
    if (inputFormat != null) {
        job.setInputFormatClass(inputFormat);
    }

    if (inputKey instanceof Schema) {
        AvroJob.setInputKeySchema(job, (Schema) inputKey);
    }

    if (inputValue instanceof Schema) {
        AvroJob.setInputValueSchema(job, (Schema) inputValue);
    }

    if (outputKey instanceof Schema) {

        if (outputValue instanceof Schema) {
            outputFormat = outputFormat == null ? AvroKeyValueOutputFormat.class : outputFormat;
        }
        outputFormat = outputFormat == null ? AvroKeyOutputFormat.class : outputFormat;

    }
    if (outputFormat != null) {
        job.setOutputFormatClass(outputFormat);
    }

    if (outputKey instanceof Schema) {
        AvroJob.setOutputKeySchema(job, (Schema) outputKey);
    } else if (outputKey instanceof Class) {
        job.setOutputKeyClass((Class) outputKey);
    }

    if (outputValue instanceof Schema) {
        AvroJob.setOutputValueSchema(job, (Schema) outputValue);
    } else if (outputValue instanceof Class) {
        job.setOutputValueClass((Class) outputValue);
    }

    if (reducer == null) {
        job.setNumReduceTasks(0);

        if (mapperKey instanceof Schema) {
            AvroJob.setMapOutputKeySchema(job, (Schema) mapperKey);
        } else if (mapperKey instanceof Class) {
            job.setOutputKeyClass((Class) mapperKey);
        }

        if (mapperValue instanceof Schema) {
            AvroJob.setOutputValueSchema(job, (Schema) mapperValue);
        } else if (mapperKey instanceof Class) {
            job.setOutputValueClass((Class) mapperValue);
        }
        job.setJarByClass(mapper);

    } else if (reducer.equals(Reducer.class)) {
        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        }
        job.setJarByClass(mapper);

    } else {
        job.setJarByClass(reducer);

    }

    FileInputFormat.setInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    if (isCompress) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, DeflateCodec.class);
    }

    job.setMapperClass(mapper);
    if (mapperKey instanceof Schema) {
        AvroJob.setMapOutputKeySchema(job, (Schema) mapperKey);
    } else if (mapperKey instanceof Class) {
        job.setMapOutputKeyClass((Class) mapperKey);
    }

    if (mapperValue instanceof Schema) {
        AvroJob.setMapOutputValueSchema(job, (Schema) mapperValue);
    } else if (mapperKey instanceof Class) {
        job.setMapOutputValueClass((Class) mapperValue);
    }

    if (reducer != null) {
        job.setReducerClass(reducer);
    }
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }

    if (overwrite) {
        HadoopUtils.delete(jobConf, new Path(outputPath));
    }

    return job;

}

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

public static Job prepareAvroJob(String inputPaths, Path outputPath, Schema inputKeySchema,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends Reducer> combiner,
        Class<? extends Reducer> reducer, Schema outputKeySchema, Class<? extends Writable> outputValue,
        Configuration conf, boolean overwrite) throws IOException {
    Job job = Job.getInstance(conf);
    Configuration jobConf = job.getConfiguration();

    if (reducer.equals(Reducer.class)) {
        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        }/*w w w .j  a  va 2 s. c  o m*/
        job.setJarByClass(mapper);
    } else {
        job.setJarByClass(reducer);
    }

    FileInputFormat.setInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, outputPath);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, DeflateCodec.class);

    job.setInputFormatClass(AvroKeyInputFormat.class);
    AvroJob.setInputKeySchema(job, inputKeySchema);
    job.setMapperClass(mapper);
    if (mapperKey != null) {
        job.setMapOutputKeyClass(mapperKey);
    }
    if (mapperValue != null) {
        job.setMapOutputValueClass(mapperValue);
    }
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    job.setReducerClass(reducer);
    AvroJob.setOutputKeySchema(job, outputKeySchema);
    job.setOutputValueClass(outputValue);

    if (overwrite) {
        HadoopUtils.delete(jobConf, outputPath);
    }

    return job;

}

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

public static Job prepareAvroJob(String inputPaths, Path outputPath, Schema inputKeySchema,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends Reducer> combiner,
        Class<? extends Reducer> reducer, Class<? extends Writable> outputKey,
        Class<? extends Writable> outputValue, Configuration conf, boolean overwrite) throws IOException {
    Job job = Job.getInstance(conf);
    Configuration jobConf = job.getConfiguration();

    if (reducer.equals(Reducer.class)) {
        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        }/*from   www  .  j a  v a2s.co m*/
        job.setJarByClass(mapper);
    } else {
        job.setJarByClass(reducer);
    }

    FileInputFormat.setInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, outputPath);

    //        FileOutputFormat.setCompressOutput(job, true);
    //        FileOutputFormat.setOutputCompressorClass(job, DeflateCodec.class);

    job.setInputFormatClass(AvroKeyInputFormat.class);
    AvroJob.setInputKeySchema(job, inputKeySchema);
    job.setMapperClass(mapper);
    if (mapperKey != null) {
        job.setMapOutputKeyClass(mapperKey);
    }
    if (mapperValue != null) {
        job.setMapOutputValueClass(mapperValue);
    }
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }

    job.setReducerClass(reducer);
    job.setOutputKeyClass(outputKey);
    job.setOutputValueClass(outputValue);

    if (overwrite) {
        HadoopUtils.delete(jobConf, outputPath);
    }

    return job;

}

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

public static Job prepareJob(String inputPath, String outputPath, Class<? extends InputFormat> inputFormat,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
        Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue,
        Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException {

    //    Job job = new Job(new Configuration(conf));
    Job job = Job.getInstance(conf);
    Configuration jobConf = job.getConfiguration();

    if (reducer.equals(Reducer.class)) {
        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        }//from   www .j a  v a 2  s  .c  om
        job.setJarByClass(mapper);
    } else {
        job.setJarByClass(reducer);
    }

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath);

    job.setMapperClass(mapper);
    if (mapperKey != null) {
        job.setMapOutputKeyClass(mapperKey);
    }
    if (mapperValue != null) {
        job.setMapOutputValueClass(mapperValue);
    }

    jobConf.setBoolean("mapred.compress.map.output", true);

    job.setReducerClass(reducer);
    job.setOutputKeyClass(reducerKey);
    job.setOutputValueClass(reducerValue);

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath);

    return job;
}

From source file:com.github.sandgorgon.parmr.Main.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: parmr <input file> <output path>");
        return -1;
    }/*from   w  w  w  .  j ava 2  s  .  c o  m*/

    Configuration conf = super.getConf();
    conf.set("mapreduce.job.queuename", "prod");

    Job job = Job.getInstance(conf);
    job.setJobName(jobName);
    job.setJarByClass(Main.class);

    // Parquet Schema
    // Read from the input file itself the schema that we will be assuming
    Path infile = new Path(args[0]);
    List<Footer> footers = ParquetFileReader.readFooters(conf, infile.getFileSystem(conf).getFileStatus(infile),
            true);
    MessageType schema = footers.get(0).getParquetMetadata().getFileMetaData().getSchema();

    // Avro Schema
    // Convert the Parquet schema to an Avro schema
    AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter();
    Schema avroSchema = avroSchemaConverter.convert(schema);

    // Set the Mapper
    job.setMapperClass(UserMapper.class);

    // This works for predicate pushdown on record assembly read.
    AvroParquetInputFormat.setUnboundRecordFilter(job, UserRecordFilter.class);

    AvroParquetInputFormat.addInputPath(job, new Path(args[0]));
    AvroParquetInputFormat.setAvroReadSchema(job, avroSchema);
    job.setInputFormatClass(AvroParquetInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // If you needed to return an avro object from the mapper, refer to this...
    //job.setMapOutputValueClass(AvroValue.class);
    //AvroJob.setMapOutputValueSchema(job, avroSchema);

    // Reducer
    job.setReducerClass(UserReducer.class);

    // Output
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    // If we need to return an avro class again, refer to this...
    //job.setOutputFormatClass(AvroParquetOutputFormat.class);
    //AvroParquetOutputFormat.setOutputPath(job, new Path(args[1]));
    //AvroParquetOutputFormat.setSchema(job, avroSchema);
    //job.setOutputKeyClass(Void.class);
    //job.setOutputValueClass(GenericRecord.class);

    // Rough way of testing the projection side of things.
    AvroParquetInputFormat.setRequestedProjection(job,
            Schema.parse("{\"namespace\": \"com.github.sandgorgon.parmr.avro\",\n" + " \"type\": \"record\",\n"
                    + " \"name\": \"User\",\n" + " \"fields\": [\n"
                    + "     {\"name\": \"name\", \"type\": \"string\"},\n"
                    + "     {\"name\": \"favorite_number\",  \"type\": [\"int\", \"null\"]}\n" +
                    //                "     {\"name\": \"favorite_color\", \"type\": [\"string\", \"null\"]}\n" +
                    " ]\n" + "}\n" + ""));

    // Do the deed!
    int completion = job.waitForCompletion(true) ? 0 : 1;

    return completion;
}

From source file:com.github.ygf.pagerank.InLinks.java

License:Apache License

private void summarizeResults(Configuration conf, Path outputDir) throws Exception {

    int topResults = Integer.parseInt(conf.get("inlinks.top_results"));

    Job job = Job.getInstance(conf, "InLinks:TopN");

    job.setJarByClass(InLinks.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(InLinksTopNMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setReducerClass(InLinksTopNReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "inlinks"));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "inlinks-top" + topResults));

    job.setNumReduceTasks(1);/*  w  w  w  . ja va 2 s.  com*/
    job.waitForCompletion(true);
}

From source file:com.github.ygf.pagerank.PageRank.java

License:Apache License

private void createTransitionMatrix(Configuration conf, Path linksFile, Path outputDir) throws Exception {

    // This job reads the links-simple-sorted.txt input file and generates
    // the corresponding transition matrix. The matrix is divided into
    // square blocks and each block is represented by the nonzero entries.
    // See Section 5.2 (and 5.2.3 in particular) of Mining of Massive Datasets
    // (http://infolab.stanford.edu/~ullman/mmds.html) for details.
    // The output is written to the "M" subdir in the output dir.

    Job job = Job.getInstance(conf, "PageRank:Matrix");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(PageRankMatrixMapper.class);
    job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
    job.getConfiguration().setClass("mapreduce.map.output.compress.codec", DefaultCodec.class,
            CompressionCodec.class);
    job.setMapOutputKeyClass(ShortArrayWritable.class);
    job.setMapOutputValueClass(ShortArrayWritable.class);
    job.setReducerClass(PageRankMatrixReducer.class);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
    SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(ShortArrayWritable.class);
    job.setOutputValueClass(MatrixBlockWritable.class);
    FileInputFormat.addInputPath(job, linksFile);
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "M"));

    job.waitForCompletion(true);//from w w w.  j a v a 2 s.c o  m
}

From source file:com.github.ygf.pagerank.PageRank.java

License:Apache License

private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception {

    // This job performs an iteration of the power iteration method to
    // compute PageRank. The map task processes each block M_{i,j}, loads 
    // the corresponding stripe j of the vector v_{k-1} and produces the
    // partial result of the stripe i of the vector v_k. The reduce task
    // sums all the partial results of v_k and adds the teleportation factor
    // (the combiner only sums all the partial results). See Section 5.2
    // (and 5.2.3 in particular) of Mining of Massive Datasets
    // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The
    // output is written in a "vk" subdir of the output dir, where k is the
    // iteration number. MapFileOutputFormat is used to keep an array of the
    // stripes of v.

    Job job = Job.getInstance(conf, "PageRank:Iteration");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(PageRankIterationMapper.class);
    job.setMapOutputKeyClass(ShortWritable.class);
    job.setMapOutputValueClass(FloatArrayWritable.class);
    job.setCombinerClass(PageRankIterationCombiner.class);
    job.setReducerClass(PageRankIterationReducer.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(ShortWritable.class);
    job.setOutputValueClass(FloatArrayWritable.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "M"));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter));

    job.waitForCompletion(true);//from  w  w w  .j  a v a2s .  c  o m
}

From source file:com.github.ygf.pagerank.PageRank.java

License:Apache License

private void summarizeResults(int iter, Configuration conf, Path outputDir) throws Exception {

    // This job creates a plain text file with the top N PageRanks and the
    // titles of the pages. Each map task emits the top N PageRanks it
    // receives, and the reduce task merges the partial results into the
    // global top N PageRanks. A single reducer is used in the job in order
    // to have access to all the individual top N PageRanks from the
    // mappers. The reducer looks up the titles in the index built by
    // TitleIndex. This job was designed considering that N is small.

    int topResults = Integer.parseInt(conf.get("pagerank.top_results"));

    Job job = Job.getInstance(conf, "PageRank:TopN");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(PageRankTopNMapper.class);
    job.setMapOutputKeyClass(FloatWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setReducerClass(PageRankTopNReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "v" + iter));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter + "-top" + topResults));

    job.setNumReduceTasks(1);/* ww w .j  a  v a 2  s  .c  o  m*/
    job.waitForCompletion(true);
}