Example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the value class for the map output data.

Usage

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format/*www . j a  v a 2  s  .c  om*/
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize,
        Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath);
    job.setJarByClass(DictionaryVectorizer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format/* w  w w  .j  ava2  s  .co  m*/
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize,
        Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath);
    job.setJarByClass(FixDictionaryVectorizer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.ery.hadoop.mrddx.hbase.HbaseInputFormat.java

License:Apache License

@Override
public void handle(Job conf) throws Exception {
    // HBase??//  ww w  .j  a  v  a 2  s.  c  o  m
    HbaseConfiguration hconf = new HbaseConfiguration(conf.getConfiguration(),
            HbaseConfiguration.FLAG_HBASE_INPUT);
    String tableName = hconf.getInputTableName();
    if (null == tableName || tableName.trim().length() <= 0) {
        String meg = "[MR ERROR]HBase??<" + HbaseConfiguration.INPUT_TABLE + ">?.";
        MRLog.error(LOG, meg);
        throw new Exception(meg);
    }

    // ?
    String inputFieldName[] = hconf.getInputFieldNames();
    this.vParamSrcTargetFieldNames(hconf, inputFieldName);

    if (hconf.getInputIsCombiner()) {
        conf.setCombinerClass(DBGroupReducer.class);
    }

    // ?TIMERANGE
    String timerange[] = hconf.getInputHBaseQueryTimerange();
    this.vParamQueryTimeRange(timerange);

    // ?startrow
    String startrow = hconf.getInputHBaseQueryStartRow();
    if (null == startrow || startrow.trim().length() <= 0) {
        MRLog.warn(LOG,
                "[MR WARN]?startrow<" + HbaseConfiguration.INPUT_QUERY_STARTROW + ">.");
    }

    // ?stoprow
    String stoprow = hconf.getInputHBaseQueryStopRow();
    if (null == stoprow || stoprow.trim().length() <= 0) {
        MRLog.warn(LOG,
                "[MR WARN]?stoprow<" + HbaseConfiguration.INPUT_QUERY_STOPROW + ">.");
    }

    // ?timestamp
    long timestamp = hconf.getInputHBaseQueryTimestamp();
    if (timestamp <= -1) {
        MRLog.warn(LOG, "[MR WARN]?<" + HbaseConfiguration.INPUT_QUERY_TIMESTAMP
                + ">.");
    }

    // ?filters
    String filters = hconf.getInputHBaseQueryFilters();
    if (null == filters || filters.length() <= 0) {
        MRLog.warn(LOG, "[MR WARN]??<" + HbaseConfiguration.INPUT_QUERY_FILTER
                + ">.");
    }

    // ?familyColumns
    String familyColumns[] = hconf.getInputHBaseQueryFamilyColumns();
    if (null == familyColumns || familyColumns.length <= 0) {
        MRLog.warn(LOG,
                "[MR WARN]?<" + HbaseConfiguration.INPUT_QUERY_FAMILYCOLUMNS + ">.");
    }

    if (null != familyColumns) {
        for (String tmp : familyColumns) {
            if (tmp.split(":").length != 2) {
                String meg = "[MR ERROR]?<" + HbaseConfiguration.INPUT_QUERY_FAMILYCOLUMNS
                        + ">.";
                MRLog.error(LOG, meg);
                throw new Exception(meg);
            }
        }
    }

    // ?familys
    String familys[] = hconf.getInputHBaseQueryFamilys();
    if (null == familys || familys.length <= 0) {
        MRLog.warn(LOG,
                "[MR WARN]??<" + HbaseConfiguration.INPUT_QUERY_FAMILYS + ">.");
    }

    conf.setInputFormatClass(HbaseInputFormat.class);
    hconf.setInputClass(DBRecord.class);

    // ?MapTask?
    int taskNumber = HbaseInputFormat.getTableHRegionInfoCount(conf.getConfiguration(), startrow, stoprow);
    int reduceTasks = taskNumber;
    if (hconf.getInputMapEnd()) {
        reduceTasks = 0;
    }

    // 
    hconf.setNumMapTasks(taskNumber);
    hconf.setNumReduceTasks(reduceTasks);
    hconf.setInputClass(DBRecord.class);
    conf.setMapperClass(DBMapper.class);
    conf.setMapOutputKeyClass(DBRecord.class);
    conf.setMapOutputValueClass(DBRecord.class);
    if (hconf.getInputIsCombiner()) {
        conf.setCombinerClass(DBGroupReducer.class);
    }
}

From source file:com.example.bigtable.sample.CellCounter.java

License:Apache License

/**
 * Sets up the actual job.//from ww  w . j a v  a2 s . co  m
 *
 * @param conf The current configuration.
 * @param args The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {
    String tableName = args[0];
    Path outputDir = new Path(args[1]);
    String reportSeparatorString = (args.length > 2) ? args[2] : ":";
    conf.set("ReportSeparator", reportSeparatorString);
    Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
    job.setJarByClass(CellCounter.class);
    Scan scan = getConfiguredScanForJob(conf, args);
    TableMapReduceUtil.initTableMapperJob(tableName, scan, CellCounterMapper.class,
            ImmutableBytesWritable.class, Result.class, job);
    job.setNumReduceTasks(1);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setReducerClass(IntSumReducer.class);
    return job;
}

From source file:com.example.bigtable.sample.WordCountHBase.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = HBaseConfiguration.create();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: wordcount-hbase <in> [<in>...] <table-name>");
        System.exit(2);/* w w  w  . j  a  v a2s  . c  om*/
    }

    Job job = Job.getInstance(conf, "word count");

    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }

    TableName tableName = TableName.valueOf(otherArgs[otherArgs.length - 1]);
    try {
        CreateTable.createTable(tableName, conf, Collections.singletonList(Bytes.toString(COLUMN_FAMILY)));
    } catch (Exception e) {
        LOG.error("Could not create the table.", e);
    }

    job.setJarByClass(WordCountHBase.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setMapOutputValueClass(IntWritable.class);

    TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), MyTableReducer.class, job);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.examples.ch03.ParseWeblogs_Ex_1.java

public int run(String[] args) throws Exception {
    Path inputPath = new Path("apache_clf.txt");
    Path outputPath = new Path("output");
    Configuration conf = getConf();
    Job weblogJob = Job.getInstance(conf);
    weblogJob.setJobName("Weblog Transformer");
    weblogJob.setJarByClass(getClass());
    weblogJob.setNumReduceTasks(0);//ww w.j  a  v  a2s . c  o  m

    weblogJob.setMapperClass(CLFMapper_Ex_1.class);
    weblogJob.setMapOutputKeyClass(Text.class);
    weblogJob.setMapOutputValueClass(Text.class);

    weblogJob.setOutputKeyClass(Text.class);
    weblogJob.setOutputValueClass(Text.class);

    weblogJob.setInputFormatClass(TextInputFormat.class);
    weblogJob.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(weblogJob, inputPath);
    FileOutputFormat.setOutputPath(weblogJob, outputPath);

    if (weblogJob.waitForCompletion(true)) {
        return 0;
    }
    return 1;
}

From source file:com.facebook.hiveio.mapreduce.output.WritingTool.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    handleCommandLine(args, conf);//from ww w. j a  va2 s  .c  o  m
    HadoopUtils.setMapAttempts(conf, 1);
    adjustConfigurationForHive(conf);
    HiveTools.setupJob(conf);

    Job job = new Job(conf, "hive-io-writing");
    if (job.getJar() == null) {
        job.setJarByClass(getClass());
    }
    job.setMapperClass(SampleMapper.class);
    job.setInputFormatClass(SampleInputFormat.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(HiveWritableRecord.class);
    job.setOutputFormatClass(SampleOutputFormat.class);

    job.setNumReduceTasks(0);

    job.submit();
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.fanlehai.hadoop.serialize.avro.MapReduceAvroWordCount.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length != 2) {
        printUsage();/*  ww w  .  ja  va2 s . c om*/
    }

    FileSystem.get(new Configuration()).delete(new Path(args[1]), true);
    Job job = Job.getInstance(super.getConf(), "AvroWordCount");

    job.setJarByClass(MapReduceAvroWordCount.class);
    job.setJobName("AvroWordCount");

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputKeySchema(job, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)));
    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setSortComparatorClass(Text.Comparator.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 1 : 0;
}

From source file:com.fanlehai.hadoop.serialize.avro.MapReduceColorCount.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length != 2) {
        printUsage();//from   w  w  w. j av a 2 s . co m
    }

    FileSystem.get(new Configuration()).delete(new Path(args[1]), true);
    Job job = Job.getInstance(super.getConf(), "MapReduceAvroWordCount");

    job.setJarByClass(MapReduceColorCount.class);
    job.setJobName("Color Count");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapperClass(ColorCountMapper.class);
    AvroJob.setInputKeySchema(job, User.getClassSchema());
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    job.setReducerClass(ColorCountReducer.class);
    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

    return job.waitForCompletion(true) ? 1 : 0;
}

From source file:com.flipkart.fdp.migration.distcp.core.MirrorDistCPDriver.java

License:Apache License

private Job createJob(Configuration configuration) throws Exception {

    System.out.println("Initializing BlueShift v 2.0...");
    System.out.println("Configuration: " + dcmConfig.toString());

    Job job = Job.getInstance(configuration, "BlueShift v 2.0 - " + dcmConfig.getBatchName());

    job.setJarByClass(MirrorDistCPDriver.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MirrorMapper.class);
    job.setReducerClass(MirrorReducer.class);

    job.setInputFormatClass(MirrorFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileOutputFormat.setOutputPath(job, stateManager.getReportPath());

    job.setNumReduceTasks(configuration.getInt("mapreduce.reduce.tasks", 1));

    System.out.println("Job Initialization Complete, The status of the Mirror job will be written to: "
            + stateManager.getReportPath());
    return job;//  ww w  .j a  v a 2  s.co m
}