Example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the value class for the map output data.

Usage

From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java

License:Apache License

/**
 * Configures and submits the Map Reduce Job to Hadoop
 *///from   w  w w.  jav a 2s .  c om
public int run(String[] args) throws Exception {

    String inputPath = null;
    String outputPath = null;
    boolean overwrite = false;
    String s3AccessKey = null;
    String s3SecretKey = null;

    // Read the command line arguments. We're not using GenericOptionsParser
    // to prevent having to include commons.cli as a dependency.
    for (int index = 0; index < args.length; index++) {
        try {

            if (ARGNAME_INPATH.equals(args[index])) {
                inputPath = args[++index];
            } else if (ARGNAME_OUTPATH.equals(args[index])) {
                outputPath = args[++index];
            } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) {
                s3AccessKey = args[++index];
            } else if (ARGNAME_S3SECRETKEY.equals(args[index])) {
                s3SecretKey = args[++index];
            } else if (ARGNAME_MAXFILES.equals(args[index])) {
                // FIXME - No use of static methods
                WarcFileFilter.setMax(Long.parseLong(args[++index]));
            } else if (ARGNAME_OVERWRITE.equals(args[index])) {
                overwrite = true;
            } else {
                LOG.warn("Unsupported argument: " + args[index]);
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            usage();
            throw new IllegalArgumentException();
        }
    }

    if (inputPath == null || outputPath == null) {
        usage();
        throw new IllegalArgumentException();
    }

    if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) {
        usage();
        LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage ");
        throw new IllegalArgumentException();
    }

    // Create the Hadoop job.
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    job.setJarByClass(GoogleAdsCounterJob.class);
    if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) {
        conf.set("AWS_ACCESS_KEY_ID", s3AccessKey);
        conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey);
    }
    // Scan the provided input path for WARC files.
    LOG.info("setting input path to '" + inputPath + "'");

    WarcFileFilter.setFilter(FILEFILTER);
    FileInputFormat.addInputPath(job, new Path(inputPath));

    // FIXME - I see the problem that you want to give a dynamic number to a
    // static class. My question is, Is this really required, if we just
    // point to a file in s3 that should solve our problem
    FileInputFormat.setInputPathFilter(job, WarcFileFilter.class);

    // Delete the output path directory if it already exists and user wants
    // to overwrite it.
    if (overwrite) {
        LOG.info("clearing the output path at '" + outputPath + "'");
        FileSystem fs = FileSystem.get(new URI(outputPath), conf);
        if (fs.exists(new Path(outputPath))) {
            fs.delete(new Path(outputPath), true);
        }
    }

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    /*
     * // Defines additional single text based output 'GoogleAdClient' for
     * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient",
     * TextOutputFormat.class, Text.class,LongWritable.class );
     * 
     * // Defines additional text based output 'GoogleAdType' for the job
     * MultipleOutputs.addNamedOutput(job,
     * "GoogleAdType",TextOutputFormat.class, Text.class,
     * LongWritable.class);
     */
    // Set which InputFormat class to use.
    job.setInputFormatClass(WARCInputFormat.class);

    // Set which OutputFormat class to use.
    job.setOutputFormatClass(TextOutputFormat.class);

    /*
     * Using MultipleOutputs creates zero-sized default output e.g.: *
     * part-r-00000. To prevent this use LazyOutputFormat instead of
     * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job
     * configuration.
     */
    // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //   job.setPartitionerClass(GoogleAdsCounterPartitioner.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    //job.setNumReduceTasks(4);
    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(GoogleAdsCounterMapper.class);
    // job.setMapperClass(CrawlMapper_AdStatsDetails.class);
    job.setReducerClass(GoogleAdsCounterReducer.class);

    // set combiner
    //job.setCombinerClass(GoogleAdsCounterReducer.class);

    // set job name
    job.setJobName("CommonCrawl Data Processing : Counting Google Ads");

    long startTime = System.currentTimeMillis();
    if (job.waitForCompletion(true)) {

        LOG.info("Job completion status : " + job.waitForCompletion(true));
        long endTime = System.currentTimeMillis();

        long difference = endTime - startTime;
        LOG.info("Elapsed milliseconds: " + difference);
        Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES);
        LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue());

        Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES);
        LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue());

        return 0;
    } else {
        return 1;
    }
}

From source file:com.talis.hadoop.rdf.collation.QuadsCollater.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }//from  ww w.ja va  2  s . c  o  m

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    FileOutputFormat.setCompressOutput(job, true);

    job.setInputFormatClass(NQuadsInputFormat.class);
    job.setMapperClass(CollationMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(QuadWritable.class);

    job.setReducerClass(CollationReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(QuadArrayWritable.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.CheckConvergence.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: CheckConvergence <input path> <output path>");
        return -1;
    }/*from w w  w  .j av  a2s .c o m*/

    FileSystem.get(getConf()).delete(new Path(args[1]), true);

    Job job = new Job(getConf(), "CheckConvergence");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(CheckConvergenceMapper.class);
    job.setCombinerClass(CheckConvergenceReducer.class);
    job.setReducerClass(CheckConvergenceReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DoubleWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.CheckingData.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: CheckingData <input path> <output path>");
        return -1;
    }//from   w  ww.  ja  v  a 2 s .c  o  m

    FileSystem.get(getConf()).delete(new Path(args[1]), true);

    Job job = new Job(getConf(), "CheckingData");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(CheckingDataMapper.class);
    job.setReducerClass(CheckingDataReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.CountPages.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: CountPages <input path> <output path>");
        return -1;
    }/*w w w .j  ava 2 s.co  m*/

    FileSystem.get(getConf()).delete(new Path(args[1]), true);

    Job job = new Job(getConf(), "CountPages");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(CountPagesMapper.class);
    job.setCombinerClass(CountPagesReducer.class);
    job.setReducerClass(CountPagesReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.DanglingPages.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: DanglingPages <input path> <output path>");
        return -1;
    }/*from   ww w.jav a  2s . c  om*/

    FileSystem.get(getConf()).delete(new Path(args[1]), true);

    Job job = new Job(getConf(), "DanglingPages");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(DanglingPagesMapper.class);
    job.setCombinerClass(DanglingPagesReducer.class);
    job.setReducerClass(DanglingPagesReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DoubleWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.InitializePageRanks.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: InitializePageRanks <input path> <output path> <number of pages>");
        return -1;
    }/*from   w  w w .  j a va  2s.  c  o  m*/

    Configuration conf = getConf();
    conf.set("pagerank.count", args[2]);

    FileSystem.get(conf).delete(new Path(args[1]), true);

    Job job = new Job(conf, "InitializePageRanks");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(InitializePageRanksMapper.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.SortPageRanks.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: SortPageRanks <input path> <output path>");
        return -1;
    }//from   w w w  . j av  a  2s. co m

    FileSystem.get(getConf()).delete(new Path(args[1]), true);

    Job job = new Job(getConf(), "SortPageRanks");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(SortPageRanksMapper.class);
    job.setReducerClass(Reducer.class); // i.e. identity reducer
    job.setSortComparatorClass(DoubleWritableDecreasingComparator.class);

    job.setMapOutputKeyClass(DoubleWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(1); // TODO: inefficient, use InputSampler with v0.20.x

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.UpdatePageRanks.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        System.err.println(/*  w  ww .  j  a v  a2 s.  c o  m*/
                "Usage: UpdatePageRanks <input path> <output path> <number of pages> <dangling pages contribution>");
        return -1;
    }

    Configuration conf = getConf();
    conf.set("pagerank.count", args[2]);
    conf.set("pagerank.dangling", args[3]);

    FileSystem.get(conf).delete(new Path(args[1]), true);

    Job job = new Job(conf, "UpdatePageRanks");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(UpdatePageRanksMapper.class);
    job.setReducerClass(UpdatePageRanksReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.tdunning.plume.local.lazy.MapRedExecutor.java

License:Apache License

/**
 * This method returns a Job instance out of a {@link MSCR} entity. It puts the Class of 
 * the {@link PlumeWorkflow} argument and the MSCR id in the hadoop configuration.
 * //from  ww  w  . j  a  v  a 2s  .  c  om
 * @param mscr The MSCR to convert 
 * @param workflow The workflow whose class will be instantiated by hadoop mappers/reducers
 * @param outputPath The output path of the MapRed job
 * @return A hadoop-executable MapRed Job
 * 
 * @throws IOException
 */
static Job getMapRed(final MSCR mscr, PlumeWorkflow workFlow, String workFlowOutputPath, String outputPath)
        throws IOException {

    Configuration conf = new Configuration();
    conf.set(WORKFLOW_NAME, workFlow.getClass().getName());
    conf.setInt(MSCR_ID, mscr.getId());
    conf.set(TEMP_OUTPUT_PATH, workFlowOutputPath);

    Job job = new Job(conf, "MSCR"); // TODO deprecation

    job.setMapOutputKeyClass(PlumeObject.class);
    job.setMapOutputValueClass(PlumeObject.class);

    job.setJarByClass(MapRedExecutor.class);

    /**
     * Define multiple inputs
     */
    for (PCollection<?> input : mscr.getInputs()) {
        if (!(input instanceof LazyCollection)) {
            throw new IllegalArgumentException("Can't create MapRed from MSCR whose inputs are not LazyTable");
        }
        LazyCollection<Text> l = (LazyCollection<Text>) input;
        if (!(l.isMaterialized() && l.getFile() != null)) {
            // Collections have plume ID only if they are intermediate results - TODO better naming for this
            if (l.getPlumeId().length() < 1) {
                throw new IllegalArgumentException(
                        "Can't create MapRed from MSCR inputs that are not materialized to a file");
            }
        }
        PCollectionType<?> rType = l.getType();
        Class<? extends InputFormat> format = SequenceFileInputFormat.class;
        if (rType instanceof PTableType) {
            PTableType<?, ?> tType = (PTableType<?, ?>) rType;
            if (tType.valueType() instanceof StringType && tType.keyType() instanceof StringType) {
                format = KeyValueTextInputFormat.class;
            }
            MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
        } else {
            if (rType.elementType() instanceof StringType) {
                format = TextInputFormat.class;
            }
            MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
        }
    }
    /**
     * Define multiple outputs
     */
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) {
        PCollectionType<?> rType = ((LazyCollection<?>) mscr.getOutputChannels().get(entry.getKey()).output)
                .getType();
        if (rType instanceof PTableType) {
            PTableType<?, ?> tType = (PTableType<?, ?>) rType;
            Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
            if (tType.keyType() instanceof StringType && tType.valueType() instanceof StringType) {
                outputFormat = TextOutputFormat.class;
            }
            MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat,
                    getHadoopType(tType.keyType()), getHadoopType(tType.valueType()));
        } else {
            Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
            if (rType.elementType() instanceof StringType) {
                outputFormat = TextOutputFormat.class;
            }
            MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, NullWritable.class,
                    getHadoopType(rType.elementType()));
        }
    }
    /**
     * Define Reducer & Combiner
     */
    job.setCombinerClass(MSCRCombiner.class);
    job.setReducerClass(MSCRReducer.class);

    job.setNumReduceTasks(1);
    return job;
}