Example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the value class for the map output data.

Usage

From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java

License:Apache License

/**
 * Configures and submits the Map Reduce Job to Hadoop
 *///from   w  w w.  jav a 2s .  c om
public int run(String[] args) throws Exception {

    String inputPath = null;
    String outputPath = null;
    boolean overwrite = false;
    String s3AccessKey = null;
    String s3SecretKey = null;

    // Read the command line arguments. We're not using GenericOptionsParser
    // to prevent having to include commons.cli as a dependency.
    for (int index = 0; index < args.length; index++) {
        try {

            if (ARGNAME_INPATH.equals(args[index])) {
                inputPath = args[++index];
            } else if (ARGNAME_OUTPATH.equals(args[index])) {
                outputPath = args[++index];
            } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) {
                s3AccessKey = args[++index];
            } else if (ARGNAME_S3SECRETKEY.equals(args[index])) {
                s3SecretKey = args[++index];
            } else if (ARGNAME_MAXFILES.equals(args[index])) {
                // FIXME - No use of static methods
                WarcFileFilter.setMax(Long.parseLong(args[++index]));
            } else if (ARGNAME_OVERWRITE.equals(args[index])) {
                overwrite = true;
            } else {
                LOG.warn("Unsupported argument: " + args[index]);
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            usage();
            throw new IllegalArgumentException();
        }
    }

    if (inputPath == null || outputPath == null) {
        usage();
        throw new IllegalArgumentException();
    }

    if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) {
        usage();
        LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage ");
        throw new IllegalArgumentException();
    }

    // Create the Hadoop job.
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    job.setJarByClass(GoogleAdsCounterJob.class);
    if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) {
        conf.set("AWS_ACCESS_KEY_ID", s3AccessKey);
        conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey);
    }
    // Scan the provided input path for WARC files.
    LOG.info("setting input path to '" + inputPath + "'");

    WarcFileFilter.setFilter(FILEFILTER);
    FileInputFormat.addInputPath(job, new Path(inputPath));

    // FIXME - I see the problem that you want to give a dynamic number to a
    // static class. My question is, Is this really required, if we just
    // point to a file in s3 that should solve our problem
    FileInputFormat.setInputPathFilter(job, WarcFileFilter.class);

    // Delete the output path directory if it already exists and user wants
    // to overwrite it.
    if (overwrite) {
        LOG.info("clearing the output path at '" + outputPath + "'");
        FileSystem fs = FileSystem.get(new URI(outputPath), conf);
        if (fs.exists(new Path(outputPath))) {
            fs.delete(new Path(outputPath), true);
        }
    }

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    /*
     * // Defines additional single text based output 'GoogleAdClient' for
     * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient",
     * TextOutputFormat.class, Text.class,LongWritable.class );
     * 
     * // Defines additional text based output 'GoogleAdType' for the job
     * MultipleOutputs.addNamedOutput(job,
     * "GoogleAdType",TextOutputFormat.class, Text.class,
     * LongWritable.class);
     */
    // Set which InputFormat class to use.
    job.setInputFormatClass(WARCInputFormat.class);

    // Set which OutputFormat class to use.
    job.setOutputFormatClass(TextOutputFormat.class);

    /*
     * Using MultipleOutputs creates zero-sized default output e.g.: *
     * part-r-00000. To prevent this use LazyOutputFormat instead of
     * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job
     * configuration.
     */
    // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //   job.setPartitionerClass(GoogleAdsCounterPartitioner.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    //job.setNumReduceTasks(4);
    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(GoogleAdsCounterMapper.class);
    // job.setMapperClass(CrawlMapper_AdStatsDetails.class);
    job.setReducerClass(GoogleAdsCounterReducer.class);

    // set combiner
    //job.setCombinerClass(GoogleAdsCounterReducer.class);

    // set job name
    job.setJobName("CommonCrawl Data Processing : Counting Google Ads");

    long startTime = System.currentTimeMillis();
    if (job.waitForCompletion(true)) {

        LOG.info("Job completion status : " + job.waitForCompletion(true));
        long endTime = System.currentTimeMillis();

        long difference = endTime - startTime;
        LOG.info("Elapsed milliseconds: " + difference);
        Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES);
        LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue());

        Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES);
        LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue());

        return 0;
    } else {
        return 1;
    }
}

From source file:com.talis.hadoop.rdf.collation.QuadsCollater.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }//from  ww w.ja va  2  s . c  o  m

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    FileOutputFormat.setCompressOutput(job, true);

    job.setInputFormatClass(NQuadsInputFormat.class);
    job.setMapperClass(CollationMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(QuadWritable.class);

    job.setReducerClass(CollationReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(QuadArrayWritable.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.CheckConvergence.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: CheckConvergence <input path> <output path>");
        return -1;
    }/*from w w  w  .j av  a2s .c o m*/

    FileSystem.get(getConf()).delete(new Path(args[1]), true);

    Job job = new Job(getConf(), "CheckConvergence");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(CheckConvergenceMapper.class);
    job.setCombinerClass(CheckConvergenceReducer.class);
    job.setReducerClass(CheckConvergenceReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DoubleWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.CheckingData.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: CheckingData <input path> <output path>");
        return -1;
    }//from   w  ww.  ja  v  a 2 s .c  o  m

    FileSystem.get(getConf()).delete(new Path(args[1]), true);

    Job job = new Job(getConf(), "CheckingData");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(CheckingDataMapper.class);
    job.setReducerClass(CheckingDataReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.CountPages.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: CountPages <input path> <output path>");
        return -1;
    }/*w w w .j  ava 2 s.co  m*/

    FileSystem.get(getConf()).delete(new Path(args[1]), true);

    Job job = new Job(getConf(), "CountPages");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(CountPagesMapper.class);
    job.setCombinerClass(CountPagesReducer.class);
    job.setReducerClass(CountPagesReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.DanglingPages.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: DanglingPages <input path> <output path>");
        return -1;
    }/*from   ww w.jav a  2s . c  om*/

    FileSystem.get(getConf()).delete(new Path(args[1]), true);

    Job job = new Job(getConf(), "DanglingPages");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(DanglingPagesMapper.class);
    job.setCombinerClass(DanglingPagesReducer.class);
    job.setReducerClass(DanglingPagesReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DoubleWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.InitializePageRanks.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: InitializePageRanks <input path> <output path> <number of pages>");
        return -1;
    }/*from   w  w w .  j a va  2s.  c  o  m*/

    Configuration conf = getConf();
    conf.set("pagerank.count", args[2]);

    FileSystem.get(conf).delete(new Path(args[1]), true);

    Job job = new Job(conf, "InitializePageRanks");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(InitializePageRanksMapper.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.SortPageRanks.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: SortPageRanks <input path> <output path>");
        return -1;
    }//from   w w w  . j av  a  2s. co m

    FileSystem.get(getConf()).delete(new Path(args[1]), true);

    Job job = new Job(getConf(), "SortPageRanks");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(SortPageRanksMapper.class);
    job.setReducerClass(Reducer.class); // i.e. identity reducer
    job.setSortComparatorClass(DoubleWritableDecreasingComparator.class);

    job.setMapOutputKeyClass(DoubleWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(1); // TODO: inefficient, use InputSampler with v0.20.x

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.talis.labs.pagerank.mapreduce.UpdatePageRanks.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        System.err.println(/*  w  ww .  j  a v  a2 s.  c o  m*/
                "Usage: UpdatePageRanks <input path> <output path> <number of pages> <dangling pages contribution>");
        return -1;
    }

    Configuration conf = getConf();
    conf.set("pagerank.count", args[2]);
    conf.set("pagerank.dangling", args[3]);

    FileSystem.get(conf).delete(new Path(args[1]), true);

    Job job = new Job(conf, "UpdatePageRanks");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(UpdatePageRanksMapper.class);
    job.setReducerClass(UpdatePageRanksReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.tdunning.plume.local.lazy.MapRedExecutor.java

License:Apache License

/**
 * This method returns a Job instance out of a {@link MSCR} entity. It puts the Class of 
 * the {@link PlumeWorkflow} argument and the MSCR id in the hadoop configuration.
 * //from  ww  w  . j  a  v  a 2s  .  c  om
 * @param mscr The MSCR to convert 
 * @param workflow The workflow whose class will be instantiated by hadoop mappers/reducers
 * @param outputPath The output path of the MapRed job
 * @return A hadoop-executable MapRed Job
 * 
 * @throws IOException
 */
static Job getMapRed(final MSCR mscr, PlumeWorkflow workFlow, String workFlowOutputPath, String outputPath)
        throws IOException {

    Configuration conf = new Configuration();
    conf.set(WORKFLOW_NAME, workFlow.getClass().getName());
    conf.setInt(MSCR_ID, mscr.getId());
    conf.set(TEMP_OUTPUT_PATH, workFlowOutputPath);

    Job job = new Job(conf, "MSCR"); // TODO deprecation

    job.setMapOutputKeyClass(PlumeObject.class);
    job.setMapOutputValueClass(PlumeObject.class);

    job.setJarByClass(MapRedExecutor.class);

    /**
     * Define multiple inputs
     */
    for (PCollection<?> input : mscr.getInputs()) {
        if (!(input instanceof LazyCollection)) {
            throw new IllegalArgumentException("Can't create MapRed from MSCR whose inputs are not LazyTable");
        }
        LazyCollection<Text> l = (LazyCollection<Text>) input;
        if (!(l.isMaterialized() && l.getFile() != null)) {
            // Collections have plume ID only if they are intermediate results - TODO better naming for this
            if (l.getPlumeId().length() < 1) {
                throw new IllegalArgumentException(
                        "Can't create MapRed from MSCR inputs that are not materialized to a file");
            }
        }
        PCollectionType<?> rType = l.getType();
        Class<? extends InputFormat> format = SequenceFileInputFormat.class;
        if (rType instanceof PTableType) {
            PTableType<?, ?> tType = (PTableType<?, ?>) rType;
            if (tType.valueType() instanceof StringType && tType.keyType() instanceof StringType) {
                format = KeyValueTextInputFormat.class;
            }
            MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
        } else {
            if (rType.elementType() instanceof StringType) {
                format = TextInputFormat.class;
            }
            MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
        }
    }
    /**
     * Define multiple outputs
     */
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) {
        PCollectionType<?> rType = ((LazyCollection<?>) mscr.getOutputChannels().get(entry.getKey()).output)
                .getType();
        if (rType instanceof PTableType) {
            PTableType<?, ?> tType = (PTableType<?, ?>) rType;
            Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
            if (tType.keyType() instanceof StringType && tType.valueType() instanceof StringType) {
                outputFormat = TextOutputFormat.class;
            }
            MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat,
                    getHadoopType(tType.keyType()), getHadoopType(tType.valueType()));
        } else {
            Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
            if (rType.elementType() instanceof StringType) {
                outputFormat = TextOutputFormat.class;
            }
            MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, NullWritable.class,
                    getHadoopType(rType.elementType()));
        }
    }
    /**
     * Define Reducer & Combiner
     */
    job.setCombinerClass(MSCRCombiner.class);
    job.setReducerClass(MSCRReducer.class);

    job.setNumReduceTasks(1);
    return job;
}