Example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the value class for the map output data.

Usage

From source file:com.hortonworks.pso.data.generator.mapreduce.DataGenTool.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(getConf()); // new Job(conf, this.getClass().getCanonicalName());

    //        Configuration conf = getConf();

    int mappers = 2;
    String output = null;/*from  w  ww  .ja  va  2  s  .  c om*/
    String config = null;
    long count = 100;

    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-mappers".equals(args[i])) {
                mappers = Integer.parseInt(args[++i]);
                otherArgs.add("-Dmapreduce.job.maps=" + Integer.toString(mappers));
            } else if ("-output".equals(args[i])) {
                output = args[++i];
            } else if ("-json.cfg".equals(args[i])) {
                config = args[++i];
            } else if ("-count".equals(args[i])) {
                count = Long.parseLong(args[++i]);
            } else {
                otherArgs.add(args[i]);
            }

        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    job.getConfiguration().set("json.cfg", config);

    String[] altArgs = new String[otherArgs.size()];
    otherArgs.toArray(altArgs);

    GenericOptionsParser gop = new GenericOptionsParser(job.getConfiguration(), altArgs);

    DataGenInputFormat.setNumberOfRows(job, count);

    job.setJarByClass(DataGenTool.class);

    Path output_path = new Path(output);

    if (output_path.getFileSystem(getConf()).exists(output_path)) {
        throw new IOException("Output directory " + output_path + " already exists.");
    }

    FileOutputFormat.setOutputPath(job, output_path);

    job.setMapperClass(DataGenMapper.class);
    // Map Only Job
    job.setNumReduceTasks(0);
    //        job.setReducerClass(RerateReducer.class);

    job.setInputFormatClass(DataGenInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    //        job.setOutputKeyClass(Text.class);
    //        job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:com.howbuy.hadoop.mr.online.SecondarySort.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysrot <in> <out>");
        System.exit(2);/*from  www.  ja va  2s . co m*/
    }
    Job job = new Job(conf, "secondary sort");
    job.setJarByClass(SecondarySort.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    // group and partition by the first int in the pair
    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(FirstGroupingComparator.class);

    // the map output is IntPair, IntWritable
    job.setMapOutputKeyClass(IntPair.class);
    job.setMapOutputValueClass(IntWritable.class);

    // the reduce output is Text, IntWritable
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(TextInputFormat.class);
    // job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setNumReduceTasks(3);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists)
        throws IOException, InterruptedException, ClassNotFoundException {
    // invert again, partition by host/domain/IP, sort by url hash
    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: Partitioning selected urls for politeness:" + inputDir);
    }/*from  w  w w. ja  v  a  2 s.  com*/
    Path segment = new Path(segmentsDir, generateSegmentName());
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);

    LOG.info("Generator: segment: " + segment + " with " + numLists + " Fetchers");

    Job job = AvroJob.getAvroJob(getConf());
    job.setJobName("generate: partition " + segment);
    job.getConfiguration().setInt("partition.url.seed", new Random().nextInt());

    FileInputFormat.addInputPath(job, inputDir);
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorInverseMapper.class);
    job.setPartitionerClass(AveragePartition.class);
    job.setMapOutputKeyClass(String.class);
    job.setMapOutputValueClass(SelectorEntry.class);
    job.setReducerClass(PartitionReducer.class);
    job.setNumReduceTasks(numLists);

    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.waitForCompletion(true);
    return segment;
}

From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java

License:Open Source License

@SuppressWarnings({ "unchecked", "rawtypes" })
public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation)
        throws IOException, SAXException, ParserConfigurationException {
    StringWriter xml = new StringWriter();
    String outputCollection = job.outputCollectionTemp;// (non-append mode) 
    if ((null != job.appendResults) && job.appendResults)
        outputCollection = job.outputCollection; // (append mode, write directly in....)
    else if (null != job.incrementalMode)
        job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode)

    createConfigXML(xml, job.jobtitle, job.inputCollection,
            InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS),
            job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper,
            job.reducer, job.combiner,/*ww  w.jav a  2 s  .com*/
            InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY),
            job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode,
            job.submitterID, job.selfMerge, job.outputCollection, job.appendResults);

    ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader();

    URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
            savedClassLoader);
    Thread.currentThread().setContextClassLoader(child);

    // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable
    boolean dataModelLoaded = true;
    try {
        URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
                null);
        try {
            Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest);
        } catch (ClassNotFoundException e2) {
            //(this is fine, will use the cached version)
            dataModelLoaded = false;
        }
        if (dataModelLoaded)
            Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest);
    } catch (ClassNotFoundException e1) {
        throw new RuntimeException(
                "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards");
    }

    // Now load the XML into a configuration object: 
    Configuration config = new Configuration();
    // Add the client configuration overrides:
    if (!bLocalMode) {
        String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/";
        config.addResource(new Path(hadoopConfigPath + "core-site.xml"));
        config.addResource(new Path(hadoopConfigPath + "mapred-site.xml"));
        config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml"));
    } //TESTED

    try {
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes()));
        NodeList nList = doc.getElementsByTagName("property");

        for (int temp = 0; temp < nList.getLength(); temp++) {
            Node nNode = nList.item(temp);
            if (nNode.getNodeType() == Node.ELEMENT_NODE) {
                Element eElement = (Element) nNode;
                String name = getTagValue("name", eElement);
                String value = getTagValue("value", eElement);
                if ((null != name) && (null != value)) {
                    config.set(name, value);
                }
            }
        }
    } catch (Exception e) {
        throw new IOException(e.getMessage());
    }

    // Some other config defaults:
    // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config)
    config.set("mapred.map.tasks.speculative.execution", "false");
    config.set("mapred.reduce.tasks.speculative.execution", "false");
    // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera)

    // Now run the JAR file
    try {
        BasicDBObject advancedConfigurationDbo = null;
        try {
            advancedConfigurationDbo = (null != job.query)
                    ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query))
                    : (new BasicDBObject());
        } catch (Exception e) {
            advancedConfigurationDbo = new BasicDBObject();
        }
        boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable;
        if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) {
            throw new RuntimeException(
                    "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead.");
        }

        config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing)
        if (bLocalMode) { // local job tracker and FS mode
            config.set("mapred.job.tracker", "local");
            config.set("fs.default.name", "local");
        } else {
            if (bTestMode) { // run job tracker locally but FS mode remotely
                config.set("mapred.job.tracker", "local");
            } else { // normal job tracker
                String trackerUrl = HadoopUtils.getXMLProperty(
                        props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker");
                config.set("mapred.job.tracker", trackerUrl);
            }
            String fsUrl = HadoopUtils.getXMLProperty(
                    props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name");
            config.set("fs.default.name", fsUrl);
        }
        if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves
            Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/",
                    "infinit.e.data_model.jar", config);
            DistributedCache.addFileToClassPath(jarToCache, config);
            jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/",
                    "infinit.e.processing.custom.library.jar", config);
            DistributedCache.addFileToClassPath(jarToCache, config);
        } //TESTED

        // Debug scripts (only if they exist), and only in non local/test mode
        if (!bLocalMode && !bTestMode) {

            try {
                Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/",
                        "custom_map_error_handler.sh", config);
                config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle);
                config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle);
                DistributedCache.createSymlink(config);
                DistributedCache.addCacheFile(scriptToCache.toUri(), config);
            } catch (Exception e) {
            } // just carry on

            try {
                Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/",
                        "custom_reduce_error_handler.sh", config);
                config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle);
                config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle);
                DistributedCache.createSymlink(config);
                DistributedCache.addCacheFile(scriptToCache.toUri(), config);
            } catch (Exception e) {
            } // just carry on

        } //TODO (???): TOTEST

        // (need to do these 2 things here before the job is created, at which point the config class has been copied across)
        //1)
        Class<?> mapperClazz = Class.forName(job.mapper, true, child);
        if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook
            ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz
                    .newInstance();
            preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode));
        } //TESTED
          //2)
        if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) {
            // Need to download the GridFSZip file
            try {
                Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/",
                        "GridFSZipFile.jar", config);
                DistributedCache.addFileToClassPath(jarToCache, config);
            } catch (Throwable t) {
            } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!)            
        }

        if (job.inputCollection.equals("records")) {

            InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo);

            //(won't run under 0.19 so running with "records" should cause all sorts of exceptions)

        } //TESTED (by hand)         

        if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place
            config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
        }

        // Manually specified caches
        List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"),
                job, config, props_custom);

        Job hj = new Job(config); // (NOTE: from here, changes to config are ignored)
        try {

            if (null != localJarCaches) {
                if (bLocalMode || bTestMode) {
                    Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class });
                    method.setAccessible(true);
                    method.invoke(child, localJarCaches.toArray());

                } //TOTEST (tested logically)
            }
            Class<?> classToLoad = Class.forName(job.mapper, true, child);
            hj.setJarByClass(classToLoad);

            if (job.inputCollection.equalsIgnoreCase("filesystem")) {
                String inputPath = null;
                try {
                    inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url");
                    if (!inputPath.endsWith("/")) {
                        inputPath = inputPath + "/";
                    }
                } catch (Exception e) {
                }
                if (null == inputPath) {
                    throw new RuntimeException("Must specify 'file.url' if reading from filesystem.");
                }
                inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath);

                InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive)
                InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB)
                InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config);
                hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child));
            } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) {

                String[] oidStrs = null;
                try {
                    String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url");
                    Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)");
                    Matcher m = oidExtractor.matcher(inputPath);
                    if (m.find()) {
                        oidStrs = m.group(1).split("\\s*,\\s*");

                    } else {
                        throw new RuntimeException(
                                "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath);
                    }
                    InfiniteHadoopUtils.authenticateShareList(job, oidStrs);
                } catch (Exception e) {
                    throw new RuntimeException(
                            "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e);
                }

                hj.getConfiguration().setStrings("mapred.input.dir", oidStrs);
                hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child));
            } else if (job.inputCollection.equals("records")) {
                hj.setInputFormatClass((Class<? extends InputFormat>) Class
                        .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child));
            } else {
                if (esMode) {
                    hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                            "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat",
                            true, child));
                } else {
                    hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                            "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child));
                }
            }
            if ((null != job.exportToHdfs) && job.exportToHdfs) {

                //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?)

                Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom);

                if ((null != job.outputKey) && (null != job.outputValue)
                        && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text")
                        && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) {
                    // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text)
                    hj.setOutputFormatClass((Class<? extends OutputFormat>) Class
                            .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child));
                    TextOutputFormat.setOutputPath(hj, outPath);
                } //TESTED
                else {
                    hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(
                            "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child));
                    SequenceFileOutputFormat.setOutputPath(hj, outPath);
                } //TESTED
            } else { // normal case, stays in MongoDB
                hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child));
            }
            hj.setMapperClass((Class<? extends Mapper>) mapperClazz);
            String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null);
            if (null != mapperOutputKeyOverride) {
                hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride));
            } //TESTED 

            String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null);
            if (null != mapperOutputValueOverride) {
                hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride));
            } //TESTED 

            if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null")
                    && !job.reducer.equalsIgnoreCase("none")) {
                hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child));
                // Variable reducers:
                if (null != job.query) {
                    try {
                        hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1));
                    } catch (Exception e) {
                        try {
                            // (just check it's not a string that is a valid int)
                            hj.setNumReduceTasks(
                                    Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1")));
                        } catch (Exception e2) {
                        }
                    }
                } //TESTED
            } else {
                hj.setNumReduceTasks(0);
            }
            if ((null != job.combiner) && !job.combiner.startsWith("#")
                    && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) {
                hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child));
            }
            hj.setOutputKeyClass(Class.forName(job.outputKey, true, child));
            hj.setOutputValueClass(Class.forName(job.outputValue, true, child));

            hj.setJobName(job.jobtitle);
            currJobName = job.jobtitle;
        } catch (Error e) { // (messing about with class loaders = lots of chances for errors!)
            throw new RuntimeException(e.getMessage(), e);
        }
        if (bTestMode || bLocalMode) {
            hj.submit();
            currThreadId = null;
            Logger.getRootLogger().addAppender(this);
            currLocalJobId = hj.getJobID().toString();
            currLocalJobErrs.setLength(0);
            while (!hj.isComplete()) {
                Thread.sleep(1000);
            }
            Logger.getRootLogger().removeAppender(this);
            if (hj.isSuccessful()) {
                if (this.currLocalJobErrs.length() > 0) {
                    return "local_done: " + this.currLocalJobErrs.toString();
                } else {
                    return "local_done";
                }
            } else {
                return "Error: " + this.currLocalJobErrs.toString();
            }
        } else {
            hj.submit();
            String jobId = hj.getJobID().toString();
            return jobId;
        }
    } catch (Exception e) {
        e.printStackTrace();
        Thread.currentThread().setContextClassLoader(savedClassLoader);
        return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e);
    } finally {
        Thread.currentThread().setContextClassLoader(savedClassLoader);
    }
}

From source file:com.impetus.code.examples.hadoop.cassandra.wordcount.WordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    String outputReducerType = "cassandra";
    if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) {
        String[] s = args[0].split("=");
        if (s != null && s.length == 2)
            outputReducerType = s[1];/*  w w w .j  a  v  a2  s.  c o  m*/
    }
    logger.info("output reducer type: " + outputReducerType);

    for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) {
        String columnName = "text" + i;
        getConf().set(CONF_COLUMN_NAME, columnName);

        Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);

        if (outputReducerType.equalsIgnoreCase("filesystem")) {
            job.setCombinerClass(ReducerToFilesystem.class);
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i));
        } else {
            job.setReducerClass(ReducerToCassandra.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            job.setOutputKeyClass(ByteBuffer.class);
            job.setOutputValueClass(List.class);

            job.setOutputFormatClass(ColumnFamilyOutputFormat.class);

            ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
        }

        job.setInputFormatClass(ColumnFamilyInputFormat.class);

        ConfigHelper.setRpcPort(job.getConfiguration(), "9160");
        ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost");
        ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, INPUT_COLUMN_FAMILY);
        SlicePredicate predicate = new SlicePredicate()
                .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
        ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

        job.waitForCompletion(true);
    }
    return 0;
}

From source file:com.inmobi.conduit.distcp.tools.DistCp.java

License:Apache License

/**
 * Create Job object for submitting it, with all the configuration
 *
 * @return Reference to job object./*from w  w w. ja v  a  2 s. c om*/
 * @throws IOException - Exception if any
 */
protected Job createJob() throws IOException {
    String jobName = "distcp";
    String userChosenName = getConf().get("mapred.job.name");
    if (userChosenName != null)
        jobName += ": " + userChosenName;
    Job job = new Job(getConf(), jobName);
    job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), inputOptions));
    job.setJarByClass(CopyMapper.class);
    configureOutputFormat(job);

    job.setMapperClass(CopyMapper.class);
    job.setReducerClass(Reducer.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(CopyOutputFormat.class);
    job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false");
    job.getConfiguration().set(DistCpConstants.CONF_LABEL_NUM_MAPS, String.valueOf(inputOptions.getMaxMaps()));

    if (inputOptions.getSslConfigurationFile() != null) {
        setupSSLConfig(job.getConfiguration());
    }

    inputOptions.appendToConf(job.getConfiguration());
    return job;
}

From source file:com.inmobi.conduit.local.LocalStreamService.java

License:Apache License

protected Job createJob(Path inputPath, long totalSize) throws IOException {
    String jobName = getName();//from   w  w w. ja  va2s  . com
    Configuration conf = currentCluster.getHadoopConf();
    conf.set(ConduitConstants.AUDIT_ENABLED_KEY, System.getProperty(ConduitConstants.AUDIT_ENABLED_KEY));
    Job job = new Job(conf);
    job.setJobName(jobName);
    // DistributedCache.addFileToClassPath(inputFormatJarDestPath,
    // job.getConfiguration());
    job.getConfiguration().set("tmpjars",
            inputFormatJarDestPath.toString() + "," + auditUtilJarDestPath.toString());
    LOG.debug("Adding file [" + inputFormatJarDestPath + "] to distributed cache");
    job.setInputFormatClass(UniformSizeInputFormat.class);
    Class<? extends Mapper<Text, FileStatus, NullWritable, Text>> mapperClass = getMapperClass();
    job.setJarByClass(mapperClass);

    job.setMapperClass(mapperClass);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    // setting identity reducer
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, tmpCounterOutputPath);
    job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false");
    job.getConfiguration().set(LOCALSTREAM_TMP_PATH, tmpPath.toString());
    job.getConfiguration().set(SRC_FS_DEFAULT_NAME_KEY, srcCluster.getHadoopConf().get(FS_DEFAULT_NAME_KEY));

    // set configurations needed for UniformSizeInputFormat
    int numMaps = getNumMapsForJob(totalSize);
    job.getConfiguration().setInt(DistCpConstants.CONF_LABEL_NUM_MAPS, numMaps);
    job.getConfiguration().setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, totalSize);
    job.getConfiguration().set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, inputPath.toString());
    LOG.info("Expected number of maps [" + numMaps + "] Total data size [" + totalSize + "]");

    return job;
}

From source file:com.intel.hadoop.hbase.dot.mapreduce.DotImportTsv.java

License:Apache License

/**
 * Sets up the actual job.//from   w  w  w . j a va  2 s . c  o  m
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
        throws IOException, ClassNotFoundException {

    // Support non-XML supported characters
    // by re-encoding the passed separator as a Base64 string.
    String actualSeparator = conf.get(SEPARATOR_CONF_KEY);
    if (actualSeparator != null) {
        conf.set(SEPARATOR_CONF_KEY, Base64.encodeBytes(actualSeparator.getBytes()));
    }

    // See if a non-default Mapper was set
    String mapperClassName = conf.get(MAPPER_CONF_KEY);
    Class mapperClass = mapperClassName != null ? Class.forName(mapperClassName) : DEFAULT_MAPPER;

    String tableName = args[0];
    Path inputDir = new Path(args[1]);
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(mapperClass);
    FileInputFormat.setInputPaths(job, inputDir);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(mapperClass);

    String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);
    if (hfileOutPath != null) {
        if (!doesTableExist(tableName)) {
            createTable(conf, tableName);
        }
        HTable table = new HTable(conf, tableName);
        job.setReducerClass(PutSortReducer.class);
        Path outputDir = new Path(hfileOutPath);
        FileOutputFormat.setOutputPath(job, outputDir);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);
        HFileOutputFormat.configureIncrementalLoad(job, table);
    } else {
        // No reducers.  Just write straight to table.  Call initTableReducerJob
        // to set up the TableOutputFormat.
        TableMapReduceUtil.initTableReducerJob(tableName, null, job);
        job.setNumReduceTasks(0);
    }

    TableMapReduceUtil.addDependencyJars(job);
    TableMapReduceUtil.addDependencyJars(job.getConfiguration(),
            com.google.common.base.Function.class /* Guava used by TsvParser */);
    return job;
}

From source file:com.intel.hibench.DFSIOWriter.java

License:Apache License

@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
    startTime = System.currentTimeMillis();
    benchData.put(new Put(ONE, ONE, startTime));

    Job job = context.getHadoopJob();
    job.setInputFormatClass(RandomInputFormat.class);
    job.setMapperClass(Generator.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setNumReduceTasks(0);//from  ww w .  j a  v  a  2 s .  c o  m

    String sizeStr = context.getRuntimeArguments().get("size");
    if (sizeStr != null) {
        LOG.info("size we get in config is : " + sizeStr);
        long totalBytes = Long.valueOf(sizeStr) * 1024 * 1024;
        job.getConfiguration().setLong(BENCH_SIZE, totalBytes);
        benchData.put(new Put(ONE, THREE, totalBytes));
    }

}

From source file:com.jbw.recommendsystem.add.AddMRD.java

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = getConf();
    Path in = new Path(conf.get("input"));
    Path out = new Path(conf.get("output"));

    Job job = Job.getInstance(conf);
    job.setJobName("5");
    job.setJarByClass(AddMRD.class);

    job.setMapperClass(AddMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setReducerClass(AddReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    TextInputFormat.addInputPath(job, in);
    TextOutputFormat.setOutputPath(job, out);

    return job.waitForCompletion(true) ? 0 : 1;
}