Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.mongodb.hadoop.input.MongoMultipleInputs.java

License:Apache License

/**
 * Add a {@link Path} with a custom {@link InputFormat} and
 * {@link Mapper} to the list of inputs for the map-reduce job.
 * // w  ww.  j a  v a2 s. c  o  m
 * @param job The {@link Job}
 * @param path {@link Path} to be added to the list of inputs for the job
 * @param inputFormatClass {@link InputFormat} class to use for this path
 * @param mapperClass {@link Mapper} class to use for this path
 * @param query {@link DBObject} query for path and mapper
 * @param fields {@link DBObject} fields for path and mapper
 * @param sort {@link DBObject} sort for path and mapper
 * @param limit {@link int} limit for path and mapper
 * @param skip {@link int} skip for path and mapper
 */
@SuppressWarnings("unchecked")
public static void addInputPath(Job job, String uri, Class<? extends InputFormat> inputFormatClass,
        Class<? extends Mapper> mapperClass, String query, String fields, String sort, int limit, int skip) {
    Configuration conf = job.getConfiguration();
    MongoConfigUtil.addMongoRequest(conf, uri, inputFormatClass, mapperClass, query, fields, sort, limit, skip);

    job.setMapperClass(DelegatingMapper.class);
    job.setInputFormatClass(DelegatingInputFormat.class);
}

From source file:com.mongodb.hadoop.util.MongoTool.java

License:Apache License

private int runMapReduceJob(final Configuration conf) throws IOException {
    final Job job = Job.getInstance(conf, getJobName());
    /**/*from ww  w .  ja  v  a  2  s .  co m*/
     * Any arguments specified with -D <property>=<value>
     * on the CLI will be picked up and set here
     * They override any XML level values
     * Note that -D<space> is important - no space will
     * not work as it gets picked up by Java itself
     */
    // TODO - Do we need to set job name somehow more specifically?
    // This may or may not be correct/sane
    job.setJarByClass(getClass());
    final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf);

    LOG.debug("Mapper Class: " + mapper);
    LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI));
    job.setMapperClass(mapper);
    Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf);
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }
    job.setReducerClass(MongoConfigUtil.getReducer(conf));

    job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf));
    job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf));
    job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf));
    job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf));
    Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf);
    Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf);

    if (mapOutputKeyClass != null) {
        job.setMapOutputKeyClass(mapOutputKeyClass);
    }
    if (mapOutputValueClass != null) {
        job.setMapOutputValueClass(mapOutputValueClass);
    }

    /**
     * Determines if the job will run verbosely e.g. print debug output
     * Only works with foreground jobs
     */
    final boolean verbose = MongoConfigUtil.isJobVerbose(conf);
    /**
     * Run job in foreground aka wait for completion or background?
     */
    final boolean background = MongoConfigUtil.isJobBackground(conf);
    try {
        if (background) {
            LOG.info("Setting up and running MapReduce job in background.");
            job.submit();
            return 0;
        } else {
            LOG.info("Setting up and running MapReduce job in foreground, will wait for results.  {Verbose? "
                    + verbose + "}");
            return job.waitForCompletion(true) ? 0 : 1;
        }
    } catch (final Exception e) {
        LOG.error("Exception while executing job... ", e);
        return 1;
    }
}

From source file:com.moz.fiji.mapreduce.input.FijiTableMapReduceJobInput.java

License:Apache License

/** {@inheritDoc} */
@Override/*w w  w  .jav a2 s  .  c o  m*/
public void configure(Job job) throws IOException {
    // Configure the input format class.
    super.configure(job);
    FijiTableInputFormat inputFormat = FijiTableInputFormat.Factory.get(mInputTableURI).getInputFormat();
    job.setInputFormatClass(inputFormat.getClass());
    FijiTableInputFormat.configureJob(job, mInputTableURI, mDataRequest,
            null != mRowOptions ? mRowOptions.getStartRow() : null,
            null != mRowOptions ? mRowOptions.getLimitRow() : null,
            null != mRowOptions ? mRowOptions.getRowFilter() : null);
}

From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java

License:Apache License

public Job setupJob(String jobName, Path outputFile, Class<? extends Mapper> mapperClass,
        Class<? extends Reducer> reducerClass, EntityId startKey, EntityId limitKey, FijiRowFilter filter)
        throws Exception {
    final Job job = new Job(createConfiguration());
    final Configuration conf = job.getConfiguration();

    // Get settings for test.
    final FijiDataRequest request = FijiDataRequest.builder()
            .addColumns(ColumnsDef.create().add("info", "name").add("info", "email")).build();

    job.setJarByClass(IntegrationTestFijiTableInputFormat.class);

    // Setup the InputFormat.
    FijiTableInputFormat.configureJob(job, getFooTable().getURI(), request, startKey, limitKey, filter);
    job.setInputFormatClass(HBaseFijiTableInputFormat.class);

    // Duplicate functionality from MapReduceJobBuilder, since we are not using it here:
    final List<Path> jarFiles = Lists.newArrayList();
    final FileSystem fs = FileSystem.getLocal(conf);
    for (String cpEntry : System.getProperty("java.class.path").split(":")) {
        if (cpEntry.endsWith(".jar")) {
            jarFiles.add(fs.makeQualified(new Path(cpEntry)));
        }/*from w w  w .  j a  va  2  s. c o  m*/
    }
    DistributedCacheJars.addJarsToDistributedCache(job, jarFiles);

    // Create a test job.
    job.setJobName(jobName);

    // Setup the OutputFormat.
    TextOutputFormat.setOutputPath(job, outputFile.getParent());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Set the mapper class.
    if (null != mapperClass) {
        job.setMapperClass(mapperClass);
    }
    // Set the reducer class.
    if (null != reducerClass) {
        job.setReducerClass(reducerClass);
    }

    return job;
}

From source file:com.moz.fiji.mapreduce.MapReduceJobInput.java

License:Apache License

/**
 * Configure a job to use this type of input for the MapReduce.
 *
 * @param job The job to configure with input.
 * @throws IOException If there is an error during configuration.
 *//*from  w  w  w .  j  a  v  a 2  s  . c o m*/
public void configure(Job job) throws IOException {
    job.setInputFormatClass(getInputFormatClass());
}

From source file:com.moz.fiji.schema.mapreduce.FijiTableInputFormat.java

License:Apache License

/**
 * Configures a Hadoop M/R job to read from a given table.
 *
 * @param job Job to configure.//from w w  w  .j  a  va2 s. c  om
 * @param tableURI URI of the table to read from.
 * @param dataRequest Data request.
 * @param startRow Minimum row key to process.
 * @param endRow Maximum row Key to process.
 * @throws IOException on I/O error.
 */
public static void configureJob(Job job, FijiURI tableURI, FijiDataRequest dataRequest, String startRow,
        String endRow) throws IOException {

    final Configuration conf = job.getConfiguration();
    // As a precaution, be sure the table exists and can be opened.
    final Fiji fiji = Fiji.Factory.open(tableURI, conf);
    final FijiTable table = fiji.openTable(tableURI.getTable());
    ResourceUtils.releaseOrLog(table);
    ResourceUtils.releaseOrLog(fiji);

    // TODO: Check for jars config:
    // GenericTableMapReduceUtil.initTableInput(hbaseTableName, scan, job);

    // TODO: Obey specified start/end rows.

    // Write all the required values to the job's configuration object.
    job.setInputFormatClass(FijiTableInputFormat.class);
    final String serializedRequest = Base64.encodeBase64String(SerializationUtils.serialize(dataRequest));
    conf.set(FijiConfKeys.INPUT_DATA_REQUEST, serializedRequest);
    conf.set(FijiConfKeys.INPUT_TABLE_URI, tableURI.toString());
}

From source file:com.mozilla.hadoop.Backup.java

License:Apache License

/**
 * @param args//from  w  w w.j a v  a  2s . c o m
 * @return
 * @throws IOException
 * @throws ParseException 
 */
public Job initJob(String[] args) throws IOException, ParseException {

    Path inputPath = null;
    Path loadPath = null;
    String outputPath = null;
    boolean useSpecifiedPaths = false;
    for (int idx = 0; idx < args.length; idx++) {
        if ("-f".equals(args[idx])) {
            useSpecifiedPaths = true;
            loadPath = new Path(args[++idx]);
        } else if (idx == args.length - 1) {
            outputPath = args[idx];
        } else {
            inputPath = new Path(args[idx]);
        }
    }

    Path mrOutputPath = new Path(NAME + "-results");

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.set("backup.input.path", inputPath.toString());
    conf.set("backup.output.path", outputPath);

    FileSystem inputFs = null;
    FileSystem outputFs = null;
    Path[] inputSources = null;
    try {
        inputFs = FileSystem.get(inputPath.toUri(), new Configuration());
        outputFs = FileSystem.get(getConf());
        if (useSpecifiedPaths) {
            inputSources = createInputSources(loadPaths(outputFs, loadPath), outputFs);
        } else {
            inputSources = createInputSources(getPaths(inputFs, inputPath, 0, 2), outputFs);
        }
    } finally {
        checkAndClose(inputFs);
        checkAndClose(outputFs);
    }

    Job job = new Job(getConf());
    job.setJobName(NAME);
    job.setJarByClass(Backup.class);

    job.setMapperClass(BackupMapper.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    job.setInputFormatClass(TextInputFormat.class);

    for (Path source : inputSources) {
        System.out.println("Adding input path: " + source.toString());
        FileInputFormat.addInputPath(job, source);
    }

    FileOutputFormat.setOutputPath(job, mrOutputPath);

    return job;
}

From source file:com.mozilla.hadoop.hbase.mapreduce.MultiScanTableMapReduceUtil.java

License:Apache License

/**
 * Use this before submitting a TableMap job. It will appropriately set up
 * the job./*from  ww w.j ava  2  s . co m*/
 * 
 * @param table
 *            The table name to read from.
 * @param scans
 *            The scan instances with the columns, time range etc.
 * @param mapper
 *            The mapper class to use.
 * @param outputKeyClass
 *            The class of the output key.
 * @param outputValueClass
 *            The class of the output value.
 * @param job
 *            The current job to adjust.
 * @throws IOException
 *             When setting up the details fails.
 */
@SuppressWarnings("rawtypes")
public static void initMultiScanTableMapperJob(final String table, final Scan[] scans,
        final Class<? extends TableMapper> mapper, final Class<? extends WritableComparable> outputKeyClass,
        final Class<? extends Writable> outputValueClass, final Job job) throws IOException {

    job.setInputFormatClass(MultiScanTableInputFormat.class);
    if (outputValueClass != null) {
        job.setMapOutputValueClass(outputValueClass);
    }
    if (outputKeyClass != null) {
        job.setMapOutputKeyClass(outputKeyClass);
    }
    job.setMapperClass(mapper);
    job.getConfiguration().set(MultiScanTableInputFormat.INPUT_TABLE, table);
    job.getConfiguration().set(MultiScanTableInputFormat.SCANS, convertScanArrayToString(scans));
}

From source file:com.msd.gin.halyard.tools.HalyardBulkLoad.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: bulkload [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D"
                + SKIP_INVALID_PROPERTY + "=true] [-D" + SPLIT_BITS_PROPERTY + "=8] [-D"
                + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY
                + "=true] <input_path(s)> <output_path> <table_name>");
        return -1;
    }/*from w w w  .j  av  a2  s  .  com*/
    TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class,
            RDFFormat.class, RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    if (SnappyCodec.isNativeCodeLoaded()) {
        getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
        getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
    }
    getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0);
    getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l);
    getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100);
    getConf().setInt(MRJobConfig.IO_SORT_MB, 1000);
    getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000);
    getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048);
    Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + args[1] + " -> " + args[2]);
    job.setJarByClass(HalyardBulkLoad.class);
    job.setMapperClass(RDFMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(RioFileInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    Map<String, Integer> contextSplitsMap = new HashMap<>();
    for (Map.Entry<String, String> me : getConf().getValByRegex(CONTEXT_SPLIT_REGEXP).entrySet()) {
        int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1));
        StringTokenizer stk = new StringTokenizer(me.getValue(), ",");
        while (stk.hasMoreTokens()) {
            contextSplitsMap.put(stk.nextToken(), splits);
        }
    }
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true,
            getConf().getInt(SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(),
                hTable.getRegionLocator());
        FileInputFormat.setInputDirRecursive(job, true);
        FileInputFormat.setInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable);
            LOG.info("Bulk Load Completed..");
            return 0;
        }
    }
    return -1;
}

From source file:com.msd.gin.halyard.tools.HalyardBulkUpdate.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: bulkupdate [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D"
                + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY
                + "=true] <input_file_with_SPARQL_queries> <output_path> <table_name>");
        return -1;
    }//w  w w  . j av  a  2s .c  o m
    TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, RDFFormat.class,
            RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    if (SnappyCodec.isNativeCodeLoaded()) {
        getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
        getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
    }
    getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0);
    getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l);
    getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100);
    getConf().setInt(MRJobConfig.IO_SORT_MB, 1000);
    getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000);
    getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048);
    getConf().setStrings(TABLE_NAME_PROPERTY, args[2]);
    Job job = Job.getInstance(getConf(), "HalyardBulkUpdate -> " + args[1] + " -> " + args[2]);
    NLineInputFormat.setNumLinesPerSplit(job, 1);
    job.setJarByClass(HalyardBulkUpdate.class);
    job.setMapperClass(SPARQLMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(NLineInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], false, 0, null)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(),
                hTable.getRegionLocator());
        FileInputFormat.setInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable);
            LOG.info("Bulk Update Completed..");
            return 0;
        }
    }
    return -1;
}