Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.mongodb.hadoop.input.MongoMultipleInputs.java

License:Apache License

/**
 * Add a {@link Path} with a custom {@link InputFormat} and
 * {@link Mapper} to the list of inputs for the map-reduce job.
 * // w  ww.  j a  v a2 s. c  o  m
 * @param job The {@link Job}
 * @param path {@link Path} to be added to the list of inputs for the job
 * @param inputFormatClass {@link InputFormat} class to use for this path
 * @param mapperClass {@link Mapper} class to use for this path
 * @param query {@link DBObject} query for path and mapper
 * @param fields {@link DBObject} fields for path and mapper
 * @param sort {@link DBObject} sort for path and mapper
 * @param limit {@link int} limit for path and mapper
 * @param skip {@link int} skip for path and mapper
 */
@SuppressWarnings("unchecked")
public static void addInputPath(Job job, String uri, Class<? extends InputFormat> inputFormatClass,
        Class<? extends Mapper> mapperClass, String query, String fields, String sort, int limit, int skip) {
    Configuration conf = job.getConfiguration();
    MongoConfigUtil.addMongoRequest(conf, uri, inputFormatClass, mapperClass, query, fields, sort, limit, skip);

    job.setMapperClass(DelegatingMapper.class);
    job.setInputFormatClass(DelegatingInputFormat.class);
}

From source file:com.mongodb.hadoop.util.MongoTool.java

License:Apache License

private int runMapReduceJob(final Configuration conf) throws IOException {
    final Job job = Job.getInstance(conf, getJobName());
    /**/*from ww  w .  ja  v  a  2  s .  co m*/
     * Any arguments specified with -D <property>=<value>
     * on the CLI will be picked up and set here
     * They override any XML level values
     * Note that -D<space> is important - no space will
     * not work as it gets picked up by Java itself
     */
    // TODO - Do we need to set job name somehow more specifically?
    // This may or may not be correct/sane
    job.setJarByClass(getClass());
    final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf);

    LOG.debug("Mapper Class: " + mapper);
    LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI));
    job.setMapperClass(mapper);
    Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf);
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }
    job.setReducerClass(MongoConfigUtil.getReducer(conf));

    job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf));
    job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf));
    job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf));
    job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf));
    Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf);
    Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf);

    if (mapOutputKeyClass != null) {
        job.setMapOutputKeyClass(mapOutputKeyClass);
    }
    if (mapOutputValueClass != null) {
        job.setMapOutputValueClass(mapOutputValueClass);
    }

    /**
     * Determines if the job will run verbosely e.g. print debug output
     * Only works with foreground jobs
     */
    final boolean verbose = MongoConfigUtil.isJobVerbose(conf);
    /**
     * Run job in foreground aka wait for completion or background?
     */
    final boolean background = MongoConfigUtil.isJobBackground(conf);
    try {
        if (background) {
            LOG.info("Setting up and running MapReduce job in background.");
            job.submit();
            return 0;
        } else {
            LOG.info("Setting up and running MapReduce job in foreground, will wait for results.  {Verbose? "
                    + verbose + "}");
            return job.waitForCompletion(true) ? 0 : 1;
        }
    } catch (final Exception e) {
        LOG.error("Exception while executing job... ", e);
        return 1;
    }
}

From source file:com.moz.fiji.mapreduce.input.FijiTableMapReduceJobInput.java

License:Apache License

/** {@inheritDoc} */
@Override/*w w  w  .jav a2 s  .  c o  m*/
public void configure(Job job) throws IOException {
    // Configure the input format class.
    super.configure(job);
    FijiTableInputFormat inputFormat = FijiTableInputFormat.Factory.get(mInputTableURI).getInputFormat();
    job.setInputFormatClass(inputFormat.getClass());
    FijiTableInputFormat.configureJob(job, mInputTableURI, mDataRequest,
            null != mRowOptions ? mRowOptions.getStartRow() : null,
            null != mRowOptions ? mRowOptions.getLimitRow() : null,
            null != mRowOptions ? mRowOptions.getRowFilter() : null);
}

From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java

License:Apache License

public Job setupJob(String jobName, Path outputFile, Class<? extends Mapper> mapperClass,
        Class<? extends Reducer> reducerClass, EntityId startKey, EntityId limitKey, FijiRowFilter filter)
        throws Exception {
    final Job job = new Job(createConfiguration());
    final Configuration conf = job.getConfiguration();

    // Get settings for test.
    final FijiDataRequest request = FijiDataRequest.builder()
            .addColumns(ColumnsDef.create().add("info", "name").add("info", "email")).build();

    job.setJarByClass(IntegrationTestFijiTableInputFormat.class);

    // Setup the InputFormat.
    FijiTableInputFormat.configureJob(job, getFooTable().getURI(), request, startKey, limitKey, filter);
    job.setInputFormatClass(HBaseFijiTableInputFormat.class);

    // Duplicate functionality from MapReduceJobBuilder, since we are not using it here:
    final List<Path> jarFiles = Lists.newArrayList();
    final FileSystem fs = FileSystem.getLocal(conf);
    for (String cpEntry : System.getProperty("java.class.path").split(":")) {
        if (cpEntry.endsWith(".jar")) {
            jarFiles.add(fs.makeQualified(new Path(cpEntry)));
        }/*from w w  w .  j a  va  2  s. c o  m*/
    }
    DistributedCacheJars.addJarsToDistributedCache(job, jarFiles);

    // Create a test job.
    job.setJobName(jobName);

    // Setup the OutputFormat.
    TextOutputFormat.setOutputPath(job, outputFile.getParent());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Set the mapper class.
    if (null != mapperClass) {
        job.setMapperClass(mapperClass);
    }
    // Set the reducer class.
    if (null != reducerClass) {
        job.setReducerClass(reducerClass);
    }

    return job;
}

From source file:com.moz.fiji.mapreduce.MapReduceJobInput.java

License:Apache License

/**
 * Configure a job to use this type of input for the MapReduce.
 *
 * @param job The job to configure with input.
 * @throws IOException If there is an error during configuration.
 *//*from  w  w  w .  j  a  v  a 2  s  . c o m*/
public void configure(Job job) throws IOException {
    job.setInputFormatClass(getInputFormatClass());
}

From source file:com.moz.fiji.schema.mapreduce.FijiTableInputFormat.java

License:Apache License

/**
 * Configures a Hadoop M/R job to read from a given table.
 *
 * @param job Job to configure.//from w w  w  .j  a  va2 s. c  om
 * @param tableURI URI of the table to read from.
 * @param dataRequest Data request.
 * @param startRow Minimum row key to process.
 * @param endRow Maximum row Key to process.
 * @throws IOException on I/O error.
 */
public static void configureJob(Job job, FijiURI tableURI, FijiDataRequest dataRequest, String startRow,
        String endRow) throws IOException {

    final Configuration conf = job.getConfiguration();
    // As a precaution, be sure the table exists and can be opened.
    final Fiji fiji = Fiji.Factory.open(tableURI, conf);
    final FijiTable table = fiji.openTable(tableURI.getTable());
    ResourceUtils.releaseOrLog(table);
    ResourceUtils.releaseOrLog(fiji);

    // TODO: Check for jars config:
    // GenericTableMapReduceUtil.initTableInput(hbaseTableName, scan, job);

    // TODO: Obey specified start/end rows.

    // Write all the required values to the job's configuration object.
    job.setInputFormatClass(FijiTableInputFormat.class);
    final String serializedRequest = Base64.encodeBase64String(SerializationUtils.serialize(dataRequest));
    conf.set(FijiConfKeys.INPUT_DATA_REQUEST, serializedRequest);
    conf.set(FijiConfKeys.INPUT_TABLE_URI, tableURI.toString());
}

From source file:com.mozilla.hadoop.Backup.java

License:Apache License

/**
 * @param args//from  w  w w.j a v  a  2s . c o m
 * @return
 * @throws IOException
 * @throws ParseException 
 */
public Job initJob(String[] args) throws IOException, ParseException {

    Path inputPath = null;
    Path loadPath = null;
    String outputPath = null;
    boolean useSpecifiedPaths = false;
    for (int idx = 0; idx < args.length; idx++) {
        if ("-f".equals(args[idx])) {
            useSpecifiedPaths = true;
            loadPath = new Path(args[++idx]);
        } else if (idx == args.length - 1) {
            outputPath = args[idx];
        } else {
            inputPath = new Path(args[idx]);
        }
    }

    Path mrOutputPath = new Path(NAME + "-results");

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.set("backup.input.path", inputPath.toString());
    conf.set("backup.output.path", outputPath);

    FileSystem inputFs = null;
    FileSystem outputFs = null;
    Path[] inputSources = null;
    try {
        inputFs = FileSystem.get(inputPath.toUri(), new Configuration());
        outputFs = FileSystem.get(getConf());
        if (useSpecifiedPaths) {
            inputSources = createInputSources(loadPaths(outputFs, loadPath), outputFs);
        } else {
            inputSources = createInputSources(getPaths(inputFs, inputPath, 0, 2), outputFs);
        }
    } finally {
        checkAndClose(inputFs);
        checkAndClose(outputFs);
    }

    Job job = new Job(getConf());
    job.setJobName(NAME);
    job.setJarByClass(Backup.class);

    job.setMapperClass(BackupMapper.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    job.setInputFormatClass(TextInputFormat.class);

    for (Path source : inputSources) {
        System.out.println("Adding input path: " + source.toString());
        FileInputFormat.addInputPath(job, source);
    }

    FileOutputFormat.setOutputPath(job, mrOutputPath);

    return job;
}

From source file:com.mozilla.hadoop.hbase.mapreduce.MultiScanTableMapReduceUtil.java

License:Apache License

/**
 * Use this before submitting a TableMap job. It will appropriately set up
 * the job./*from  ww w.j ava  2  s . co m*/
 * 
 * @param table
 *            The table name to read from.
 * @param scans
 *            The scan instances with the columns, time range etc.
 * @param mapper
 *            The mapper class to use.
 * @param outputKeyClass
 *            The class of the output key.
 * @param outputValueClass
 *            The class of the output value.
 * @param job
 *            The current job to adjust.
 * @throws IOException
 *             When setting up the details fails.
 */
@SuppressWarnings("rawtypes")
public static void initMultiScanTableMapperJob(final String table, final Scan[] scans,
        final Class<? extends TableMapper> mapper, final Class<? extends WritableComparable> outputKeyClass,
        final Class<? extends Writable> outputValueClass, final Job job) throws IOException {

    job.setInputFormatClass(MultiScanTableInputFormat.class);
    if (outputValueClass != null) {
        job.setMapOutputValueClass(outputValueClass);
    }
    if (outputKeyClass != null) {
        job.setMapOutputKeyClass(outputKeyClass);
    }
    job.setMapperClass(mapper);
    job.getConfiguration().set(MultiScanTableInputFormat.INPUT_TABLE, table);
    job.getConfiguration().set(MultiScanTableInputFormat.SCANS, convertScanArrayToString(scans));
}

From source file:com.msd.gin.halyard.tools.HalyardBulkLoad.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: bulkload [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D"
                + SKIP_INVALID_PROPERTY + "=true] [-D" + SPLIT_BITS_PROPERTY + "=8] [-D"
                + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY
                + "=true] <input_path(s)> <output_path> <table_name>");
        return -1;
    }/*from w w w  .j  av  a2  s  .  com*/
    TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class,
            RDFFormat.class, RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    if (SnappyCodec.isNativeCodeLoaded()) {
        getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
        getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
    }
    getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0);
    getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l);
    getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100);
    getConf().setInt(MRJobConfig.IO_SORT_MB, 1000);
    getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000);
    getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048);
    Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + args[1] + " -> " + args[2]);
    job.setJarByClass(HalyardBulkLoad.class);
    job.setMapperClass(RDFMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(RioFileInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    Map<String, Integer> contextSplitsMap = new HashMap<>();
    for (Map.Entry<String, String> me : getConf().getValByRegex(CONTEXT_SPLIT_REGEXP).entrySet()) {
        int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1));
        StringTokenizer stk = new StringTokenizer(me.getValue(), ",");
        while (stk.hasMoreTokens()) {
            contextSplitsMap.put(stk.nextToken(), splits);
        }
    }
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true,
            getConf().getInt(SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(),
                hTable.getRegionLocator());
        FileInputFormat.setInputDirRecursive(job, true);
        FileInputFormat.setInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable);
            LOG.info("Bulk Load Completed..");
            return 0;
        }
    }
    return -1;
}

From source file:com.msd.gin.halyard.tools.HalyardBulkUpdate.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: bulkupdate [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D"
                + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY
                + "=true] <input_file_with_SPARQL_queries> <output_path> <table_name>");
        return -1;
    }//w  w w  . j av  a  2s .c  o m
    TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, RDFFormat.class,
            RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    if (SnappyCodec.isNativeCodeLoaded()) {
        getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
        getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
    }
    getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0);
    getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l);
    getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100);
    getConf().setInt(MRJobConfig.IO_SORT_MB, 1000);
    getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000);
    getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048);
    getConf().setStrings(TABLE_NAME_PROPERTY, args[2]);
    Job job = Job.getInstance(getConf(), "HalyardBulkUpdate -> " + args[1] + " -> " + args[2]);
    NLineInputFormat.setNumLinesPerSplit(job, 1);
    job.setJarByClass(HalyardBulkUpdate.class);
    job.setMapperClass(SPARQLMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(NLineInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], false, 0, null)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(),
                hTable.getRegionLocator());
        FileInputFormat.setInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable);
            LOG.info("Bulk Update Completed..");
            return 0;
        }
    }
    return -1;
}