List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass
public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException
From source file:com.mongodb.hadoop.input.MongoMultipleInputs.java
License:Apache License
/** * Add a {@link Path} with a custom {@link InputFormat} and * {@link Mapper} to the list of inputs for the map-reduce job. * // w ww. j a v a2 s. c o m * @param job The {@link Job} * @param path {@link Path} to be added to the list of inputs for the job * @param inputFormatClass {@link InputFormat} class to use for this path * @param mapperClass {@link Mapper} class to use for this path * @param query {@link DBObject} query for path and mapper * @param fields {@link DBObject} fields for path and mapper * @param sort {@link DBObject} sort for path and mapper * @param limit {@link int} limit for path and mapper * @param skip {@link int} skip for path and mapper */ @SuppressWarnings("unchecked") public static void addInputPath(Job job, String uri, Class<? extends InputFormat> inputFormatClass, Class<? extends Mapper> mapperClass, String query, String fields, String sort, int limit, int skip) { Configuration conf = job.getConfiguration(); MongoConfigUtil.addMongoRequest(conf, uri, inputFormatClass, mapperClass, query, fields, sort, limit, skip); job.setMapperClass(DelegatingMapper.class); job.setInputFormatClass(DelegatingInputFormat.class); }
From source file:com.mongodb.hadoop.util.MongoTool.java
License:Apache License
private int runMapReduceJob(final Configuration conf) throws IOException { final Job job = Job.getInstance(conf, getJobName()); /**/*from ww w . ja v a 2 s . co m*/ * Any arguments specified with -D <property>=<value> * on the CLI will be picked up and set here * They override any XML level values * Note that -D<space> is important - no space will * not work as it gets picked up by Java itself */ // TODO - Do we need to set job name somehow more specifically? // This may or may not be correct/sane job.setJarByClass(getClass()); final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf); LOG.debug("Mapper Class: " + mapper); LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI)); job.setMapperClass(mapper); Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf); if (combiner != null) { job.setCombinerClass(combiner); } job.setReducerClass(MongoConfigUtil.getReducer(conf)); job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf)); job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf)); job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf)); Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf); Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf); if (mapOutputKeyClass != null) { job.setMapOutputKeyClass(mapOutputKeyClass); } if (mapOutputValueClass != null) { job.setMapOutputValueClass(mapOutputValueClass); } /** * Determines if the job will run verbosely e.g. print debug output * Only works with foreground jobs */ final boolean verbose = MongoConfigUtil.isJobVerbose(conf); /** * Run job in foreground aka wait for completion or background? */ final boolean background = MongoConfigUtil.isJobBackground(conf); try { if (background) { LOG.info("Setting up and running MapReduce job in background."); job.submit(); return 0; } else { LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? " + verbose + "}"); return job.waitForCompletion(true) ? 0 : 1; } } catch (final Exception e) { LOG.error("Exception while executing job... ", e); return 1; } }
From source file:com.moz.fiji.mapreduce.input.FijiTableMapReduceJobInput.java
License:Apache License
/** {@inheritDoc} */ @Override/*w w w .jav a2 s . c o m*/ public void configure(Job job) throws IOException { // Configure the input format class. super.configure(job); FijiTableInputFormat inputFormat = FijiTableInputFormat.Factory.get(mInputTableURI).getInputFormat(); job.setInputFormatClass(inputFormat.getClass()); FijiTableInputFormat.configureJob(job, mInputTableURI, mDataRequest, null != mRowOptions ? mRowOptions.getStartRow() : null, null != mRowOptions ? mRowOptions.getLimitRow() : null, null != mRowOptions ? mRowOptions.getRowFilter() : null); }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
public Job setupJob(String jobName, Path outputFile, Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass, EntityId startKey, EntityId limitKey, FijiRowFilter filter) throws Exception { final Job job = new Job(createConfiguration()); final Configuration conf = job.getConfiguration(); // Get settings for test. final FijiDataRequest request = FijiDataRequest.builder() .addColumns(ColumnsDef.create().add("info", "name").add("info", "email")).build(); job.setJarByClass(IntegrationTestFijiTableInputFormat.class); // Setup the InputFormat. FijiTableInputFormat.configureJob(job, getFooTable().getURI(), request, startKey, limitKey, filter); job.setInputFormatClass(HBaseFijiTableInputFormat.class); // Duplicate functionality from MapReduceJobBuilder, since we are not using it here: final List<Path> jarFiles = Lists.newArrayList(); final FileSystem fs = FileSystem.getLocal(conf); for (String cpEntry : System.getProperty("java.class.path").split(":")) { if (cpEntry.endsWith(".jar")) { jarFiles.add(fs.makeQualified(new Path(cpEntry))); }/*from w w w . j a va 2 s. c o m*/ } DistributedCacheJars.addJarsToDistributedCache(job, jarFiles); // Create a test job. job.setJobName(jobName); // Setup the OutputFormat. TextOutputFormat.setOutputPath(job, outputFile.getParent()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); // Set the mapper class. if (null != mapperClass) { job.setMapperClass(mapperClass); } // Set the reducer class. if (null != reducerClass) { job.setReducerClass(reducerClass); } return job; }
From source file:com.moz.fiji.mapreduce.MapReduceJobInput.java
License:Apache License
/** * Configure a job to use this type of input for the MapReduce. * * @param job The job to configure with input. * @throws IOException If there is an error during configuration. *//*from w w w . j a v a 2 s . c o m*/ public void configure(Job job) throws IOException { job.setInputFormatClass(getInputFormatClass()); }
From source file:com.moz.fiji.schema.mapreduce.FijiTableInputFormat.java
License:Apache License
/** * Configures a Hadoop M/R job to read from a given table. * * @param job Job to configure.//from w w w .j a va2 s. c om * @param tableURI URI of the table to read from. * @param dataRequest Data request. * @param startRow Minimum row key to process. * @param endRow Maximum row Key to process. * @throws IOException on I/O error. */ public static void configureJob(Job job, FijiURI tableURI, FijiDataRequest dataRequest, String startRow, String endRow) throws IOException { final Configuration conf = job.getConfiguration(); // As a precaution, be sure the table exists and can be opened. final Fiji fiji = Fiji.Factory.open(tableURI, conf); final FijiTable table = fiji.openTable(tableURI.getTable()); ResourceUtils.releaseOrLog(table); ResourceUtils.releaseOrLog(fiji); // TODO: Check for jars config: // GenericTableMapReduceUtil.initTableInput(hbaseTableName, scan, job); // TODO: Obey specified start/end rows. // Write all the required values to the job's configuration object. job.setInputFormatClass(FijiTableInputFormat.class); final String serializedRequest = Base64.encodeBase64String(SerializationUtils.serialize(dataRequest)); conf.set(FijiConfKeys.INPUT_DATA_REQUEST, serializedRequest); conf.set(FijiConfKeys.INPUT_TABLE_URI, tableURI.toString()); }
From source file:com.mozilla.hadoop.Backup.java
License:Apache License
/** * @param args//from w w w.j a v a 2s . c o m * @return * @throws IOException * @throws ParseException */ public Job initJob(String[] args) throws IOException, ParseException { Path inputPath = null; Path loadPath = null; String outputPath = null; boolean useSpecifiedPaths = false; for (int idx = 0; idx < args.length; idx++) { if ("-f".equals(args[idx])) { useSpecifiedPaths = true; loadPath = new Path(args[++idx]); } else if (idx == args.length - 1) { outputPath = args[idx]; } else { inputPath = new Path(args[idx]); } } Path mrOutputPath = new Path(NAME + "-results"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.set("backup.input.path", inputPath.toString()); conf.set("backup.output.path", outputPath); FileSystem inputFs = null; FileSystem outputFs = null; Path[] inputSources = null; try { inputFs = FileSystem.get(inputPath.toUri(), new Configuration()); outputFs = FileSystem.get(getConf()); if (useSpecifiedPaths) { inputSources = createInputSources(loadPaths(outputFs, loadPath), outputFs); } else { inputSources = createInputSources(getPaths(inputFs, inputPath, 0, 2), outputFs); } } finally { checkAndClose(inputFs); checkAndClose(outputFs); } Job job = new Job(getConf()); job.setJobName(NAME); job.setJarByClass(Backup.class); job.setMapperClass(BackupMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); for (Path source : inputSources) { System.out.println("Adding input path: " + source.toString()); FileInputFormat.addInputPath(job, source); } FileOutputFormat.setOutputPath(job, mrOutputPath); return job; }
From source file:com.mozilla.hadoop.hbase.mapreduce.MultiScanTableMapReduceUtil.java
License:Apache License
/** * Use this before submitting a TableMap job. It will appropriately set up * the job./*from ww w.j ava 2 s . co m*/ * * @param table * The table name to read from. * @param scans * The scan instances with the columns, time range etc. * @param mapper * The mapper class to use. * @param outputKeyClass * The class of the output key. * @param outputValueClass * The class of the output value. * @param job * The current job to adjust. * @throws IOException * When setting up the details fails. */ @SuppressWarnings("rawtypes") public static void initMultiScanTableMapperJob(final String table, final Scan[] scans, final Class<? extends TableMapper> mapper, final Class<? extends WritableComparable> outputKeyClass, final Class<? extends Writable> outputValueClass, final Job job) throws IOException { job.setInputFormatClass(MultiScanTableInputFormat.class); if (outputValueClass != null) { job.setMapOutputValueClass(outputValueClass); } if (outputKeyClass != null) { job.setMapOutputKeyClass(outputKeyClass); } job.setMapperClass(mapper); job.getConfiguration().set(MultiScanTableInputFormat.INPUT_TABLE, table); job.getConfiguration().set(MultiScanTableInputFormat.SCANS, convertScanArrayToString(scans)); }
From source file:com.msd.gin.halyard.tools.HalyardBulkLoad.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: bulkload [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + SKIP_INVALID_PROPERTY + "=true] [-D" + SPLIT_BITS_PROPERTY + "=8] [-D" + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY + "=true] <input_path(s)> <output_path> <table_name>"); return -1; }/*from w w w .j av a2 s . com*/ TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); if (SnappyCodec.isNativeCodeLoaded()) { getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); } getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0); getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l); getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100); getConf().setInt(MRJobConfig.IO_SORT_MB, 1000); getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000); getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048); Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + args[1] + " -> " + args[2]); job.setJarByClass(HalyardBulkLoad.class); job.setMapperClass(RDFMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(RioFileInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); Map<String, Integer> contextSplitsMap = new HashMap<>(); for (Map.Entry<String, String> me : getConf().getValByRegex(CONTEXT_SPLIT_REGEXP).entrySet()) { int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1)); StringTokenizer stk = new StringTokenizer(me.getValue(), ","); while (stk.hasMoreTokens()) { contextSplitsMap.put(stk.nextToken(), splits); } } try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true, getConf().getInt(SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Load Completed.."); return 0; } } return -1; }
From source file:com.msd.gin.halyard.tools.HalyardBulkUpdate.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: bulkupdate [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY + "=true] <input_file_with_SPARQL_queries> <output_path> <table_name>"); return -1; }//w w w . j av a 2s .c o m TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); if (SnappyCodec.isNativeCodeLoaded()) { getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); } getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0); getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l); getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100); getConf().setInt(MRJobConfig.IO_SORT_MB, 1000); getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000); getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048); getConf().setStrings(TABLE_NAME_PROPERTY, args[2]); Job job = Job.getInstance(getConf(), "HalyardBulkUpdate -> " + args[1] + " -> " + args[2]); NLineInputFormat.setNumLinesPerSplit(job, 1); job.setJarByClass(HalyardBulkUpdate.class); job.setMapperClass(SPARQLMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(NLineInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], false, 0, null)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Update Completed.."); return 0; } } return -1; }