List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:org.apache.avro.mapred.TestSequenceFileReader.java
License:Apache License
@Test public void testNonAvroReducer() throws Exception { JobConf job = new JobConf(); Path output = new Path(System.getProperty("test.dir", ".") + "/seq-out"); output.getFileSystem(job).delete(output); // configure input for Avro from sequence file AvroJob.setInputSequenceFile(job);/* ww w . j av a 2s . c o m*/ AvroJob.setInputSchema(job, SCHEMA); FileInputFormat.setInputPaths(job, FILE.toURI().toString()); // mapper is default, identity // use a hadoop reducer that consumes Avro input AvroJob.setMapOutputSchema(job, SCHEMA); job.setReducerClass(NonAvroReducer.class); // configure output for non-Avro SequenceFile job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, output); // output key/value classes are default, LongWritable/Text JobClient.runJob(job); checkFile(new SequenceFileReader<Long, CharSequence>(new File(output.toString() + "/part-00000"))); }
From source file:org.apache.avro.mapred.TestWeather.java
License:Apache License
/** Uses default mapper with no reduces for a map-only identity job. */ @Test/*from ww w. j a v a2 s . c o m*/ @SuppressWarnings("deprecation") public void testMapOnly() throws Exception { JobConf job = new JobConf(); String inDir = System.getProperty("share.dir", "../../../share") + "/test/data"; Path input = new Path(inDir + "/weather.avro"); Path output = new Path(System.getProperty("test.dir", "target/test") + "/weather-ident"); output.getFileSystem(job).delete(output); job.setJobName("identity map weather"); AvroJob.setInputSchema(job, Weather.SCHEMA$); AvroJob.setOutputSchema(job, Weather.SCHEMA$); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setCompressOutput(job, true); job.setNumReduceTasks(0); // map-only JobClient.runJob(job); // check output is correct DatumReader<Weather> reader = new SpecificDatumReader<Weather>(); DataFileReader<Weather> check = new DataFileReader<Weather>(new File(inDir + "/weather.avro"), reader); DataFileReader<Weather> sorted = new DataFileReader<Weather>( new File(output.toString() + "/part-00000.avro"), reader); for (Weather w : sorted) assertEquals(check.next(), w); check.close(); sorted.close(); }
From source file:org.apache.avro.mapred.TestWeather.java
License:Apache License
@Test @SuppressWarnings("deprecation") public void testSort() throws Exception { JobConf job = new JobConf(); String inDir = System.getProperty("share.dir", "../../../share") + "/test/data"; Path input = new Path(inDir + "/weather.avro"); Path output = new Path(System.getProperty("test.dir", "target/test") + "/weather-sort"); output.getFileSystem(job).delete(output); job.setJobName("sort weather"); AvroJob.setInputSchema(job, Weather.SCHEMA$); AvroJob.setMapOutputSchema(job, Pair.getPairSchema(Weather.SCHEMA$, Schema.create(Type.NULL))); AvroJob.setOutputSchema(job, Weather.SCHEMA$); AvroJob.setMapperClass(job, SortMapper.class); AvroJob.setReducerClass(job, SortReducer.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); FileOutputFormat.setCompressOutput(job, true); AvroJob.setOutputCodec(job, SNAPPY_CODEC); JobClient.runJob(job);//from w w w . j ava 2 s. c om // check output is correct DatumReader<Weather> reader = new SpecificDatumReader<Weather>(); DataFileReader<Weather> check = new DataFileReader<Weather>(new File(inDir + "/weather-sorted.avro"), reader); DataFileReader<Weather> sorted = new DataFileReader<Weather>( new File(output.toString() + "/part-00000.avro"), reader); for (Weather w : sorted) assertEquals(check.next(), w); check.close(); sorted.close(); // check that AvroMapper and AvroReducer get close() and configure() called assertEquals(1, mapCloseCalls.get()); assertEquals(1, reducerCloseCalls.get()); assertEquals(1, mapConfigureCalls.get()); assertEquals(1, reducerConfigureCalls.get()); }
From source file:org.apache.avro.mapred.TestWordCount.java
License:Apache License
@SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile();//from w w w . j a va2s. c o m job.setJobName("wordcount"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); WordCountUtil.setMeta(job); JobClient.runJob(job); WordCountUtil.validateCountsFile(); }
From source file:org.apache.avro.mapred.TestWordCountGeneric.java
License:Apache License
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); JobConf job = new JobConf(); try {//from ww w. ja v a 2 s . c om WordCountUtil.writeLinesFile(); job.setJobName("wordcount"); AvroJob.setInputGeneric(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputGeneric(job, WordCount.SCHEMA$); job.setMapperClass(MapImpl.class); job.setCombinerClass(ReduceImpl.class); job.setReducerClass(ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); JobClient.runJob(job); WordCountUtil.validateCountsFile(); } finally { outputPath.getFileSystem(job).delete(outputPath); } }
From source file:org.apache.avro.mapred.TestWordCountSpecific.java
License:Apache License
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); try {// w ww . ja v a 2 s.c o m WordCountUtil.writeLinesFile(); job.setJobName("wordcount"); AvroJob.setInputSpecific(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSpecific(job, WordCount.SCHEMA$); job.setMapperClass(MapImpl.class); job.setCombinerClass(ReduceImpl.class); job.setReducerClass(ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); JobClient.runJob(job); WordCountUtil.validateCountsFile(); } finally { outputPath.getFileSystem(job).delete(outputPath); } }
From source file:org.apache.cassandra.bulkloader.CassandraBulkLoader.java
License:Apache License
public static void runJob(String[] args) { JobConf conf = new JobConf(CassandraBulkLoader.class); if (args.length >= 4) { conf.setNumReduceTasks(new Integer(args[3])); }/*from ww w .j a va2s . c o m*/ try { // We store the cassandra storage-conf.xml on the HDFS cluster DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf); } catch (URISyntaxException e) { throw new RuntimeException(e); } conf.setInputFormat(KeyValueTextInputFormat.class); conf.setJobName("CassandraBulkLoader_v2"); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[1])); FileOutputFormat.setOutputPath(conf, new Path(args[2])); try { JobClient.runJob(conf); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormat.java
License:Apache License
@VisibleForTesting static Stream<String> getLocations(final List<WindowedDataSegment> segments, final org.apache.hadoop.mapred.InputFormat fio, final JobConf conf) { return segments.stream().sequential().flatMap((final WindowedDataSegment segment) -> { FileInputFormat.setInputPaths(conf, new Path(JobHelper.getURIFromSegment(segment.getSegment()))); try {//from www . ja v a2s . c om return Arrays.stream(fio.getSplits(conf, 1)) .flatMap((final org.apache.hadoop.mapred.InputSplit split) -> { try { return Arrays.stream(split.getLocations()); } catch (final Exception e) { logger.error(e, "Exception getting locations"); return Stream.empty(); } }); } catch (final Exception e) { logger.error(e, "Exception getting splits"); return Stream.empty(); } }); }
From source file:org.apache.hawq.pxf.plugins.hive.HiveDataFragmenter.java
License:Apache License
private void fetchMetaData(HiveTablePartition tablePartition) throws Exception { InputFormat<?, ?> fformat = makeInputFormat(tablePartition.storageDesc.getInputFormat(), jobConf); FileInputFormat.setInputPaths(jobConf, new Path(tablePartition.storageDesc.getLocation())); InputSplit[] splits = null;/* w w w . j a va 2 s . co m*/ try { splits = fformat.getSplits(jobConf, 1); } catch (org.apache.hadoop.mapred.InvalidInputException e) { LOG.debug("getSplits failed on " + e.getMessage()); return; } for (InputSplit split : splits) { FileSplit fsp = (FileSplit) split; String[] hosts = fsp.getLocations(); String filepath = fsp.getPath().toUri().getPath(); byte[] locationInfo = HdfsUtilities.prepareFragmentMetadata(fsp); Fragment fragment = new Fragment(filepath, hosts, locationInfo, makeUserData(tablePartition)); fragments.add(fragment); } }
From source file:org.apache.hcatalog.hcatmix.load.HadoopLoadGenerator.java
License:Apache License
/** * Prepare input directory/jobConf and launch the hadoop job, for load testing * * @param confFileName The properties file for the task, should be available in the classpath * @param conf//from ww w.j av a 2 s . c o m * @return * @throws IOException * @throws MetaException * @throws TException */ public SortedMap<Long, ReduceResult> runLoadTest(String confFileName, Configuration conf) throws Exception, MetaException, TException { JobConf jobConf; if (conf != null) { jobConf = new JobConf(conf); } else { jobConf = new JobConf(new Configuration()); } InputStream confFileIS; try { confFileIS = HCatMixUtils.getInputStream(confFileName); } catch (Exception e) { LOG.error("Couldn't load configuration file " + confFileName); throw e; } Properties props = new Properties(); try { props.load(confFileIS); } catch (IOException e) { LOG.error("Couldn't load properties file: " + confFileName, e); throw e; } LOG.info("Loading configuration file: " + confFileName); addToJobConf(jobConf, props, Conf.MAP_RUN_TIME_MINUTES); addToJobConf(jobConf, props, Conf.STAT_COLLECTION_INTERVAL_MINUTE); addToJobConf(jobConf, props, Conf.THREAD_INCREMENT_COUNT); addToJobConf(jobConf, props, Conf.THREAD_INCREMENT_INTERVAL_MINUTES); addToJobConf(jobConf, props, Conf.THREAD_COMPLETION_BUFFER_MINUTES); int numMappers = Integer .parseInt(props.getProperty(Conf.NUM_MAPPERS.propName, "" + Conf.NUM_MAPPERS.defaultValue)); Path inputDir = new Path(props.getProperty(Conf.INPUT_DIR.propName, Conf.INPUT_DIR.defaultValueStr)); Path outputDir = new Path(props.getProperty(Conf.OUTPUT_DIR.propName, Conf.OUTPUT_DIR.defaultValueStr)); jobConf.setJobName(JOB_NAME); jobConf.setNumMapTasks(numMappers); jobConf.setMapperClass(HCatMapper.class); jobConf.setJarByClass(HCatMapper.class); jobConf.setReducerClass(HCatReducer.class); jobConf.setMapOutputKeyClass(LongWritable.class); jobConf.setMapOutputValueClass(IntervalResult.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(ReduceResult.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.set(Conf.TASK_CLASS_NAMES.getJobConfKey(), props.getProperty(Conf.TASK_CLASS_NAMES.propName, Conf.TASK_CLASS_NAMES.defaultValueStr)); fs = FileSystem.get(jobConf); Path jarRoot = new Path("/tmp/hcatmix_jar_" + new Random().nextInt()); HadoopUtils.uploadClasspathAndAddToJobConf(jobConf, jarRoot); fs.deleteOnExit(jarRoot); FileInputFormat.setInputPaths(jobConf, createInputFiles(inputDir, numMappers)); if (fs.exists(outputDir)) { fs.delete(outputDir, true); } FileOutputFormat.setOutputPath(jobConf, outputDir); // Set up delegation token required for hiveMetaStoreClient in map task HiveConf hiveConf = new HiveConf(HadoopLoadGenerator.class); HiveMetaStoreClient hiveClient = new HiveMetaStoreClient(hiveConf); String tokenStr = hiveClient.getDelegationToken(UserGroupInformation.getCurrentUser().getUserName(), "mapred"); Token<? extends AbstractDelegationTokenIdentifier> token = new Token<DelegationTokenIdentifier>(); token.decodeFromUrlString(tokenStr); token.setService(new Text(METASTORE_TOKEN_SIGNATURE)); jobConf.getCredentials().addToken(new Text(METASTORE_TOKEN_KEY), token); // Submit the job, once the job is complete see output LOG.info("Submitted hadoop job"); RunningJob j = JobClient.runJob(jobConf); LOG.info("JobID is: " + j.getJobName()); if (!j.isSuccessful()) { throw new IOException("Job failed"); } return readResult(outputDir, jobConf); }