List of usage examples for org.apache.hadoop.mapred FileInputFormat getInputPaths
public static Path[] getInputPaths(JobConf conf)
From source file:cascading.tap.hadoop.ZipInputFormat.java
License:Open Source License
protected Path[] listPathsInternal(JobConf jobConf) throws IOException { Path[] dirs = FileInputFormat.getInputPaths(jobConf); if (dirs.length == 0) throw new IOException("no input paths specified in job"); for (Path dir : dirs) { FileSystem fs = dir.getFileSystem(jobConf); if (!fs.isFile(dir)) throw new IOException("does not support directories: " + dir); }//from w ww . j ava 2 s . c o m return dirs; }
From source file:cascading.tap.Hfs.java
License:Open Source License
@Override public void sourceInit(JobConf conf) throws IOException { Path qualifiedPath = getQualifiedPath(conf); for (Path exitingPath : FileInputFormat.getInputPaths(conf)) { if (exitingPath.equals(qualifiedPath)) throw new TapException("may not add duplicate paths, found: " + exitingPath); }/* w w w . j a v a2s . c o m*/ FileInputFormat.addInputPath(conf, qualifiedPath); super.sourceInit(conf); makeLocal(conf, qualifiedPath, "forcing job to local mode, via source: "); TupleSerialization.setSerializations(conf); // allows Hfs to be used independent of Flow }
From source file:cn.edu.xmu.dm.mapreduce.Sort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job.// ww w . j a v a2 s . c om * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "Sorter"); job.setJarByClass(Sort.class); JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.alexholmes.hadooputils.sort.Sort.java
License:Apache License
/** * The driver for the sort MapReduce job. * * @param jobConf sort configuration * @param numMapTasks number of map tasks * @param numReduceTasks number of reduce tasks * @param sampler sampler, if required * @param codecClass the compression codec for compressing final outputs * @param mapCodecClass the compression codec for compressing intermediary map outputs * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes * for the job output files * @param inputDirAsString input directory in CSV-form * @param outputDirAsString output directory * @return true if the job completed successfully * @throws IOException if something went wrong * @throws URISyntaxException if a URI wasn't correctly formed *//* w w w .ja va2 s. co m*/ public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks, final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass, final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes, final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException { jobConf.setJarByClass(Sort.class); jobConf.setJobName("sorter"); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); if (numMapTasks != null) { jobConf.setNumMapTasks(numMapTasks); } if (numReduceTasks != null) { jobConf.setNumReduceTasks(numReduceTasks); } else { int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sortReduces = jobConf.get("test.sort.reduces_per_host"); if (sortReduces != null) { numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(numReduces); } jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(SortReduce.class); jobConf.setInputFormat(SortInputFormat.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); if (mapCodecClass != null) { jobConf.setMapOutputCompressorClass(mapCodecClass); } if (codecClass != null) { jobConf.setBoolean("mapred.output.compress", true); jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class); } FileInputFormat.setInputPaths(jobConf, inputDirAsString); FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString)); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; FileSystem fileSystem = FileSystem.get(jobConf); if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) { inputDir = inputDir.getParent(); } inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + jobConf.getNumReduceTasks() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds."); if (jobResult.isSuccessful()) { if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) { new LzoIndexer(jobConf).index(new Path(outputDirAsString)); } return true; } return false; }
From source file:com.benchmark.mapred.Sort.java
License:Apache License
/** * The main driver for sort program./* w w w. ja v a2 s .com*/ * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:com.cloudera.recordservice.hive.RecordServiceHiveInputFormat.java
License:Apache License
/** * Copied HiveInputFormat/* w w w . j a v a 2 s . c o m*/ */ @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { init(job); Path[] dirs = FileInputFormat.getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } JobConf newjob = new JobConf(job); List<InputSplit> result = new ArrayList<InputSplit>(); List<Path> currentDirs = new ArrayList<Path>(); Class<? extends InputFormat> currentInputFormatClass = null; TableDesc currentTable = null; TableScanOperator currentTableScan = null; // for each dir, get the InputFormat, and do getSplits. for (Path dir : dirs) { PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir); Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass(); TableDesc table = part.getTableDesc(); TableScanOperator tableScan = null; List<String> aliases = mrwork_.getPathToAliases().get(dir.toUri().toString()); // Make filter pushdown information available to getSplits. if ((aliases != null) && (aliases.size() == 1)) { Operator op = mrwork_.getAliasToWork().get(aliases.get(0)); if ((op != null) && (op instanceof TableScanOperator)) { tableScan = (TableScanOperator) op; // push down projections. ColumnProjectionUtils.appendReadColumns(newjob, tableScan.getNeededColumnIDs(), tableScan.getNeededColumns()); // push down filters pushFilters(newjob, tableScan); } } if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass) && table.equals(currentTable) && tableScan == currentTableScan) { currentDirs.add(dir); continue; } if (!currentDirs.isEmpty()) { LOG.info("Generating splits"); addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result); } currentDirs.clear(); currentDirs.add(dir); currentTableScan = tableScan; currentTable = table; currentInputFormatClass = inputFormatClass; } if (dirs.length != 0) { LOG.info("Generating splits"); addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result); } LOG.info("number of splits " + result.size()); return result.toArray(new HiveInputSplitShim[result.size()]); }
From source file:com.cloudera.science.avro.streaming.AvroAsJSONInputFormat.java
License:Open Source License
private void loadSchemas(JobConf job) throws IOException { this.schemas = Lists.newArrayList(); SchemaLoader loader = new SchemaLoader(job); String schemaLiteral = job.get(SCHEMA_LITERAL); if (schemaLiteral != null) { schemas.add(loader.loadLiteral(schemaLiteral)); return;//www .j av a2s .c o m } else { String[] schemaUrls = job.getStrings(SCHEMA_URL); String[] typeNames = job.getStrings(SCHEMA_TYPE_NAME); if (schemaUrls != null) { for (String schemaUrl : schemaUrls) { schemas.add(loader.loadFromUrl(schemaUrl)); } } else if (typeNames != null) { for (String typeName : typeNames) { schemas.add(loader.loadFromTypeName(typeName)); } } else { throw new IllegalArgumentException("No schema information provided"); } if (schemas.size() > 1) { // Need to track input paths Path[] inputs = FileInputFormat.getInputPaths(job); if (inputs.length != schemas.size()) { throw new IllegalArgumentException(String.format( "Number of input paths (%d) does not match number of schemas specified (%d)", inputs.length, schemas.size())); } this.inputPaths = new String[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputPaths[i] = inputs[i].toString(); } } } }
From source file:com.dappervision.hbase.mapred.TypedBytesTableInputFormat.java
License:Apache License
/** * Builds a TableRecordReader. If no TableRecordReader was provided, uses * the default./*from w w w . ja va2s . c o m*/ * * @see org.apache.hadoop.mapred.InputFormat#getRecordReader(InputSplit, * JobConf, Reporter) */ public void configure(JobConf job) { Path[] tableNames = FileInputFormat.getInputPaths(job); String colArg = job.get(COLUMN_LIST); String[] colNames = colArg.split(" "); byte[][] m_cols = new byte[colNames.length][]; for (int i = 0; i < m_cols.length; i++) { m_cols[i] = Base64.decodeBase64(Bytes.toBytes(colNames[i])); } setInputColumns(m_cols); if (job.get(ROW_FILTER_REGEX) != null) { LOG.info("Row Regex Filter[" + job.get(ROW_FILTER_REGEX) + "]"); setRowFilter(new RowFilter(CompareFilter.CompareOp.EQUAL, new RegexStringComparator(job.get(ROW_FILTER_REGEX)))); } if (job.get(START_ROW) != null) { LOG.info("Start Row[" + job.get(START_ROW) + "]"); try { setStartRow(Base64.decodeBase64(job.get(START_ROW).getBytes("US-ASCII"))); } catch (UnsupportedEncodingException e) { LOG.error("Start Row[" + job.get(START_ROW) + "] - Error"); } } if (job.get(STOP_ROW) != null) { LOG.info("Stop Row[" + job.get(STOP_ROW) + "]"); try { setStopRow(Base64.decodeBase64(job.get(STOP_ROW).getBytes("US-ASCII"))); } catch (UnsupportedEncodingException e) { LOG.error("Stop Row[" + job.get(STOP_ROW) + "] - Error"); } } try { setHTable(new HTable(HBaseConfiguration.create(job), tableNames[0].getName())); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } if (job.get(VALUE_FORMAT) != null && job.get(VALUE_FORMAT).equalsIgnoreCase("singlevalue")) { LOG.info("Value Format[" + job.get(VALUE_FORMAT) + "]"); super.setTableRecordReader(new TypedBytesTableRecordReaderSingleValue()); } else { LOG.info("Value Format[familiescolumns]"); super.setTableRecordReader(new TypedBytesTableRecordReader()); } }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableSnapshotInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { setColumns(job);//from ww w. ja v a 2 s . com // hive depends on FileSplits, so wrap in HBaseSplit Path[] tablePaths = FileInputFormat.getInputPaths(job); InputSplit[] results = delegate.getSplits(job, numSplits); for (int i = 0; i < results.length; i++) { results[i] = new HBaseSplit(results[i], tablePaths[0]); } return results; }
From source file:com.linkedin.mapred.AvroUtils.java
License:Open Source License
/** * Obtain the avro input schema from data * @param conf//from w ww . j a v a 2s . com * @return * @throws IOException */ public static Schema getAvroInputSchema(JobConf conf) throws IOException { Path[] paths = FileInputFormat.getInputPaths(conf); if (paths == null) { throw new IllegalStateException("input paths do not exist in jobConf!"); } Schema inputSchema = AvroUtils.getSchemaFromFile(conf, paths[0]); if (inputSchema == null) { throw new IllegalStateException("Input does not have schema info and/or input is missing."); } return inputSchema; }