List of usage examples for org.apache.hadoop.mapred FileInputFormat getInputPaths
public static Path[] getInputPaths(JobConf conf)
From source file:HiveKeyIgnoringBAMOutputFormat.java
License:Open Source License
private void setSAMHeaderFrom(JobConf job) throws IOException { if (wrappedOutputFormat.getSAMHeader() != null) return;/* w w w . j av a2 s .c om*/ // XXX: We're not told where to take the SAM header from so we just merge // them all. There should probably be a better way of doing this. final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(); // The "best" sort order among the headers: unsorted if they're sorted // differently, otherwise their common sort order. SAMFileHeader.SortOrder sortOrder = null; // XXX: it seems that FileInputFormat.getInputPaths(job) will point to // the directories of the input tables in the query. I'm not sure if this // is always the case. for (final Path table : FileInputFormat.getInputPaths(job)) { final FileSystem fs = table.getFileSystem(job); for (final FileStatus stat : fs.listStatus(table)) { if (!stat.isFile()) throw new IOException("Unexpected directory '" + stat.getPath() + "', expected only files"); final SAMFileReader r = new SAMFileReader(fs.open(stat.getPath())); final SAMFileHeader h = r.getFileHeader(); r.close(); headers.add(h); if (sortOrder == null) { sortOrder = h.getSortOrder(); continue; } if (sortOrder == SAMFileHeader.SortOrder.unsorted) continue; if (sortOrder != h.getSortOrder()) sortOrder = SAMFileHeader.SortOrder.unsorted; } } wrappedOutputFormat.setSAMHeader(new SamFileHeaderMerger(sortOrder, headers, true).getMergedHeader()); }
From source file:boa.datagen.SeqSort.java
License:Apache License
/** * The main driver for sort program./*from w w w.ja va 2 s . com*/ * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ @Override public int run(String[] args) throws Exception { System.out.println(inPath); JobConf jobConf = new JobConf(getConf(), SeqSort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(BytesWritable.class); SequenceFileOutputFormat.setCompressOutput(jobConf, true); SequenceFileOutputFormat.setOutputCompressorClass(jobConf, SnappyCodec.class); SequenceFileOutputFormat.setOutputCompressionType(jobConf, CompressionType.BLOCK); // Make sure there are exactly 2 parameters left. FileInputFormat.setInputPaths(jobConf, inPath); FileOutputFormat.setOutputPath(jobConf, new Path(outPath)); System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:cascading.flow.hadoop.MapReduceFlow.java
License:Open Source License
protected Map<String, Tap> createSources(JobConf jobConf) { Path[] paths = FileInputFormat.getInputPaths(jobConf); if (paths.length == 0) { try {/*from ww w .j av a2 s .c o m*/ paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths(new Job(jobConf)); } catch (IOException exception) { throw new CascadingException(exception); } } Map<String, Tap> taps = new HashMap<String, Tap>(); for (Path path : paths) taps.put(path.toString(), new Hfs(new NullScheme(), path.toString())); return taps; }
From source file:cascading.flow.MapReduceFlow.java
License:Open Source License
private Map<String, Tap> createSources(JobConf jobConf) { Path[] paths = FileInputFormat.getInputPaths(jobConf); Map<String, Tap> taps = new HashMap<String, Tap>(); for (Path path : paths) taps.put(path.toString(), new Hfs(new NullScheme(), path.toString())); return taps;/*from ww w.jav a 2 s .c om*/ }
From source file:cascading.hbase.helper.TableInputFormat.java
License:Apache License
public void validateInput(JobConf job) throws IOException { // expecting exactly one path Path[] tableNames = FileInputFormat.getInputPaths(job); if (tableNames == null || tableNames.length > 1) { throw new IOException("expecting one table name"); }//from w w w . j a v a 2 s .co m // connected to table? if (getHTable() == null) { throw new IOException("could not connect to table '" + tableNames[0].getName() + "'"); } // expecting at least one column String colArg = job.get(COLUMN_LIST); if (colArg == null || colArg.length() == 0) { throw new IOException("expecting at least one column"); } }
From source file:cascading.scheme.hadoop.TextLine.java
License:Open Source License
@Override public void sourceConfInit(FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) { if (hasZippedFiles(FileInputFormat.getInputPaths(asJobConfInstance(conf)))) throw new IllegalStateException("cannot read zip files: " + Arrays.toString(FileInputFormat.getInputPaths(asJobConfInstance(conf)))); conf.setBoolean("mapred.mapper.new-api", false); conf.setClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class); }
From source file:cascading.scheme.TextLine.java
License:Open Source License
@Override public void sourceInit(Tap tap, JobConf conf) { if (hasZippedFiles(FileInputFormat.getInputPaths(conf))) conf.setInputFormat(ZipInputFormat.class); else//from w w w .ja va 2 s . com conf.setInputFormat(TextInputFormat.class); }
From source file:cascading.tap.hadoop.Hfs.java
License:Open Source License
protected static void verifyNoDuplicates(Configuration conf) { Path[] inputPaths = FileInputFormat.getInputPaths(HadoopUtil.asJobConfInstance(conf)); Set<Path> paths = new HashSet<Path>((int) (inputPaths.length / .75f)); for (Path inputPath : inputPaths) { if (!paths.add(inputPath)) throw new TapException("may not add duplicate paths, found: " + inputPath); }//from ww w . j a v a 2s . co m }
From source file:cascading.tap.hadoop.io.MultiInputFormat.java
License:Open Source License
/** * Used to set the current JobConf with all sub jobs configurations. * * @param toJob//from www. j a v a 2s.c o m * @param fromJobs */ public static void addInputFormat(JobConf toJob, JobConf... fromJobs) { toJob.setInputFormat(MultiInputFormat.class); List<Map<String, String>> configs = new ArrayList<Map<String, String>>(); List<Path> allPaths = new ArrayList<Path>(); boolean isLocal = false; for (JobConf fromJob : fromJobs) { if (fromJob.get("mapred.input.format.class") == null) throw new CascadingException( "mapred.input.format.class is required, should be set in source Scheme#sourceConfInit"); configs.add(HadoopUtil.getConfig(toJob, fromJob)); Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob)); if (!isLocal) isLocal = HadoopUtil.isLocal(fromJob); } if (!allPaths.isEmpty()) // it's possible there aren't any FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()])); try { toJob.set("cascading.multiinputformats", HadoopUtil.serializeBase64(configs, toJob, true)); } catch (IOException exception) { throw new CascadingException("unable to pack input formats", exception); } if (isLocal) HadoopUtil.setLocal(toJob); }
From source file:cascading.tap.hadoop.MultiInputFormat.java
License:Open Source License
/** * Used to set the current JobConf with all sub jobs configurations. * * @param toJob// w ww .ja va 2 s. c o m * @param fromJobs */ public static void addInputFormat(JobConf toJob, JobConf... fromJobs) { toJob.setInputFormat(MultiInputFormat.class); List<Map<String, String>> configs = new ArrayList<Map<String, String>>(); List<Path> allPaths = new ArrayList<Path>(); boolean isLocal = false; for (JobConf fromJob : fromJobs) { configs.add(getConfig(toJob, fromJob)); Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob)); if (!isLocal) isLocal = fromJob.get("mapred.job.tracker").equalsIgnoreCase("local"); } FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()])); try { toJob.set("cascading.multiinputformats", Util.serializeBase64(configs)); } catch (IOException exception) { throw new CascadingException("unable to pack input formats", exception); } if (isLocal) toJob.set("mapred.job.tracker", "local"); }