List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:org.archive.hadoop.jobs.WATGenerator.java
License:Apache License
/** * Run the job./* w ww .j a v a 2 s. co m*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("WAT Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); // keep job running despite some failures in generating WATs job.setBoolean("strictMode", false); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WATGeneratorMapper.class); job.setJarByClass(WATGenerator.class); int arg = 0; if (args[arg].equals("-strictMode")) { job.setBoolean("strictMode", true); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to WATGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.jbs.Parse.java
License:Apache License
/** * Run the job./*ww w.j a v a2s . co m*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } FileSystem fs = FileSystem.get(getConf()); // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("jbs.Parse " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // Use the Parse-specific output format. job.setOutputFormat(PerMapOutputFormat.class); // Use our ParseMapper, with output keys and values of type // Text. job.setMapperClass(ParseMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Configure the input and output paths, from the command-line. Path outputDir = new Path(args[0]); FileOutputFormat.setOutputPath(job, outputDir); boolean atLeastOneInput = false; for (int i = 1; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); Path outputPath = new Path(outputDir, inputPath.getName()); if (fs.exists(outputPath)) { LOG.debug("Output path already exists: " + outputPath); } else { atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } } if (!atLeastOneInput) { LOG.info("No input files to parse."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.jbs.tools.Dump.java
License:Apache License
public int run(String[] args) throws Exception { String usage = "Usage: Dump [-k|-v] <mapfile|sequencefile>..."; if (args.length < 1) { System.err.println(usage); return 1; }// w w w. j a v a2 s. c om int i = 0; int mode = 0; if (args[0].equals("-k")) { mode = 1; i++; } else if (args[0].equals("-v")) { mode = 2; i++; } for (; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); dump(inputfs, inputPath, mode); } } return 0; }
From source file:org.bgi.flexlab.gaea.util.SortUilts.java
License:Open Source License
public static void merge(MultipleVCFHeader mVcfHeader, VCFSortOptions options, Configuration conf) { try {//from w ww . j a v a 2s.c om System.out.println("vcf-MultiSampleSort :: Merging output..."); // First, place the VCF or BCF header. final Path outpath = new Path(options.getOutputPath()); final Path wrkPath = new Path(options.getWorkPath()); final FileSystem srcFS = wrkPath.getFileSystem(conf); final FileSystem dstFS = outpath.getFileSystem(conf); Map<String, OutputStream> outs = new HashMap<String, OutputStream>(); Map<Integer, String> multiOutputs = options.getMultiOutputs(); for (String result : multiOutputs.values()) { Path sPath = new Path(options.getOutputPath() + "/" + result + ".vcf"); OutputStream os = dstFS.create(sPath); outs.put(result, os); } final VariantContextWriterBuilder builder = new VariantContextWriterBuilder(); VariantContextWriter writer; Map<Integer, SingleVCFHeader> id2VcfHeader = mVcfHeader.getID2SingleVcfHeader(); for (int id : multiOutputs.keySet()) { VCFHeader newHeader = id2VcfHeader.get(id).getHeader(); writer = builder.setOutputStream(new FilterOutputStream(outs.get(multiOutputs.get(id))) { @Override public void close() throws IOException { this.out.flush(); } }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build(); writer.writeHeader(newHeader); writer.close(); final FileStatus[] parts = srcFS.globStatus( new Path(options.getWorkPath(), multiOutputs.get(id) + "-*-[0-9][0-9][0-9][0-9][0-9]*")); int i = 0; for (final FileStatus part : parts) { System.out.printf("sort:: Merging part %d ( size %d)...\n", i++, part.getLen()); System.out.flush(); final FSDataInputStream ins = srcFS.open(part.getPath()); IOUtils.copyBytes(ins, outs.get(multiOutputs.get(id)), conf, false); ins.close(); } for (final FileStatus part : parts) srcFS.delete(part.getPath(), false); outs.get(multiOutputs.get(id)).close(); } } catch (IOException e) { System.err.printf("vcf-MultiSampleSort :: Output merging failed: %s\n", e); } }
From source file:org.commoncrawl.mapred.ec2.parser.EC2ParserTask.java
License:Open Source License
private static List<Path> scanForCompletedSegments(FileSystem fs) throws IOException { ImmutableList.Builder<Path> pathListBuilder = new ImmutableList.Builder<Path>(); for (FileStatus fileStatus : fs.globStatus(new Path(VALID_SEGMENTS_PATH + "[0-9]*"))) { pathListBuilder.addAll(scanSegmentManifestFile(fs, fileStatus.getPath())); }//from w w w . jav a2s. co m return pathListBuilder.build(); }
From source file:org.commoncrawl.mapred.ec2.parser.EC2ParserTask.java
License:Open Source License
/** build a list of parse candidates sorted by timestamp * /* w ww .java 2 s. c om*/ * @param fs * @param logFilePath * @return a Set of Candidates * @throws IOException */ private static TreeSet<Path> buildCandidateList(FileSystem fs, Path logFilePath) throws IOException { TreeSet<Path> candidateList = new TreeSet<Path>(new Comparator<Path>() { @Override public int compare(Path p1, Path p2) { String n1 = p1.getName(); String n2 = p2.getName(); Matcher m1 = CRAWL_LOG_REG_EXP.matcher(n1); Matcher m2 = CRAWL_LOG_REG_EXP.matcher(n2); m1.matches(); m2.matches(); Long v1 = Long.parseLong(m1.group(1)); Long v2 = Long.parseLong(m2.group(1)); return v1.compareTo(v2); } }); LOG.info("Scanning for Log Files at:" + logFilePath); FileStatus candidateItems[] = fs.globStatus(new Path(logFilePath, "CrawlLog*")); for (FileStatus candidate : candidateItems) { candidateList.add(candidate.getPath()); } return candidateList; }
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBMergingReducer.java
License:Open Source License
/** * scan the merge db path and find the latest crawl database timestamp * //ww w . java 2s .c o m * @param fs * @param conf * @return * @throws IOException */ static long findLatestMergeDBTimestamp(FileSystem fs, Configuration conf) throws IOException { long timestampOut = -1L; FileStatus files[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_DB_PATH, "[0-9]*")); for (FileStatus candidate : files) { Path successPath = new Path(candidate.getPath(), "_SUCCESS"); if (fs.exists(successPath)) { long timestamp = Long.parseLong(candidate.getPath().getName()); timestampOut = Math.max(timestamp, timestampOut); } } return timestampOut; }
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBMergingReducer.java
License:Open Source License
/** * iterate the intermediate link graph data and extract unmerged set ... * /* ww w. j av a2 s .c o m*/ * @param fs * @param conf * @param latestMergeDBTimestamp * @return * @throws IOException */ static List<Path> filterMergeCandidtes(FileSystem fs, Configuration conf, long latestMergeDBTimestamp) throws IOException { ArrayList<Path> list = new ArrayList<Path>(); FileStatus candidates[] = fs .globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH, "[0-9]*")); for (FileStatus candidate : candidates) { LOG.info("Found Merge Candidate:" + candidate.getPath()); long candidateTimestamp = Long.parseLong(candidate.getPath().getName()); if (candidateTimestamp > latestMergeDBTimestamp) { Path successPath = new Path(candidate.getPath(), "_SUCCESS"); if (fs.exists(successPath)) { list.add(candidate.getPath()); } else { LOG.info("Rejected Merge Candidate:" + candidate.getPath()); } } } return list; }
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java
License:Open Source License
public static void processSegmentEC2(FileSystem s3fs, Configuration conf, long segmentId) throws IOException { Path outputPath = new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + Long.toString(segmentId)); LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Output path is:" + outputPath); if (s3fs.exists(outputPath)) { LOG.warn("Output Path Already Exists for Segment:" + segmentId + ".Deleting!"); s3fs.delete(outputPath, true);/*ww w. ja va 2s.com*/ } // ok collect merge files ArrayList<Path> pathList = new ArrayList<Path>(); for (FileStatus metadataFile : s3fs.globStatus(new Path(SEGMENTS_PATH, segmentId + "/metadata-*"))) { pathList.add(metadataFile.getPath().makeQualified(s3fs)); } LOG.info("Input Paths for Segment:" + segmentId + " are:" + pathList); JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).inputs(pathList) .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class) .mapper(LinkGraphDataEmitter.class).maxMapAttempts(7).maxReduceAttempts(7).maxMapTaskFailures(1000) .reuseJVM(1000).reducer(LinkGraphDataEmitter.class, false).partition(CrawlDBKeyPartitioner.class) .sort(LinkKeyComparator.class).numReducers(1000).speculativeExecution(true).output(outputPath) .outputFormat(SequenceFileOutputFormat.class).compressMapOutput(true) .compressor(CompressionType.BLOCK, SnappyCodec.class).build(); JobClient.runJob(jobConf); }
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java
License:Open Source License
private static SortedSet<Long> scanForValidSegments(FileSystem fs) throws IOException { SortedSet<Long> completeSegmentIds = Sets.newTreeSet(); for (FileStatus fileStatus : fs.globStatus(new Path(S3N_BUCKET_PREFIX + VALID_SEGMENTS_PATH + "[0-9]*"))) { completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName())); }/* w ww . j ava 2 s . c o m*/ return completeSegmentIds; }