Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.archive.hadoop.jobs.WATGenerator.java

License:Apache License

/**
* Run the job./*  w  ww .j a  v  a  2  s.  co m*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("WAT Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    // keep job running despite some failures in generating WATs
    job.setBoolean("strictMode", false);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WATGeneratorMapper.class);
    job.setJarByClass(WATGenerator.class);

    int arg = 0;
    if (args[arg].equals("-strictMode")) {
        job.setBoolean("strictMode", true);
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to WATGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.jbs.Parse.java

License:Apache License

/**
 * Run the job./*ww  w.j  a v  a2s  . co  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    FileSystem fs = FileSystem.get(getConf());

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("jbs.Parse " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // Use the Parse-specific output format.
    job.setOutputFormat(PerMapOutputFormat.class);

    // Use our ParseMapper, with output keys and values of type
    // Text.
    job.setMapperClass(ParseMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Configure the input and output paths, from the command-line.
    Path outputDir = new Path(args[0]);
    FileOutputFormat.setOutputPath(job, outputDir);

    boolean atLeastOneInput = false;
    for (int i = 1; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());

        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            Path outputPath = new Path(outputDir, inputPath.getName());
            if (fs.exists(outputPath)) {
                LOG.debug("Output path already exists: " + outputPath);
            } else {
                atLeastOneInput = true;
                LOG.info("Add input path: " + inputPath);
                FileInputFormat.addInputPath(job, inputPath);
            }
        }
    }

    if (!atLeastOneInput) {
        LOG.info("No input files to parse.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);

    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }

    return 0;
}

From source file:org.archive.jbs.tools.Dump.java

License:Apache License

public int run(String[] args) throws Exception {
    String usage = "Usage: Dump [-k|-v] <mapfile|sequencefile>...";

    if (args.length < 1) {
        System.err.println(usage);
        return 1;
    }//  w  w  w.  j a v  a2 s. c  om

    int i = 0;
    int mode = 0;
    if (args[0].equals("-k")) {
        mode = 1;
        i++;
    } else if (args[0].equals("-v")) {
        mode = 2;
        i++;
    }

    for (; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());

        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();

            dump(inputfs, inputPath, mode);
        }
    }

    return 0;
}

From source file:org.bgi.flexlab.gaea.util.SortUilts.java

License:Open Source License

public static void merge(MultipleVCFHeader mVcfHeader, VCFSortOptions options, Configuration conf) {
    try {//from  w  ww  .  j  a  v a  2s.c  om
        System.out.println("vcf-MultiSampleSort :: Merging output...");

        // First, place the VCF or BCF header.
        final Path outpath = new Path(options.getOutputPath());
        final Path wrkPath = new Path(options.getWorkPath());
        final FileSystem srcFS = wrkPath.getFileSystem(conf);
        final FileSystem dstFS = outpath.getFileSystem(conf);

        Map<String, OutputStream> outs = new HashMap<String, OutputStream>();
        Map<Integer, String> multiOutputs = options.getMultiOutputs();
        for (String result : multiOutputs.values()) {
            Path sPath = new Path(options.getOutputPath() + "/" + result + ".vcf");
            OutputStream os = dstFS.create(sPath);
            outs.put(result, os);
        }

        final VariantContextWriterBuilder builder = new VariantContextWriterBuilder();
        VariantContextWriter writer;
        Map<Integer, SingleVCFHeader> id2VcfHeader = mVcfHeader.getID2SingleVcfHeader();
        for (int id : multiOutputs.keySet()) {
            VCFHeader newHeader = id2VcfHeader.get(id).getHeader();
            writer = builder.setOutputStream(new FilterOutputStream(outs.get(multiOutputs.get(id))) {
                @Override
                public void close() throws IOException {
                    this.out.flush();
                }
            }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build();

            writer.writeHeader(newHeader);
            writer.close();

            final FileStatus[] parts = srcFS.globStatus(
                    new Path(options.getWorkPath(), multiOutputs.get(id) + "-*-[0-9][0-9][0-9][0-9][0-9]*"));

            int i = 0;

            for (final FileStatus part : parts) {
                System.out.printf("sort:: Merging part %d ( size %d)...\n", i++, part.getLen());
                System.out.flush();

                final FSDataInputStream ins = srcFS.open(part.getPath());
                IOUtils.copyBytes(ins, outs.get(multiOutputs.get(id)), conf, false);
                ins.close();
            }
            for (final FileStatus part : parts)
                srcFS.delete(part.getPath(), false);

            outs.get(multiOutputs.get(id)).close();

        }
    } catch (IOException e) {
        System.err.printf("vcf-MultiSampleSort :: Output merging failed: %s\n", e);
    }
}

From source file:org.commoncrawl.mapred.ec2.parser.EC2ParserTask.java

License:Open Source License

private static List<Path> scanForCompletedSegments(FileSystem fs) throws IOException {
    ImmutableList.Builder<Path> pathListBuilder = new ImmutableList.Builder<Path>();

    for (FileStatus fileStatus : fs.globStatus(new Path(VALID_SEGMENTS_PATH + "[0-9]*"))) {
        pathListBuilder.addAll(scanSegmentManifestFile(fs, fileStatus.getPath()));
    }//from  w  w w . jav a2s.  co  m
    return pathListBuilder.build();
}

From source file:org.commoncrawl.mapred.ec2.parser.EC2ParserTask.java

License:Open Source License

/** build a list of parse candidates sorted by timestamp 
 * /* w  ww  .java 2 s. c  om*/
 * @param fs
 * @param logFilePath
 * @return a Set of Candidates
 * @throws IOException
 */
private static TreeSet<Path> buildCandidateList(FileSystem fs, Path logFilePath) throws IOException {

    TreeSet<Path> candidateList = new TreeSet<Path>(new Comparator<Path>() {

        @Override
        public int compare(Path p1, Path p2) {
            String n1 = p1.getName();
            String n2 = p2.getName();
            Matcher m1 = CRAWL_LOG_REG_EXP.matcher(n1);
            Matcher m2 = CRAWL_LOG_REG_EXP.matcher(n2);
            m1.matches();
            m2.matches();
            Long v1 = Long.parseLong(m1.group(1));
            Long v2 = Long.parseLong(m2.group(1));

            return v1.compareTo(v2);
        }

    });

    LOG.info("Scanning for Log Files at:" + logFilePath);
    FileStatus candidateItems[] = fs.globStatus(new Path(logFilePath, "CrawlLog*"));
    for (FileStatus candidate : candidateItems) {
        candidateList.add(candidate.getPath());
    }

    return candidateList;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBMergingReducer.java

License:Open Source License

/** 
 * scan the merge db path and find the latest crawl database timestamp 
 * //ww w  .  java  2s  .c o m
 * @param fs
 * @param conf
 * @return
 * @throws IOException
 */
static long findLatestMergeDBTimestamp(FileSystem fs, Configuration conf) throws IOException {
    long timestampOut = -1L;

    FileStatus files[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_DB_PATH, "[0-9]*"));

    for (FileStatus candidate : files) {
        Path successPath = new Path(candidate.getPath(), "_SUCCESS");
        if (fs.exists(successPath)) {
            long timestamp = Long.parseLong(candidate.getPath().getName());
            timestampOut = Math.max(timestamp, timestampOut);
        }
    }
    return timestampOut;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBMergingReducer.java

License:Open Source License

/** 
 * iterate the intermediate link graph data and extract unmerged set ... 
 * /*  ww w. j av  a2  s  .c o m*/
 * @param fs
 * @param conf
 * @param latestMergeDBTimestamp
 * @return
 * @throws IOException
 */
static List<Path> filterMergeCandidtes(FileSystem fs, Configuration conf, long latestMergeDBTimestamp)
        throws IOException {
    ArrayList<Path> list = new ArrayList<Path>();
    FileStatus candidates[] = fs
            .globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH, "[0-9]*"));

    for (FileStatus candidate : candidates) {
        LOG.info("Found Merge Candidate:" + candidate.getPath());
        long candidateTimestamp = Long.parseLong(candidate.getPath().getName());
        if (candidateTimestamp > latestMergeDBTimestamp) {
            Path successPath = new Path(candidate.getPath(), "_SUCCESS");
            if (fs.exists(successPath)) {
                list.add(candidate.getPath());
            } else {
                LOG.info("Rejected Merge Candidate:" + candidate.getPath());
            }
        }
    }
    return list;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java

License:Open Source License

public static void processSegmentEC2(FileSystem s3fs, Configuration conf, long segmentId) throws IOException {
    Path outputPath = new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + Long.toString(segmentId));
    LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Output path is:" + outputPath);

    if (s3fs.exists(outputPath)) {
        LOG.warn("Output Path Already Exists for Segment:" + segmentId + ".Deleting!");
        s3fs.delete(outputPath, true);/*ww  w. ja va  2s.com*/
    }

    // ok collect merge files
    ArrayList<Path> pathList = new ArrayList<Path>();
    for (FileStatus metadataFile : s3fs.globStatus(new Path(SEGMENTS_PATH, segmentId + "/metadata-*"))) {
        pathList.add(metadataFile.getPath().makeQualified(s3fs));
    }
    LOG.info("Input Paths for Segment:" + segmentId + " are:" + pathList);

    JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).inputs(pathList)
            .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class)
            .mapper(LinkGraphDataEmitter.class).maxMapAttempts(7).maxReduceAttempts(7).maxMapTaskFailures(1000)
            .reuseJVM(1000).reducer(LinkGraphDataEmitter.class, false).partition(CrawlDBKeyPartitioner.class)
            .sort(LinkKeyComparator.class).numReducers(1000).speculativeExecution(true).output(outputPath)
            .outputFormat(SequenceFileOutputFormat.class).compressMapOutput(true)
            .compressor(CompressionType.BLOCK, SnappyCodec.class).build();

    JobClient.runJob(jobConf);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java

License:Open Source License

private static SortedSet<Long> scanForValidSegments(FileSystem fs) throws IOException {
    SortedSet<Long> completeSegmentIds = Sets.newTreeSet();

    for (FileStatus fileStatus : fs.globStatus(new Path(S3N_BUCKET_PREFIX + VALID_SEGMENTS_PATH + "[0-9]*"))) {
        completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName()));
    }/* w ww . j ava  2  s  .  c o  m*/
    return completeSegmentIds;
}