Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.archive.hadoop.jobs.WATGenerator.java

License:Apache License

/**
* Run the job./*  w  ww .j a  v  a  2  s.  co m*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("WAT Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    // keep job running despite some failures in generating WATs
    job.setBoolean("strictMode", false);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WATGeneratorMapper.class);
    job.setJarByClass(WATGenerator.class);

    int arg = 0;
    if (args[arg].equals("-strictMode")) {
        job.setBoolean("strictMode", true);
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to WATGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.jbs.Parse.java

License:Apache License

/**
 * Run the job./*ww  w.j  a v  a2s  . co  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    FileSystem fs = FileSystem.get(getConf());

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("jbs.Parse " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // Use the Parse-specific output format.
    job.setOutputFormat(PerMapOutputFormat.class);

    // Use our ParseMapper, with output keys and values of type
    // Text.
    job.setMapperClass(ParseMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Configure the input and output paths, from the command-line.
    Path outputDir = new Path(args[0]);
    FileOutputFormat.setOutputPath(job, outputDir);

    boolean atLeastOneInput = false;
    for (int i = 1; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());

        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            Path outputPath = new Path(outputDir, inputPath.getName());
            if (fs.exists(outputPath)) {
                LOG.debug("Output path already exists: " + outputPath);
            } else {
                atLeastOneInput = true;
                LOG.info("Add input path: " + inputPath);
                FileInputFormat.addInputPath(job, inputPath);
            }
        }
    }

    if (!atLeastOneInput) {
        LOG.info("No input files to parse.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);

    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }

    return 0;
}

From source file:org.archive.jbs.tools.Dump.java

License:Apache License

public int run(String[] args) throws Exception {
    String usage = "Usage: Dump [-k|-v] <mapfile|sequencefile>...";

    if (args.length < 1) {
        System.err.println(usage);
        return 1;
    }//  w  w  w.  j a v  a2 s. c  om

    int i = 0;
    int mode = 0;
    if (args[0].equals("-k")) {
        mode = 1;
        i++;
    } else if (args[0].equals("-v")) {
        mode = 2;
        i++;
    }

    for (; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());

        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();

            dump(inputfs, inputPath, mode);
        }
    }

    return 0;
}

From source file:org.bgi.flexlab.gaea.util.SortUilts.java

License:Open Source License

public static void merge(MultipleVCFHeader mVcfHeader, VCFSortOptions options, Configuration conf) {
    try {//from  w  ww  .  j  a  v a  2s.c  om
        System.out.println("vcf-MultiSampleSort :: Merging output...");

        // First, place the VCF or BCF header.
        final Path outpath = new Path(options.getOutputPath());
        final Path wrkPath = new Path(options.getWorkPath());
        final FileSystem srcFS = wrkPath.getFileSystem(conf);
        final FileSystem dstFS = outpath.getFileSystem(conf);

        Map<String, OutputStream> outs = new HashMap<String, OutputStream>();
        Map<Integer, String> multiOutputs = options.getMultiOutputs();
        for (String result : multiOutputs.values()) {
            Path sPath = new Path(options.getOutputPath() + "/" + result + ".vcf");
            OutputStream os = dstFS.create(sPath);
            outs.put(result, os);
        }

        final VariantContextWriterBuilder builder = new VariantContextWriterBuilder();
        VariantContextWriter writer;
        Map<Integer, SingleVCFHeader> id2VcfHeader = mVcfHeader.getID2SingleVcfHeader();
        for (int id : multiOutputs.keySet()) {
            VCFHeader newHeader = id2VcfHeader.get(id).getHeader();
            writer = builder.setOutputStream(new FilterOutputStream(outs.get(multiOutputs.get(id))) {
                @Override
                public void close() throws IOException {
                    this.out.flush();
                }
            }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build();

            writer.writeHeader(newHeader);
            writer.close();

            final FileStatus[] parts = srcFS.globStatus(
                    new Path(options.getWorkPath(), multiOutputs.get(id) + "-*-[0-9][0-9][0-9][0-9][0-9]*"));

            int i = 0;

            for (final FileStatus part : parts) {
                System.out.printf("sort:: Merging part %d ( size %d)...\n", i++, part.getLen());
                System.out.flush();

                final FSDataInputStream ins = srcFS.open(part.getPath());
                IOUtils.copyBytes(ins, outs.get(multiOutputs.get(id)), conf, false);
                ins.close();
            }
            for (final FileStatus part : parts)
                srcFS.delete(part.getPath(), false);

            outs.get(multiOutputs.get(id)).close();

        }
    } catch (IOException e) {
        System.err.printf("vcf-MultiSampleSort :: Output merging failed: %s\n", e);
    }
}

From source file:org.commoncrawl.mapred.ec2.parser.EC2ParserTask.java

License:Open Source License

private static List<Path> scanForCompletedSegments(FileSystem fs) throws IOException {
    ImmutableList.Builder<Path> pathListBuilder = new ImmutableList.Builder<Path>();

    for (FileStatus fileStatus : fs.globStatus(new Path(VALID_SEGMENTS_PATH + "[0-9]*"))) {
        pathListBuilder.addAll(scanSegmentManifestFile(fs, fileStatus.getPath()));
    }//from  w  w w . jav a2s.  co  m
    return pathListBuilder.build();
}

From source file:org.commoncrawl.mapred.ec2.parser.EC2ParserTask.java

License:Open Source License

/** build a list of parse candidates sorted by timestamp 
 * /* w  ww  .java 2 s. c  om*/
 * @param fs
 * @param logFilePath
 * @return a Set of Candidates
 * @throws IOException
 */
private static TreeSet<Path> buildCandidateList(FileSystem fs, Path logFilePath) throws IOException {

    TreeSet<Path> candidateList = new TreeSet<Path>(new Comparator<Path>() {

        @Override
        public int compare(Path p1, Path p2) {
            String n1 = p1.getName();
            String n2 = p2.getName();
            Matcher m1 = CRAWL_LOG_REG_EXP.matcher(n1);
            Matcher m2 = CRAWL_LOG_REG_EXP.matcher(n2);
            m1.matches();
            m2.matches();
            Long v1 = Long.parseLong(m1.group(1));
            Long v2 = Long.parseLong(m2.group(1));

            return v1.compareTo(v2);
        }

    });

    LOG.info("Scanning for Log Files at:" + logFilePath);
    FileStatus candidateItems[] = fs.globStatus(new Path(logFilePath, "CrawlLog*"));
    for (FileStatus candidate : candidateItems) {
        candidateList.add(candidate.getPath());
    }

    return candidateList;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBMergingReducer.java

License:Open Source License

/** 
 * scan the merge db path and find the latest crawl database timestamp 
 * //ww w  .  java  2s  .c o m
 * @param fs
 * @param conf
 * @return
 * @throws IOException
 */
static long findLatestMergeDBTimestamp(FileSystem fs, Configuration conf) throws IOException {
    long timestampOut = -1L;

    FileStatus files[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_DB_PATH, "[0-9]*"));

    for (FileStatus candidate : files) {
        Path successPath = new Path(candidate.getPath(), "_SUCCESS");
        if (fs.exists(successPath)) {
            long timestamp = Long.parseLong(candidate.getPath().getName());
            timestampOut = Math.max(timestamp, timestampOut);
        }
    }
    return timestampOut;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBMergingReducer.java

License:Open Source License

/** 
 * iterate the intermediate link graph data and extract unmerged set ... 
 * /*  ww w. j av  a2  s  .c o m*/
 * @param fs
 * @param conf
 * @param latestMergeDBTimestamp
 * @return
 * @throws IOException
 */
static List<Path> filterMergeCandidtes(FileSystem fs, Configuration conf, long latestMergeDBTimestamp)
        throws IOException {
    ArrayList<Path> list = new ArrayList<Path>();
    FileStatus candidates[] = fs
            .globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH, "[0-9]*"));

    for (FileStatus candidate : candidates) {
        LOG.info("Found Merge Candidate:" + candidate.getPath());
        long candidateTimestamp = Long.parseLong(candidate.getPath().getName());
        if (candidateTimestamp > latestMergeDBTimestamp) {
            Path successPath = new Path(candidate.getPath(), "_SUCCESS");
            if (fs.exists(successPath)) {
                list.add(candidate.getPath());
            } else {
                LOG.info("Rejected Merge Candidate:" + candidate.getPath());
            }
        }
    }
    return list;
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java

License:Open Source License

public static void processSegmentEC2(FileSystem s3fs, Configuration conf, long segmentId) throws IOException {
    Path outputPath = new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH + Long.toString(segmentId));
    LOG.info("Starting Intermedaite Merge of Segment:" + segmentId + " Output path is:" + outputPath);

    if (s3fs.exists(outputPath)) {
        LOG.warn("Output Path Already Exists for Segment:" + segmentId + ".Deleting!");
        s3fs.delete(outputPath, true);/*ww  w. ja va  2s.com*/
    }

    // ok collect merge files
    ArrayList<Path> pathList = new ArrayList<Path>();
    for (FileStatus metadataFile : s3fs.globStatus(new Path(SEGMENTS_PATH, segmentId + "/metadata-*"))) {
        pathList.add(metadataFile.getPath().makeQualified(s3fs));
    }
    LOG.info("Input Paths for Segment:" + segmentId + " are:" + pathList);

    JobConf jobConf = new JobBuilder("Intermediate merge for:" + segmentId, conf).inputs(pathList)
            .inputFormat(SequenceFileInputFormat.class).keyValue(TextBytes.class, TextBytes.class)
            .mapper(LinkGraphDataEmitter.class).maxMapAttempts(7).maxReduceAttempts(7).maxMapTaskFailures(1000)
            .reuseJVM(1000).reducer(LinkGraphDataEmitter.class, false).partition(CrawlDBKeyPartitioner.class)
            .sort(LinkKeyComparator.class).numReducers(1000).speculativeExecution(true).output(outputPath)
            .outputFormat(SequenceFileOutputFormat.class).compressMapOutput(true)
            .compressor(CompressionType.BLOCK, SnappyCodec.class).build();

    JobClient.runJob(jobConf);
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.LinkGraphDataEmitterJob.java

License:Open Source License

private static SortedSet<Long> scanForValidSegments(FileSystem fs) throws IOException {
    SortedSet<Long> completeSegmentIds = Sets.newTreeSet();

    for (FileStatus fileStatus : fs.globStatus(new Path(S3N_BUCKET_PREFIX + VALID_SEGMENTS_PATH + "[0-9]*"))) {
        completeSegmentIds.add(Long.parseLong(fileStatus.getPath().getName()));
    }/* w ww . j ava  2  s  .  c o  m*/
    return completeSegmentIds;
}