Example usage for org.apache.hadoop.fs FileSystem listStatus

List of usage examples for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException 

Source Link

Document

Filter files/directories in the given list of paths using default path filter.

Usage

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation.//from   w w w.  jav a  2  s  .  c o m
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force)
        throws IOException, InterruptedException, ClassNotFoundException {
    //getConf().set("mapred.temp.dir", "d:/tmp");
    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: starting");

    Job job = AvroJob.getAvroJob(getConf());
    if (numLists == -1) { // for politeness make
        numLists = job.getNumReduceTasks(); // a partition per fetch task
    }
    if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    LOG.info("Generator: with " + numLists + " partition.");
    job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorMapper.class);
    job.setReducerClass(SelectorReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    //job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputFormatClass(GeneratorOutputFormat.class);
    job.setOutputKeyClass(Float.class);
    job.setOutputValueClass(SelectorEntry.class);
    // AvroMultipleOutputs.addNamedOutput(job, "seq",
    // AvroPairOutputFormat.class, Float.class, SelectorEntry.class);
    try {
        job.waitForCompletion(true);
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);

            fs.createNewFile(new Path(newSeg, "generatored"));
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = AvroJob.getAvroJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormatClass(AvroPairInputFormat.class);
        job.setMapperClass(CrawlDbUpdateMapper.class);
        // job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormatClass(AvroMapOutputFormat.class);
        job.setOutputKeyClass(String.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            job.waitForCompletion(true);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: done.");
    }
    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:com.iflytek.spider.parse.ParseSegment.java

License:Apache License

public int run(String[] args) throws Exception {

    String usage = "Usage: ParseSegment segments";

    if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);/*from ww w . ja  v  a2  s  .co  m*/
    }
    FileSystem fs = FileSystem.get(getConf());
    for (FileStatus p : fs.listStatus(new Path(args[0]))) {
        if (fs.exists(new Path(p.getPath(), "crawl_parse")))
            fs.delete(new Path(p.getPath(), "crawl_parse"), true);
        if (fs.exists(new Path(p.getPath(), "parse_data")))
            fs.delete(new Path(p.getPath(), "parse_data"), true);
        parse(p.getPath());
    }
    return 0;
}

From source file:com.inmobi.conduit.AbstractService.java

License:Apache License

private Path getLatestDir(FileSystem fs, Path Dir) throws Exception {

    FileStatus[] fileStatus;/*from  w w w .j a  va 2s  . c  o  m*/
    try {
        fileStatus = fs.listStatus(Dir);
    } catch (FileNotFoundException fe) {
        fileStatus = null;
    }
    if (fileStatus != null && fileStatus.length > 0) {
        FileStatus latestfile = fileStatus[0];
        for (FileStatus currentfile : fileStatus) {
            if (currentfile.getPath().getName().compareTo(latestfile.getPath().getName()) > 0)
                latestfile = currentfile;
        }
        return latestfile.getPath();
    }
    return null;
}

From source file:com.inmobi.conduit.distcp.tools.mapred.lib.TestDynamicInputFormat.java

License:Apache License

private static void createFile(String path) throws Exception {
    FileSystem fileSystem = null;
    DataOutputStream outputStream = null;
    try {//from w  w  w .  ja  v  a 2 s .co m
        fileSystem = cluster.getFileSystem();
        outputStream = fileSystem.create(new Path(path), true, 0);
        expectedFilePaths.add(fileSystem.listStatus(new Path(path))[0].getPath().toString());
    } finally {
        IOUtils.cleanup(null, fileSystem, outputStream);
    }
}

From source file:com.inmobi.conduit.distcp.tools.mapred.TestCopyCommitter.java

License:Apache License

@Test
public void testDeleteMissingFlatInterleavedFiles() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext = Mockito.mock(JobContext.class);
    Mockito.when(jobContext.getConfiguration()).thenReturn(config);
    JobID jobID = new JobID();
    Mockito.when(jobContext.getJobID()).thenReturn(jobID);
    Configuration conf = jobContext.getConfiguration();

    String sourceBase;/*from   w  w w . jav  a 2  s  . c  om*/
    String targetBase;
    FileSystem fs = null;
    try {
        OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
        fs = FileSystem.get(conf);
        sourceBase = "/tmp1/" + String.valueOf(rand.nextLong());
        targetBase = "/tmp1/" + String.valueOf(rand.nextLong());
        TestDistCpUtils.createFile(fs, sourceBase + "/1");
        TestDistCpUtils.createFile(fs, sourceBase + "/3");
        TestDistCpUtils.createFile(fs, sourceBase + "/4");
        TestDistCpUtils.createFile(fs, sourceBase + "/5");
        TestDistCpUtils.createFile(fs, sourceBase + "/7");
        TestDistCpUtils.createFile(fs, sourceBase + "/8");
        TestDistCpUtils.createFile(fs, sourceBase + "/9");

        TestDistCpUtils.createFile(fs, targetBase + "/2");
        TestDistCpUtils.createFile(fs, targetBase + "/4");
        TestDistCpUtils.createFile(fs, targetBase + "/5");
        TestDistCpUtils.createFile(fs, targetBase + "/7");
        TestDistCpUtils.createFile(fs, targetBase + "/9");
        TestDistCpUtils.createFile(fs, targetBase + "/A");

        DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
        options.setSyncFolder(true);
        options.setDeleteMissing(true);
        options.appendToConf(conf);

        CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
        Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
        listing.buildListing(listingFile, options);

        conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
        conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);

        committer.commitJob(jobContext);
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4);

        //Test for idempotent commit
        committer.commitJob(jobContext);
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4);
    } catch (IOException e) {
        LOG.error("Exception encountered while testing for delete missing", e);
        Assert.fail("Delete missing failure");
    } finally {
        TestDistCpUtils.delete(fs, "/tmp1");
    }

}

From source file:com.inmobi.conduit.distcp.tools.mapred.TestCopyCommitter.java

License:Apache License

private boolean checkDirectoryPermissions(FileSystem fs, String targetBase, FsPermission sourcePerm)
        throws IOException {
    Path base = new Path(targetBase);

    Stack<Path> stack = new Stack<Path>();
    stack.push(base);//from  www. jav a 2 s  .  c o m
    while (!stack.isEmpty()) {
        Path file = stack.pop();
        if (!fs.exists(file))
            continue;
        FileStatus[] fStatus = fs.listStatus(file);
        if (fStatus == null || fStatus.length == 0)
            continue;

        for (FileStatus status : fStatus) {
            if (status.isDir()) {
                stack.push(status.getPath());
                Assert.assertEquals(status.getPermission(), sourcePerm);
            }
        }
    }
    return true;
}

From source file:com.inmobi.conduit.distcp.tools.mapred.TestUniformSizeInputFormat.java

License:Apache License

public void testGetSplits(int nMaps) throws Exception {
    DistCpOptions options = getOptions(nMaps);
    Configuration configuration = new Configuration();
    configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps()));
    Path listFile = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testGetSplits_1/fileList.seq");
    CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing(listFile, options);

    JobContext jobContext = Mockito.mock(JobContext.class);
    Mockito.when(jobContext.getConfiguration()).thenReturn(configuration);
    Mockito.when(jobContext.getJobID()).thenReturn(new JobID());
    UniformSizeInputFormat uniformSizeInputFormat = new UniformSizeInputFormat();
    List<InputSplit> splits = uniformSizeInputFormat.getSplits(jobContext);

    //Removing the legacy check - Refer HADOOP-9230
    int sizePerMap = totalFileSize / nMaps;

    checkSplits(listFile, splits);//w  ww .ja v  a2  s .  c  om

    int doubleCheckedTotalSize = 0;
    int previousSplitSize = -1;
    for (int i = 0; i < splits.size(); ++i) {
        InputSplit split = splits.get(i);
        int currentSplitSize = 0;
        TaskAttemptID taskId = new TaskAttemptID("", 0, true, 0, 0);
        final TaskAttemptContext taskAttemptContext = Mockito.mock(TaskAttemptContext.class);
        Mockito.when(taskAttemptContext.getConfiguration()).thenReturn(configuration);
        Mockito.when(taskAttemptContext.getTaskAttemptID()).thenReturn(taskId);
        RecordReader<Text, FileStatus> recordReader = uniformSizeInputFormat.createRecordReader(split,
                taskAttemptContext);
        recordReader.initialize(split, taskAttemptContext);
        while (recordReader.nextKeyValue()) {
            Path sourcePath = recordReader.getCurrentValue().getPath();
            FileSystem fs = sourcePath.getFileSystem(configuration);
            FileStatus fileStatus[] = fs.listStatus(sourcePath);
            Assert.assertEquals(fileStatus.length, 1);
            currentSplitSize += fileStatus[0].getLen();
        }
        Assert.assertTrue(previousSplitSize == -1
                || Math.abs(currentSplitSize - previousSplitSize) < 0.1 * sizePerMap || i == splits.size() - 1);

        doubleCheckedTotalSize += currentSplitSize;
    }

    Assert.assertEquals(totalFileSize, doubleCheckedTotalSize);
}

From source file:com.inmobi.conduit.distcp.tools.SimpleCopyListing.java

License:Apache License

/** {@inheritDoc} */
@Override/*from  w w w .jav a 2s. c  o  m*/
public void doBuildListing(Path pathToListingFile, DistCpOptions options) throws IOException {

    SequenceFile.Writer fileListWriter = null;

    try {
        fileListWriter = getWriter(pathToListingFile);

        for (Path path : options.getSourcePaths()) {
            FileSystem sourceFS = path.getFileSystem(getConf());
            path = makeQualified(path);

            FileStatus rootStatus = sourceFS.getFileStatus(path);
            Path sourcePathRoot = computeSourceRootPath(rootStatus, options);
            boolean localFile = (rootStatus.getClass() != FileStatus.class);

            FileStatus[] sourceFiles = sourceFS.listStatus(path);
            if (sourceFiles != null && sourceFiles.length > 0) {
                for (FileStatus sourceStatus : sourceFiles) {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Recording source-path: " + sourceStatus.getPath() + " for copy.");
                    }
                    writeToFileListing(fileListWriter, sourceStatus, sourcePathRoot, localFile, options);

                    if (isDirectoryAndNotEmpty(sourceFS, sourceStatus)) {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Traversing non-empty source dir: " + sourceStatus.getPath());
                        }
                        traverseNonEmptyDirectory(fileListWriter, sourceStatus, sourcePathRoot, localFile,
                                options);
                    }
                }
            } else {
                writeToFileListing(fileListWriter, rootStatus, sourcePathRoot, localFile, options);
            }
        }
    } finally {
        try {
            if (fileListWriter != null)
                fileListWriter.close();
        } catch (IOException exception) {
            LOG.error("Could not close output-steam to the file-list: ", exception);
            throw exception;
        }
    }
}

From source file:com.inmobi.conduit.distcp.tools.SimpleCopyListing.java

License:Apache License

private static FileStatus[] getChildren(FileSystem fileSystem, FileStatus parent) throws IOException {
    return fileSystem.listStatus(parent.getPath());
}

From source file:com.inmobi.conduit.distcp.tools.util.TestDistCpUtils.java

License:Apache License

public static boolean checkIfFoldersAreInSync(FileSystem fs, String targetBase, String sourceBase)
        throws IOException {
    Path base = new Path(targetBase);

    Stack<Path> stack = new Stack<Path>();
    stack.push(base);/*  ww  w .j a  v a  2  s .  c  o  m*/
    while (!stack.isEmpty()) {
        Path file = stack.pop();
        if (!fs.exists(file))
            continue;
        FileStatus[] fStatus = fs.listStatus(file);
        if (fStatus == null || fStatus.length == 0)
            continue;

        for (FileStatus status : fStatus) {
            if (status.isDir()) {
                stack.push(status.getPath());
            }
            Assert.assertTrue(fs.exists(new Path(
                    sourceBase + "/" + DistCpUtils.getRelativePath(new Path(targetBase), status.getPath()))));
        }
    }
    return true;
}