Example usage for org.apache.hadoop.fs FileSystem listStatus

List of usage examples for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException 

Source Link

Document

Filter files/directories in the given list of paths using user-supplied path filter.

Usage

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java

License:Open Source License

private static void deletePartFiles(FileSystem fs, Path path) throws FileNotFoundException, IOException {
    PathFilter filter = new PathFilter() {
        public boolean accept(Path file) {
            return file.getName().startsWith("part-");
        }//from www.j  a  va2s.  co m
    };
    FileStatus[] list = fs.listStatus(path, filter);
    for (FileStatus stat : list) {
        fs.delete(stat.getPath(), false);
    }
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

/**
 * Helper function to fetch and sort the list of part files under the given
 * input directory./* w  w  w . ja va 2 s  .  co  m*/
 * 
 * @param input
 * @param fs
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
@SuppressWarnings("unchecked")
private static ArrayList<Path> collectInputFiles(String input, FileSystem fs)
        throws FileNotFoundException, IOException {
    Path path = new Path(input);
    ArrayList<Path> files = new ArrayList<Path>();
    if (fs.isDirectory(path)) {
        for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);
    } else
        files.add(path);

    return files;
}

From source file:com.ibm.jaql.io.hadoop.FileOutputConfigurator.java

License:Apache License

public void setSequential(JobConf conf) throws Exception {
    registerSerializers(conf);/*from  w w w .ja  v a  2s  .c  om*/

    // For an expression, the location is the final file name
    Path outPath = new Path(location);
    FileSystem fs = outPath.getFileSystem(conf);
    outPath = outPath.makeQualified(fs);
    if (fs.exists(outPath)) {
        // TODO: Jaql currently has overwrite semantics; add flag to control this
        if (fs.isFile(outPath)) {
            fs.delete(outPath, false);
        } else {
            // Look for a map-reduce output directory
            FileStatus[] nonMR = fs.listStatus(outPath, new PathFilter() {
                boolean onlyOne = true;

                public boolean accept(Path path) {
                    String name = path.getName();
                    if (name.matches("([.][.]?)|([.]part-[0-9]+.crc)|(part-[0-9]+)")) {
                        return false;
                    }
                    if (onlyOne) {
                        onlyOne = false;
                        return true;
                    }
                    return false;
                }
            });
            if (nonMR.length > 0) {
                throw new IOException(
                        "directory exists and is not a map-reduce output directory: " + nonMR[0].getPath());
            }
            fs.delete(outPath, true);
        }
    }

    // In sequential mode, we will write directly to the output file
    // and bypass the _temporary directory and rename of the standard 
    // FileOutputCommitter by using our own DirectFileOutputCommitter.
    FileOutputFormat.setOutputPath(conf, outPath.getParent());
    conf.setClass("mapred.output.committer.class", DirectFileOutputCommiter.class, OutputCommitter.class);
}

From source file:com.iflytek.spider.crawl.CrawlDb.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println(/*from w  w w  .j  a va  2 s  .com*/
                "Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-noAdditions]");
        System.err.println("\tcrawldb\tCrawlDb to update");
        System.err.println("\t-dir segments\tparent directory containing all segments to update from");
        System.err.println("\tseg1 seg2 ...\tlist of segment names to update from");
        System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
        System.err.println(
                "\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");
        return -1;
    }
    boolean force = false;
    final FileSystem fs = FileSystem.get(getConf());
    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
    HashSet<Path> dirs = new HashSet<Path>();
    for (int i = 1; i < args.length; i++) {
        if (args[i].equals("-force")) {
            force = true;
        } else if (args[i].equals("-noAdditions")) {
            additionsAllowed = false;
        } else if (args[i].equals("-dir")) {
            FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
            dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
        } else {
            dirs.add(new Path(args[i]));
        }
    }
    try {
        update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), additionsAllowed, force);
        return 0;
    } catch (Exception e) {
        LOG.fatal("CrawlDb update: " + StringUtils.stringifyException(e));
        return -1;
    }
}

From source file:com.inmobi.conduit.AbstractService.java

License:Apache License

private List<Path> listPartFiles(Path path, FileSystem fs) {
    List<Path> matches = new LinkedList<Path>();
    try {//from  ww w .j  av a 2  s  . com
        FileStatus[] statuses = fs.listStatus(path, new PathFilter() {
            public boolean accept(Path path) {
                return path.toString().contains("part");
            }
        });
        for (FileStatus status : statuses) {
            matches.add(status.getPath());
        }
    } catch (IOException e) {
        LOG.error(e.getMessage(), e);
    }
    return matches;
}

From source file:com.inmobi.conduit.local.LocalStreamServiceTest.java

License:Apache License

private void testClusterName(String configName, String currentClusterName) throws Exception {
    ConduitConfigParser parser = new ConduitConfigParser(configName);
    ConduitConfig config = parser.getConfig();
    Set<String> streamsToProcess = new HashSet<String>();
    streamsToProcess.addAll(config.getSourceStreams().keySet());
    Set<String> clustersToProcess = new HashSet<String>();
    Set<TestLocalStreamService> services = new HashSet<TestLocalStreamService>();
    Cluster currentCluster = null;/*from  w  ww. jav a2s  .  c om*/
    for (SourceStream sStream : config.getSourceStreams().values()) {
        for (String cluster : sStream.getSourceClusters()) {
            clustersToProcess.add(cluster);
        }
    }
    if (currentClusterName != null) {
        currentCluster = config.getClusters().get(currentClusterName);
    }
    for (String clusterName : clustersToProcess) {
        Cluster cluster = config.getClusters().get(clusterName);
        cluster.getHadoopConf().set("mapred.job.tracker", super.CreateJobConf().get("mapred.job.tracker"));
        TestLocalStreamService service = new TestLocalStreamService(config, cluster, currentCluster,
                new NullCheckPointProvider(), streamsToProcess);
        services.add(service);
    }

    for (TestLocalStreamService service : services) {
        FileSystem fs = service.getFileSystem();
        service.preExecute();
        if (currentClusterName != null)
            Assert.assertEquals(service.getCurrentCluster().getName(), currentClusterName);
        // creating a job with empty input path
        Path tmpJobInputPath = new Path("/tmp/job/input/path");
        Map<FileStatus, String> fileListing = new TreeMap<FileStatus, String>();
        Set<FileStatus> trashSet = new HashSet<FileStatus>();
        // checkpointKey, CheckPointPath
        Table<String, String, String> checkpointPaths = HashBasedTable.create();
        service.createMRInput(tmpJobInputPath, fileListing, trashSet, checkpointPaths);
        Job testJobConf = service.createJob(tmpJobInputPath, 1000);
        testJobConf.waitForCompletion(true);

        int numberOfCountersPerFile = 0;
        long sumOfCounterValues = 0;
        Path outputCounterPath = new Path(new Path(service.getCluster().getTmpPath(), service.getName()),
                "counters");
        FileStatus[] statuses = fs.listStatus(outputCounterPath, new PathFilter() {
            public boolean accept(Path path) {
                return path.toString().contains("part");
            }
        });
        for (FileStatus fileSt : statuses) {
            Scanner scanner = new Scanner(fs.open(fileSt.getPath()));
            while (scanner.hasNext()) {
                String counterNameValue = null;
                try {
                    counterNameValue = scanner.next();
                    String tmp[] = counterNameValue.split(ConduitConstants.AUDIT_COUNTER_NAME_DELIMITER);
                    Assert.assertEquals(4, tmp.length);
                    Long numOfMsgs = Long.parseLong(tmp[3]);
                    numberOfCountersPerFile++;
                    sumOfCounterValues += numOfMsgs;
                } catch (Exception e) {
                    LOG.error("Counters file has malformed line with counter name =" + counterNameValue
                            + "..skipping the line", e);
                }
            }
        }
        // Should have 2 counters for each file
        Assert.assertEquals(NUMBER_OF_FILES * 2, numberOfCountersPerFile);
        // sum of all counter values should be equal to total number of messages
        Assert.assertEquals(NUMBER_OF_FILES * 3, sumOfCounterValues);

        Assert.assertEquals(testJobConf.getConfiguration().get(FS_DEFAULT_NAME_KEY),
                service.getCurrentCluster().getHadoopConf().get(FS_DEFAULT_NAME_KEY));
        Assert.assertEquals(testJobConf.getConfiguration().get(SRC_FS_DEFAULT_NAME_KEY),
                service.getCluster().getHadoopConf().get(FS_DEFAULT_NAME_KEY));
        if (currentCluster == null)
            Assert.assertEquals(testJobConf.getConfiguration().get(FS_DEFAULT_NAME_KEY),
                    testJobConf.getConfiguration().get(SRC_FS_DEFAULT_NAME_KEY));
        service.getFileSystem().delete(new Path(service.getCluster().getRootDir()), true);
    }

}

From source file:com.jkoolcloud.tnt4j.streams.inputs.HdfsFileLineStream.java

License:Apache License

/**
 * Searches for files matching name pattern. Name pattern also may contain path of directory, where file search
 * should be performed, e.g., C:/Tomcat/logs/localhost_access_log.*.txt. If no path is defined (just file name
 * pattern) then files are searched in {@code System.getProperty("user.dir")}. Files array is ordered by file create
 * timestamp in descending order./*from www  .  j  a v a2 s.  co m*/
 *
 * @param path
 *            path of file
 * @param fs
 *            file system
 *
 * @return array of found files paths.
 * @throws IOException
 *             if files can't be listed by file system.
 *
 * @see FileSystem#listStatus(Path, PathFilter)
 * @see FilenameUtils#wildcardMatch(String, String, IOCase)
 */
public static Path[] searchFiles(Path path, FileSystem fs) throws IOException {
    FileStatus[] dir = fs.listStatus(path.getParent(), new PathFilter() {
        @Override
        public boolean accept(Path path) {
            String name = path.getName();
            return FilenameUtils.wildcardMatch(name, "*", IOCase.INSENSITIVE); // NON-NLS
        }
    });

    Path[] activityFiles = new Path[dir == null ? 0 : dir.length];
    if (dir != null) {
        Arrays.sort(dir, new Comparator<FileStatus>() {
            @Override
            public int compare(FileStatus o1, FileStatus o2) {
                return Long.valueOf(o1.getModificationTime()).compareTo(o2.getModificationTime()) * (-1);
            }
        });

        for (int i = 0; i < dir.length; i++) {
            activityFiles[i] = dir[i].getPath();
        }
    }

    return activityFiles;
}

From source file:com.jkoolcloud.tnt4j.streams.inputs.HdfsFileLineStreamTest.java

License:Apache License

@Test()
public void test() throws Exception {
    FileSystem fs = mock(FileSystem.class);
    HdfsFileLineStream stream = new HdfsFileLineStream();

    TestFileList files = new TestFileList(false);

    final String fileName = ("file:////" + files.get(0).getParentFile() + File.separator + files.getPrefix() // NON-NLS
            + "*.TST").replace("\\", "/"); // NON-NLS

    Map<String, String> props = new HashMap<>(2);
    props.put(StreamProperties.PROP_FILENAME, fileName);
    props.put(StreamProperties.PROP_RESTORE_STATE, "false"); // NON-NLS

    when(fs.open(any(Path.class))).thenReturn(new FSDataInputStream(new TestInputStreamStub()));
    final FileStatus fileStatusMock = mock(FileStatus.class);
    final FileStatus[] array = new FileStatus[10];
    Arrays.fill(array, fileStatusMock);
    when(fs.listStatus(any(Path.class), any(PathFilter.class))).thenReturn(array);
    when(fileStatusMock.getModificationTime()).thenReturn(1L, 2L, 3L);
    when(fileStatusMock.getPath()).thenReturn(mock(Path.class));
    when(fs.getContentSummary(any(Path.class))).thenReturn(mock(ContentSummary.class));

    Method m = FileSystem.class.getDeclaredMethod("addFileSystemForTesting", URI.class, Configuration.class, // NON-NLS
            FileSystem.class);
    m.setAccessible(true);//w ww  .j  a  v  a2 s.com
    m.invoke(FileSystem.class, URI.create(fileName), new Configuration(), fs);

    StreamThread st = mock(StreamThread.class);
    st.setName("HdfsFileLineStreamTestThreadName"); // NON-NLS
    stream.setOwnerThread(st);

    stream.setProperties(props.entrySet());
    stream.startStream();

    verify(fileStatusMock, atLeastOnce()).getModificationTime();
    verify(fileStatusMock, atLeastOnce()).getPath();
    verify(fs, atLeastOnce()).listStatus(any(Path.class), any(PathFilter.class));

    stream.cleanup();
}

From source file:com.kxen.han.projection.giraph.BspCase.java

License:Apache License

/**
 * Read all parts- files in the output and count their lines.
 * This works only for textual output!/*  w w w . j  av  a2  s . co  m*/
 *
 * @param conf Configuration
 * @param outputPath Output path
 * @return Number of output lines
 * @throws IOException
 */
public int getNumResults(Configuration conf, Path outputPath) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    int numResults = 0;
    for (FileStatus status : fs.listStatus(outputPath, PARTS_FILTER)) {
        FSDataInputStream in = null;
        BufferedReader reader = null;
        try {
            in = fs.open(status.getPath());
            reader = new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
            while (reader.readLine() != null) {
                numResults++;
            }
        } finally {
            Closeables.closeQuietly(in);
            Closeables.closeQuietly(reader);
        }
    }
    return numResults;
}

From source file:com.linkedin.cubert.io.rubix.RubixFile.java

License:Open Source License

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException,
        ParseException, InstantiationException, IllegalAccessException {
    final int VERBOSE_NUM_ROWS = 4;

    Options options = new Options();

    options.addOption("h", "help", false, "shows this message");
    options.addOption("v", "verbose", false, "print summary and first few rows of each block");
    options.addOption("m", "metadata", false, "show the metadata");
    options.addOption("d", "dump", false,
            "dump the contents of the rubix file. Use -f for specifying format, and -o for specifying output location");
    options.addOption("f", "format", true, "the data format for dumping data (AVRO or TEXT). Default: TEXT");
    options.addOption("e", "extract", true,
            "Extract one rubix block matching the block id. Use -o for specifying output location");
    options.addOption("o", true, "Store the output at the specified location");

    CommandLineParser parser = new BasicParser();

    // parse the command line arguments
    CommandLine line = parser.parse(options, args);

    // show the help message
    if (line.hasOption("h")) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(/*  w ww  .  ja v  a  2 s .c  o  m*/
                "RubixFile <rubix file or dir> [options]\nIf no options are provided, print a summary of the blocks.",
                options);
        return;
    }

    // validate provided options
    if (line.hasOption("d") && line.hasOption("e")) {
        System.err.println("Cannot dump (-d) and extract (-e) at the same time!");
        return;
    }

    // obtain the list of rubix files
    String[] files = line.getArgs();
    if (files == null || files.length == 0) {
        System.err.println("Rubix file not specified");
        return;
    }

    Configuration conf = new JobConf();
    FileSystem fs = FileSystem.get(conf);

    Path path = new Path(files[0]);
    FileStatus[] allFiles;

    FileStatus status = fs.getFileStatus(path);
    if (status.isDir()) {
        allFiles = fs.listStatus(path, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return path.toString().contains(RubixConstants.RUBIX_EXTENSION);
            }

        });
    } else {
        allFiles = new FileStatus[] { status };
    }

    // walk over all files and extract the trailer section
    List<RubixFile<Tuple, Object>> rfiles = new ArrayList<RubixFile<Tuple, Object>>();

    for (FileStatus s : allFiles) {
        Path p = s.getPath();

        RubixFile<Tuple, Object> rfile = new RubixFile<Tuple, Object>(conf, p);

        // if printing meta data information.. exit after first file (since all files
        // have the same meta data)
        if (line.hasOption("m")) {
            rfile.getKeyData();

            System.out.println(new ObjectMapper().writer().writeValueAsString(rfile.metadataJson));
            break;
        }

        rfiles.add(rfile);
    }

    // dump the data
    if (line.hasOption("d")) {
        String format = line.getOptionValue("f");
        if (format == null)
            format = "TEXT";

        format = format.trim().toUpperCase();

        if (format.equals("AVRO")) {
            // dumpAvro(rfiles, line.getOptionValue("o"));
            throw new UnsupportedOperationException(
                    "Dumping to avro is not currently supporting. Please write a Cubert (map-only) script to store data in avro format");
        } else if (format.equals("TEXT")) {
            if (line.hasOption("o")) {
                System.err.println("Dumping TEXT format data *into a file* is not currently supported");
                return;
            }
            dumpText(rfiles, line.getOptionValue("o"), Integer.MAX_VALUE);
        } else {
            System.err.println("Invalid format [" + format + "] for dumping. Please use AVRO or TEXT");
            return;
        }
    } else if (line.hasOption("e")) // extract one rubix block
    {
        long blockId = Long.parseLong(line.getOptionValue("e"));
        extract(rfiles, blockId, line.getOptionValue("o"));
    } else
    // print summary
    {
        dumpText(rfiles, null, line.hasOption("v") ? VERBOSE_NUM_ROWS : 0);
    }
}