Example usage for org.apache.hadoop.fs FileSystem isDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem isDirectory.

Prototype

@Deprecated
public boolean isDirectory(Path f) throws IOException

Source Link

Document

True iff the named path is a directory.

Usage

From source file:uk.bl.wa.hadoop.mapreduce.hash.HdfsFileHasher.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    // Options://from w w  w  .ja  v a  2 s. c  om
    String[] otherArgs = new GenericOptionsParser(args).getRemainingArgs();

    // Process remaining args list this:
    Options options = new Options();
    options.addOption("i", true, "a local file containing a list of HDFS paths to process");
    options.addOption("o", true, "output directory");
    options.addOption("m", false, "use MD5 rather than SHA-512");
    options.addOption("r", true, "number of reducers (defaults to 1)");

    CommandLineParser parser = new PosixParser();
    CommandLine cmd = parser.parse(options, otherArgs);
    if (!cmd.hasOption("i") || !cmd.hasOption("o")) {
        HelpFormatter helpFormatter = new HelpFormatter();
        helpFormatter.setWidth(80);
        helpFormatter.printHelp(CLI_USAGE, CLI_HEADER, options, "");
        System.exit(1);
    }
    String input_file = cmd.getOptionValue("i");
    String output_path = cmd.getOptionValue("o");
    String algorithm = null;
    int numReducers = 1;
    if (cmd.hasOption("m")) {
        algorithm = "MD5";
    }
    if (cmd.hasOption("r")) {
        numReducers = Integer.parseInt(cmd.getOptionValue("r"));
    }

    // When implementing tool, choose algorithm:
    Configuration conf = this.getConf();
    if (algorithm != null)
        conf.set(MessageDigestMapper.CONFIG_DIGEST_ALGORITHM, algorithm);

    // Create job
    Job job = new Job(conf, "HDFS File Checksummer");
    job.setJarByClass(HdfsFileHasher.class);

    // Setup MapReduce job
    // Do not specify the number of Reducer
    job.setMapperClass(MessageDigestMapper.class);
    job.setReducerClass(Reducer.class);

    // Just one output file:
    job.setNumReduceTasks(numReducers);

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Input
    log.info("Reading input files...");
    String line = null;
    long line_count = 0;
    BufferedReader br = new BufferedReader(new FileReader(input_file));
    while ((line = br.readLine()) != null) {
        if (StringUtils.isEmpty(line))
            continue;
        //
        line_count++;
        Path path = new Path(line);
        FileSystem fs = path.getFileSystem(conf);
        if (fs.isFile(path)) {
            FileInputFormat.addInputPath(job, path);
        } else if (fs.isDirectory(path)) {
            FileStatus[] listing = fs.listStatus(path);
            int list_count = 0;
            for (FileStatus fstat : listing) {
                list_count++;
                log.info("Checking " + list_count + "/" + listing.length + " " + fstat.getPath());
                if (!fstat.isDir()) {
                    FileInputFormat.addInputPath(job, fstat.getPath());
                }
            }
        }
    }
    br.close();
    log.info("Read " + FileInputFormat.getInputPaths(job).length + " input files from " + line_count
            + " paths.");
    job.setInputFormatClass(UnsplittableInputFileFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, new Path(output_path));
    job.setOutputFormatClass(TextOutputFormat.class);

    // Execute job and return status
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:voldemort.store.readonly.mr.serialization.JsonSequenceFileInputFormat.java

License:Apache License

private List<FileStatus> getAllSubFileStatus(JobConf inputConf, Path filterMemberPath) throws IOException {
    List<FileStatus> list = new ArrayList<FileStatus>();

    FileSystem fs = filterMemberPath.getFileSystem(inputConf);
    FileStatus[] subFiles = fs.listStatus(filterMemberPath);

    if (null != subFiles) {
        if (fs.isDirectory(filterMemberPath)) {
            for (FileStatus subFile : subFiles) {
                if (!HadoopUtils.shouldPathBeIgnored(subFile.getPath())) {
                    list.addAll(getAllSubFileStatus(inputConf, subFile.getPath()));
                }// w ww . j  a v a 2  s .com
            }
        } else {
            if (subFiles.length > 0 && !HadoopUtils.shouldPathBeIgnored(subFiles[0].getPath())) {
                list.add(subFiles[0]);
            }
        }
    }

    return list;
}

From source file:voldemort.store.readonly.mr.utils.AvroUtils.java

License:Apache License

/**
 * Pull the schema off of the given file (if it is a file). If it is a
 * directory, then pull schemas off of all subfiles, and check that they are
 * all the same schema. If so, return that schema, otherwise throw an
 * exception//from w  w  w . j a  va 2s. com
 * 
 * @param fs The filesystem to use
 * @param path The path from which to get the schema
 * @param checkSameSchema boolean flag to check all files in directory for
 *        same schema
 * @return The schema of this file or all its subfiles
 * @throws IOException
 */

@SuppressWarnings({ "unchecked", "rawtypes" })
private static Schema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema) {

    try {
        if (fs.isFile(path)) {
            BufferedInputStream inStream = null;
            try {
                inStream = new BufferedInputStream(fs.open(path));
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            GenericDatumReader datum = new GenericDatumReader();

            DataFileStream reader = null;
            try {
                reader = new DataFileStream(inStream, datum);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            return reader.getSchema();
        } else {
            FileStatus[] statuses = null;
            if (fs.isDirectory(path)) {
                // this is a directory, get schemas from all subfiles
                statuses = fs.listStatus(path);
            } else {
                // this is wildcard path, get schemas from all matched files
                statuses = fs.globStatus(path);
            }
            if (statuses == null || statuses.length == 0)
                throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath());
            List<Schema> schemas = new ArrayList<Schema>();
            for (FileStatus status : statuses) {
                if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
                    if (!checkSameSchema) {
                        // return first valid schema w/o checking all files
                        return getSchemaFromPath(fs, status.getPath(), checkSameSchema);
                    }
                    schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema));
                }
            }

            // now check that all the schemas are the same
            if (schemas.size() > 0) {
                Schema schema = schemas.get(0);
                for (int i = 1; i < schemas.size(); i++)
                    if (!schema.equals(schemas.get(i)))
                        throw new IllegalArgumentException("The directory " + path.toString()
                                + " contains heterogenous schemas: found both '" + schema.toString() + "' and '"
                                + schemas.get(i).toString() + "'.");

                return schema;
            } else {
                throw new IllegalArgumentException("No Valid metadata file found for Path:" + path.toString());
            }
        }
    } catch (Exception e) {
        // logger.error("failed to get metadata from path:" + path);
        throw new RuntimeException(e);
    }

}

From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java

License:Apache License

/**
 * Pull the schema off of the given file (if it is a file). If it is a
 * directory, then pull schemas off of all subfiles, and check that they are
 * all the same schema. If so, return that schema, otherwise throw an
 * exception//from  ww w.  j a v a2s .c om
 * 
 * @param fs The filesystem to use
 * @param path The path from which to get the schema
 * @param checkSameSchema boolean flag to check all files in directory for
 *        same schema
 * @return The schema of this file or all its subfiles
 * @throws IOException
 */
public static JsonSchema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema)
        throws IOException {
    try {
        if (fs.isFile(path)) {
            // this is a normal file, get a schema from it
            Map<String, String> m = HadoopUtils.getMetadataFromSequenceFile(fs, path);
            if (!m.containsKey("value.schema") || !m.containsKey("key.schema"))
                throw new IllegalArgumentException("No schema found on file " + path.toString());
            return new JsonSchema(JsonTypeDefinition.fromJson(m.get("key.schema")),
                    JsonTypeDefinition.fromJson(m.get("value.schema")));
        } else {
            FileStatus[] statuses = null;
            if (fs.isDirectory(path)) {
                // this is a directory, get schemas from all subfiles
                statuses = fs.listStatus(path);
            } else {
                // this is wildcard path, get schemas from all matched files
                statuses = fs.globStatus(path);
            }
            if (statuses == null || statuses.length == 0)
                throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath());
            List<JsonSchema> schemas = new ArrayList<JsonSchema>();
            for (FileStatus status : statuses) {
                if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
                    if (!checkSameSchema) {
                        // return first valid schema w/o checking all files
                        return getSchemaFromPath(fs, status.getPath(), checkSameSchema);
                    }
                    schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema));
                }
            }

            // now check that all the schemas are the same
            if (schemas.size() > 0) {
                JsonSchema schema = schemas.get(0);
                for (int i = 1; i < schemas.size(); i++)
                    if (!schema.equals(schemas.get(i)))
                        throw new IllegalArgumentException("The directory " + path.toString()
                                + " contains heterogenous schemas: found both '" + schema.toString() + "' and '"
                                + schemas.get(i).toString() + "'.");

                return schema;
            } else {
                throw new IllegalArgumentException("No Valid metedata file found for Path:" + path.toString());
            }
        }
    } catch (Exception e) {
        logger.error("failed to get metadata from path:" + path);
        throw new RuntimeException(e);
    }
}

From source file:yarnkit.utils.YarnUtils.java

License:Apache License

public static LocalResource mapLocalResourceToHDFS(@Nonnull FileSystem fs, @Nonnull Path srcLocalPath,
        @Nonnull Path dstHdfsDir, @Nonnull String resourceName, @Nonnull Map<String, LocalResource> resourceMap)
        throws IOException {
    if (!fs.isDirectory(dstHdfsDir)) {
        throw new IllegalStateException("Expected a directory, not a file: " + dstHdfsDir);
    }// ww  w  .j  av a2  s. co m
    if (!fs.exists(dstHdfsDir)) {
        throw new IllegalStateException("HDFS directory does not exist: " + dstHdfsDir);
    }
    Path dstPath = new Path(dstHdfsDir, resourceName);

    LocalResource resource = copyToHDFS(fs, srcLocalPath, dstPath);
    resourceMap.put(resourceName, resource);
    return resource;
}