List of usage examples for org.apache.hadoop.fs FileSystem isDirectory
@Deprecated public boolean isDirectory(Path f) throws IOException
From source file:uk.bl.wa.hadoop.mapreduce.hash.HdfsFileHasher.java
License:Open Source License
@Override public int run(String[] args) throws Exception { // Options://from w w w .ja v a 2 s. c om String[] otherArgs = new GenericOptionsParser(args).getRemainingArgs(); // Process remaining args list this: Options options = new Options(); options.addOption("i", true, "a local file containing a list of HDFS paths to process"); options.addOption("o", true, "output directory"); options.addOption("m", false, "use MD5 rather than SHA-512"); options.addOption("r", true, "number of reducers (defaults to 1)"); CommandLineParser parser = new PosixParser(); CommandLine cmd = parser.parse(options, otherArgs); if (!cmd.hasOption("i") || !cmd.hasOption("o")) { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.setWidth(80); helpFormatter.printHelp(CLI_USAGE, CLI_HEADER, options, ""); System.exit(1); } String input_file = cmd.getOptionValue("i"); String output_path = cmd.getOptionValue("o"); String algorithm = null; int numReducers = 1; if (cmd.hasOption("m")) { algorithm = "MD5"; } if (cmd.hasOption("r")) { numReducers = Integer.parseInt(cmd.getOptionValue("r")); } // When implementing tool, choose algorithm: Configuration conf = this.getConf(); if (algorithm != null) conf.set(MessageDigestMapper.CONFIG_DIGEST_ALGORITHM, algorithm); // Create job Job job = new Job(conf, "HDFS File Checksummer"); job.setJarByClass(HdfsFileHasher.class); // Setup MapReduce job // Do not specify the number of Reducer job.setMapperClass(MessageDigestMapper.class); job.setReducerClass(Reducer.class); // Just one output file: job.setNumReduceTasks(numReducers); // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Input log.info("Reading input files..."); String line = null; long line_count = 0; BufferedReader br = new BufferedReader(new FileReader(input_file)); while ((line = br.readLine()) != null) { if (StringUtils.isEmpty(line)) continue; // line_count++; Path path = new Path(line); FileSystem fs = path.getFileSystem(conf); if (fs.isFile(path)) { FileInputFormat.addInputPath(job, path); } else if (fs.isDirectory(path)) { FileStatus[] listing = fs.listStatus(path); int list_count = 0; for (FileStatus fstat : listing) { list_count++; log.info("Checking " + list_count + "/" + listing.length + " " + fstat.getPath()); if (!fstat.isDir()) { FileInputFormat.addInputPath(job, fstat.getPath()); } } } } br.close(); log.info("Read " + FileInputFormat.getInputPaths(job).length + " input files from " + line_count + " paths."); job.setInputFormatClass(UnsplittableInputFileFormat.class); // Output FileOutputFormat.setOutputPath(job, new Path(output_path)); job.setOutputFormatClass(TextOutputFormat.class); // Execute job and return status return job.waitForCompletion(true) ? 0 : 1; }
From source file:voldemort.store.readonly.mr.serialization.JsonSequenceFileInputFormat.java
License:Apache License
private List<FileStatus> getAllSubFileStatus(JobConf inputConf, Path filterMemberPath) throws IOException { List<FileStatus> list = new ArrayList<FileStatus>(); FileSystem fs = filterMemberPath.getFileSystem(inputConf); FileStatus[] subFiles = fs.listStatus(filterMemberPath); if (null != subFiles) { if (fs.isDirectory(filterMemberPath)) { for (FileStatus subFile : subFiles) { if (!HadoopUtils.shouldPathBeIgnored(subFile.getPath())) { list.addAll(getAllSubFileStatus(inputConf, subFile.getPath())); }// w ww . j a v a 2 s .com } } else { if (subFiles.length > 0 && !HadoopUtils.shouldPathBeIgnored(subFiles[0].getPath())) { list.add(subFiles[0]); } } } return list; }
From source file:voldemort.store.readonly.mr.utils.AvroUtils.java
License:Apache License
/** * Pull the schema off of the given file (if it is a file). If it is a * directory, then pull schemas off of all subfiles, and check that they are * all the same schema. If so, return that schema, otherwise throw an * exception//from w w w . j a va 2s. com * * @param fs The filesystem to use * @param path The path from which to get the schema * @param checkSameSchema boolean flag to check all files in directory for * same schema * @return The schema of this file or all its subfiles * @throws IOException */ @SuppressWarnings({ "unchecked", "rawtypes" }) private static Schema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema) { try { if (fs.isFile(path)) { BufferedInputStream inStream = null; try { inStream = new BufferedInputStream(fs.open(path)); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } GenericDatumReader datum = new GenericDatumReader(); DataFileStream reader = null; try { reader = new DataFileStream(inStream, datum); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return reader.getSchema(); } else { FileStatus[] statuses = null; if (fs.isDirectory(path)) { // this is a directory, get schemas from all subfiles statuses = fs.listStatus(path); } else { // this is wildcard path, get schemas from all matched files statuses = fs.globStatus(path); } if (statuses == null || statuses.length == 0) throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath()); List<Schema> schemas = new ArrayList<Schema>(); for (FileStatus status : statuses) { if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) { if (!checkSameSchema) { // return first valid schema w/o checking all files return getSchemaFromPath(fs, status.getPath(), checkSameSchema); } schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema)); } } // now check that all the schemas are the same if (schemas.size() > 0) { Schema schema = schemas.get(0); for (int i = 1; i < schemas.size(); i++) if (!schema.equals(schemas.get(i))) throw new IllegalArgumentException("The directory " + path.toString() + " contains heterogenous schemas: found both '" + schema.toString() + "' and '" + schemas.get(i).toString() + "'."); return schema; } else { throw new IllegalArgumentException("No Valid metadata file found for Path:" + path.toString()); } } } catch (Exception e) { // logger.error("failed to get metadata from path:" + path); throw new RuntimeException(e); } }
From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java
License:Apache License
/** * Pull the schema off of the given file (if it is a file). If it is a * directory, then pull schemas off of all subfiles, and check that they are * all the same schema. If so, return that schema, otherwise throw an * exception//from ww w. j a v a2s .c om * * @param fs The filesystem to use * @param path The path from which to get the schema * @param checkSameSchema boolean flag to check all files in directory for * same schema * @return The schema of this file or all its subfiles * @throws IOException */ public static JsonSchema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema) throws IOException { try { if (fs.isFile(path)) { // this is a normal file, get a schema from it Map<String, String> m = HadoopUtils.getMetadataFromSequenceFile(fs, path); if (!m.containsKey("value.schema") || !m.containsKey("key.schema")) throw new IllegalArgumentException("No schema found on file " + path.toString()); return new JsonSchema(JsonTypeDefinition.fromJson(m.get("key.schema")), JsonTypeDefinition.fromJson(m.get("value.schema"))); } else { FileStatus[] statuses = null; if (fs.isDirectory(path)) { // this is a directory, get schemas from all subfiles statuses = fs.listStatus(path); } else { // this is wildcard path, get schemas from all matched files statuses = fs.globStatus(path); } if (statuses == null || statuses.length == 0) throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath()); List<JsonSchema> schemas = new ArrayList<JsonSchema>(); for (FileStatus status : statuses) { if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) { if (!checkSameSchema) { // return first valid schema w/o checking all files return getSchemaFromPath(fs, status.getPath(), checkSameSchema); } schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema)); } } // now check that all the schemas are the same if (schemas.size() > 0) { JsonSchema schema = schemas.get(0); for (int i = 1; i < schemas.size(); i++) if (!schema.equals(schemas.get(i))) throw new IllegalArgumentException("The directory " + path.toString() + " contains heterogenous schemas: found both '" + schema.toString() + "' and '" + schemas.get(i).toString() + "'."); return schema; } else { throw new IllegalArgumentException("No Valid metedata file found for Path:" + path.toString()); } } } catch (Exception e) { logger.error("failed to get metadata from path:" + path); throw new RuntimeException(e); } }
From source file:yarnkit.utils.YarnUtils.java
License:Apache License
public static LocalResource mapLocalResourceToHDFS(@Nonnull FileSystem fs, @Nonnull Path srcLocalPath, @Nonnull Path dstHdfsDir, @Nonnull String resourceName, @Nonnull Map<String, LocalResource> resourceMap) throws IOException { if (!fs.isDirectory(dstHdfsDir)) { throw new IllegalStateException("Expected a directory, not a file: " + dstHdfsDir); }// ww w .j av a2 s. co m if (!fs.exists(dstHdfsDir)) { throw new IllegalStateException("HDFS directory does not exist: " + dstHdfsDir); } Path dstPath = new Path(dstHdfsDir, resourceName); LocalResource resource = copyToHDFS(fs, srcLocalPath, dstPath); resourceMap.put(resourceName, resource); return resource; }