List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:com.mycompany.app.TestStagingDirectoryPermissions.java
License:Apache License
@Test public void perms() throws IOException, InterruptedException { MiniDFSCluster minidfs = null;//from w ww .j a va 2s . co m FileSystem fs = null; MiniMRClientCluster minimr = null; try { Configuration conf = new Configuration(true); conf.set("fs.permission.umask-mode", "0077"); minidfs = new MiniDFSCluster.Builder(conf).build(); minidfs.waitActive(); fs = minidfs.getFileSystem(); conf.set(FileSystem.FS_DEFAULT_NAME_KEY, fs.getUri().toString()); Path p = path("/in"); fs.mkdirs(p); FSDataOutputStream os = fs.create(new Path(p, "input.txt")); os.write("hello!".getBytes("UTF-8")); os.close(); String user = UserGroupInformation.getCurrentUser().getUserName(); Path home = new Path("/User/" + user); fs.mkdirs(home); minimr = MiniMRClientClusterFactory.create(this.getClass(), 1, conf); JobConf job = new JobConf(minimr.getConfig()); job.setJobName("PermsTest"); JobClient client = new JobClient(job); FileInputFormat.addInputPath(job, p); FileOutputFormat.setOutputPath(job, path("/out")); job.setInputFormat(TextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(MySleepMapper.class); job.setNumReduceTasks(1); RunningJob submittedJob = client.submitJob(job); // Sleep for a bit to let localization finish System.out.println("Sleeping..."); Thread.sleep(3 * 1000l); System.out.println("Done sleeping..."); assertFalse(UserGroupInformation.isSecurityEnabled()); Path stagingRoot = path("/tmp/hadoop-yarn/staging/" + user + "/.staging/"); assertTrue(fs.exists(stagingRoot)); assertEquals(1, fs.listStatus(stagingRoot).length); Path staging = fs.listStatus(stagingRoot)[0].getPath(); Path jobXml = path(staging + "/job.xml"); assertTrue(fs.exists(jobXml)); FileStatus fileStatus = fs.getFileStatus(jobXml); System.out.println("job.xml permission = " + fileStatus.getPermission()); assertTrue(fileStatus.getPermission().getOtherAction().implies(FsAction.READ)); assertTrue(fileStatus.getPermission().getGroupAction().implies(FsAction.READ)); submittedJob.waitForCompletion(); } finally { if (minimr != null) { minimr.stop(); } if (fs != null) { fs.close(); } if (minidfs != null) { minidfs.shutdown(true); } } }
From source file:com.netease.news.text.SequenceFilesFromDirectory.java
License:Apache License
private int runMapReduce(Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException { int chunkSizeInMB = 64; if (hasOption(CHUNK_SIZE_OPTION[0])) { chunkSizeInMB = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0])); }//from w w w .jav a 2s . co m String keyPrefix = null; if (hasOption(KEY_PREFIX_OPTION[0])) { keyPrefix = getOption(KEY_PREFIX_OPTION[0]); } // Prepare Job for submission. Job job = prepareJob(input, output, MultipleTextFileInputFormat.class, SequenceFilesFromDirectoryMapper.class, Text.class, Text.class, SequenceFileOutputFormat.class, "SequenceFilesFromDirectory"); Configuration jobConfig = job.getConfiguration(); jobConfig.set(KEY_PREFIX_OPTION[0], keyPrefix); FileSystem fs = FileSystem.get(jobConfig); FileStatus fsFileStatus = fs.getFileStatus(input); String inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus); jobConfig.set(BASE_INPUT_PATH, input.toString()); long chunkSizeInBytes = chunkSizeInMB * 1024 * 1024; // set the max split locations, otherwise we get nasty debug stuff jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS)); FileInputFormat.setInputPaths(job, inputDirList); // need to set this to a multiple of the block size, or no split happens FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes); FileOutputFormat.setCompressOutput(job, true); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { return -1; } return 0; }
From source file:com.netease.news.utils.SequenceFileDumper.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//w w w . j a va2s .c o m addOutputOption(); addOption("substring", "b", "The number of chars to print out per value", false); addOption(buildOption("count", "c", "Report the count only", false, false, null)); addOption("numItems", "n", "Output at most <n> key value pairs", false); addOption( buildOption("facets", "fa", "Output the counts per key. Note, if there are a lot of unique keys, " + "this can take up a fair amount of memory", false, false, null)); addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); Path input = getInputPath(); FileSystem fs = input.getFileSystem(conf); if (fs.getFileStatus(input).isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter())); } else { pathArr = new Path[1]; pathArr[0] = input; } Writer writer; boolean shouldClose; if (hasOption("output")) { shouldClose = true; writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { for (Path path : pathArr) { if (!hasOption("quiet")) { writer.append("Input Path: ").append(String.valueOf(path)).append('\n'); } int sub = Integer.MAX_VALUE; if (hasOption("substring")) { sub = Integer.parseInt(getOption("substring")); } boolean countOnly = hasOption("count"); SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true, conf); if (!hasOption("quiet")) { writer.append("Key class: ").append(iterator.getKeyClass().toString()); writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n'); } OpenObjectIntHashMap<String> facets = null; if (hasOption("facets")) { facets = new OpenObjectIntHashMap<String>(); } long count = 0; if (countOnly) { while (iterator.hasNext()) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } writer.append("Count: ").append(String.valueOf(count)).append('\n'); } else { long numItems = Long.MAX_VALUE; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (!hasOption("quiet")) { writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n"); } } while (iterator.hasNext() && count < numItems) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); writer.append("Key: ").append(key); String str = record.getSecond().toString(); writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str); writer.write('\n'); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } if (!hasOption("quiet")) { writer.append("Count: ").append(String.valueOf(count)).append('\n'); } } if (facets != null) { List<String> keyList = Lists.newArrayListWithCapacity(facets.size()); IntArrayList valueList = new IntArrayList(facets.size()); facets.pairsSortedByKey(keyList, valueList); writer.append("-----Facets---\n"); writer.append("Key\t\tCount\n"); int i = 0; for (String key : keyList) { writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n'); } } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
public void splitDirectory(Configuration conf, Path inputDir) throws IOException, ClassNotFoundException, InterruptedException { FileSystem fs = inputDir.getFileSystem(conf); if (fs.getFileStatus(inputDir) == null) { throw new IOException(inputDir + " does not exist"); }//from w ww. j a v a 2 s . c o m if (!fs.getFileStatus(inputDir).isDir()) { throw new IOException(inputDir + " is not a directory"); } if (useMapRed) { SplitInputJob.run(conf, inputDir, mapRedOutputDirectory, keepPct, testRandomSelectionPct); } else { // input dir contains one file per category. FileStatus[] fileStats = fs.listStatus(inputDir, PathFilters.logsCRCFilter()); for (FileStatus inputFile : fileStats) { if (!inputFile.isDir()) { splitFile(inputFile.getPath()); } } } }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
/** * Perform a split on the specified input file. Results will be written to files of the same name in the specified * training and test output directories. The {@link #validate()} method is called prior to executing the split. *//* w w w .j a va2 s. com*/ public void splitFile(Path inputFile) throws IOException { Configuration conf = getConf(); FileSystem fs = inputFile.getFileSystem(conf); if (fs.getFileStatus(inputFile) == null) { throw new IOException(inputFile + " does not exist"); } if (fs.getFileStatus(inputFile).isDir()) { throw new IOException(inputFile + " is a directory"); } validate(); Path testOutputFile = new Path(testOutputDirectory, inputFile.getName()); Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName()); int lineCount = countLines(fs, inputFile, charset); log.info("{} has {} lines", inputFile.getName(), lineCount); int testSplitStart = 0; int testSplitSize = this.testSplitSize; // don't modify state BitSet randomSel = null; if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) { testSplitSize = this.testRandomSelectionSize; if (testRandomSelectionPct > 0) { testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f); } log.info("{} test split size is {} based on random selection percentage {}", inputFile.getName(), testSplitSize, testRandomSelectionPct); long[] ridx = new long[testSplitSize]; RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom()); randomSel = new BitSet(lineCount); for (long idx : ridx) { randomSel.set((int) idx + 1); } } else { if (testSplitPct > 0) { // calculate split size based on percentage testSplitSize = Math.round(lineCount * testSplitPct / 100.0f); log.info("{} test split size is {} based on percentage {}", inputFile.getName(), testSplitSize, testSplitPct); } else { log.info("{} test split size is {}", inputFile.getName(), testSplitSize); } if (splitLocation > 0) { // calculate start of split based on percentage testSplitStart = Math.round(lineCount * splitLocation / 100.0f); if (lineCount - testSplitStart < testSplitSize) { // adjust split start downwards based on split size. testSplitStart = lineCount - testSplitSize; } log.info("{} test split start is {} based on split location {}", inputFile.getName(), testSplitStart, splitLocation); } if (testSplitStart < 0) { throw new IllegalArgumentException( "test split size for " + inputFile + " is too large, it would produce an " + "empty training set from the initial set of " + lineCount + " examples"); } else if (lineCount - testSplitSize < testSplitSize) { log.warn( "Test set size for {} may be too large, {} is larger than the number of " + "lines remaining in the training set: {}", inputFile, testSplitSize, lineCount - testSplitSize); } } int trainCount = 0; int testCount = 0; if (!useSequence) { BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset); Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset); try { String line; int pos = 0; while ((line = reader.readLine()) != null) { pos++; Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } writer.write(line); writer.write('\n'); } } finally { Closeables.close(reader, true); Closeables.close(trainingWriter, false); Closeables.close(testWriter, false); } } else { SequenceFileIterator<Writable, Writable> iterator = new SequenceFileIterator<Writable, Writable>( inputFile, false, fs.getConf()); SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile, iterator.getKeyClass(), iterator.getValueClass()); SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile, iterator.getKeyClass(), iterator.getValueClass()); try { int pos = 0; while (iterator.hasNext()) { pos++; SequenceFile.Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } Pair<Writable, Writable> pair = iterator.next(); writer.append(pair.getFirst(), pair.getSecond()); } } finally { Closeables.close(iterator, true); Closeables.close(trainingWriter, false); Closeables.close(testWriter, false); } } log.info("file: {}, input: {} train: {}, test: {} starting at {}", inputFile.getName(), lineCount, trainCount, testCount, testSplitStart); // testing; if (callback != null) { callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart); } }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
/** * Validates that the current instance is in a consistent state * * @throws IllegalArgumentException if settings violate class invariants. * @throws IOException if output directories do not exist or are not directories. *//* w w w . j a v a2 s . c o m*/ public void validate() throws IOException { Preconditions.checkArgument(testSplitSize >= 1 || testSplitSize == -1, "Invalid testSplitSize", testSplitSize); Preconditions.checkArgument(splitLocation >= 0 && splitLocation <= 100 || splitLocation == -1, "Invalid splitLocation percentage", splitLocation); Preconditions.checkArgument(testSplitPct >= 0 && testSplitPct <= 100 || testSplitPct == -1, "Invalid testSplitPct percentage", testSplitPct); Preconditions.checkArgument(splitLocation >= 0 && splitLocation <= 100 || splitLocation == -1, "Invalid splitLocation percentage", splitLocation); Preconditions.checkArgument( testRandomSelectionPct >= 0 && testRandomSelectionPct <= 100 || testRandomSelectionPct == -1, "Invalid testRandomSelectionPct percentage", testRandomSelectionPct); Preconditions.checkArgument(trainingOutputDirectory != null || useMapRed, "No training output directory was specified"); Preconditions.checkArgument(testOutputDirectory != null || useMapRed, "No test output directory was specified"); // only one of the following may be set, one must be set. int count = 0; if (testSplitSize > 0) { count++; } if (testSplitPct > 0) { count++; } if (testRandomSelectionSize > 0) { count++; } if (testRandomSelectionPct > 0) { count++; } Preconditions.checkArgument(count == 1, "Exactly one of testSplitSize, testSplitPct, testRandomSelectionSize, " + "testRandomSelectionPct should be set"); if (!useMapRed) { Configuration conf = getConf(); FileSystem fs = trainingOutputDirectory.getFileSystem(conf); FileStatus trainingOutputDirStatus = fs.getFileStatus(trainingOutputDirectory); Preconditions.checkArgument(trainingOutputDirStatus != null && trainingOutputDirStatus.isDir(), "%s is not a directory", trainingOutputDirectory); FileStatus testOutputDirStatus = fs.getFileStatus(testOutputDirectory); Preconditions.checkArgument(testOutputDirStatus != null && testOutputDirStatus.isDir(), "%s is not a directory", testOutputDirectory); } }
From source file:com.netflix.bdp.s3mper.listing.ConsistentListingAspect.java
License:Apache License
/** * Ensures that all the entries in the metastore also exist in the FileSystem listing. * //from www . j a v a2 s . c o m * @param pjp * @return * @throws Throwable */ @Around("list() && !cflow(delete()) && !within(ConsistentListingAspect)") public Object metastoreCheck(final ProceedingJoinPoint pjp) throws Throwable { FileSystem fs = (FileSystem) pjp.getThis(); if (disabled) { return pjp.proceed(); } Configuration conf = ((FileSystem) pjp.getTarget()).getConf(); updateConfig(conf); FileStatus[] s3Listing = (FileStatus[]) pjp.proceed(); FileStatus[] originalListing = null; if (darkload) { originalListing = s3Listing.clone(); } List<Path> pathsToCheck = new ArrayList<Path>(); Object pathArg = pjp.getArgs()[0]; //Locate paths in the arguments if (pathArg instanceof Path) { pathsToCheck.add((Path) pathArg); } else if (pathArg instanceof List) { pathsToCheck.addAll((List) pathArg); } else if (pathArg.getClass().isArray()) { pathsToCheck.addAll(Arrays.asList((Path[]) pathArg)); } //HACK: This is just to prevent the emr metrics from causing consisteny failures for (StackTraceElement e : Thread.currentThread().getStackTrace()) { if (e.getClassName().contains("emr.metrics")) { log.debug("Ignoring EMR metrics listing for paths: " + pathsToCheck); return s3Listing; } } //END HACK long recheck = recheckCount; long delay = recheckPeriod; try { if (isTask(conf) && !checkTaskListings) { log.info("Skipping consistency check for task listing"); return s3Listing; } if (isTask(conf)) { recheck = taskRecheckCount; delay = taskRecheckPeriod; } } catch (Exception e) { log.error("Error checking for task side listing", e); } try { List<FileInfo> metastoreListing = metastore.list(pathsToCheck); List<Path> missingPaths = ImmutableList.of(); if (statOnMissingFile) { missingPaths = checkListing(metastoreListing, s3Listing); if (!missingPaths.isEmpty()) { List<FileStatus> fullListing = new ArrayList<FileStatus>(); fullListing.addAll(Arrays.asList(s3Listing)); for (Path path : missingPaths) { FileStatus status = fs.getFileStatus(path); fullListing.add(status); } s3Listing = fullListing.toArray(new FileStatus[0]); } } else { int checkAttempt; for (checkAttempt = 0; checkAttempt <= recheck; checkAttempt++) { missingPaths = checkListing(metastoreListing, s3Listing); if (delistDeleteMarkedFiles) { s3Listing = delistDeletedPaths(metastoreListing, s3Listing); } if (missingPaths.isEmpty()) { break; } //Check if acceptable threshold of data has been met. This is a little //ambigious becuase S3 could potentially have more files than the //metastore (via out-of-band access) and throw off the ratio if (fileThreshold < 1 && metastoreListing.size() > 0) { float ratio = s3Listing.length / (float) metastoreListing.size(); if (ratio > fileThreshold) { log.info(format( "Proceeding with incomplete listing at ratio %f (%f as acceptable). Still missing paths: %s", ratio, fileThreshold, missingPaths)); missingPaths.clear(); break; } } if (recheck == 0) { break; } log.info(format("Rechecking consistency in %d (ms). Files missing %d. Missing paths: %s", delay, missingPaths.size(), missingPaths)); Thread.sleep(delay); s3Listing = (FileStatus[]) pjp.proceed(); } if (!missingPaths.isEmpty()) { alertDispatcher.alert(missingPaths); if (shouldFail(conf)) { throw new S3ConsistencyException( "Consistency check failed. See go/s3mper for details. Missing paths: " + missingPaths); } else { log.error("Consistency check failed. See go/s3mper for details. Missing paths: " + missingPaths); } } else { if (checkAttempt > 0) { log.info(format("Listing achieved consistency after %d attempts", checkAttempt)); alertDispatcher.recovered(pathsToCheck); } } } } catch (TimeoutException t) { log.error("Timeout occurred listing metastore paths: " + pathsToCheck, t); alertDispatcher.timeout("metastoreCheck", pathsToCheck); if (failOnTimeout) { throw t; } } catch (Exception e) { log.error("Failed to list metastore for paths: " + pathsToCheck, e); if (shouldFail(conf)) { throw e; } } return darkload ? originalListing : s3Listing; }
From source file:com.netflix.bdp.s3mper.listing.ConsistentListingAspect.java
License:Apache License
/** * Deletes listing records based on a delete call from the FileSystem. * /*from www . j a va 2 s . c o m*/ * @param pjp * @return * @throws Throwable */ @Around("delete() && !within(ConsistentListingAspect)") public Object metastoreDelete(final ProceedingJoinPoint pjp) throws Throwable { if (disabled) { return pjp.proceed(); } Configuration conf = ((FileSystem) pjp.getTarget()).getConf(); updateConfig(conf); Path deletePath = (Path) pjp.getArgs()[0]; boolean recursive = false; if (pjp.getArgs().length > 1) { recursive = (Boolean) pjp.getArgs()[1]; } try { FileSystem s3fs = (FileSystem) pjp.getTarget(); Set<Path> filesToDelete = new HashSet<Path>(); filesToDelete.add(deletePath); List<FileInfo> metastoreFiles = metastore.list(Collections.singletonList(deletePath)); for (FileInfo f : metastoreFiles) { filesToDelete.add(f.getPath()); } try { if (s3fs.getFileStatus(deletePath).isDir() && recursive) { filesToDelete.addAll(recursiveList(s3fs, deletePath)); } } catch (Exception e) { log.info("A problem occurred deleting path: " + deletePath + " " + e.getMessage()); } for (Path path : filesToDelete) { metastore.delete(path); } } catch (TimeoutException t) { log.error("Timeout occurred deleting metastore path: " + deletePath, t); alertDispatcher.timeout("metastoreDelete", Collections.singletonList(deletePath)); if (failOnTimeout) { throw t; } } catch (Exception e) { log.error("Error deleting paths from metastore: " + deletePath, e); if (shouldFail(conf)) { throw e; } } return pjp.proceed(); }
From source file:com.ning.sweeper.HdfsItem.java
License:Apache License
public HdfsItem(FileSystem fs, String path, ContentSummaryTypes contentSummaryType) throws IOException { this(fs, fs.getFileStatus(new Path(path)), contentSummaryType); }
From source file:com.phantom.hadoop.examples.dancing.DistributedPentomino.java
License:Apache License
/** * Create the input file with all of the possible combinations of the given * depth.//from w w w . j a va 2 s . c o m * * @param fs * the filesystem to write into * @param dir * the directory to write the input file into * @param pent * the puzzle * @param depth * the depth to explore when generating prefixes */ private static long createInputDirectory(FileSystem fs, Path dir, Pentomino pent, int depth) throws IOException { fs.mkdirs(dir); List<int[]> splits = pent.getSplits(depth); Path input = new Path(dir, "part1"); PrintWriter file = new PrintWriter( new OutputStreamWriter(new BufferedOutputStream(fs.create(input), 64 * 1024), Charsets.UTF_8)); for (int[] prefix : splits) { for (int i = 0; i < prefix.length; ++i) { if (i != 0) { file.print(','); } file.print(prefix[i]); } file.print('\n'); } file.close(); return fs.getFileStatus(input).getLen(); }