List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
From source file:com.netease.news.utils.SequenceFileDumper.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();// w w w.ja va 2s.c om addOutputOption(); addOption("substring", "b", "The number of chars to print out per value", false); addOption(buildOption("count", "c", "Report the count only", false, false, null)); addOption("numItems", "n", "Output at most <n> key value pairs", false); addOption( buildOption("facets", "fa", "Output the counts per key. Note, if there are a lot of unique keys, " + "this can take up a fair amount of memory", false, false, null)); addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); Path input = getInputPath(); FileSystem fs = input.getFileSystem(conf); if (fs.getFileStatus(input).isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter())); } else { pathArr = new Path[1]; pathArr[0] = input; } Writer writer; boolean shouldClose; if (hasOption("output")) { shouldClose = true; writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { for (Path path : pathArr) { if (!hasOption("quiet")) { writer.append("Input Path: ").append(String.valueOf(path)).append('\n'); } int sub = Integer.MAX_VALUE; if (hasOption("substring")) { sub = Integer.parseInt(getOption("substring")); } boolean countOnly = hasOption("count"); SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true, conf); if (!hasOption("quiet")) { writer.append("Key class: ").append(iterator.getKeyClass().toString()); writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n'); } OpenObjectIntHashMap<String> facets = null; if (hasOption("facets")) { facets = new OpenObjectIntHashMap<String>(); } long count = 0; if (countOnly) { while (iterator.hasNext()) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } writer.append("Count: ").append(String.valueOf(count)).append('\n'); } else { long numItems = Long.MAX_VALUE; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (!hasOption("quiet")) { writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n"); } } while (iterator.hasNext() && count < numItems) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); writer.append("Key: ").append(key); String str = record.getSecond().toString(); writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str); writer.write('\n'); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } if (!hasOption("quiet")) { writer.append("Count: ").append(String.valueOf(count)).append('\n'); } } if (facets != null) { List<String> keyList = Lists.newArrayListWithCapacity(facets.size()); IntArrayList valueList = new IntArrayList(facets.size()); facets.pairsSortedByKey(keyList, valueList); writer.append("-----Facets---\n"); writer.append("Key\t\tCount\n"); int i = 0; for (String key : keyList) { writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n'); } } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
public void splitDirectory(Configuration conf, Path inputDir) throws IOException, ClassNotFoundException, InterruptedException { FileSystem fs = inputDir.getFileSystem(conf); if (fs.getFileStatus(inputDir) == null) { throw new IOException(inputDir + " does not exist"); }/* ww w. j a v a2s .c o m*/ if (!fs.getFileStatus(inputDir).isDir()) { throw new IOException(inputDir + " is not a directory"); } if (useMapRed) { SplitInputJob.run(conf, inputDir, mapRedOutputDirectory, keepPct, testRandomSelectionPct); } else { // input dir contains one file per category. FileStatus[] fileStats = fs.listStatus(inputDir, PathFilters.logsCRCFilter()); for (FileStatus inputFile : fileStats) { if (!inputFile.isDir()) { splitFile(inputFile.getPath()); } } } }
From source file:com.netflix.bdp.s3.S3MultipartOutputCommitter.java
License:Apache License
/** * Lists the output of a task under the task attempt path. Subclasses can * override this method to change how output files are identified. * <p>//from w w w . j a va 2 s . c om * This implementation lists the files that are direct children of the output * path and filters hidden files (file names starting with '.' or '_'). * <p> * The task attempt path is provided by * {@link #getTaskAttemptPath(TaskAttemptContext)} * * @param context this task's {@link TaskAttemptContext} * @return the output files produced by this task in the task attempt path * @throws IOException */ protected Iterable<FileStatus> getTaskOutput(TaskAttemptContext context) throws IOException { // get files on the local FS in the attempt path Path attemptPath = getTaskAttemptPath(context); FileSystem attemptFS = attemptPath.getFileSystem(context.getConfiguration()); FileStatus[] stats = attemptFS.listStatus(attemptPath, HiddenPathFilter.get()); return Arrays.asList(stats); }
From source file:com.netflix.bdp.s3.S3MultipartOutputCommitter.java
License:Apache License
private List<S3Util.PendingUpload> getPendingUploads(JobContext context, boolean suppressExceptions) throws IOException { Path jobAttemptPath = wrappedCommitter.getJobAttemptPath(context); final FileSystem attemptFS = jobAttemptPath.getFileSystem(context.getConfiguration()); FileStatus[] pendingCommitFiles = attemptFS.listStatus(jobAttemptPath, HiddenPathFilter.get()); final List<S3Util.PendingUpload> pending = Lists.newArrayList(); // try to read every pending file and add all results to pending. // in the case of a failure to read the file, exceptions are held until all // reads have been attempted. Tasks.foreach(pendingCommitFiles).throwFailureWhenFinished(!suppressExceptions) .executeWith(getThreadPool(context)).run(new Task<FileStatus, IOException>() { @Override/*from w w w.j a va 2 s. c om*/ public void run(FileStatus pendingCommitFile) throws IOException { pending.addAll(S3Util.readPendingCommits(attemptFS, pendingCommitFile.getPath())); } }); return pending; }
From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java
License:Apache License
/** * List input directories.//from w ww .j av a 2 s. co m * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // Get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); List<IOException> errors = new ArrayList<IOException>(); for (Path p : dirs) { FileSystem fs = p.getFileSystem(job.getConfiguration()); final SmilePathFilter filter = new SmilePathFilter(); FileStatus[] matches = fs.globStatus(p, filter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { Collections.addAll(result, fs.listStatus(globStat.getPath(), filter)); } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
From source file:com.peer2gear.nutch.xquery.ParseResult.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.printf("Usage: %s [generic options] (<segment> ... | -dir <segments>) <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }// w w w .j a v a 2 s .c o m Job job = new Job(getConf()); for (int i = 0; i < args.length - 1; i++) { if ("-dir".equals(args[i])) { Path dir = new Path(args[++i]); FileSystem fs = dir.getFileSystem(getConf()); FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs)); Path[] segments = HadoopFSUtil.getPaths(fstats); for (Path segment : segments) { FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); } } else { FileInputFormat.addInputPath(job, new Path(args[i], ParseData.DIR_NAME)); } } FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1])); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(GetResultMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.redsqirl.workflow.server.datatype.MapRedDir.java
License:Open Source License
public String isPathValid(String path, List<String> shouldNotHaveExt, List<String> shouldHaveExt, boolean fileExtension) throws RemoteException { String error = null;/*w ww. jav a2 s .c o m*/ HdfsFileChecker hCh = new HdfsFileChecker(path); if (shouldHaveExt != null && !shouldHaveExt.isEmpty()) { boolean found = false; for (String extCur : shouldHaveExt) { found |= path.endsWith(extCur); } if (!found) { error = LanguageManagerWF.getText("mapredtexttype.shouldhaveextcompresssile", new Object[] { path, shouldHaveExt }); } } else if (shouldNotHaveExt != null && !shouldNotHaveExt.isEmpty()) { boolean found = false; for (String extCur : shouldNotHaveExt) { found |= path.endsWith(extCur); } if (found) { error = LanguageManagerWF.getText("mapredtexttype.shouldnothaveextcompresssile", new Object[] { path, shouldNotHaveExt }); } } if (!hCh.isInitialized() || hCh.isFile()) { error = LanguageManagerWF.getText("mapredtexttype.dirisfile"); } else if (isPathExist()) { FileSystem fs; try { fs = NameNodeVar.getFS(); hCh.setPath(new Path(path).getParent()); if (!hCh.isDirectory()) { error = LanguageManagerWF.getText("mapredtexttype.nodir", new String[] { hCh.getPath().toString() }); } FileStatus[] stat = null; if (error == null) { try { stat = fs.listStatus(new Path(path), new PathFilter() { @Override public boolean accept(Path arg0) { return !arg0.getName().startsWith("_") && !arg0.getName().startsWith("."); } }); } catch (Exception e) { stat = null; error = LanguageManagerWF.getText("mapredtexttype.notmrdir", new Object[] { path }); } } if (stat != null) { for (int i = 0; i < stat.length && error == null; ++i) { if (stat[i].isDir()) { error = LanguageManagerWF.getText("mapredtexttype.notmrdir", new Object[] { path }); } else { if (fileExtension) { if (shouldHaveExt != null && !shouldHaveExt.isEmpty()) { boolean found = false; for (String extCur : shouldHaveExt) { found |= stat[i].getPath().getName().endsWith(extCur); } if (!found) { error = LanguageManagerWF.getText( "mapredtexttype.shouldhaveextcompresssile", new Object[] { path, shouldHaveExt }); } } else if (shouldNotHaveExt != null && !shouldNotHaveExt.isEmpty()) { boolean found = false; for (String extCur : shouldNotHaveExt) { found |= stat[i].getPath().getName().endsWith(extCur); } if (found) { error = LanguageManagerWF.getText( "mapredtexttype.shouldnothaveextcompresssile", new Object[] { path, shouldNotHaveExt }); } } } try { hdfsInt.select(stat[i].getPath().toString(), "", 1); } catch (Exception e) { error = LanguageManagerWF.getText("mapredtexttype.notmrdir"); logger.error(error, e); } } } } } catch (IOException e) { error = LanguageManagerWF.getText("unexpectedexception", new Object[] { e.getMessage() }); logger.error(error, e); } } // hCh.close(); return error; }
From source file:com.redsqirl.workflow.server.datatype.MapRedDir.java
License:Open Source License
public List<String> selectLine(int maxToRead) throws RemoteException { List<String> ans = null; if (isPathExist()) { try {// w ww .j av a 2 s. com FileSystem fs = NameNodeVar.getFS(); FileStatus[] stat = fs.listStatus(new Path(getPath()), new PathFilter() { @Override public boolean accept(Path arg0) { return !arg0.getName().startsWith("_") && !arg0.getName().startsWith("."); } }); if (stat != null && stat.length > 0) { ans = new ArrayList<String>(maxToRead); SortedSet<Map.Entry<FileStatus, Long>> filesSortedBySize = new TreeSet<Map.Entry<FileStatus, Long>>( new Comparator<Map.Entry<FileStatus, Long>>() { @Override public int compare(Map.Entry<FileStatus, Long> e1, Map.Entry<FileStatus, Long> e2) { return -e1.getValue().compareTo(e2.getValue()); } }); //We limit the number of file to be 100 for (int k = 0; k < stat.length; ++k) { filesSortedBySize .add(new AbstractMap.SimpleEntry<FileStatus, Long>(stat[k], stat[k].getLen())); } //Read the biggest files first Iterator<Map.Entry<FileStatus, Long>> fileIt = filesSortedBySize.iterator(); int k = 0; while (fileIt.hasNext() && ans.size() < maxToRead && k < NB_FILE_TO_READ_MAX) { Map.Entry<FileStatus, Long> cur = fileIt.next(); FileStatus file = cur.getKey(); logger.debug("Number of line already read: " + ans.size()); ans.addAll(hdfsInt.select(file.getPath().toString(), ",", maxToRead - ans.size())); ++k; } logger.debug("Number of line read in " + getPath() + ": " + ans.size()); } } catch (IOException e) { String error = "Unexpected error: " + e.getMessage(); logger.error(error, e); ans = null; } catch (Exception e) { logger.error(e, e); ans = null; } } return ans; }
From source file:com.redsqirl.workflow.server.OozieManager.java
License:Open Source License
/** * Clean the directory where the Job details are stored * //from w ww . j ava2 s . c o m * @param nameWf * @throws RemoteException */ public void cleanJobDirectory(final String nameWf) throws RemoteException { Path hdfsWfPath = new Path(WorkflowPrefManager.getHDFSPathJobs()); FileSystem fs = null; int numberToKeep = WorkflowPrefManager.getNbOozieDirToKeep(); try { fs = NameNodeVar.getFS(); FileStatus[] children = fs.listStatus(hdfsWfPath, new PathFilter() { @Override public boolean accept(Path arg0) { return arg0.getName().startsWith(nameWf + "_"); } }); Arrays.sort(children, 0, children.length, new Comparator<FileStatus>() { @Override public int compare(FileStatus arg0, FileStatus arg1) { return (int) ((arg0.getModificationTime() - arg1.getModificationTime()) / 10000); } }); for (int i = 0; i < children.length - numberToKeep; ++i) { fs.delete(children[i].getPath(), true); } } catch (Exception e1) { logger.error(e1); } }
From source file:com.redsqirl.workflow.server.OozieManager.java
License:Open Source License
/** * Get a name for a directory to store all the jobs files and configuration * /*from w ww . jav a2s. co m*/ * @param df * @return The name for a directory to store all the jobs files and configuration * @throws RemoteException */ protected String buildFileName(DataFlow df) throws RemoteException { final String nameWf = df.getName(); if (nameWf == null) { logger.warn("The workflow to run has no name"); df.setName(RandomString.getRandomName(8)); } String ans = null; Path hdfsWfPath = new Path(WorkflowPrefManager.getHDFSPathJobs()); FileSystem fs = null; int number = -1; try { fs = NameNodeVar.getFS(); FileStatus[] children = fs.listStatus(hdfsWfPath, new PathFilter() { @Override public boolean accept(Path arg0) { if (arg0.getName().startsWith(nameWf)) { try { @SuppressWarnings("unused") int i = Integer.valueOf(arg0.getName().substring(nameWf.length() + 1)); return true; } catch (Exception e) { } } return false; } }); if (children != null && children.length > 0) { for (FileStatus child : children) { number = Math.max(number, Integer.valueOf(child.getPath().getName().substring(nameWf.length() + 1))); } } } catch (Exception e) { logger.error(e, e); } ans = nameWf + "_" + (number + 1); return ans; }