List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException
From source file:com.datatorrent.stram.util.FSUtil.java
License:Apache License
public static void setPermission(FileSystem fs, Path dst, FsPermission permission) throws IOException { FileStatus contents[] = fs.listStatus(dst); for (int i = 0; i < contents.length; i++) { fs.setPermission(contents[i].getPath(), permission); }//from w w w. j ava2 s .c o m fs.setPermission(dst, permission); }
From source file:com.davidgildeh.hadoop.utils.FileUtils.java
License:Apache License
/** * Merges a list of input files in a directory to a single file under the * outputpath with a specified filename/*from ww w.ja v a 2s . c om*/ * * @param inputPath The input directory containing all the input files. E.g. /input/dir/on/hdfs/ * @param outputPath The output path to output the file. E.g. /output/dir/on/hdfs/filename * @throws IOException */ public static void mergeFiles(String inputPath, String outputPath) throws IOException { Path inputDir = new Path(inputPath); Path outputFile = new Path(outputPath); FileSystem fileSystem = getFileSystem(outputFile); checkFileExists(fileSystem, inputDir); // Check the input path is a directory if (!fileSystem.getFileStatus(inputDir).isDir()) { LOG.error("Path '" + inputDir.toString() + "' is not a directory."); throw new IOException("Path '" + inputDir.toString() + "' is not a directory."); } // Create Output File OutputStream out = fileSystem.create(outputFile); try { FileStatus contents[] = fileSystem.listStatus(inputDir); // Loop through all files in directory and merge them into one file for (int i = 0; i < contents.length; i++) { if (!contents[i].isDir()) { InputStream in = fileSystem.open(contents[i].getPath()); try { IOUtils.copyBytes(in, out, fileSystem.getConf(), false); } finally { in.close(); } } } } finally { out.close(); fileSystem.close(); LOG.info("Merged input files from '" + inputPath + "' to '" + outputPath + "'"); } }
From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java
License:Apache License
private void generateXMLdocs(String inputf, String outputf) throws IOException { Path input = new Path(inputf); File output = new File(outputf); if (output.exists() && output.isFile()) { System.err.println("Output " + outputf + " already exists"); return;//from w w w.j a va2 s . com } if (output.exists() == false) output.mkdirs(); FileSystem fs = input.getFileSystem(getConf()); FileStatus[] statuses = fs.listStatus(input); int count[] = { 0 }; for (int i = 0; i < statuses.length; i++) { FileStatus status = statuses[i]; Path suPath = status.getPath(); if (suPath.getName().equals("_SUCCESS")) continue; generateXMLdocs(suPath, output, count); } }
From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("v", "vector", true, "input vector sequencefile"); options.addOption("l", "label", true, "input vector sequencefile"); options.addOption("o", "output", true, "output Behemoth corpus"); // parse the command line arguments CommandLine line = null;/* w w w. j a v a2 s .c o m*/ try { line = parser.parse(options, args); if (line.hasOption("help")) { formatter.printHelp("CorpusGenerator", options); return 0; } if (!line.hasOption("v") | !line.hasOption("o") | !line.hasOption("l")) { formatter.printHelp("CorpusGenerator", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusGenerator", options); } Path vectorPath = new Path(line.getOptionValue("v")); Path labelPath = new Path(line.getOptionValue("l")); String output = line.getOptionValue("o"); Path tempOutput = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis()); // extracts the string representations from the vectors int retVal = vectorToString(vectorPath, tempOutput); if (retVal != 0) { HadoopUtil.delete(getConf(), tempOutput); return retVal; } Path tempOutput2 = new Path(vectorPath.getParent(), "temp-" + System.currentTimeMillis()); retVal = convert(tempOutput, labelPath, tempOutput2); // delete the temp output HadoopUtil.delete(getConf(), tempOutput); if (retVal != 0) { HadoopUtil.delete(getConf(), tempOutput2); return retVal; } // convert tempOutput to standard file BufferedWriter bow = new BufferedWriter(new FileWriter(new File(output))); // the label dictionary is not dumped to text int labelMaxIndex = 0; Map<String, Integer> labelIndex = new HashMap<String, Integer>(); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); FileStatus[] fss = fs.listStatus(tempOutput2); try { for (FileStatus status : fss) { Path path = status.getPath(); // skips the _log or _SUCCESS files if (!path.getName().startsWith("part-") && !path.getName().equals(tempOutput2.getName())) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); // read the key + values in that file Text key = new Text(); Text value = new Text(); while (reader.next(key, value)) { String label = key.toString(); // replace the label by its index Integer indexLabel = labelIndex.get(label); if (indexLabel == null) { indexLabel = new Integer(labelMaxIndex); labelIndex.put(label, indexLabel); labelMaxIndex++; } String val = value.toString(); bow.append(indexLabel.toString()).append(val).append("\n"); } reader.close(); } bow.flush(); } catch (Exception e) { e.printStackTrace(); return -1; } finally { bow.close(); fs.delete(tempOutput2, true); } return 0; }
From source file:com.digitalpebble.behemoth.util.ContentExtractor.java
License:Apache License
private int generateDocs(String inputf, String outputf) throws IOException, ArchiveException { Path input = new Path(inputf); Path dirPath = new Path(outputf); FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf()); if (fsout.exists(dirPath) == false) fsout.mkdirs(dirPath);/*from www .j a va 2 s . co m*/ else { System.err.println("Output " + outputf + " already exists"); return -1; } // index file Path indexPath = new Path(dirPath, "index"); if (fsout.exists(indexPath) == false) { fsout.createNewFile(indexPath); } maxNumEntriesInArchive = getConf().getInt(numEntriesPerArchiveParamName, 10000); index = fsout.create(indexPath); createArchive(dirPath); FileSystem fs = input.getFileSystem(getConf()); FileStatus[] statuses = fs.listStatus(input); int count[] = { 0 }; for (int i = 0; i < statuses.length; i++) { FileStatus status = statuses[i]; Path suPath = status.getPath(); if (suPath.getName().equals("_SUCCESS")) continue; generateDocs(suPath, dirPath, count); } if (index != null) index.close(); if (currentArchive != null) { currentArchive.finish(); currentArchive.close(); } return 0; }
From source file:com.digitalpebble.behemoth.util.CorpusReader.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("c", "displayContent", false, "display binary content in output"); options.addOption("t", "displayText", false, "display text in output"); options.addOption("a", "displayAnnotations", false, "display annotations in output"); options.addOption("m", "displayMetadata", false, "display metadata in output"); // parse the command line arguments CommandLine line = null;/*from w w w . j a v a 2 s .com*/ try { line = parser.parse(options, args); String input = line.getOptionValue("i"); if (line.hasOption("help")) { formatter.printHelp("CorpusReader", options); return 0; } if (input == null) { formatter.printHelp("CorpusReader", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusReader", options); return -1; } boolean showBinaryContent = line.hasOption("displayContent"); boolean showText = line.hasOption("displayText"); boolean showAnnotations = line.hasOption("displayAnnotations"); boolean showMD = line.hasOption("displayMetadata"); Path inputPath = new Path(line.getOptionValue("i")); Configuration conf = getConf(); FileSystem fs = inputPath.getFileSystem(conf); // filter input DocumentFilter filters = DocumentFilter.getFilters(conf); boolean doFilter = DocumentFilter.isRequired(conf); FileStatus[] fss = fs.listStatus(inputPath); for (FileStatus status : fss) { Path path = status.getPath(); // skips the _log or _SUCCESS files if (!path.getName().startsWith("part-") && !path.getName().equals(inputPath.getName())) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); Text key = new Text(); BehemothDocument value = new BehemothDocument(); while (reader.next(key, value)) { // skip this document? if (doFilter && filters.keep(value) == false) continue; System.out.println(value.toString(showBinaryContent, showAnnotations, showText, showMD)); } reader.close(); } return 0; }
From source file:com.facebook.hiveio.common.FileSystems.java
License:Apache License
/** * Move a file or directory from source to destination, recursively copying * subdirectories.// w ww . ja v a2 s. c o m * * @param fs FileSystem * @param file path to copy (file or directory) * @param src path to source directory * @param dest path to destination directory * @throws IOException I/O problems */ public static void move(FileSystem fs, Path file, Path src, Path dest) throws IOException { Path destFilePath = pathInDestination(file, src, dest); if (fs.isFile(file)) { if (fs.exists(destFilePath)) { if (!fs.delete(destFilePath, true)) { throw new IllegalArgumentException("Could not remove existing file " + destFilePath); } } if (!fs.rename(file, destFilePath)) { throw new IllegalArgumentException("Could not move " + file + " to " + destFilePath); } } else if (fs.getFileStatus(file).isDir()) { FileStatus[] statuses = fs.listStatus(file); fs.mkdirs(destFilePath); if (statuses != null) { for (FileStatus status : statuses) { move(fs, status.getPath(), src, dest); } } } }
From source file:com.facebook.presto.hive.AbstractTestHiveClient.java
License:Apache License
protected Set<String> listAllDataFiles(Path path) throws IOException { Set<String> result = new HashSet<>(); FileSystem fileSystem = hdfsEnvironment.getFileSystem("user", path); if (fileSystem.exists(path)) { for (FileStatus fileStatus : fileSystem.listStatus(path)) { if (HadoopFileStatus.isFile(fileStatus)) { result.add(fileStatus.getPath().toString()); } else if (HadoopFileStatus.isDirectory(fileStatus)) { result.addAll(listAllDataFiles(fileStatus.getPath())); }/*w w w. ja v a2s . co m*/ } } return result; }
From source file:com.facebook.presto.hive.AbstractTestHiveClient.java
License:Apache License
private List<String> listDirectory(String user, Path path) throws IOException { FileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path); ImmutableList.Builder<String> result = ImmutableList.builder(); for (FileStatus fileStatus : fileSystem.listStatus(path)) { result.add(fileStatus.getPath().getName()); }//w w w . ja va2 s . c o m return result.build(); }
From source file:com.facebook.presto.hive.HiveSplitSourceProvider.java
License:Apache License
private static FileStatus[] listStatus(FileSystem fs, Path path) { try {/*from w ww. j ava 2 s.c o m*/ return fs.listStatus(path); } catch (IOException e) { throw Throwables.propagate(e); } }