List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:de.rwth.i9.palm.analytics.algorithm.lda.CustomVectorDumper.java
License:Apache License
@Override public int run(String[] args) throws Exception { /**// ww w. j av a 2s.com * Option seqOpt = * obuilder.withLongName("seqFile").withRequired(false).withArgument( * abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()). * withDescription( * "The Sequence File containing the Vectors").withShortName * ("s").create(); Option dirOpt = * obuilder.withLongName("seqDirectory"). * withRequired(false).withArgument( * abuilder.withName("seqDirectory").withMinimum * (1).withMaximum(1).create()) .withDescription( * "The directory containing Sequence File of Vectors") * .withShortName("d").create(); */ addInputOption(); addOutputOption(); addOption("useKey", "u", "If the Key is a vector than dump that instead"); addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true"); addOption("dictionary", "d", "The dictionary file.", false); addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false); addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries"); addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector " + "(if the vector is one) printing out the name"); addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)"); addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order"); addOption("quiet", "q", "Print only file contents"); addOption("sizeOnly", "sz", "Dump only the size of the vector"); addOption("numItems", "ni", "Output at most <n> vecors", false); addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort", false); addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." + " Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path input = getInputPath(); FileStatus fileStatus = fs.getFileStatus(input); if (fileStatus.isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter())); } else { FileStatus[] inputPaths = fs.globStatus(input); pathArr = new Path[inputPaths.length]; int i = 0; for (FileStatus fstatus : inputPaths) { pathArr[i++] = fstatus.getPath(); } } String dictionaryType = getOption("dictionaryType", "text"); boolean sortVectors = hasOption("sortVectors"); boolean quiet = hasOption("quiet"); if (!quiet) { log.info("Sort? {}", sortVectors); } String[] dictionary = null; if (hasOption("dictionary")) { String dictFile = getOption("dictionary"); if ("text".equals(dictionaryType)) { dictionary = VectorHelper.loadTermDictionary(new File(dictFile)); } else if ("sequencefile".equals(dictionaryType)) { dictionary = VectorHelper.loadTermDictionary(conf, dictFile); } else { // TODO: support Lucene's FST as a dictionary type throw new IOException("Invalid dictionary type: " + dictionaryType); } } Set<String> filters; if (hasOption("filter")) { filters = Sets.newHashSet(getOptions("filter")); } else { filters = null; } boolean useCSV = hasOption("csv"); boolean sizeOnly = hasOption("sizeOnly"); boolean nameOnly = hasOption("nameOnly"); boolean namesAsComments = hasOption("namesAsComments"); boolean transposeKeyValue = hasOption("vectorAsKey"); Writer writer; boolean shouldClose; File output = getOutputFile(); if (output != null) { shouldClose = true; log.info("Output file: {}", output); Files.createParentDirs(output); writer = Files.newWriter(output, Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { boolean printKey = hasOption("printKey"); if (useCSV && dictionary != null) { writer.write("#"); for (int j = 0; j < dictionary.length; j++) { writer.write(dictionary[j]); if (j < dictionary.length - 1) { writer.write(','); } } writer.write('\n'); } Long numItems = null; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (quiet) { writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n'); } } int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize")) : Integer.MAX_VALUE; long itemCount = 0; int fileCount = 0; for (Path path : pathArr) { if (numItems != null && numItems <= itemCount) { break; } if (quiet) { log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length); } SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>( path, true, conf); Iterator<Pair<Writable, Writable>> iterator = iterable.iterator(); long i = 0; while (iterator.hasNext() && (numItems == null || itemCount < numItems)) { Pair<Writable, Writable> record = iterator.next(); Writable keyWritable = record.getFirst(); Writable valueWritable = record.getSecond(); if (printKey) { Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable; writer.write(notTheVectorWritable.toString()); writer.write('\t'); } Vector vector; try { vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get(); } catch (ClassCastException e) { if ((transposeKeyValue ? keyWritable : valueWritable) instanceof WeightedPropertyVectorWritable) { vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).getVector(); } else { throw e; } } if (filters != null && vector instanceof NamedVector && !filters.contains(((NamedVector) vector).getName())) { // we are filtering out this item, skip continue; } if (sizeOnly) { if (vector instanceof NamedVector) { writer.write(((NamedVector) vector).getName()); writer.write(":"); } else { writer.write(String.valueOf(i++)); writer.write(":"); } writer.write(String.valueOf(vector.size())); writer.write('\n'); } else if (nameOnly) { if (vector instanceof NamedVector) { writer.write(((NamedVector) vector).getName()); writer.write('\n'); } } else { String fmtStr; if (useCSV) { fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments); } else { fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector, sortVectors); } writer.write(fmtStr); writer.write('\n'); } itemCount++; } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.HdfsResource.java
License:Apache License
@SuppressWarnings("deprecation") HdfsResource(Path path, FileSystem fs) { Assert.notNull(path, "a valid path is required"); Assert.notNull(fs, "non null file system required"); this.location = path.toString(); this.fs = fs; this.path = path.makeQualified(fs); boolean exists = false; try {//from w w w .jav a 2s . c o m exists = fs.exists(path); } catch (final Exception ex) { } this.exists = exists; FileStatus status = null; try { status = fs.getFileStatus(path); } catch (final Exception ex) { } this.status = status; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileReader.java
License:Apache License
/** * Opens a file for reading. If the filename ends in `.gz`, it is automatically decompressed * on the fly.//from w w w . j ava 2s . co m * * @param conf The Hadoop configuration. * @param filePath The Hadoop path to the file that should be read. * @throws IOException I/O exception */ public WARCFileReader(Configuration conf, Path filePath) throws IOException { FileSystem fs = filePath.getFileSystem(conf); this.fileSize = fs.getFileStatus(filePath).getLen(); logger.info("Reading from " + filePath); CompressionCodec codec = filePath.getName().endsWith(".gz") ? WARCFileWriter.getGzipCodec(conf) : null; byteStream = new CountingInputStream(new BufferedInputStream(fs.open(filePath))); dataStream = new DataInputStream(codec == null ? byteStream : codec.createInputStream(byteStream)); }
From source file:edu.arizona.cs.hadoop.fs.irods.output.HirodsFileOutputCommitter.java
License:Apache License
private void moveTaskOutputsToIRODS(TaskAttemptContext context, FileSystem outfs, Path outDir, FileSystem workfs, Path workOutput) throws IOException { context.progress();/* w w w . ja va 2 s .co m*/ if (workfs.isFile(workOutput)) { Path finalOutputPath = getFinalPath(outDir, workOutput, this.workPath); FSDataOutputStream irods_os = null; FSDataInputStream temp_is = null; try { // commit to iRODS irods_os = outfs.create(finalOutputPath, true); temp_is = workfs.open(workOutput); byte[] buffer = new byte[100 * 1024]; int bytes_read = 0; while ((bytes_read = temp_is.read(buffer)) != -1) { irods_os.write(buffer, 0, bytes_read); } } finally { if (temp_is != null) { try { temp_is.close(); } catch (IOException ex) { // ignore exceptions } } // remove temporary file try { workfs.delete(workOutput, true); } catch (IOException ex) { // ignore exceptions } if (irods_os != null) { irods_os.close(); } } LOG.debug("Moved " + workOutput + " to " + finalOutputPath); } else if (workfs.getFileStatus(workOutput).isDir()) { FileStatus[] paths = workfs.listStatus(workOutput); Path finalOutputPath = getFinalPath(outDir, workOutput, this.workPath); outfs.mkdirs(finalOutputPath); if (paths != null) { for (FileStatus path : paths) { moveTaskOutputsToIRODS(context, outfs, outDir, workfs, path.getPath()); } } } }
From source file:edu.berkeley.chukwa_xtrace.TestXtrExtract.java
License:Apache License
public void testArchiving() throws Exception { System.out.println("starting archive test"); Configuration conf = new Configuration(); System.setProperty("hadoop.log.dir", System.getProperty("test.build.data", "/tmp")); MiniDFSCluster dfs = new MiniDFSCluster(conf, NUM_HADOOP_SLAVES, true, null); FileSystem fileSys = dfs.getFileSystem(); fileSys.delete(OUTPUT_DIR, true);//nuke output dir writeASinkFile(conf, fileSys, INPUT_DIR, 1000); FileStatus fstat = fileSys.getFileStatus(INPUT_DIR); assertTrue(fstat.getLen() > 10);//from ww w . jav a 2 s. co m System.out.println("filesystem is " + fileSys.getUri()); conf.set("fs.default.name", fileSys.getUri().toString()); conf.setInt("io.sort.mb", 1); conf.setInt("io.sort.factor", 5); conf.setInt("mapred.tasktracker.map.tasks.maximum", 2); conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 2); MiniMRCluster mr = new MiniMRCluster(NUM_HADOOP_SLAVES, fileSys.getUri().toString(), 1); String[] archiveArgs = { INPUT_DIR.toString(), fileSys.getUri().toString() + OUTPUT_DIR.toString() }; JobConf jc = mr.createJobConf(new JobConf(conf)); assertEquals("true", jc.get("archive.groupByClusterName")); assertEquals(1, jc.getInt("io.sort.mb", 5)); int returnVal = ToolRunner.run(jc, new XtrExtract(), archiveArgs); assertEquals(0, returnVal); fstat = fileSys.getFileStatus(new Path("/chukwa/archives/foocluster/HadoopLogProcessor_2008_05_29.arc")); assertTrue(fstat.getLen() > 10); Thread.sleep(1000); System.out.println("done!"); }
From source file:edu.brown.cs.mapreduce.BenchmarkBase.java
License:Open Source License
public JobConf getJobConf() { JobConf jobConf = new JobConf(this.conf, this.benchmarkClass); ////from w w w . ja v a 2 s .c o m // Options // List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; i++) { try { // // Print property and exit // if ("-property".equals(args[i])) { String prop = jobConf.get(args[i + 1]); System.out.println(prop); System.exit(0); // // # of Maps // } else if ("-m".equals(args[i])) { this.num_of_maps = Integer.parseInt(args[++i]); // // # of Reduces // } else if ("-r".equals(args[i])) { this.num_of_reduces = Integer.parseInt(args[++i]); // // Enable debug // } else if ("-debug".equals(args[i])) { this.debug = true; // // Enable single output file for results // } else if ("-combine".equals(args[i])) { this.combine = true; // // Tell jobs to compress their intermediate output files // } else if ("-compress".equals(args[i])) { this.compress = true; // // We're using TupleWritable (which has to be in a SequenceFile) // } else if ("-tuple".equals(args[i])) { this.tuple_data = true; this.sequence_file = true; // // Use SequenceFiles for initial input // } else if ("-sequence".equals(args[i])) { this.sequence_file = true; // // Recursively load directories // } else if ("-recursive-dirs".equals(args[i])) { this.load_directories = true; // // Job Basename // } else if ("-basename".equals(args[i])) { this.job_name = args[++i]; // // Misc. Properties // } else if ("-D".equals(args[i].substring(0, 2))) { String arg = args[i].substring(2); int pos = arg.indexOf('='); if (pos == -1) { System.err.println("ERROR: Invalid properties option '" + arg + "'"); System.exit(1); } this.options.put(arg.substring(0, pos), arg.substring(pos + 1)); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.err.println("ERROR: Integer expected instead of " + args[i]); System.exit(1); } catch (ArrayIndexOutOfBoundsException except) { System.err.println("ERROR: Required parameter missing from " + args[i - 1]); System.exit(1); } } // FOR // // Make sure there are exactly 2 parameters left. // if (otherArgs.size() < 2) { System.err.println("ERROR: Wrong number of parameters: " + otherArgs.size()); System.exit(1); } // // Set these flags so the jobs know about them // if (this.getSequenceFile()) this.options.put(PROPERTY_SEQUENCEFILE, "true"); if (this.getTupleData()) this.options.put(PROPERTY_TUPLEDATA, "true"); if (this.getDebug()) this.options.put(PROPERTY_DEBUG, "true"); FileSystem fs = null; try { fs = FileSystem.get(conf); } catch (Exception ex) { ex.printStackTrace(); System.exit(-1); } // // Input Paths // int cnt = otherArgs.size() - 1; this.input_paths = new ArrayList<Path>(); for (int ctr = 0; ctr < cnt; ctr++) { Path new_path = new Path(otherArgs.get(ctr)); try { if (this.load_directories && fs.getFileStatus(new_path).isDir()) { //int limit = 10; FileStatus paths[] = fs.listStatus(new_path); for (FileStatus p : paths) { this.input_paths.add(p.getPath()); FileInputFormat.addInputPath(jobConf, p.getPath()); //if (limit-- <= 0) break; } // FOR } else { this.input_paths.add(new_path); FileInputFormat.addInputPath(jobConf, new_path); } } catch (Exception ex) { ex.printStackTrace(); System.exit(-1); } } // FOR if (this.input_paths.isEmpty()) { System.err.println( "ERROR: No input paths were defined for '" + this.benchmarkClass.getSimpleName() + "'"); System.exit(-1); } // // Output Paths // this.output_path = new Path(otherArgs.get(otherArgs.size() - 1)); FileOutputFormat.setOutputPath(jobConf, this.output_path); jobConf.setJobName(this.job_name != null ? this.job_name : this.benchmarkClass.getSimpleName()); if (this.num_of_maps >= 0) jobConf.setNumMapTasks(this.num_of_maps); if (this.num_of_reduces >= 0) jobConf.setNumReduceTasks(this.num_of_reduces); // // Set all properties // for (String key : this.options.keySet()) { jobConf.set(key, this.options.get(key)); } return (jobConf); }
From source file:edu.cmu.graphchi.toolkits.collaborative_filtering.yarn.ApplicationMaster.java
License:Apache License
public static LocalResource addLocalResource(String filePath) throws Exception { File file = new File(filePath); String fullFilePath = file.getAbsolutePath(); FileSystem fs = FileSystem.get(IO.getConf()); Path src = new Path(fullFilePath); String pathSuffix = "local-tmp/" + file.getName(); Path dst = new Path(fs.getHomeDirectory(), pathSuffix); fs.copyFromLocalFile(false, true, src, dst); FileStatus destStatus = fs.getFileStatus(dst); LocalResource resource = Records.newRecord(LocalResource.class); resource.setType(LocalResourceType.FILE); resource.setVisibility(LocalResourceVisibility.APPLICATION); resource.setResource(ConverterUtils.getYarnUrlFromPath(dst)); resource.setTimestamp(destStatus.getModificationTime()); resource.setSize(destStatus.getLen()); return resource; }
From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java
License:Open Source License
/** * Copies a part of a file from a remote file system (e.g., HDFS) to a local * file. Returns a path to a local temporary file. * // w w w . j ava 2s. c om * @param conf * @param split * @return * @throws IOException */ public static String copyFileSplit(Configuration conf, FileSplit split) throws IOException { FileSystem fs = split.getPath().getFileSystem(conf); // Special case of a local file. Skip copying the file if (fs instanceof LocalFileSystem && split.getStart() == 0) return split.getPath().toUri().getPath(); // Length of input file. We do not depend on split.length because it is // not // set by input format for performance reason. Setting it in the input // format would cost a lot of time because it runs on the client machine // while the record reader runs on slave nodes in parallel long length = fs.getFileStatus(split.getPath()).getLen(); FSDataInputStream in = fs.open(split.getPath()); in.seek(split.getStart()); ReadableByteChannel rbc = Channels.newChannel(in); // Prepare output file for write File tempFile = File.createTempFile(split.getPath().getName(), "tmp"); FileOutputStream out = new FileOutputStream(tempFile); out.getChannel().transferFrom(rbc, 0, length); rbc.close(); out.close(); return tempFile.getAbsolutePath(); }
From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java
License:Open Source License
/** * Copies a file to the local file system given its path. * /*from w ww. java2 s .c o m*/ * @param conf * @param inFile * @return * @throws IOException */ public static String copyFile(Configuration conf, Path inFile) throws IOException { FileSystem fs = inFile.getFileSystem(conf); return copyFile(conf, fs.getFileStatus(inFile)); }
From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java
License:Open Source License
/** * Get the actual size of all data in the given directory. If the input is * a single file, its size is returned immediately. If the input is a * directory, we returns the total size of all data in that directory. * If there is a global index, the size is retrieved from that global index. * Otherwise, we add up all the sizes of single files. * @param fs - the file system that contains the path * @param path - the path that contains the data * @return/*from w w w . j a va2 s . c o m*/ * @throws IOException */ public static long getPathSize(FileSystem fs, Path path) throws IOException { FileStatus fileStatus = fs.getFileStatus(path); // 1- Check if the path points to a file if (!fileStatus.isDir()) return fileStatus.getLen(); // 2- Check if the input is indexed and get the cached size GlobalIndex<Partition> gIndex = SpatialTemporalSite.getGlobalIndex(fs, path); if (gIndex != null) { long totalSize = 0; for (Partition partition : gIndex) totalSize += partition.size; return totalSize; } // 3- Get the total size of all non-hidden files long totalSize = 0; FileStatus[] allFiles = fs.listStatus(path, SpatialTemporalSite.NonHiddenFileFilter); for (FileStatus subFile : allFiles) { if (!subFile.isDir()) totalSize += subFile.getLen(); } return totalSize; }