Example usage for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:de.rwth.i9.palm.analytics.algorithm.lda.CustomVectorDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**//  ww w.  j av a 2s.com
     * Option seqOpt =
     * obuilder.withLongName("seqFile").withRequired(false).withArgument(
     * abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
     * withDescription(
     * "The Sequence File containing the Vectors").withShortName
     * ("s").create(); Option dirOpt =
     * obuilder.withLongName("seqDirectory").
     * withRequired(false).withArgument(
     * abuilder.withName("seqDirectory").withMinimum
     * (1).withMaximum(1).create()) .withDescription(
     * "The directory containing Sequence File of Vectors")
     * .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c",
            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
            + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort",
            "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs",
            "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort",
            false);
    addOption(buildOption("filter", "fi",
            "Only dump out those vectors whose name matches the filter."
                    + "  Multiple items may be specified by repeating the argument.",
            true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter()));
    } else {
        FileStatus[] inputPaths = fs.globStatus(input);
        pathArr = new Path[inputPaths.length];
        int i = 0;
        for (FileStatus fstatus : inputPaths) {
            pathArr[i++] = fstatus.getPath();
        }
    }

    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
        log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
        String dictFile = getOption("dictionary");
        if ("text".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
        } else if ("sequencefile".equals(dictionaryType)) {
            dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
        } else {
            // TODO: support Lucene's FST as a dictionary type
            throw new IOException("Invalid dictionary type: " + dictionaryType);
        }
    }

    Set<String> filters;
    if (hasOption("filter")) {
        filters = Sets.newHashSet(getOptions("filter"));
    } else {
        filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
        shouldClose = true;
        log.info("Output file: {}", output);
        Files.createParentDirs(output);
        writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        boolean printKey = hasOption("printKey");
        if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
                writer.write(dictionary[j]);
                if (j < dictionary.length - 1) {
                    writer.write(',');
                }
            }
            writer.write('\n');
        }
        Long numItems = null;
        if (hasOption("numItems")) {
            numItems = Long.parseLong(getOption("numItems"));
            if (quiet) {
                writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
            }
        }
        int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize"))
                : Integer.MAX_VALUE;
        long itemCount = 0;
        int fileCount = 0;
        for (Path path : pathArr) {
            if (numItems != null && numItems <= itemCount) {
                break;
            }
            if (quiet) {
                log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
            }
            SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<Writable, Writable>(
                    path, true, conf);
            Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
            long i = 0;
            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
                Pair<Writable, Writable> record = iterator.next();
                Writable keyWritable = record.getFirst();
                Writable valueWritable = record.getSecond();
                if (printKey) {
                    Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
                    writer.write(notTheVectorWritable.toString());
                    writer.write('\t');
                }
                Vector vector;
                try {
                    vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get();
                } catch (ClassCastException e) {
                    if ((transposeKeyValue ? keyWritable
                            : valueWritable) instanceof WeightedPropertyVectorWritable) {
                        vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable
                                : valueWritable)).getVector();
                    } else {
                        throw e;
                    }
                }
                if (filters != null && vector instanceof NamedVector
                        && !filters.contains(((NamedVector) vector).getName())) {
                    // we are filtering out this item, skip
                    continue;
                }
                if (sizeOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write(":");
                    } else {
                        writer.write(String.valueOf(i++));
                        writer.write(":");
                    }
                    writer.write(String.valueOf(vector.size()));
                    writer.write('\n');
                } else if (nameOnly) {
                    if (vector instanceof NamedVector) {
                        writer.write(((NamedVector) vector).getName());
                        writer.write('\n');
                    }
                } else {
                    String fmtStr;
                    if (useCSV) {
                        fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
                    } else {
                        fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                                sortVectors);
                    }
                    writer.write(fmtStr);
                    writer.write('\n');
                }
                itemCount++;
            }
        }
        writer.flush();
    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.HdfsResource.java

License:Apache License

@SuppressWarnings("deprecation")
HdfsResource(Path path, FileSystem fs) {
    Assert.notNull(path, "a valid path is required");
    Assert.notNull(fs, "non null file system required");

    this.location = path.toString();
    this.fs = fs;
    this.path = path.makeQualified(fs);

    boolean exists = false;

    try {//from   w  w  w  .jav  a 2s  .  c o m
        exists = fs.exists(path);
    } catch (final Exception ex) {
    }
    this.exists = exists;

    FileStatus status = null;
    try {
        status = fs.getFileStatus(path);
    } catch (final Exception ex) {
    }
    this.status = status;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileReader.java

License:Apache License

/**
 * Opens a file for reading. If the filename ends in `.gz`, it is automatically decompressed
 * on the fly.//from  w  w  w .  j ava  2s . co  m
 *
 * @param conf     The Hadoop configuration.
 * @param filePath The Hadoop path to the file that should be read.
 * @throws IOException I/O exception
 */
public WARCFileReader(Configuration conf, Path filePath) throws IOException {
    FileSystem fs = filePath.getFileSystem(conf);
    this.fileSize = fs.getFileStatus(filePath).getLen();
    logger.info("Reading from " + filePath);

    CompressionCodec codec = filePath.getName().endsWith(".gz") ? WARCFileWriter.getGzipCodec(conf) : null;
    byteStream = new CountingInputStream(new BufferedInputStream(fs.open(filePath)));
    dataStream = new DataInputStream(codec == null ? byteStream : codec.createInputStream(byteStream));
}

From source file:edu.arizona.cs.hadoop.fs.irods.output.HirodsFileOutputCommitter.java

License:Apache License

private void moveTaskOutputsToIRODS(TaskAttemptContext context, FileSystem outfs, Path outDir,
        FileSystem workfs, Path workOutput) throws IOException {
    context.progress();/*  w w  w .  ja va 2 s .co  m*/
    if (workfs.isFile(workOutput)) {
        Path finalOutputPath = getFinalPath(outDir, workOutput, this.workPath);
        FSDataOutputStream irods_os = null;
        FSDataInputStream temp_is = null;
        try {
            // commit to iRODS
            irods_os = outfs.create(finalOutputPath, true);
            temp_is = workfs.open(workOutput);

            byte[] buffer = new byte[100 * 1024];
            int bytes_read = 0;

            while ((bytes_read = temp_is.read(buffer)) != -1) {
                irods_os.write(buffer, 0, bytes_read);
            }
        } finally {
            if (temp_is != null) {
                try {
                    temp_is.close();
                } catch (IOException ex) {
                    // ignore exceptions
                }
            }

            // remove temporary file
            try {
                workfs.delete(workOutput, true);
            } catch (IOException ex) {
                // ignore exceptions
            }

            if (irods_os != null) {
                irods_os.close();
            }
        }

        LOG.debug("Moved " + workOutput + " to " + finalOutputPath);
    } else if (workfs.getFileStatus(workOutput).isDir()) {
        FileStatus[] paths = workfs.listStatus(workOutput);
        Path finalOutputPath = getFinalPath(outDir, workOutput, this.workPath);
        outfs.mkdirs(finalOutputPath);
        if (paths != null) {
            for (FileStatus path : paths) {
                moveTaskOutputsToIRODS(context, outfs, outDir, workfs, path.getPath());
            }
        }
    }
}

From source file:edu.berkeley.chukwa_xtrace.TestXtrExtract.java

License:Apache License

public void testArchiving() throws Exception {

    System.out.println("starting archive test");
    Configuration conf = new Configuration();
    System.setProperty("hadoop.log.dir", System.getProperty("test.build.data", "/tmp"));
    MiniDFSCluster dfs = new MiniDFSCluster(conf, NUM_HADOOP_SLAVES, true, null);
    FileSystem fileSys = dfs.getFileSystem();
    fileSys.delete(OUTPUT_DIR, true);//nuke output dir

    writeASinkFile(conf, fileSys, INPUT_DIR, 1000);

    FileStatus fstat = fileSys.getFileStatus(INPUT_DIR);
    assertTrue(fstat.getLen() > 10);//from   ww  w  . jav  a 2  s.  co  m

    System.out.println("filesystem is " + fileSys.getUri());
    conf.set("fs.default.name", fileSys.getUri().toString());
    conf.setInt("io.sort.mb", 1);
    conf.setInt("io.sort.factor", 5);
    conf.setInt("mapred.tasktracker.map.tasks.maximum", 2);
    conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 2);

    MiniMRCluster mr = new MiniMRCluster(NUM_HADOOP_SLAVES, fileSys.getUri().toString(), 1);
    String[] archiveArgs = { INPUT_DIR.toString(), fileSys.getUri().toString() + OUTPUT_DIR.toString() };

    JobConf jc = mr.createJobConf(new JobConf(conf));
    assertEquals("true", jc.get("archive.groupByClusterName"));
    assertEquals(1, jc.getInt("io.sort.mb", 5));

    int returnVal = ToolRunner.run(jc, new XtrExtract(), archiveArgs);
    assertEquals(0, returnVal);
    fstat = fileSys.getFileStatus(new Path("/chukwa/archives/foocluster/HadoopLogProcessor_2008_05_29.arc"));
    assertTrue(fstat.getLen() > 10);

    Thread.sleep(1000);

    System.out.println("done!");
}

From source file:edu.brown.cs.mapreduce.BenchmarkBase.java

License:Open Source License

public JobConf getJobConf() {
    JobConf jobConf = new JobConf(this.conf, this.benchmarkClass);
    ////from   w  w  w . ja  v  a 2  s  .c o  m
    // Options
    //
    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; i++) {
        try {
            //
            // Print property and exit
            //
            if ("-property".equals(args[i])) {
                String prop = jobConf.get(args[i + 1]);
                System.out.println(prop);
                System.exit(0);
                //
                // # of Maps
                //
            } else if ("-m".equals(args[i])) {
                this.num_of_maps = Integer.parseInt(args[++i]);
                //
                // # of Reduces
                //
            } else if ("-r".equals(args[i])) {
                this.num_of_reduces = Integer.parseInt(args[++i]);
                //
                // Enable debug
                //
            } else if ("-debug".equals(args[i])) {
                this.debug = true;
                //
                // Enable single output file for results
                //
            } else if ("-combine".equals(args[i])) {
                this.combine = true;
                //
                // Tell jobs to compress their intermediate output files
                //
            } else if ("-compress".equals(args[i])) {
                this.compress = true;
                //
                // We're using TupleWritable (which has to be in a SequenceFile)
                //
            } else if ("-tuple".equals(args[i])) {
                this.tuple_data = true;
                this.sequence_file = true;
                //
                // Use SequenceFiles for initial input
                //
            } else if ("-sequence".equals(args[i])) {
                this.sequence_file = true;
                //
                // Recursively load directories
                //
            } else if ("-recursive-dirs".equals(args[i])) {
                this.load_directories = true;
                //
                // Job Basename
                //
            } else if ("-basename".equals(args[i])) {
                this.job_name = args[++i];
                //
                // Misc. Properties
                //
            } else if ("-D".equals(args[i].substring(0, 2))) {
                String arg = args[i].substring(2);
                int pos = arg.indexOf('=');
                if (pos == -1) {
                    System.err.println("ERROR: Invalid properties option '" + arg + "'");
                    System.exit(1);
                }
                this.options.put(arg.substring(0, pos), arg.substring(pos + 1));
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.err.println("ERROR: Integer expected instead of " + args[i]);
            System.exit(1);
        } catch (ArrayIndexOutOfBoundsException except) {
            System.err.println("ERROR: Required parameter missing from " + args[i - 1]);
            System.exit(1);
        }
    } // FOR
      //
      // Make sure there are exactly 2 parameters left.
      //
    if (otherArgs.size() < 2) {
        System.err.println("ERROR: Wrong number of parameters: " + otherArgs.size());
        System.exit(1);
    }

    //
    // Set these flags so the jobs know about them
    //
    if (this.getSequenceFile())
        this.options.put(PROPERTY_SEQUENCEFILE, "true");
    if (this.getTupleData())
        this.options.put(PROPERTY_TUPLEDATA, "true");
    if (this.getDebug())
        this.options.put(PROPERTY_DEBUG, "true");

    FileSystem fs = null;
    try {
        fs = FileSystem.get(conf);
    } catch (Exception ex) {
        ex.printStackTrace();
        System.exit(-1);
    }

    //
    // Input Paths
    //
    int cnt = otherArgs.size() - 1;
    this.input_paths = new ArrayList<Path>();
    for (int ctr = 0; ctr < cnt; ctr++) {
        Path new_path = new Path(otherArgs.get(ctr));
        try {
            if (this.load_directories && fs.getFileStatus(new_path).isDir()) {
                //int limit = 10;
                FileStatus paths[] = fs.listStatus(new_path);
                for (FileStatus p : paths) {
                    this.input_paths.add(p.getPath());
                    FileInputFormat.addInputPath(jobConf, p.getPath());
                    //if (limit-- <= 0) break;
                } // FOR
            } else {
                this.input_paths.add(new_path);
                FileInputFormat.addInputPath(jobConf, new_path);
            }
        } catch (Exception ex) {
            ex.printStackTrace();
            System.exit(-1);
        }
    } // FOR
    if (this.input_paths.isEmpty()) {
        System.err.println(
                "ERROR: No input paths were defined for '" + this.benchmarkClass.getSimpleName() + "'");
        System.exit(-1);
    }

    //
    // Output Paths
    //
    this.output_path = new Path(otherArgs.get(otherArgs.size() - 1));
    FileOutputFormat.setOutputPath(jobConf, this.output_path);

    jobConf.setJobName(this.job_name != null ? this.job_name : this.benchmarkClass.getSimpleName());
    if (this.num_of_maps >= 0)
        jobConf.setNumMapTasks(this.num_of_maps);
    if (this.num_of_reduces >= 0)
        jobConf.setNumReduceTasks(this.num_of_reduces);

    //
    // Set all properties
    //
    for (String key : this.options.keySet()) {
        jobConf.set(key, this.options.get(key));
    }

    return (jobConf);
}

From source file:edu.cmu.graphchi.toolkits.collaborative_filtering.yarn.ApplicationMaster.java

License:Apache License

public static LocalResource addLocalResource(String filePath) throws Exception {
    File file = new File(filePath);
    String fullFilePath = file.getAbsolutePath();

    FileSystem fs = FileSystem.get(IO.getConf());
    Path src = new Path(fullFilePath);
    String pathSuffix = "local-tmp/" + file.getName();
    Path dst = new Path(fs.getHomeDirectory(), pathSuffix);
    fs.copyFromLocalFile(false, true, src, dst);
    FileStatus destStatus = fs.getFileStatus(dst);
    LocalResource resource = Records.newRecord(LocalResource.class);

    resource.setType(LocalResourceType.FILE);
    resource.setVisibility(LocalResourceVisibility.APPLICATION);
    resource.setResource(ConverterUtils.getYarnUrlFromPath(dst));
    resource.setTimestamp(destStatus.getModificationTime());
    resource.setSize(destStatus.getLen());

    return resource;
}

From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java

License:Open Source License

/**
 * Copies a part of a file from a remote file system (e.g., HDFS) to a local
 * file. Returns a path to a local temporary file.
 * // w w w .  j ava  2s.  c om
 * @param conf
 * @param split
 * @return
 * @throws IOException
 */
public static String copyFileSplit(Configuration conf, FileSplit split) throws IOException {
    FileSystem fs = split.getPath().getFileSystem(conf);

    // Special case of a local file. Skip copying the file
    if (fs instanceof LocalFileSystem && split.getStart() == 0)
        return split.getPath().toUri().getPath();

    // Length of input file. We do not depend on split.length because it is
    // not
    // set by input format for performance reason. Setting it in the input
    // format would cost a lot of time because it runs on the client machine
    // while the record reader runs on slave nodes in parallel
    long length = fs.getFileStatus(split.getPath()).getLen();

    FSDataInputStream in = fs.open(split.getPath());
    in.seek(split.getStart());
    ReadableByteChannel rbc = Channels.newChannel(in);

    // Prepare output file for write
    File tempFile = File.createTempFile(split.getPath().getName(), "tmp");
    FileOutputStream out = new FileOutputStream(tempFile);

    out.getChannel().transferFrom(rbc, 0, length);

    rbc.close();
    out.close();
    return tempFile.getAbsolutePath();
}

From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java

License:Open Source License

/**
 * Copies a file to the local file system given its path.
 * /*from w ww. java2  s .c o  m*/
 * @param conf
 * @param inFile
 * @return
 * @throws IOException
 */
public static String copyFile(Configuration conf, Path inFile) throws IOException {
    FileSystem fs = inFile.getFileSystem(conf);
    return copyFile(conf, fs.getFileStatus(inFile));
}

From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java

License:Open Source License

/**
 * Get the actual size of all data in the given directory. If the input is
 * a single file, its size is returned immediately. If the input is a
 * directory, we returns the total size of all data in that directory.
 * If there is a global index, the size is retrieved from that global index.
 * Otherwise, we add up all the sizes of single files.
 * @param fs - the file system that contains the path
 * @param path - the path that contains the data
 * @return/*from  w  w w . j  a  va2  s .  c o  m*/
 * @throws IOException 
 */
public static long getPathSize(FileSystem fs, Path path) throws IOException {
    FileStatus fileStatus = fs.getFileStatus(path);
    // 1- Check if the path points to a file
    if (!fileStatus.isDir())
        return fileStatus.getLen();
    // 2- Check if the input is indexed and get the cached size
    GlobalIndex<Partition> gIndex = SpatialTemporalSite.getGlobalIndex(fs, path);
    if (gIndex != null) {
        long totalSize = 0;
        for (Partition partition : gIndex)
            totalSize += partition.size;
        return totalSize;
    }
    // 3- Get the total size of all non-hidden files
    long totalSize = 0;
    FileStatus[] allFiles = fs.listStatus(path, SpatialTemporalSite.NonHiddenFileFilter);
    for (FileStatus subFile : allFiles) {
        if (!subFile.isDir())
            totalSize += subFile.getLen();
    }
    return totalSize;
}