Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:com.kylinolap.job.hadoop.cube.CopySeq.java

License:Apache License

public static void copyTo64MB(String src, String dst) throws IOException {
    Configuration hconf = new Configuration();
    Path srcPath = new Path(src);
    Path dstPath = new Path(dst);

    FileSystem fs = FileSystem.get(hconf);
    long srcSize = fs.getFileStatus(srcPath).getLen();
    int copyTimes = (int) (67108864 / srcSize); // 64 MB
    System.out.println("Copy " + copyTimes + " times");

    Reader reader = new Reader(hconf, SequenceFile.Reader.file(srcPath));
    Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), hconf);
    Text value = new Text();

    Writer writer = SequenceFile.createWriter(hconf, Writer.file(dstPath), Writer.keyClass(key.getClass()),
            Writer.valueClass(Text.class), Writer.compression(CompressionType.BLOCK, getLZOCodec(hconf)));

    int count = 0;
    while (reader.next(key, value)) {
        for (int i = 0; i < copyTimes; i++) {
            writer.append(key, value);//from  w ww  .ja  va 2  s .  c  om
            count++;
        }
    }

    System.out.println("Len: " + writer.getLength());
    System.out.println("Rows: " + count);

    reader.close();
    writer.close();
}

From source file:com.liferay.hadoop.store.HDFSStore.java

License:Open Source License

@Override
public long getFileSize(long companyId, long repositoryId, String fileName)
        throws PortalException, SystemException {

    Path fullPath = HadoopManager.getFullVersionFilePath(companyId, repositoryId, fileName, VERSION_DEFAULT);

    try {// w  w w.  ja v  a2 s .c o  m
        FileSystem fileSystem = HadoopManager.getFileSystem();

        if (!fileSystem.exists(fullPath)) {
            throw new PortalException("File " + fullPath.toUri().toString() + " does not exist");
        }

        FileStatus fileStatus = fileSystem.getFileStatus(fullPath);

        return fileStatus.getLen();
    } catch (IOException ioe) {
        throw new SystemException(ioe);
    }
}

From source file:com.lightboxtechnologies.spectrum.HDFSArchiver.java

License:Apache License

protected static void traverse(FileSystem fs, Path p, ZipOutputStream zout, byte[] buf) throws IOException {
    final String relpath = relativize(p);

    final FileStatus pstat = fs.getFileStatus(p);
    if (pstat.isDir()) {
        handleDirectory(relpath, fs, p, zout, buf);
    } else {/* ww  w .  j a  v a2s  .  co m*/
        handleFile(relpath, fs, p, zout, buf);
    }
}

From source file:com.linkedin.cubert.io.rubix.RubixFile.java

License:Open Source License

@SuppressWarnings("unchecked")
public List<KeyData<K>> getKeyData() throws IOException, ClassNotFoundException {
    final FileSystem fs = FileSystem.get(conf);
    keyData = new ArrayList<KeyData<K>>();

    final long filesize = fs.getFileStatus(path).getLen();
    FSDataInputStream in = fs.open(path);

    /* The last long in the file is the start position of the trailer section */
    in.seek(filesize - 8);//from   w  w  w .  j  a v  a  2s  .c o  m
    long metaDataStartPos = in.readLong();

    in.seek(metaDataStartPos);

    ObjectMapper mapper = new ObjectMapper();
    metadataJson = mapper.readValue(in.readUTF(), JsonNode.class);

    int keySectionSize = in.readInt();

    // load the key section
    byte[] keySection = new byte[keySectionSize];

    in.seek(filesize - keySectionSize - 8);
    in.read(keySection, 0, keySectionSize);
    in.close();

    ByteArrayInputStream bis = new ByteArrayInputStream(keySection);
    DataInput dataInput = new DataInputStream(bis);

    int numberOfBlocks = metadataJson.get("numberOfBlocks").getIntValue();

    // load the key section
    keyClass = (Class<K>) ClassCache.forName(JsonUtils.getText(metadataJson, "keyClass"));
    valueClass = (Class<V>) ClassCache.forName(JsonUtils.getText(metadataJson, "valueClass"));

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Deserializer<K> deserializer = serializationFactory.getDeserializer(keyClass);

    deserializer.open(bis);

    while (bis.available() > 0 && numberOfBlocks > 0) {
        K key = deserializer.deserialize(null);

        long offset = dataInput.readLong();
        long blockId = dataInput.readLong();
        long numRecords = dataInput.readLong();

        keyData.add(new KeyData<K>(key, offset, 0, numRecords, blockId));
        numberOfBlocks--;
    }

    // Assign length to each keydata entry
    int numEntries = keyData.size();
    for (int i = 1; i < numEntries; i++) {
        KeyData<K> prev = keyData.get(i - 1);
        KeyData<K> current = keyData.get(i);

        prev.setLength(current.getOffset() - prev.getOffset());
    }

    if (numEntries > 0) {
        KeyData<K> last = keyData.get(numEntries - 1);
        last.setLength(metaDataStartPos - last.offset);
    }

    return keyData;
}

From source file:com.linkedin.cubert.io.rubix.RubixFile.java

License:Open Source License

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException,
        ParseException, InstantiationException, IllegalAccessException {
    final int VERBOSE_NUM_ROWS = 4;

    Options options = new Options();

    options.addOption("h", "help", false, "shows this message");
    options.addOption("v", "verbose", false, "print summary and first few rows of each block");
    options.addOption("m", "metadata", false, "show the metadata");
    options.addOption("d", "dump", false,
            "dump the contents of the rubix file. Use -f for specifying format, and -o for specifying output location");
    options.addOption("f", "format", true, "the data format for dumping data (AVRO or TEXT). Default: TEXT");
    options.addOption("e", "extract", true,
            "Extract one rubix block matching the block id. Use -o for specifying output location");
    options.addOption("o", true, "Store the output at the specified location");

    CommandLineParser parser = new BasicParser();

    // parse the command line arguments
    CommandLine line = parser.parse(options, args);

    // show the help message
    if (line.hasOption("h")) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(/*  w  ww.  ja v a 2 s  .c  o  m*/
                "RubixFile <rubix file or dir> [options]\nIf no options are provided, print a summary of the blocks.",
                options);
        return;
    }

    // validate provided options
    if (line.hasOption("d") && line.hasOption("e")) {
        System.err.println("Cannot dump (-d) and extract (-e) at the same time!");
        return;
    }

    // obtain the list of rubix files
    String[] files = line.getArgs();
    if (files == null || files.length == 0) {
        System.err.println("Rubix file not specified");
        return;
    }

    Configuration conf = new JobConf();
    FileSystem fs = FileSystem.get(conf);

    Path path = new Path(files[0]);
    FileStatus[] allFiles;

    FileStatus status = fs.getFileStatus(path);
    if (status.isDir()) {
        allFiles = fs.listStatus(path, new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return path.toString().contains(RubixConstants.RUBIX_EXTENSION);
            }

        });
    } else {
        allFiles = new FileStatus[] { status };
    }

    // walk over all files and extract the trailer section
    List<RubixFile<Tuple, Object>> rfiles = new ArrayList<RubixFile<Tuple, Object>>();

    for (FileStatus s : allFiles) {
        Path p = s.getPath();

        RubixFile<Tuple, Object> rfile = new RubixFile<Tuple, Object>(conf, p);

        // if printing meta data information.. exit after first file (since all files
        // have the same meta data)
        if (line.hasOption("m")) {
            rfile.getKeyData();

            System.out.println(new ObjectMapper().writer().writeValueAsString(rfile.metadataJson));
            break;
        }

        rfiles.add(rfile);
    }

    // dump the data
    if (line.hasOption("d")) {
        String format = line.getOptionValue("f");
        if (format == null)
            format = "TEXT";

        format = format.trim().toUpperCase();

        if (format.equals("AVRO")) {
            // dumpAvro(rfiles, line.getOptionValue("o"));
            throw new UnsupportedOperationException(
                    "Dumping to avro is not currently supporting. Please write a Cubert (map-only) script to store data in avro format");
        } else if (format.equals("TEXT")) {
            if (line.hasOption("o")) {
                System.err.println("Dumping TEXT format data *into a file* is not currently supported");
                return;
            }
            dumpText(rfiles, line.getOptionValue("o"), Integer.MAX_VALUE);
        } else {
            System.err.println("Invalid format [" + format + "] for dumping. Please use AVRO or TEXT");
            return;
        }
    } else if (line.hasOption("e")) // extract one rubix block
    {
        long blockId = Long.parseLong(line.getOptionValue("e"));
        extract(rfiles, blockId, line.getOptionValue("o"));
    } else
    // print summary
    {
        dumpText(rfiles, null, line.hasOption("v") ? VERBOSE_NUM_ROWS : 0);
    }
}

From source file:com.linkedin.cubert.io.rubix.RubixInputSplit.java

License:Open Source License

@Override
public String[] getLocations() throws IOException, InterruptedException {
    if (hostnames == null) {
        /* Obtain the FileSystem object and get the FileStatus objects for the split */
        FileSystem fileSystem = FileSystem.get(conf);
        FileStatus fileStatus = fileSystem.getFileStatus(filename);
        /*//from   w w w.j  a  va2 s  .  c  o m
         * Obtain the Block locations for the split. This also provides the offset and
         * length information for each block
         */
        final BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(fileStatus, offset, length);
        /**
         * Collect all hosts in a map and populate the number of bytes to be read from
         * each host
         */
        Long l;
        Map<String, Long> hostMap = new HashMap<String, Long>();
        for (BlockLocation bl : blockLocations) {
            final long start = bl.getOffset() < offset ? offset : bl.getOffset();
            final long end = (offset + length) < (bl.getOffset() + bl.getLength()) ? offset + length
                    : bl.getOffset() + bl.getLength();
            final long nRelevantBytes = end - start;
            for (String host : bl.getHosts()) {
                hostMap.put(host, ((l = hostMap.get(host)) == null ? 0 : l) + nRelevantBytes);
            }
        }
        /* Sort them in decreasing order of maximum number of relevant bytes */
        final Set<Map.Entry<String, Long>> entries = hostMap.entrySet();
        final Map.Entry<String, Long>[] hostLengthPairs = entries.toArray(new Map.Entry[entries.size()]);

        Arrays.sort(hostLengthPairs, new Comparator<Map.Entry<String, Long>>() {
            @Override
            public int compare(Map.Entry<String, Long> e1, Map.Entry<String, Long> e2) {
                return (int) (e2.getValue() - e1.getValue());
            }
        });

        /* Populate the hostnames object */
        final int nHost = Math.min(hostLengthPairs.length, MAX_LOCATIONS);
        hostnames = new String[nHost];
        for (int i = 0; i < nHost; ++i) {
            hostnames[i] = hostLengthPairs[i].getKey();
        }
    }
    return hostnames;
}

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorageUtils.java

License:Apache License

/**
 * Returns all non-hidden files recursively inside the base paths given
 *
 * @throws IOException/*from   w  w  w  . j a v  a  2s  .c om*/
 */
public static Set<Path> getAllFilesRecursively(Set<Path> basePaths, Configuration conf) throws IOException {
    Set<Path> paths = new HashSet<Path>();
    for (Path path : basePaths) {
        FileSystem fs = FileSystem.get(path.toUri(), conf);
        FileStatus f = fs.getFileStatus(path);
        if (f.isDir()) {
            getAllFilesInternal(f, conf, paths, fs);
        } else {
            paths.add(path);
        }
    }
    return paths;
}

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorageUtils.java

License:Apache License

/** get last file of a hdfs path if it is  a directory;
 *   or return the file itself if path is a file
 *///from  ww w .  j  a  v a 2 s  .c  o  m
public static Path getLast(Path path, FileSystem fs) throws IOException {

    FileStatus status = fs.getFileStatus(path);
    if (!status.isDir()) {
        return path;
    }
    FileStatus[] statuses = fs.listStatus(path, PATH_FILTER);

    if (statuses.length == 0) {
        return null;
    } else {
        Arrays.sort(statuses);
        for (int i = statuses.length - 1; i >= 0; i--) {
            if (!statuses[i].isDir()) {
                return statuses[i].getPath();
            }
        }
        return null;
    }
}

From source file:com.linkedin.cubert.utils.AvroUtils.java

License:Open Source License

/**
 * Extracts the schema of an Avro file.//from   w  w w  .  j a  va  2 s.c om
 * 
 * @param conf
 * @param path
 * @return
 * @throws IOException
 */
public static Schema getSchema(Configuration conf, Path path) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (fs.getFileStatus(path).isDir()) {
        Path globPath = new Path(path, "*.avro");
        FileStatus[] allFiles = fs.globStatus(globPath);
        if (allFiles.length == 0) {
            throw new IOException("there are no files in " + path.toString());
        }

        path = allFiles[0].getPath();
    }
    System.out.println("Obtaining schema of avro file " + path.toString());

    return getSchema(new FsInput(path, conf));
}

From source file:com.linkedin.cubert.utils.CommonUtils.java

License:Open Source License

public static Path getAFileInPath(Configuration conf, Path path, String suffix) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    if (fs.getFileStatus(path).isDir()) {
        Path globPath = new Path(path, "*." + suffix);
        FileStatus[] allFiles = fs.globStatus(globPath);
        if (allFiles.length == 0) {
            throw new IOException("there are no files in " + path.toString());
        }/*  w w w.j a  va  2 s  .  c o  m*/

        path = allFiles[0].getPath();
    }

    print.f("Obtaining schema of %s file %s", suffix, path.toString());

    return path;
}