Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Performs a non-recursive delete of all visible (non-hidden) files in a given
 * directory. Returns the number of files deleted as part of this operation.
 *//*from   www  .j a v a  2s.c o m*/
public static int deleteAllVisibleFiles(Path directory) throws IOException {
    FileSystem fs = directory.getFileSystem(CONF);
    Preconditions.checkState(fs.getFileStatus(directory).isDirectory());
    int numFilesDeleted = 0;
    for (FileStatus fStatus : fs.listStatus(directory)) {
        // Only delete files that are not hidden.
        if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) {
            LOG.debug("Removing: " + fStatus.getPath());
            fs.delete(fStatus.getPath(), false);
            ++numFilesDeleted;
        }
    }
    return numFilesDeleted;
}

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Returns the total number of visible (non-hidden) files in a directory.
 *///from   w ww  . ja  v  a  2s  .co m
public static int getTotalNumVisibleFiles(Path directory) throws IOException {
    FileSystem fs = directory.getFileSystem(CONF);
    Preconditions.checkState(fs.getFileStatus(directory).isDirectory());
    int numFiles = 0;
    for (FileStatus fStatus : fs.listStatus(directory)) {
        // Only delete files that are not hidden.
        if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) {
            ++numFiles;
        }
    }
    return numFiles;
}

From source file:com.cloudera.impala.util.FsPermissionChecker.java

License:Apache License

/**
 * Returns a Permissions object that can answer all access permission queries for the
 * given path.//from w w w. j av a2  s . co  m
 */
public Permissions getPermissions(FileSystem fs, Path path) throws IOException {
    Preconditions.checkNotNull(fs);
    Preconditions.checkNotNull(path);
    return new Permissions(fs.getFileStatus(path));
}

From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java

License:Open Source License

private void addOperatorInputs(Map<String, LocalResource> localResources) throws IOException {
    LOG.info("Inputs: " + operator.getInputFiles());
    FileSystem fs = FileSystem.get(conf);
    for (Entry<String, String> e : operator.getInputFiles().entrySet()) {
        if ((!e.getValue().startsWith("hdfs://")) && (!e.getValue().startsWith("$HDFS"))) {
            LOG.info("adding local resource: " + e);
            String inDir = dir;//from w  w  w.j  ava2 s  . c o m
            LocalResource rsrc = Records.newRecord(LocalResource.class);
            rsrc.setType(LocalResourceType.FILE);
            rsrc.setVisibility(LocalResourceVisibility.APPLICATION);
            LOG.info("Adding input: " + inDir + "/" + e.getValue());
            Path dst = new Path(inDir + "/" + e.getValue());
            dst = fs.makeQualified(dst);
            FileStatus stat = fs.getFileStatus(dst);
            rsrc.setSize(stat.getLen());
            rsrc.setTimestamp(stat.getModificationTime());
            rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst));
            localResources.put(e.getKey(), rsrc);
        }
    }
    /*for(String in : operator.getArguments().split(" ")){
       LOG.info("Adding input: "+in);
       LocalResource nl = constructScriptResource();
       localResources.put(in, nl);
    }*/
}

From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java

License:Open Source License

private void configureLocalScriptResourceForPath(LocalResource rsrc, Path path) throws IOException {
    //System.out.println("URI: "+path.toUri());
    FileSystem fs = FileSystem.get(conf);

    Path dst = new Path(dir + "/" + path.getName());
    fs.moveFromLocalFile(path, dst);/*from   w w  w .  j  a v a  2 s.  c  o m*/
    dst = fs.makeQualified(dst);

    FileStatus stat = fs.getFileStatus(dst);
    rsrc.setSize(stat.getLen());
    rsrc.setTimestamp(stat.getModificationTime());
    rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst));
}

From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java

License:Open Source License

private void configureLocalResourceForPath(LocalResource rsrc, Path path) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    FileStatus stat = fs.getFileStatus(path);
    rsrc.setSize(stat.getLen());/* w  ww  .  j  a  va 2 s. com*/
    rsrc.setTimestamp(stat.getModificationTime());
    rsrc.setResource(ConverterUtils.getYarnUrlFromPath(path));
}

From source file:com.cloudera.oryx.ml.MLUpdate.java

License:Open Source License

@Override
public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData,
        JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString,
        TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException {

    Objects.requireNonNull(newKeyMessageData);

    JavaRDD<M> newData = newKeyMessageData.values();
    JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values();

    if (newData != null) {
        newData.cache();//from   w  ww  .j  a v a2  s.c  o m
        // This forces caching of the RDD. This shouldn't be necessary but we see some freezes
        // when many workers try to materialize the RDDs at once. Hence the workaround.
        newData.foreachPartition(p -> {
        });
    }
    if (pastData != null) {
        pastData.cache();
        pastData.foreachPartition(p -> {
        });
    }

    List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues();
    int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates);
    List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates,
            valuesPerHyperParam);

    Path modelDir = new Path(modelDirString);
    Path tempModelPath = new Path(modelDir, ".temporary");
    Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis()));

    FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration());
    fs.mkdirs(candidatesPath);

    Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos,
            candidatesPath);

    Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis()));
    if (bestCandidatePath == null) {
        log.info("Unable to build any model");
    } else {
        // Move best model into place
        fs.rename(bestCandidatePath, finalPath);
    }
    // Then delete everything else
    fs.delete(candidatesPath, true);

    if (modelUpdateTopic == null) {
        log.info("No update topic configured, not publishing models to a topic");
    } else {
        // Push PMML model onto update topic, if it exists
        Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME);
        if (fs.exists(bestModelPath)) {
            FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath);
            PMML bestModel = null;
            boolean modelNeededForUpdates = canPublishAdditionalModelData();
            boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize;
            if (modelNeededForUpdates || modelNotTooLarge) {
                // Either the model is required for publishAdditionalModelData, or required because it's going to
                // be serialized to Kafka
                try (InputStream in = fs.open(bestModelPath)) {
                    bestModel = PMMLUtils.read(in);
                }
            }

            if (modelNotTooLarge) {
                modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel));
            } else {
                modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString());
            }

            if (modelNeededForUpdates) {
                publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath,
                        modelUpdateTopic);
            }
        }
    }

    if (newData != null) {
        newData.unpersist();
    }
    if (pastData != null) {
        pastData.unpersist();
    }
}

From source file:com.cloudera.recordbreaker.analyzer.FormatAnalyzer.java

License:Open Source License

/**
 * Create a file-appropriate DataDescriptor instance.
 *
 * Right now we just use the file ending to figure out what to do,
 * but this will become unsatisfactory pretty quickly.
 *
 * @param f a <code>File</code> value
 * @return a <code>DataDescriptor</code> value
 *///from   www.  j  av  a  2s .  c  om
public DataDescriptor describeData(FileSystem fs, Path p) throws IOException {
    FileStatus fstatus = fs.getFileStatus(p);
    String fname = p.getName();

    // Test to see if the file is one of a handful of known structured formats.
    if (CSVDataDescriptor.isCSV(fs, p)) {
        return new CSVDataDescriptor(p, fs);
    } else if (fname.endsWith(".xml")) {
        return new XMLDataDescriptor(p, fs);
    } else if (fname.endsWith(".avro")) {
        return new AvroDataDescriptor(p, fs);
    } else if (AvroSequenceFileDataDescriptor.isAvroSequenceFile(fs, p)) {
        return new AvroSequenceFileDataDescriptor(p, fs);
    } else if (SequenceFileDataDescriptor.isSequenceFile(fs, p)) {
        return new SequenceFileDataDescriptor(p, fs);
    } else if (ApacheDataDescriptor.isApacheLogFile(fs, p)) {
        return new ApacheDataDescriptor(p, fs);
    } else if (SyslogDataDescriptor.isSyslogFile(fs, p)) {
        return new SyslogDataDescriptor(p, fs);
    } else {
        // It's not one of the known formats, so apply LearnStructure 
        // to obtain the structure.
        if (UnknownTextDataDescriptor.isTextData(fs, p)) {
            try {
                return new UnknownTextDataDescriptor(fs, p, schemaDbDir);
            } catch (Exception iex) {
                //iex.printStackTrace();
            }
        }
        // If that doesn't work, then give up and call it unstructured.  You
        // can't run queries on data in this format.
        return new UnstructuredFileDescriptor(fs, p);
    }
}

From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java

License:Open Source License

/**
 * Add a single brand-new file to the system.  Parse it, obtain structure, etc, if needed.
 *//*from  www . j a v  a2  s  . c om*/
void addSingleFile(FileSystem fs, Path insertFile, long crawlId) throws IOException {
    FileStatus fstatus = fs.getFileStatus(insertFile);
    addFileMetadata(fstatus, crawlId);
    final boolean isDir = fstatus.isDir();

    if (!isDir) {
        final List<Long> typeGuesses = new ArrayList<Long>();
        DataDescriptor descriptor = formatAnalyzer.describeData(fs, insertFile);
        List<SchemaDescriptor> schemas = null;
        try {
            schemas = descriptor.getSchemaDescriptor();

            if (schemas == null || schemas.size() == 0) {
                typeGuesses.add(getCreateType(descriptor.getFileTypeIdentifier()));
                typeGuesses.add(getSingleFileSummary(descriptor.getFilename().toString()).getFid());
                typeGuesses.add(getCreateSchema(null));
            } else {
                for (SchemaDescriptor sd : schemas) {
                    typeGuesses.add(getCreateType(descriptor.getFileTypeIdentifier()));
                    typeGuesses.add(getSingleFileSummary(descriptor.getFilename().toString()).getFid());
                    typeGuesses.add(getCreateSchema(sd));
                }
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }

        dbQueue.execute(new SQLiteJob<Object>() {
            protected Long job(SQLiteConnection db) throws SQLiteException {
                for (int i = 0; i < typeGuesses.size(); i += 3) {
                    long typeId = typeGuesses.get(i);
                    long fileId = typeGuesses.get(i + 1);
                    long schemaId = typeGuesses.get(i + 2);

                    SQLiteStatement stmt = db.prepare("INSERT into TypeGuesses VALUES(?, ?)");
                    try {
                        stmt.bind(1, fileId).bind(2, typeId);
                        stmt.step();
                    } finally {
                        stmt.dispose();
                    }
                }
                return null;
            }
        }).complete();

        dbQueue.execute(new SQLiteJob<Object>() {
            protected Long job(SQLiteConnection db) throws SQLiteException {
                for (int i = 0; i < typeGuesses.size(); i += 3) {
                    long typeId = typeGuesses.get(i);
                    long fileId = typeGuesses.get(i + 1);
                    long schemaId = typeGuesses.get(i + 2);

                    SQLiteStatement stmt = db.prepare("INSERT into SchemaGuesses VALUES(?, ?)");
                    try {
                        stmt.bind(1, fileId).bind(2, schemaId);
                        stmt.step();
                    } finally {
                        stmt.dispose();
                    }
                }
                return null;
            }
        }).complete();
    }
}

From source file:com.cloudera.recordbreaker.analyzer.FSCrawler.java

License:Open Source License

/**
 * Traverse an entire region of the filesystem, analyzing files.
 * This code should:/* w  w  w.  ja va  2  s .  c  om*/
 * a) Navigate the directory hierarchy
 * b) Run analysis code to figure out the file details
 * c) Invoke addSingleFile() appropriately.
 */
protected void recursiveCrawlBuildList(FileSystem fs, Path p, int subdirDepth, long crawlId,
        List<Path> todoFileList, List<Path> todoDirList) throws IOException {
    FileStatus fstatus = fs.getFileStatus(p);
    if (!fstatus.isDir()) {
        todoFileList.add(p);
    } else {
        if (subdirDepth > 0 || subdirDepth < 0) {
            todoDirList.add(p);
            Path paths[] = new Path[1];
            paths[0] = p;
            for (FileStatus subfilestatus : fs.listStatus(p)) {
                Path subfile = subfilestatus.getPath();
                try {
                    recursiveCrawlBuildList(fs, subfile, subdirDepth - 1, crawlId, todoFileList, todoDirList);
                } catch (IOException iex) {
                    iex.printStackTrace();
                }
            }
        }
    }
}