Example usage for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Performs a non-recursive delete of all visible (non-hidden) files in a given
 * directory. Returns the number of files deleted as part of this operation.
 *//*from   www  .j a v a  2s.c o m*/
public static int deleteAllVisibleFiles(Path directory) throws IOException {
    FileSystem fs = directory.getFileSystem(CONF);
    Preconditions.checkState(fs.getFileStatus(directory).isDirectory());
    int numFilesDeleted = 0;
    for (FileStatus fStatus : fs.listStatus(directory)) {
        // Only delete files that are not hidden.
        if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) {
            LOG.debug("Removing: " + fStatus.getPath());
            fs.delete(fStatus.getPath(), false);
            ++numFilesDeleted;
        }
    }
    return numFilesDeleted;
}

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Returns the total number of visible (non-hidden) files in a directory.
 *///from   w ww  . ja  v  a  2s  .co m
public static int getTotalNumVisibleFiles(Path directory) throws IOException {
    FileSystem fs = directory.getFileSystem(CONF);
    Preconditions.checkState(fs.getFileStatus(directory).isDirectory());
    int numFiles = 0;
    for (FileStatus fStatus : fs.listStatus(directory)) {
        // Only delete files that are not hidden.
        if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) {
            ++numFiles;
        }
    }
    return numFiles;
}

From source file:com.cloudera.impala.util.FsPermissionChecker.java

License:Apache License

/**
 * Returns a Permissions object that can answer all access permission queries for the
 * given path.//from w w w. j av a2  s . co  m
 */
public Permissions getPermissions(FileSystem fs, Path path) throws IOException {
    Preconditions.checkNotNull(fs);
    Preconditions.checkNotNull(path);
    return new Permissions(fs.getFileStatus(path));
}

From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java

License:Open Source License

private void addOperatorInputs(Map<String, LocalResource> localResources) throws IOException {
    LOG.info("Inputs: " + operator.getInputFiles());
    FileSystem fs = FileSystem.get(conf);
    for (Entry<String, String> e : operator.getInputFiles().entrySet()) {
        if ((!e.getValue().startsWith("hdfs://")) && (!e.getValue().startsWith("$HDFS"))) {
            LOG.info("adding local resource: " + e);
            String inDir = dir;//from w  w  w.j  ava2 s  . c o m
            LocalResource rsrc = Records.newRecord(LocalResource.class);
            rsrc.setType(LocalResourceType.FILE);
            rsrc.setVisibility(LocalResourceVisibility.APPLICATION);
            LOG.info("Adding input: " + inDir + "/" + e.getValue());
            Path dst = new Path(inDir + "/" + e.getValue());
            dst = fs.makeQualified(dst);
            FileStatus stat = fs.getFileStatus(dst);
            rsrc.setSize(stat.getLen());
            rsrc.setTimestamp(stat.getModificationTime());
            rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst));
            localResources.put(e.getKey(), rsrc);
        }
    }
    /*for(String in : operator.getArguments().split(" ")){
       LOG.info("Adding input: "+in);
       LocalResource nl = constructScriptResource();
       localResources.put(in, nl);
    }*/
}

From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java

License:Open Source License

private void configureLocalScriptResourceForPath(LocalResource rsrc, Path path) throws IOException {
    //System.out.println("URI: "+path.toUri());
    FileSystem fs = FileSystem.get(conf);

    Path dst = new Path(dir + "/" + path.getName());
    fs.moveFromLocalFile(path, dst);/*from   w w  w .  j  a v a  2 s.  c  o m*/
    dst = fs.makeQualified(dst);

    FileStatus stat = fs.getFileStatus(dst);
    rsrc.setSize(stat.getLen());
    rsrc.setTimestamp(stat.getModificationTime());
    rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst));
}

From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java

License:Open Source License

private void configureLocalResourceForPath(LocalResource rsrc, Path path) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    FileStatus stat = fs.getFileStatus(path);
    rsrc.setSize(stat.getLen());/* w  ww  .  j  a  va 2 s. com*/
    rsrc.setTimestamp(stat.getModificationTime());
    rsrc.setResource(ConverterUtils.getYarnUrlFromPath(path));
}

From source file:com.cloudera.oryx.ml.MLUpdate.java

License:Open Source License

@Override
public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData,
        JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString,
        TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException {

    Objects.requireNonNull(newKeyMessageData);

    JavaRDD<M> newData = newKeyMessageData.values();
    JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values();

    if (newData != null) {
        newData.cache();//from   w  ww  .j  a v a2  s.c  o m
        // This forces caching of the RDD. This shouldn't be necessary but we see some freezes
        // when many workers try to materialize the RDDs at once. Hence the workaround.
        newData.foreachPartition(p -> {
        });
    }
    if (pastData != null) {
        pastData.cache();
        pastData.foreachPartition(p -> {
        });
    }

    List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues();
    int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates);
    List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates,
            valuesPerHyperParam);

    Path modelDir = new Path(modelDirString);
    Path tempModelPath = new Path(modelDir, ".temporary");
    Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis()));

    FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration());
    fs.mkdirs(candidatesPath);

    Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos,
            candidatesPath);

    Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis()));
    if (bestCandidatePath == null) {
        log.info("Unable to build any model");
    } else {
        // Move best model into place
        fs.rename(bestCandidatePath, finalPath);
    }
    // Then delete everything else
    fs.delete(candidatesPath, true);

    if (modelUpdateTopic == null) {
        log.info("No update topic configured, not publishing models to a topic");
    } else {
        // Push PMML model onto update topic, if it exists
        Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME);
        if (fs.exists(bestModelPath)) {
            FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath);
            PMML bestModel = null;
            boolean modelNeededForUpdates = canPublishAdditionalModelData();
            boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize;
            if (modelNeededForUpdates || modelNotTooLarge) {
                // Either the model is required for publishAdditionalModelData, or required because it's going to
                // be serialized to Kafka
                try (InputStream in = fs.open(bestModelPath)) {
                    bestModel = PMMLUtils.read(in);
                }
            }

            if (modelNotTooLarge) {
                modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel));
            } else {
                modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString());
            }

            if (modelNeededForUpdates) {
                publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath,
                        modelUpdateTopic);
            }
        }
    }

    if (newData != null) {
        newData.unpersist();
    }
    if (pastData != null) {
        pastData.unpersist();
    }
}

From source file:com.cloudera.recordbreaker.analyzer.FormatAnalyzer.java

License:Open Source License

/**
 * Create a file-appropriate DataDescriptor instance.
 *
 * Right now we just use the file ending to figure out what to do,
 * but this will become unsatisfactory pretty quickly.
 *
 * @param f a <code>File</code> value
 * @return a <code>DataDescriptor</code> value
 *///from   www.  j  av  a  2s .  c  om
public DataDescriptor describeData(FileSystem fs, Path p) throws IOException {
    FileStatus fstatus = fs.getFileStatus(p);
    String fname = p.getName();

    // Test to see if the file is one of a handful of known structured formats.
    if (CSVDataDescriptor.isCSV(fs, p)) {
        return new CSVDataDescriptor(p, fs);
    } else if (fname.endsWith(".xml")) {
        return new XMLDataDescriptor(p, fs);
    } else if (fname.endsWith(".avro")) {
        return new AvroDataDescriptor(p, fs);
    } else if (AvroSequenceFileDataDescriptor.isAvroSequenceFile(fs, p)) {
        return new AvroSequenceFileDataDescriptor(p, fs);
    } else if (SequenceFileDataDescriptor.isSequenceFile(fs, p)) {
        return new SequenceFileDataDescriptor(p, fs);
    } else if (ApacheDataDescriptor.isApacheLogFile(fs, p)) {
        return new ApacheDataDescriptor(p, fs);
    } else if (SyslogDataDescriptor.isSyslogFile(fs, p)) {
        return new SyslogDataDescriptor(p, fs);
    } else {
        // It's not one of the known formats, so apply LearnStructure 
        // to obtain the structure.
        if (UnknownTextDataDescriptor.isTextData(fs, p)) {
            try {
                return new UnknownTextDataDescriptor(fs, p, schemaDbDir);
            } catch (Exception iex) {
                //iex.printStackTrace();
            }
        }
        // If that doesn't work, then give up and call it unstructured.  You
        // can't run queries on data in this format.
        return new UnstructuredFileDescriptor(fs, p);
    }
}

From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java

License:Open Source License

/**
 * Add a single brand-new file to the system.  Parse it, obtain structure, etc, if needed.
 *//*from  www . j a v  a2  s  . c om*/
void addSingleFile(FileSystem fs, Path insertFile, long crawlId) throws IOException {
    FileStatus fstatus = fs.getFileStatus(insertFile);
    addFileMetadata(fstatus, crawlId);
    final boolean isDir = fstatus.isDir();

    if (!isDir) {
        final List<Long> typeGuesses = new ArrayList<Long>();
        DataDescriptor descriptor = formatAnalyzer.describeData(fs, insertFile);
        List<SchemaDescriptor> schemas = null;
        try {
            schemas = descriptor.getSchemaDescriptor();

            if (schemas == null || schemas.size() == 0) {
                typeGuesses.add(getCreateType(descriptor.getFileTypeIdentifier()));
                typeGuesses.add(getSingleFileSummary(descriptor.getFilename().toString()).getFid());
                typeGuesses.add(getCreateSchema(null));
            } else {
                for (SchemaDescriptor sd : schemas) {
                    typeGuesses.add(getCreateType(descriptor.getFileTypeIdentifier()));
                    typeGuesses.add(getSingleFileSummary(descriptor.getFilename().toString()).getFid());
                    typeGuesses.add(getCreateSchema(sd));
                }
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }

        dbQueue.execute(new SQLiteJob<Object>() {
            protected Long job(SQLiteConnection db) throws SQLiteException {
                for (int i = 0; i < typeGuesses.size(); i += 3) {
                    long typeId = typeGuesses.get(i);
                    long fileId = typeGuesses.get(i + 1);
                    long schemaId = typeGuesses.get(i + 2);

                    SQLiteStatement stmt = db.prepare("INSERT into TypeGuesses VALUES(?, ?)");
                    try {
                        stmt.bind(1, fileId).bind(2, typeId);
                        stmt.step();
                    } finally {
                        stmt.dispose();
                    }
                }
                return null;
            }
        }).complete();

        dbQueue.execute(new SQLiteJob<Object>() {
            protected Long job(SQLiteConnection db) throws SQLiteException {
                for (int i = 0; i < typeGuesses.size(); i += 3) {
                    long typeId = typeGuesses.get(i);
                    long fileId = typeGuesses.get(i + 1);
                    long schemaId = typeGuesses.get(i + 2);

                    SQLiteStatement stmt = db.prepare("INSERT into SchemaGuesses VALUES(?, ?)");
                    try {
                        stmt.bind(1, fileId).bind(2, schemaId);
                        stmt.step();
                    } finally {
                        stmt.dispose();
                    }
                }
                return null;
            }
        }).complete();
    }
}

From source file:com.cloudera.recordbreaker.analyzer.FSCrawler.java

License:Open Source License

/**
 * Traverse an entire region of the filesystem, analyzing files.
 * This code should:/* w  w  w.  ja va  2  s .  c  om*/
 * a) Navigate the directory hierarchy
 * b) Run analysis code to figure out the file details
 * c) Invoke addSingleFile() appropriately.
 */
protected void recursiveCrawlBuildList(FileSystem fs, Path p, int subdirDepth, long crawlId,
        List<Path> todoFileList, List<Path> todoDirList) throws IOException {
    FileStatus fstatus = fs.getFileStatus(p);
    if (!fstatus.isDir()) {
        todoFileList.add(p);
    } else {
        if (subdirDepth > 0 || subdirDepth < 0) {
            todoDirList.add(p);
            Path paths[] = new Path[1];
            paths[0] = p;
            for (FileStatus subfilestatus : fs.listStatus(p)) {
                Path subfile = subfilestatus.getPath();
                try {
                    recursiveCrawlBuildList(fs, subfile, subdirDepth - 1, crawlId, todoFileList, todoDirList);
                } catch (IOException iex) {
                    iex.printStackTrace();
                }
            }
        }
    }
}