Example usage for org.apache.hadoop.fs FileSystem isFile

List of usage examples for org.apache.hadoop.fs FileSystem isFile

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem isFile.

Prototype

@Deprecated
public boolean isFile(Path f) throws IOException 

Source Link

Document

True iff the named path is a regular file.

Usage

From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java

License:Apache License

/**
 * Pull the schema off of the given file (if it is a file). If it is a
 * directory, then pull schemas off of all subfiles, and check that they are
 * all the same schema. If so, return that schema, otherwise throw an
 * exception/*from  w  ww.j a v  a 2  s  .c o  m*/
 * 
 * @param fs The filesystem to use
 * @param path The path from which to get the schema
 * @param checkSameSchema boolean flag to check all files in directory for
 *        same schema
 * @return The schema of this file or all its subfiles
 * @throws IOException
 */
public static JsonSchema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema)
        throws IOException {
    try {
        if (fs.isFile(path)) {
            // this is a normal file, get a schema from it
            Map<String, String> m = HadoopUtils.getMetadataFromSequenceFile(fs, path);
            if (!m.containsKey("value.schema") || !m.containsKey("key.schema"))
                throw new IllegalArgumentException("No schema found on file " + path.toString());
            return new JsonSchema(JsonTypeDefinition.fromJson(m.get("key.schema")),
                    JsonTypeDefinition.fromJson(m.get("value.schema")));
        } else {
            FileStatus[] statuses = null;
            if (fs.isDirectory(path)) {
                // this is a directory, get schemas from all subfiles
                statuses = fs.listStatus(path);
            } else {
                // this is wildcard path, get schemas from all matched files
                statuses = fs.globStatus(path);
            }
            if (statuses == null || statuses.length == 0)
                throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath());
            List<JsonSchema> schemas = new ArrayList<JsonSchema>();
            for (FileStatus status : statuses) {
                if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
                    if (!checkSameSchema) {
                        // return first valid schema w/o checking all files
                        return getSchemaFromPath(fs, status.getPath(), checkSameSchema);
                    }
                    schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema));
                }
            }

            // now check that all the schemas are the same
            if (schemas.size() > 0) {
                JsonSchema schema = schemas.get(0);
                for (int i = 1; i < schemas.size(); i++)
                    if (!schema.equals(schemas.get(i)))
                        throw new IllegalArgumentException("The directory " + path.toString()
                                + " contains heterogenous schemas: found both '" + schema.toString() + "' and '"
                                + schemas.get(i).toString() + "'.");

                return schema;
            } else {
                throw new IllegalArgumentException("No Valid metedata file found for Path:" + path.toString());
            }
        }
    } catch (Exception e) {
        logger.error("failed to get metadata from path:" + path);
        throw new RuntimeException(e);
    }
}

From source file:weka.distributed.hadoop.KMeansClustererHadoopJob.java

License:Open Source License

/**
 * If the data has been randomly shuffled into n chunks then this does select
 * randomly chosen centers. If the data hasn't been randomly shuffled then
 * rows are read sequentially from the first data file in the input directory
 * /* ww  w  .j  a  v a2  s. co  m*/
 * @param numRuns the number of runs of k-means
 * @param numClusters the number of clusters
 * @return a list of centers (as Instances objects)
 * @throws DistributedWekaException if a problem occurs
 */
protected List<Instances> initializeWithRandomCenters(int numRuns, int numClusters)
        throws DistributedWekaException {

    String csvConfig = getCSVMapTaskOptions();
    CSVToARFFHeaderMapTask csvTask = new CSVToARFFHeaderMapTask();
    Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(m_arffHeaderJob.getFinalHeader());
    Configuration conf = new Configuration();
    m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env);

    List<Instance> candidateList = new ArrayList<Instance>();
    int numRowsToGet = 2 * numRuns * numClusters;
    boolean ok = false;

    try {
        csvTask.setOptions(Utils.splitOptions(csvConfig));
        csvTask.initParserOnly(CSVToARFFHeaderMapTask.instanceHeaderToAttributeNameList(headerNoSummary));
    } catch (Exception e) {
        throw new DistributedWekaException(e);
    }
    if (getRandomlyShuffleData()) {
        String randomizedOutputPath = m_randomizeJob.getRandomizedChunkOutputPath();
        try {
            FileSystem fs = FileSystem.get(conf);
            // FileStatus[] contents = fs.listStatus(new
            // Path(randomizedOutputPath));

            int chunkNum = 0;

            while (!ok) {
                Path chunk = new Path(randomizedOutputPath + "/chunk" + chunkNum + "-r-00000");
                if (!fs.exists(chunk)) {
                    if (chunkNum == 0) {
                        // something bad has happened - there doesn't seem to be any
                        // chunk files
                        throw new DistributedWekaException("Unable to find any chunk files in the "
                                + "randomize job's output directory: " + randomizedOutputPath);
                    }
                    break; // run out of chunks
                }
                FSDataInputStream di = fs.open(chunk);
                BufferedReader br = null;
                try {
                    br = new BufferedReader(new InputStreamReader(di));

                    // get a few more than we need in order to avoid
                    // duplicates (hopefully)
                    int count = 0;
                    String line = null;
                    while ((line = br.readLine()) != null && count < numRowsToGet) {
                        String[] parsed = csvTask.parseRowOnly(line);
                        Instance inst = csvTask.makeInstance(headerNoSummary, false, parsed, false);
                        candidateList.add(inst);
                        count++;
                    }

                    if (count == numRowsToGet) {
                        ok = true;
                    } else {
                        chunkNum++;
                    }
                    br.close();
                    br = null;
                } catch (Exception ex) {
                    throw new DistributedWekaException(ex);
                } finally {
                    if (br != null) {
                        br.close();
                    }
                }
            }
        } catch (IOException ex) {
            throw new DistributedWekaException(ex);
        }
    } else {
        String inS = m_mrConfig.getInputPaths();
        String[] inputPaths = inS.split(",");
        BufferedReader br = null;
        try {
            FileSystem fs = FileSystem.get(conf);
            int count = 0;
            for (String inPath : inputPaths) {
                FileStatus[] contents = fs.listStatus(new Path(inPath));
                for (FileStatus s : contents) {
                    String nameOnly = s.getPath().toString();
                    nameOnly = nameOnly.substring(nameOnly.lastIndexOf("/") + 1, nameOnly.length());
                    if (!nameOnly.startsWith(".") && !nameOnly.startsWith("_") && fs.isFile(s.getPath())) {
                        FSDataInputStream di = fs.open(s.getPath());

                        br = new BufferedReader(new InputStreamReader(di));
                        String line = null;
                        while ((line = br.readLine()) != null && count < numRowsToGet) {
                            String[] parsed = csvTask.parseRowOnly(line);
                            Instance inst = csvTask.makeInstance(headerNoSummary, false, parsed, false);
                            candidateList.add(inst);
                            count++;
                        }

                        if (count == numRowsToGet) {
                            ok = true;
                            break;
                        }
                        br.close();
                        br = null;
                    }
                }
            }
        } catch (Exception ex) {
            throw new DistributedWekaException(ex);
        } finally {
            if (br != null) {
                try {
                    br.close();
                } catch (IOException e) {
                    throw new DistributedWekaException(e);
                }
            }
        }
    }

    if (candidateList.size() < numRuns * numClusters) {
        throw new DistributedWekaException("Was unable to obtain enough initial start points " + "for "
                + numRuns + " runs with " + numClusters + " start points each.");
    }

    // make sure that start points and header have been through any filters
    KMeansMapTask forFilteringOnly = new KMeansMapTask();
    try {
        forFilteringOnly.setOptions(Utils.splitOptions(environmentSubstitute(getKMeansMapTaskOpts())));

        // initialize sketches
        forFilteringOnly.init(m_arffHeaderJob.getFinalHeader());

        for (int i = 0; i < candidateList.size(); i++) {
            Instance filtered = forFilteringOnly.applyFilters(candidateList.get(i));
            candidateList.set(i, filtered);
        }

        headerNoSummary = forFilteringOnly.applyFilters(headerNoSummary);
    } catch (Exception ex) {
        throw new DistributedWekaException(ex);
    }

    List<Instances> startPoints = KMeansMapTask.assignStartPointsFromList(numRuns, numClusters, candidateList,
            headerNoSummary);

    return startPoints;
}

From source file:weka.distributed.spark.SparkUtils.java

License:Open Source License

/**
 * Delete a directory (and all contents).
 *
 * @param path the path to the directory to delete
 * @throws IOException if the path is not a directory or a problem occurs
 *//*from w w w.  j a v a 2 s .  c  o  m*/
public static void deleteDirectory(String path) throws IOException {
    String[] pathOnly = new String[1];
    Configuration conf = getFSConfigurationForPath(path, pathOnly);

    FileSystem fs = FileSystem.get(conf);
    Path p = new Path(pathOnly[0]);

    if (fs.isFile(p)) {
        throw new IOException("The path '" + pathOnly[0] + "' is not a directory!");
    }

    fs.delete(p, true);
}

From source file:weka.distributed.spark.SparkUtils.java

License:Open Source License

/**
 * Check that the named file exists on either the local file system or HDFS.
 *
 * @param file the file to check//www . ja  va  2 s  . co  m
 * @return true if the file exists on the local file system or in HDFS
 * @throws IOException if a problem occurs
 */
public static boolean checkFileExists(String file) throws IOException {
    if (file.toLowerCase().indexOf("://") > 0) {
        String[] pathOnly = new String[1];
        Configuration conf = getFSConfigurationForPath(file, pathOnly);

        FileSystem fs = FileSystem.get(conf);
        Path path = new Path(pathOnly[0]);

        return fs.exists(path) && fs.isFile(path);
    } else {
        File f = new File(file);
        return f.exists() && f.isFile();
    }
}