List of usage examples for org.apache.hadoop.fs FileSystem isFile
@Deprecated public boolean isFile(Path f) throws IOException
From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java
License:Apache License
/** * Pull the schema off of the given file (if it is a file). If it is a * directory, then pull schemas off of all subfiles, and check that they are * all the same schema. If so, return that schema, otherwise throw an * exception/*from w ww.j a v a 2 s .c o m*/ * * @param fs The filesystem to use * @param path The path from which to get the schema * @param checkSameSchema boolean flag to check all files in directory for * same schema * @return The schema of this file or all its subfiles * @throws IOException */ public static JsonSchema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema) throws IOException { try { if (fs.isFile(path)) { // this is a normal file, get a schema from it Map<String, String> m = HadoopUtils.getMetadataFromSequenceFile(fs, path); if (!m.containsKey("value.schema") || !m.containsKey("key.schema")) throw new IllegalArgumentException("No schema found on file " + path.toString()); return new JsonSchema(JsonTypeDefinition.fromJson(m.get("key.schema")), JsonTypeDefinition.fromJson(m.get("value.schema"))); } else { FileStatus[] statuses = null; if (fs.isDirectory(path)) { // this is a directory, get schemas from all subfiles statuses = fs.listStatus(path); } else { // this is wildcard path, get schemas from all matched files statuses = fs.globStatus(path); } if (statuses == null || statuses.length == 0) throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath()); List<JsonSchema> schemas = new ArrayList<JsonSchema>(); for (FileStatus status : statuses) { if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) { if (!checkSameSchema) { // return first valid schema w/o checking all files return getSchemaFromPath(fs, status.getPath(), checkSameSchema); } schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema)); } } // now check that all the schemas are the same if (schemas.size() > 0) { JsonSchema schema = schemas.get(0); for (int i = 1; i < schemas.size(); i++) if (!schema.equals(schemas.get(i))) throw new IllegalArgumentException("The directory " + path.toString() + " contains heterogenous schemas: found both '" + schema.toString() + "' and '" + schemas.get(i).toString() + "'."); return schema; } else { throw new IllegalArgumentException("No Valid metedata file found for Path:" + path.toString()); } } } catch (Exception e) { logger.error("failed to get metadata from path:" + path); throw new RuntimeException(e); } }
From source file:weka.distributed.hadoop.KMeansClustererHadoopJob.java
License:Open Source License
/** * If the data has been randomly shuffled into n chunks then this does select * randomly chosen centers. If the data hasn't been randomly shuffled then * rows are read sequentially from the first data file in the input directory * /* ww w .j a v a2 s. co m*/ * @param numRuns the number of runs of k-means * @param numClusters the number of clusters * @return a list of centers (as Instances objects) * @throws DistributedWekaException if a problem occurs */ protected List<Instances> initializeWithRandomCenters(int numRuns, int numClusters) throws DistributedWekaException { String csvConfig = getCSVMapTaskOptions(); CSVToARFFHeaderMapTask csvTask = new CSVToARFFHeaderMapTask(); Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(m_arffHeaderJob.getFinalHeader()); Configuration conf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env); List<Instance> candidateList = new ArrayList<Instance>(); int numRowsToGet = 2 * numRuns * numClusters; boolean ok = false; try { csvTask.setOptions(Utils.splitOptions(csvConfig)); csvTask.initParserOnly(CSVToARFFHeaderMapTask.instanceHeaderToAttributeNameList(headerNoSummary)); } catch (Exception e) { throw new DistributedWekaException(e); } if (getRandomlyShuffleData()) { String randomizedOutputPath = m_randomizeJob.getRandomizedChunkOutputPath(); try { FileSystem fs = FileSystem.get(conf); // FileStatus[] contents = fs.listStatus(new // Path(randomizedOutputPath)); int chunkNum = 0; while (!ok) { Path chunk = new Path(randomizedOutputPath + "/chunk" + chunkNum + "-r-00000"); if (!fs.exists(chunk)) { if (chunkNum == 0) { // something bad has happened - there doesn't seem to be any // chunk files throw new DistributedWekaException("Unable to find any chunk files in the " + "randomize job's output directory: " + randomizedOutputPath); } break; // run out of chunks } FSDataInputStream di = fs.open(chunk); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(di)); // get a few more than we need in order to avoid // duplicates (hopefully) int count = 0; String line = null; while ((line = br.readLine()) != null && count < numRowsToGet) { String[] parsed = csvTask.parseRowOnly(line); Instance inst = csvTask.makeInstance(headerNoSummary, false, parsed, false); candidateList.add(inst); count++; } if (count == numRowsToGet) { ok = true; } else { chunkNum++; } br.close(); br = null; } catch (Exception ex) { throw new DistributedWekaException(ex); } finally { if (br != null) { br.close(); } } } } catch (IOException ex) { throw new DistributedWekaException(ex); } } else { String inS = m_mrConfig.getInputPaths(); String[] inputPaths = inS.split(","); BufferedReader br = null; try { FileSystem fs = FileSystem.get(conf); int count = 0; for (String inPath : inputPaths) { FileStatus[] contents = fs.listStatus(new Path(inPath)); for (FileStatus s : contents) { String nameOnly = s.getPath().toString(); nameOnly = nameOnly.substring(nameOnly.lastIndexOf("/") + 1, nameOnly.length()); if (!nameOnly.startsWith(".") && !nameOnly.startsWith("_") && fs.isFile(s.getPath())) { FSDataInputStream di = fs.open(s.getPath()); br = new BufferedReader(new InputStreamReader(di)); String line = null; while ((line = br.readLine()) != null && count < numRowsToGet) { String[] parsed = csvTask.parseRowOnly(line); Instance inst = csvTask.makeInstance(headerNoSummary, false, parsed, false); candidateList.add(inst); count++; } if (count == numRowsToGet) { ok = true; break; } br.close(); br = null; } } } } catch (Exception ex) { throw new DistributedWekaException(ex); } finally { if (br != null) { try { br.close(); } catch (IOException e) { throw new DistributedWekaException(e); } } } } if (candidateList.size() < numRuns * numClusters) { throw new DistributedWekaException("Was unable to obtain enough initial start points " + "for " + numRuns + " runs with " + numClusters + " start points each."); } // make sure that start points and header have been through any filters KMeansMapTask forFilteringOnly = new KMeansMapTask(); try { forFilteringOnly.setOptions(Utils.splitOptions(environmentSubstitute(getKMeansMapTaskOpts()))); // initialize sketches forFilteringOnly.init(m_arffHeaderJob.getFinalHeader()); for (int i = 0; i < candidateList.size(); i++) { Instance filtered = forFilteringOnly.applyFilters(candidateList.get(i)); candidateList.set(i, filtered); } headerNoSummary = forFilteringOnly.applyFilters(headerNoSummary); } catch (Exception ex) { throw new DistributedWekaException(ex); } List<Instances> startPoints = KMeansMapTask.assignStartPointsFromList(numRuns, numClusters, candidateList, headerNoSummary); return startPoints; }
From source file:weka.distributed.spark.SparkUtils.java
License:Open Source License
/** * Delete a directory (and all contents). * * @param path the path to the directory to delete * @throws IOException if the path is not a directory or a problem occurs *//*from w w w. j a v a 2 s . c o m*/ public static void deleteDirectory(String path) throws IOException { String[] pathOnly = new String[1]; Configuration conf = getFSConfigurationForPath(path, pathOnly); FileSystem fs = FileSystem.get(conf); Path p = new Path(pathOnly[0]); if (fs.isFile(p)) { throw new IOException("The path '" + pathOnly[0] + "' is not a directory!"); } fs.delete(p, true); }
From source file:weka.distributed.spark.SparkUtils.java
License:Open Source License
/** * Check that the named file exists on either the local file system or HDFS. * * @param file the file to check//www . ja va 2 s . co m * @return true if the file exists on the local file system or in HDFS * @throws IOException if a problem occurs */ public static boolean checkFileExists(String file) throws IOException { if (file.toLowerCase().indexOf("://") > 0) { String[] pathOnly = new String[1]; Configuration conf = getFSConfigurationForPath(file, pathOnly); FileSystem fs = FileSystem.get(conf); Path path = new Path(pathOnly[0]); return fs.exists(path) && fs.isFile(path); } else { File f = new File(file); return f.exists() && f.isFile(); } }