List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Performs a non-recursive delete of all visible (non-hidden) files in a given * directory. Returns the number of files deleted as part of this operation. *//*from www .j a v a 2s.c o m*/ public static int deleteAllVisibleFiles(Path directory) throws IOException { FileSystem fs = directory.getFileSystem(CONF); Preconditions.checkState(fs.getFileStatus(directory).isDirectory()); int numFilesDeleted = 0; for (FileStatus fStatus : fs.listStatus(directory)) { // Only delete files that are not hidden. if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) { LOG.debug("Removing: " + fStatus.getPath()); fs.delete(fStatus.getPath(), false); ++numFilesDeleted; } } return numFilesDeleted; }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Returns the total number of visible (non-hidden) files in a directory. *///from w ww . ja v a 2s .co m public static int getTotalNumVisibleFiles(Path directory) throws IOException { FileSystem fs = directory.getFileSystem(CONF); Preconditions.checkState(fs.getFileStatus(directory).isDirectory()); int numFiles = 0; for (FileStatus fStatus : fs.listStatus(directory)) { // Only delete files that are not hidden. if (fStatus.isFile() && !isHiddenFile(fStatus.getPath().getName())) { ++numFiles; } } return numFiles; }
From source file:com.cloudera.impala.util.FsPermissionChecker.java
License:Apache License
/** * Returns a Permissions object that can answer all access permission queries for the * given path.//from w w w. j av a2 s . co m */ public Permissions getPermissions(FileSystem fs, Path path) throws IOException { Preconditions.checkNotNull(fs); Preconditions.checkNotNull(path); return new Permissions(fs.getFileStatus(path)); }
From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java
License:Open Source License
private void addOperatorInputs(Map<String, LocalResource> localResources) throws IOException { LOG.info("Inputs: " + operator.getInputFiles()); FileSystem fs = FileSystem.get(conf); for (Entry<String, String> e : operator.getInputFiles().entrySet()) { if ((!e.getValue().startsWith("hdfs://")) && (!e.getValue().startsWith("$HDFS"))) { LOG.info("adding local resource: " + e); String inDir = dir;//from w w w.j ava2 s . c o m LocalResource rsrc = Records.newRecord(LocalResource.class); rsrc.setType(LocalResourceType.FILE); rsrc.setVisibility(LocalResourceVisibility.APPLICATION); LOG.info("Adding input: " + inDir + "/" + e.getValue()); Path dst = new Path(inDir + "/" + e.getValue()); dst = fs.makeQualified(dst); FileStatus stat = fs.getFileStatus(dst); rsrc.setSize(stat.getLen()); rsrc.setTimestamp(stat.getModificationTime()); rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst)); localResources.put(e.getKey(), rsrc); } } /*for(String in : operator.getArguments().split(" ")){ LOG.info("Adding input: "+in); LocalResource nl = constructScriptResource(); localResources.put(in, nl); }*/ }
From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java
License:Open Source License
private void configureLocalScriptResourceForPath(LocalResource rsrc, Path path) throws IOException { //System.out.println("URI: "+path.toUri()); FileSystem fs = FileSystem.get(conf); Path dst = new Path(dir + "/" + path.getName()); fs.moveFromLocalFile(path, dst);/*from w w w . j a v a 2 s. c o m*/ dst = fs.makeQualified(dst); FileStatus stat = fs.getFileStatus(dst); rsrc.setSize(stat.getLen()); rsrc.setTimestamp(stat.getModificationTime()); rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst)); }
From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java
License:Open Source License
private void configureLocalResourceForPath(LocalResource rsrc, Path path) throws IOException { FileSystem fs = FileSystem.get(conf); FileStatus stat = fs.getFileStatus(path); rsrc.setSize(stat.getLen());/* w ww . j a va 2 s. com*/ rsrc.setTimestamp(stat.getModificationTime()); rsrc.setResource(ConverterUtils.getYarnUrlFromPath(path)); }
From source file:com.cloudera.oryx.ml.MLUpdate.java
License:Open Source License
@Override public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData, JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString, TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException { Objects.requireNonNull(newKeyMessageData); JavaRDD<M> newData = newKeyMessageData.values(); JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values(); if (newData != null) { newData.cache();//from w ww .j a v a2 s.c o m // This forces caching of the RDD. This shouldn't be necessary but we see some freezes // when many workers try to materialize the RDDs at once. Hence the workaround. newData.foreachPartition(p -> { }); } if (pastData != null) { pastData.cache(); pastData.foreachPartition(p -> { }); } List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues(); int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates); List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates, valuesPerHyperParam); Path modelDir = new Path(modelDirString); Path tempModelPath = new Path(modelDir, ".temporary"); Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis())); FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration()); fs.mkdirs(candidatesPath); Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos, candidatesPath); Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis())); if (bestCandidatePath == null) { log.info("Unable to build any model"); } else { // Move best model into place fs.rename(bestCandidatePath, finalPath); } // Then delete everything else fs.delete(candidatesPath, true); if (modelUpdateTopic == null) { log.info("No update topic configured, not publishing models to a topic"); } else { // Push PMML model onto update topic, if it exists Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME); if (fs.exists(bestModelPath)) { FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath); PMML bestModel = null; boolean modelNeededForUpdates = canPublishAdditionalModelData(); boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize; if (modelNeededForUpdates || modelNotTooLarge) { // Either the model is required for publishAdditionalModelData, or required because it's going to // be serialized to Kafka try (InputStream in = fs.open(bestModelPath)) { bestModel = PMMLUtils.read(in); } } if (modelNotTooLarge) { modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel)); } else { modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString()); } if (modelNeededForUpdates) { publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath, modelUpdateTopic); } } } if (newData != null) { newData.unpersist(); } if (pastData != null) { pastData.unpersist(); } }
From source file:com.cloudera.recordbreaker.analyzer.FormatAnalyzer.java
License:Open Source License
/** * Create a file-appropriate DataDescriptor instance. * * Right now we just use the file ending to figure out what to do, * but this will become unsatisfactory pretty quickly. * * @param f a <code>File</code> value * @return a <code>DataDescriptor</code> value *///from www. j av a 2s . c om public DataDescriptor describeData(FileSystem fs, Path p) throws IOException { FileStatus fstatus = fs.getFileStatus(p); String fname = p.getName(); // Test to see if the file is one of a handful of known structured formats. if (CSVDataDescriptor.isCSV(fs, p)) { return new CSVDataDescriptor(p, fs); } else if (fname.endsWith(".xml")) { return new XMLDataDescriptor(p, fs); } else if (fname.endsWith(".avro")) { return new AvroDataDescriptor(p, fs); } else if (AvroSequenceFileDataDescriptor.isAvroSequenceFile(fs, p)) { return new AvroSequenceFileDataDescriptor(p, fs); } else if (SequenceFileDataDescriptor.isSequenceFile(fs, p)) { return new SequenceFileDataDescriptor(p, fs); } else if (ApacheDataDescriptor.isApacheLogFile(fs, p)) { return new ApacheDataDescriptor(p, fs); } else if (SyslogDataDescriptor.isSyslogFile(fs, p)) { return new SyslogDataDescriptor(p, fs); } else { // It's not one of the known formats, so apply LearnStructure // to obtain the structure. if (UnknownTextDataDescriptor.isTextData(fs, p)) { try { return new UnknownTextDataDescriptor(fs, p, schemaDbDir); } catch (Exception iex) { //iex.printStackTrace(); } } // If that doesn't work, then give up and call it unstructured. You // can't run queries on data in this format. return new UnstructuredFileDescriptor(fs, p); } }
From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java
License:Open Source License
/** * Add a single brand-new file to the system. Parse it, obtain structure, etc, if needed. *//*from www . j a v a2 s . c om*/ void addSingleFile(FileSystem fs, Path insertFile, long crawlId) throws IOException { FileStatus fstatus = fs.getFileStatus(insertFile); addFileMetadata(fstatus, crawlId); final boolean isDir = fstatus.isDir(); if (!isDir) { final List<Long> typeGuesses = new ArrayList<Long>(); DataDescriptor descriptor = formatAnalyzer.describeData(fs, insertFile); List<SchemaDescriptor> schemas = null; try { schemas = descriptor.getSchemaDescriptor(); if (schemas == null || schemas.size() == 0) { typeGuesses.add(getCreateType(descriptor.getFileTypeIdentifier())); typeGuesses.add(getSingleFileSummary(descriptor.getFilename().toString()).getFid()); typeGuesses.add(getCreateSchema(null)); } else { for (SchemaDescriptor sd : schemas) { typeGuesses.add(getCreateType(descriptor.getFileTypeIdentifier())); typeGuesses.add(getSingleFileSummary(descriptor.getFilename().toString()).getFid()); typeGuesses.add(getCreateSchema(sd)); } } } catch (Exception ex) { ex.printStackTrace(); } dbQueue.execute(new SQLiteJob<Object>() { protected Long job(SQLiteConnection db) throws SQLiteException { for (int i = 0; i < typeGuesses.size(); i += 3) { long typeId = typeGuesses.get(i); long fileId = typeGuesses.get(i + 1); long schemaId = typeGuesses.get(i + 2); SQLiteStatement stmt = db.prepare("INSERT into TypeGuesses VALUES(?, ?)"); try { stmt.bind(1, fileId).bind(2, typeId); stmt.step(); } finally { stmt.dispose(); } } return null; } }).complete(); dbQueue.execute(new SQLiteJob<Object>() { protected Long job(SQLiteConnection db) throws SQLiteException { for (int i = 0; i < typeGuesses.size(); i += 3) { long typeId = typeGuesses.get(i); long fileId = typeGuesses.get(i + 1); long schemaId = typeGuesses.get(i + 2); SQLiteStatement stmt = db.prepare("INSERT into SchemaGuesses VALUES(?, ?)"); try { stmt.bind(1, fileId).bind(2, schemaId); stmt.step(); } finally { stmt.dispose(); } } return null; } }).complete(); } }
From source file:com.cloudera.recordbreaker.analyzer.FSCrawler.java
License:Open Source License
/** * Traverse an entire region of the filesystem, analyzing files. * This code should:/* w w w. ja va 2 s . c om*/ * a) Navigate the directory hierarchy * b) Run analysis code to figure out the file details * c) Invoke addSingleFile() appropriately. */ protected void recursiveCrawlBuildList(FileSystem fs, Path p, int subdirDepth, long crawlId, List<Path> todoFileList, List<Path> todoDirList) throws IOException { FileStatus fstatus = fs.getFileStatus(p); if (!fstatus.isDir()) { todoFileList.add(p); } else { if (subdirDepth > 0 || subdirDepth < 0) { todoDirList.add(p); Path paths[] = new Path[1]; paths[0] = p; for (FileStatus subfilestatus : fs.listStatus(p)) { Path subfile = subfilestatus.getPath(); try { recursiveCrawlBuildList(fs, subfile, subdirDepth - 1, crawlId, todoFileList, todoDirList); } catch (IOException iex) { iex.printStackTrace(); } } } } }