List of usage examples for org.apache.hadoop.fs FileSystem deleteOnExit
Set deleteOnExit
To view the source code for org.apache.hadoop.fs FileSystem deleteOnExit.
Click Source Link
From source file:com.linkedin.oneclick.wordcount.WordCount.java
License:Apache License
static Path tempFile(Configuration conf, String name) { try {/* w ww . j a va 2s .c om*/ Path result = new Path(conf.get("hadoop.tmp.dir"), name); log.info("tempFile=" + result.toString()); FileSystem fs = result.getFileSystem(conf); fs.mkdirs(result.getParent()); fs.deleteOnExit(result); return result; } catch (IOException e) { throw new RuntimeException(e); } }
From source file:com.ricemap.spateDB.core.SpatialSite.java
License:Apache License
public static void setCells(JobConf job, CellInfo[] cellsInfo) throws IOException { Path tempFile;/* w w w. j ava 2 s . c o m*/ FileSystem fs = FileSystem.get(job); do { tempFile = new Path(job.getJobName() + "_" + (int) (Math.random() * 1000000) + ".cells"); } while (fs.exists(tempFile)); FSDataOutputStream out = fs.create(tempFile); out.writeInt(cellsInfo.length); for (CellInfo cell : cellsInfo) { cell.write(out); } out.close(); fs.deleteOnExit(tempFile); DistributedCache.addCacheFile(tempFile.toUri(), job); job.set(OUTPUT_CELLS, tempFile.getName()); LOG.info("Partitioning file into " + cellsInfo.length + " cells"); }
From source file:com.skp.experiment.cf.als.hadoop.ParallelALSFactorizationJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from w w w .j a v a 2 s . c o m*/ addOutputOption(); addOption("lambda", null, "regularization parameter", true); addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false)); addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40)); addOption("numFeatures", null, "dimension of the feature space", true); addOption("numIterations", null, "number of iterations", true); addOption("indexSizes", null, "index sizes Path", true); addOption("startIteration", null, "start iteration number", String.valueOf(0)); addOption("oldM", null, "old M matrix Path.", null); addOption("largeUserFeatures", null, "true if user x feature matrix is too large for memory", String.valueOf(true)); addOption("rmseCurve", null, "true if want to extract rmse curve", String.valueOf(true)); addOption("cleanUp", null, "true if want to clean up temporary matrix", String.valueOf(true)); addOption("useTransform", null, "true if using logarithm as transform", String.valueOf(true)); addOption("rateIndex", null, "0 based index for rate column in input file.", String.valueOf(2)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } try { /** step 0: fetch dimention of training set matrix. */ Map<String, String> indexSizesTmp = ALSMatrixUtil.fetchTextFiles(new Path(getOption("indexSizes")), DELIMETER, Arrays.asList(0), Arrays.asList(1)); numFeatures = Integer.parseInt(parsedArgs.get("--numFeatures")); numIterations = Integer.parseInt(parsedArgs.get("--numIterations")); lambda = Double.parseDouble(parsedArgs.get("--lambda")); alpha = Double.parseDouble(parsedArgs.get("--alpha")); implicitFeedback = Boolean.parseBoolean(parsedArgs.get("--implicitFeedback")); numUsers = Integer.parseInt(indexSizesTmp.get("0")); numItems = Integer.parseInt(indexSizesTmp.get("1")); numTaskTrackers = HadoopClusterUtil.getNumberOfTaskTrackers(getConf()) * multiplyMapTasks; startIteration = Integer.parseInt(parsedArgs.get("--startIteration")); largeUserFeatures = Boolean.parseBoolean(getOption("largeUserFeatures")); useRMSECurve = Boolean.parseBoolean(getOption("rmseCurve")); cleanUp = Boolean.parseBoolean(getOption("cleanUp")); useTransform = Boolean.parseBoolean(getOption("useTransform")); rateIndex = Integer.parseInt(getOption("rateIndex")); FileSystem fs = FileSystem.get(getConf()); if (!fs.exists(pathToTransformed())) { if (useTransform) { // transform price into rating Job transformJob = prepareJob(getInputPath(), pathToTransformed(), TextInputFormat.class, TransformColumnValueMapper.class, NullWritable.class, Text.class, TextOutputFormat.class); transformJob.waitForCompletion(true); } else { FileUtil.copy(FileSystem.get(getConf()), getInputPath(), FileSystem.get(getConf()), pathToTransformed(), false, getConf()); } } /* if (getOption("oldM") != null) { runOnetimeSolver(pathToTransformed(), getOutputPath("U"), new Path(getOption("oldM"))); return 0; } */ /* * compute the factorization A = U M' * * where A (users x items) is the matrix of known ratings * U (users x features) is the representation of users in the feature space * M (items x features) is the representation of items in the feature space */ if (startIteration == 0) { if (!fs.exists(pathToItemRatings())) { // create A' Job itemRatings = prepareJob(pathToTransformed(), pathToItemRatings(), TextInputFormat.class, ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); itemRatings.setCombinerClass(VectorSumReducer.class); long matrixSizeExp = (long) (8L * numUsers * numFeatures * SAFE_MARGIN); long memoryThreshold = HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / (long) HadoopClusterUtil.MAP_TASKS_PER_NODE; int numTaskPerDataNode = Math.max(1, (int) (HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / (double) matrixSizeExp)); //log.info("matrix Size: " + matrixSizeExp + ", memorhThreshold: " + memoryThreshold + ", numTaskPerDataNode: " + numTaskPerDataNode); if (matrixSizeExp > memoryThreshold) { //log.info("A: {}", numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf())); int numReducer = Math.min( numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf()), HadoopClusterUtil.getMaxMapTasks(getConf())); //log.info("Number Of Reducer: " + numReducer); itemRatings.setNumReduceTasks(numReducer); } itemRatings.waitForCompletion(true); } if (!fs.exists(pathToUserRatings())) { Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); userRatings.setNumReduceTasks(HadoopClusterUtil.getNumberOfTaskTrackers(getConf())); userRatings.setCombinerClass(MergeVectorsCombiner.class); userRatings.setNumReduceTasks(HadoopClusterUtil.getMaxMapTasks(getConf())); userRatings.waitForCompletion(true); } if (!fs.exists(getOutputPath("userItemCnt"))) { // count item per user Job userItemCntsJob = prepareJob(pathToUserRatings(), getOutputPath("userItemCnt"), SequenceFileInputFormat.class, UserItemCntsMapper.class, IntWritable.class, IntWritable.class, SequenceFileOutputFormat.class); userItemCntsJob.setJobName("user ratings count"); userItemCntsJob.waitForCompletion(true); } if (!fs.exists(getTempPath("averageRatings"))) { //TODO this could be fiddled into one of the upper jobs Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"), AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageItemRatings.setCombinerClass(MergeVectorsCombiner.class); averageItemRatings.waitForCompletion(true); } if (!fs.exists(new Path(pathToM(-1), "part-m-00000"))) { Vector averageRatings = ALSMatrixUtil.readFirstRow(getTempPath("averageRatings"), getConf()); /** create an initial M */ initializeM(averageRatings); } } for (int currentIteration = startIteration; currentIteration < numIterations; currentIteration++) { DistributedRowMatrix curM = new DistributedRowMatrix(pathToM(currentIteration - 1), getTempPath("Mtemp/tmp-" + String.valueOf(currentIteration - 1) + "/M"), numItems, numFeatures); curM.setConf(getConf()); DistributedRowMatrix YtransposeY = curM.times(curM); /** broadcast M, read A row-wise, recompute U row-wise */ log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations); runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1), YtransposeY.getRowPath(), numItems, false); DistributedRowMatrix curU = new DistributedRowMatrix(pathToU(currentIteration), getTempPath("Utmp/tmp-" + String.valueOf(currentIteration) + "/U"), numUsers, numFeatures); curU.setConf(getConf()); DistributedRowMatrix XtransposeX = curU.times(curU); /** broadcast U, read A' row-wise, recompute M row-wise */ log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations); runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration), XtransposeX.getRowPath(), numUsers, largeUserFeatures); /** calculate rmse on each updated matrix U, M and decide to further iteration */ if (currentIteration > startIteration && useRMSECurve) { Pair<Integer, Double> UsquaredError = calculateMatrixDistanceSquared( pathToU(currentIteration - 1), pathToU(currentIteration), currentIteration); Pair<Integer, Double> MsquaredError = calculateMatrixDistanceSquared( pathToM(currentIteration - 1), pathToM(currentIteration), currentIteration); String currentRMSE = currentIteration + DELIMETER + UsquaredError.getFirst() + DELIMETER + UsquaredError.getSecond() + DELIMETER + MsquaredError.getFirst() + DELIMETER + MsquaredError.getSecond() + DefaultOptionCreator.NEWLINE; rmsePerIteration += currentRMSE; log.info("iteration {}: {}", currentIteration, currentRMSE); } if (currentIteration >= startIteration + 2 && cleanUp) { fs.deleteOnExit(pathToU(currentIteration - 2)); fs.deleteOnExit(pathToM(currentIteration - 2)); } } return 0; } catch (Exception e) { e.printStackTrace(); return -1; } finally { if (useRMSECurve) { HadoopClusterUtil.writeToHdfs(getConf(), getOutputPath("RMSE"), rmsePerIteration); } } }
From source file:com.skp.experiment.cf.als.hadoop.SolveImplicitFeedbackMultithreadedMapper.java
License:Apache License
@Override protected void cleanup(Context context) throws IOException, InterruptedException { if (currentLockPath != null) { FileSystem fs = FileSystem.get(context.getConfiguration()); fs.deleteOnExit(currentLockPath); }/*from w w w . j av a 2 s . c o m*/ }
From source file:com.splunk.shuttl.integration.hadoop.hbase.JobRunner.java
License:Apache License
/** * @param job/*from w w w . j a va 2s. c o m*/ * @param jobConfiguration * @param configuration * @throws Exception */ private boolean run(Job job) throws Exception { Configuration configuration = job.getConfiguration(); Path inputPath = new Path(configuration.get(CONFIG_FILENAME)); Path outputPath = new Path(configuration.get(CONFIG_OUTPUTPATH)); FileSystem fSystem = FileSystem.get(configuration); CreateHBaseTableIfNotExists(configuration.get(CONFIG_TABLENAME)); DeleteOutputPathIfExists(fSystem, outputPath); FileSystem fs = FileSystem.get(configuration); String headerString = readFirstLine(fs.open(new Path(""))); configuration.set(JobConfigurationConstants.HEADER_STRING, headerString); HTable hTable = new HTable(job.getConfiguration(), configuration.get(CONFIG_TABLENAME)); // Auto configure partitioner and reducer HFileOutputFormat.configureIncrementalLoad(job, hTable); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); boolean complete = job.waitForCompletion(true); if (complete) { LoadIncrementalHFiles loader = new LoadIncrementalHFiles(configuration); loader.doBulkLoad(outputPath, hTable); } fSystem.deleteOnExit(outputPath); return complete; }
From source file:com.thinkbiganalytics.spark.io.HadoopUtil.java
License:Apache License
/** * Mark a path to be deleted when FileSystem is closed. Swallows any thrown IOException. * * @return {@code true} if deleteOnExit is successful, otherwise {@code false} *///from w w w .ja v a2 s.com public static boolean deleteOnExitSilently(@Nonnull final Path path, @Nonnull final FileSystem fs) { try { return fs.deleteOnExit(path); } catch (final IOException e) { log.debug("Failed to delete on exit silently: {}", path, e); return false; } }
From source file:edu.umn.cs.spatialHadoop.core.Partitioner.java
License:Open Source License
/** * Sets the class and value of a partitioner in the given job * @param conf/*from ww w. ja v a 2s. c o m*/ * @param partitioner * @throws IOException */ public static void setPartitioner(Configuration conf, Partitioner partitioner) throws IOException { conf.setClass(PartitionerClass, partitioner.getClass(), Partitioner.class); Path tempFile; FileSystem fs = FileSystem.get(conf); do { tempFile = new Path("cells_" + (int) (Math.random() * 1000000) + ".partitions"); } while (fs.exists(tempFile)); FSDataOutputStream out = fs.create(tempFile); partitioner.write(out); out.close(); fs.deleteOnExit(tempFile); DistributedCache.addCacheFile(tempFile.toUri(), conf); conf.set(PartitionerValue, tempFile.getName()); }
From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java
License:Open Source License
/** * Ensures that the given class is in the class path of running jobs. * If the jar is not already in the class path, it is added to the * DisributedCache of the given job to ensure the associated job will work * fine./*from www . j a va 2 s .co m*/ * @param conf * @param klass */ public static void addClassToPath(Configuration conf, Class<?> klass) { // Check if we need to add the containing jar to class path String klassJar = findContainingJar(klass); String shadoopJar = findContainingJar(SpatialSite.class); if (klassJar == null || (shadoopJar != null && klassJar.equals(shadoopJar))) return; Path containingJar = new Path(findContainingJar(klass)); Path[] existingClassPaths = DistributedCache.getArchiveClassPaths(conf); if (existingClassPaths != null) { for (Path existingClassPath : existingClassPaths) { if (containingJar.getName().equals(existingClassPath.getName())) return; } } // The containing jar is a new one and needs to be copied to class path try { LOG.info("Adding JAR '" + containingJar.getName() + "' to job class path"); FileSystem defaultFS = FileSystem.get(conf); Path libFolder; if (existingClassPaths != null && existingClassPaths.length > 0) { libFolder = existingClassPaths[0].getParent(); } else { // First jar to be added like this. Create a new lib folder do { libFolder = new Path("lib_" + (int) (Math.random() * 100000)); } while (defaultFS.exists(libFolder)); defaultFS.mkdirs(libFolder); defaultFS.deleteOnExit(libFolder); } defaultFS.copyFromLocalFile(containingJar, libFolder); Path jarFullPath = new Path(libFolder, containingJar.getName()).makeQualified(defaultFS); jarFullPath = jarFullPath.makeQualified(defaultFS); DistributedCache.addArchiveToClassPath(jarFullPath, conf); } catch (IOException e) { e.printStackTrace(); } }
From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java
License:Open Source License
/** * Set an array of cells in the job configuration. As the array might be * very large to store as one value, an alternative approach is used. * The cells are all written to a temporary file, and that file is added * to the DistributedCache of the job. Later on, a call to * {@link #getCells(Configuration)} will open the corresponding file from * DistributedCache and parse cells from that file. * @param conf//from ww w .j a v a 2s . c om * @param cellsInfo * @throws IOException */ public static void setCells(Configuration conf, CellInfo[] cellsInfo) throws IOException { Path tempFile; FileSystem fs = FileSystem.get(conf); do { tempFile = new Path("cells_" + (int) (Math.random() * 1000000) + ".cells"); } while (fs.exists(tempFile)); FSDataOutputStream out = fs.create(tempFile); out.writeInt(cellsInfo.length); for (CellInfo cell : cellsInfo) { cell.write(out); } out.close(); fs.deleteOnExit(tempFile); DistributedCache.addCacheFile(tempFile.toUri(), conf); conf.set(OUTPUT_CELLS, tempFile.getName()); LOG.info("Partitioning file into " + cellsInfo.length + " cells"); }
From source file:edu.umn.cs.spatialHadoop.nasa.HDFPlot.java
License:Open Source License
public static void generateWaterMask(Path wmImage, OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException { // Need to recover holes on write but the water mask is not set, // need to put it first Path wmPath = new Path(params.get(HDFRecordReader.WATER_MASK_PATH, "http://e4ftl01.cr.usgs.gov/MOLT/MOD44W.005/2000.02.24/")); wmPath = new Path(wmPath, "*.hdf"); params.set("recover", "none"); params.setBoolean("recoverholes", false); params.set("dataset", "water_mask"); if (params.get("shape") == null) { // Set the default shape value params.setClass("shape", NASARectangle.class, Shape.class); } else if (!(params.getShape("shape") instanceof NASAShape)) { System.err.println("The specified shape " + params.get("shape") + " in not an instance of NASAShape"); System.exit(1);/* ww w.j a v a 2 s . co m*/ } if (params.get("mbr") == null) { // Set to the same value as query rectangle or the whole world params.set("mbr", params.get("rect", "-180,-90,180,90")); } SingleLevelPlot.plot(new Path[] { wmPath }, wmImage, HDFRasterizeWaterMask.class, params); FileSystem outFS = wmImage.getFileSystem(params); outFS.deleteOnExit(wmImage); }