Example usage for org.apache.hadoop.fs FileSystem deleteOnExit

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem deleteOnExit.

Prototype

Set deleteOnExit

To view the source code for org.apache.hadoop.fs FileSystem deleteOnExit.

Click Source Link

Document

A cache of files that should be deleted when the FileSystem is closed or the JVM is exited.

Usage

From source file:com.linkedin.oneclick.wordcount.WordCount.java

License:Apache License

static Path tempFile(Configuration conf, String name) {
    try {/*  w  ww .  j a  va 2s .c om*/
        Path result = new Path(conf.get("hadoop.tmp.dir"), name);
        log.info("tempFile=" + result.toString());
        FileSystem fs = result.getFileSystem(conf);
        fs.mkdirs(result.getParent());
        fs.deleteOnExit(result);
        return result;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:com.ricemap.spateDB.core.SpatialSite.java

License:Apache License

public static void setCells(JobConf job, CellInfo[] cellsInfo) throws IOException {
    Path tempFile;/* w  w  w.  j  ava 2 s . c o m*/
    FileSystem fs = FileSystem.get(job);
    do {
        tempFile = new Path(job.getJobName() + "_" + (int) (Math.random() * 1000000) + ".cells");
    } while (fs.exists(tempFile));
    FSDataOutputStream out = fs.create(tempFile);
    out.writeInt(cellsInfo.length);
    for (CellInfo cell : cellsInfo) {
        cell.write(out);
    }
    out.close();

    fs.deleteOnExit(tempFile);

    DistributedCache.addCacheFile(tempFile.toUri(), job);
    job.set(OUTPUT_CELLS, tempFile.getName());
    LOG.info("Partitioning file into " + cellsInfo.length + " cells");
}

From source file:com.skp.experiment.cf.als.hadoop.ParallelALSFactorizationJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from  w  w w  .j a v  a  2  s .  c  o  m*/
    addOutputOption();
    addOption("lambda", null, "regularization parameter", true);
    addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
    addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("indexSizes", null, "index sizes Path", true);
    addOption("startIteration", null, "start iteration number", String.valueOf(0));
    addOption("oldM", null, "old M matrix Path.", null);
    addOption("largeUserFeatures", null, "true if user x feature matrix is too large for memory",
            String.valueOf(true));
    addOption("rmseCurve", null, "true if want to extract rmse curve", String.valueOf(true));
    addOption("cleanUp", null, "true if want to clean up temporary matrix", String.valueOf(true));
    addOption("useTransform", null, "true if using logarithm as transform", String.valueOf(true));
    addOption("rateIndex", null, "0 based index for rate column in input file.", String.valueOf(2));
    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    try {
        /** step 0: fetch dimention of training set matrix. */
        Map<String, String> indexSizesTmp = ALSMatrixUtil.fetchTextFiles(new Path(getOption("indexSizes")),
                DELIMETER, Arrays.asList(0), Arrays.asList(1));

        numFeatures = Integer.parseInt(parsedArgs.get("--numFeatures"));
        numIterations = Integer.parseInt(parsedArgs.get("--numIterations"));
        lambda = Double.parseDouble(parsedArgs.get("--lambda"));
        alpha = Double.parseDouble(parsedArgs.get("--alpha"));
        implicitFeedback = Boolean.parseBoolean(parsedArgs.get("--implicitFeedback"));
        numUsers = Integer.parseInt(indexSizesTmp.get("0"));
        numItems = Integer.parseInt(indexSizesTmp.get("1"));

        numTaskTrackers = HadoopClusterUtil.getNumberOfTaskTrackers(getConf()) * multiplyMapTasks;
        startIteration = Integer.parseInt(parsedArgs.get("--startIteration"));
        largeUserFeatures = Boolean.parseBoolean(getOption("largeUserFeatures"));
        useRMSECurve = Boolean.parseBoolean(getOption("rmseCurve"));
        cleanUp = Boolean.parseBoolean(getOption("cleanUp"));
        useTransform = Boolean.parseBoolean(getOption("useTransform"));
        rateIndex = Integer.parseInt(getOption("rateIndex"));
        FileSystem fs = FileSystem.get(getConf());
        if (!fs.exists(pathToTransformed())) {
            if (useTransform) {
                // transform price into rating
                Job transformJob = prepareJob(getInputPath(), pathToTransformed(), TextInputFormat.class,
                        TransformColumnValueMapper.class, NullWritable.class, Text.class,
                        TextOutputFormat.class);
                transformJob.waitForCompletion(true);
            } else {

                FileUtil.copy(FileSystem.get(getConf()), getInputPath(), FileSystem.get(getConf()),
                        pathToTransformed(), false, getConf());
            }
        }
        /*
        if (getOption("oldM") != null) {
          runOnetimeSolver(pathToTransformed(), getOutputPath("U"), new Path(getOption("oldM")));
          return 0;
        }
        */
        /*
            * compute the factorization A = U M'
            *
            * where A (users x items) is the matrix of known ratings
            *           U (users x features) is the representation of users in the feature space
            *           M (items x features) is the representation of items in the feature space
            */
        if (startIteration == 0) {
            if (!fs.exists(pathToItemRatings())) {
                // create A' 
                Job itemRatings = prepareJob(pathToTransformed(), pathToItemRatings(), TextInputFormat.class,
                        ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class,
                        VectorSumReducer.class, IntWritable.class, VectorWritable.class,
                        SequenceFileOutputFormat.class);
                itemRatings.setCombinerClass(VectorSumReducer.class);
                long matrixSizeExp = (long) (8L * numUsers * numFeatures * SAFE_MARGIN);
                long memoryThreshold = HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT
                        / (long) HadoopClusterUtil.MAP_TASKS_PER_NODE;
                int numTaskPerDataNode = Math.max(1,
                        (int) (HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / (double) matrixSizeExp));
                //log.info("matrix Size: " + matrixSizeExp + ", memorhThreshold: " + memoryThreshold + ", numTaskPerDataNode: " + numTaskPerDataNode);
                if (matrixSizeExp > memoryThreshold) {
                    //log.info("A: {}", numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf()));
                    int numReducer = Math.min(
                            numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf()),
                            HadoopClusterUtil.getMaxMapTasks(getConf()));
                    //log.info("Number Of Reducer: " + numReducer);
                    itemRatings.setNumReduceTasks(numReducer);
                }

                itemRatings.waitForCompletion(true);
            }

            if (!fs.exists(pathToUserRatings())) {
                Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class,
                        IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
                        VectorWritable.class);
                userRatings.setNumReduceTasks(HadoopClusterUtil.getNumberOfTaskTrackers(getConf()));
                userRatings.setCombinerClass(MergeVectorsCombiner.class);
                userRatings.setNumReduceTasks(HadoopClusterUtil.getMaxMapTasks(getConf()));
                userRatings.waitForCompletion(true);
            }
            if (!fs.exists(getOutputPath("userItemCnt"))) {
                // count item per user
                Job userItemCntsJob = prepareJob(pathToUserRatings(), getOutputPath("userItemCnt"),
                        SequenceFileInputFormat.class, UserItemCntsMapper.class, IntWritable.class,
                        IntWritable.class, SequenceFileOutputFormat.class);
                userItemCntsJob.setJobName("user ratings count");
                userItemCntsJob.waitForCompletion(true);
            }

            if (!fs.exists(getTempPath("averageRatings"))) {
                //TODO this could be fiddled into one of the upper jobs
                Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
                        AverageRatingMapper.class, IntWritable.class, VectorWritable.class,
                        MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
                averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
                averageItemRatings.waitForCompletion(true);
            }
            if (!fs.exists(new Path(pathToM(-1), "part-m-00000"))) {
                Vector averageRatings = ALSMatrixUtil.readFirstRow(getTempPath("averageRatings"), getConf());

                /** create an initial M */
                initializeM(averageRatings);
            }
        }

        for (int currentIteration = startIteration; currentIteration < numIterations; currentIteration++) {
            DistributedRowMatrix curM = new DistributedRowMatrix(pathToM(currentIteration - 1),
                    getTempPath("Mtemp/tmp-" + String.valueOf(currentIteration - 1) + "/M"), numItems,
                    numFeatures);
            curM.setConf(getConf());
            DistributedRowMatrix YtransposeY = curM.times(curM);
            /** broadcast M, read A row-wise, recompute U row-wise */
            log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
            runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
                    YtransposeY.getRowPath(), numItems, false);

            DistributedRowMatrix curU = new DistributedRowMatrix(pathToU(currentIteration),
                    getTempPath("Utmp/tmp-" + String.valueOf(currentIteration) + "/U"), numUsers, numFeatures);
            curU.setConf(getConf());
            DistributedRowMatrix XtransposeX = curU.times(curU);

            /** broadcast U, read A' row-wise, recompute M row-wise */
            log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
            runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration),
                    XtransposeX.getRowPath(), numUsers, largeUserFeatures);

            /** calculate rmse on each updated matrix U, M and decide to further iteration */
            if (currentIteration > startIteration && useRMSECurve) {
                Pair<Integer, Double> UsquaredError = calculateMatrixDistanceSquared(
                        pathToU(currentIteration - 1), pathToU(currentIteration), currentIteration);
                Pair<Integer, Double> MsquaredError = calculateMatrixDistanceSquared(
                        pathToM(currentIteration - 1), pathToM(currentIteration), currentIteration);
                String currentRMSE = currentIteration + DELIMETER + UsquaredError.getFirst() + DELIMETER
                        + UsquaredError.getSecond() + DELIMETER + MsquaredError.getFirst() + DELIMETER
                        + MsquaredError.getSecond() + DefaultOptionCreator.NEWLINE;
                rmsePerIteration += currentRMSE;
                log.info("iteration {}: {}", currentIteration, currentRMSE);
            }
            if (currentIteration >= startIteration + 2 && cleanUp) {
                fs.deleteOnExit(pathToU(currentIteration - 2));
                fs.deleteOnExit(pathToM(currentIteration - 2));
            }
        }
        return 0;
    } catch (Exception e) {
        e.printStackTrace();
        return -1;
    } finally {
        if (useRMSECurve) {
            HadoopClusterUtil.writeToHdfs(getConf(), getOutputPath("RMSE"), rmsePerIteration);
        }
    }
}

From source file:com.skp.experiment.cf.als.hadoop.SolveImplicitFeedbackMultithreadedMapper.java

License:Apache License

@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
    if (currentLockPath != null) {
        FileSystem fs = FileSystem.get(context.getConfiguration());
        fs.deleteOnExit(currentLockPath);
    }/*from w w w . j  av a 2  s . c o m*/
}

From source file:com.splunk.shuttl.integration.hadoop.hbase.JobRunner.java

License:Apache License

/**
 * @param job/*from  w w w . j  a va 2s.  c o m*/
 * @param jobConfiguration
 * @param configuration
 * @throws Exception
 */
private boolean run(Job job) throws Exception {

    Configuration configuration = job.getConfiguration();

    Path inputPath = new Path(configuration.get(CONFIG_FILENAME));
    Path outputPath = new Path(configuration.get(CONFIG_OUTPUTPATH));

    FileSystem fSystem = FileSystem.get(configuration);

    CreateHBaseTableIfNotExists(configuration.get(CONFIG_TABLENAME));

    DeleteOutputPathIfExists(fSystem, outputPath);

    FileSystem fs = FileSystem.get(configuration);
    String headerString = readFirstLine(fs.open(new Path("")));
    configuration.set(JobConfigurationConstants.HEADER_STRING, headerString);

    HTable hTable = new HTable(job.getConfiguration(), configuration.get(CONFIG_TABLENAME));

    // Auto configure partitioner and reducer
    HFileOutputFormat.configureIncrementalLoad(job, hTable);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    boolean complete = job.waitForCompletion(true);

    if (complete) {
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(configuration);
        loader.doBulkLoad(outputPath, hTable);
    }

    fSystem.deleteOnExit(outputPath);

    return complete;
}

From source file:com.thinkbiganalytics.spark.io.HadoopUtil.java

License:Apache License

/**
 * Mark a path to be deleted when FileSystem is closed. Swallows any thrown IOException.
 *
 * @return {@code true} if deleteOnExit is successful, otherwise {@code false}
 *///from   w  w  w  .ja v  a2 s.com
public static boolean deleteOnExitSilently(@Nonnull final Path path, @Nonnull final FileSystem fs) {
    try {
        return fs.deleteOnExit(path);
    } catch (final IOException e) {
        log.debug("Failed to delete on exit silently: {}", path, e);
        return false;
    }
}

From source file:edu.umn.cs.spatialHadoop.core.Partitioner.java

License:Open Source License

/**
 * Sets the class and value of a partitioner in the given job
 * @param conf/*from  ww  w. ja v a  2s. c o m*/
 * @param partitioner
 * @throws IOException
 */
public static void setPartitioner(Configuration conf, Partitioner partitioner) throws IOException {
    conf.setClass(PartitionerClass, partitioner.getClass(), Partitioner.class);
    Path tempFile;
    FileSystem fs = FileSystem.get(conf);
    do {
        tempFile = new Path("cells_" + (int) (Math.random() * 1000000) + ".partitions");
    } while (fs.exists(tempFile));
    FSDataOutputStream out = fs.create(tempFile);
    partitioner.write(out);
    out.close();

    fs.deleteOnExit(tempFile);

    DistributedCache.addCacheFile(tempFile.toUri(), conf);
    conf.set(PartitionerValue, tempFile.getName());
}

From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java

License:Open Source License

/**
 * Ensures that the given class is in the class path of running jobs.
 * If the jar is not already in the class path, it is added to the
 * DisributedCache of the given job to ensure the associated job will work
 * fine./*from  www . j  a  va 2 s .co m*/
 * @param conf
 * @param klass
 */
public static void addClassToPath(Configuration conf, Class<?> klass) {
    // Check if we need to add the containing jar to class path
    String klassJar = findContainingJar(klass);
    String shadoopJar = findContainingJar(SpatialSite.class);
    if (klassJar == null || (shadoopJar != null && klassJar.equals(shadoopJar)))
        return;
    Path containingJar = new Path(findContainingJar(klass));
    Path[] existingClassPaths = DistributedCache.getArchiveClassPaths(conf);
    if (existingClassPaths != null) {
        for (Path existingClassPath : existingClassPaths) {
            if (containingJar.getName().equals(existingClassPath.getName()))
                return;
        }
    }
    // The containing jar is a new one and needs to be copied to class path
    try {
        LOG.info("Adding JAR '" + containingJar.getName() + "' to job class path");
        FileSystem defaultFS = FileSystem.get(conf);
        Path libFolder;
        if (existingClassPaths != null && existingClassPaths.length > 0) {
            libFolder = existingClassPaths[0].getParent();
        } else {
            // First jar to be added like this. Create a new lib folder
            do {
                libFolder = new Path("lib_" + (int) (Math.random() * 100000));
            } while (defaultFS.exists(libFolder));
            defaultFS.mkdirs(libFolder);
            defaultFS.deleteOnExit(libFolder);
        }
        defaultFS.copyFromLocalFile(containingJar, libFolder);
        Path jarFullPath = new Path(libFolder, containingJar.getName()).makeQualified(defaultFS);
        jarFullPath = jarFullPath.makeQualified(defaultFS);
        DistributedCache.addArchiveToClassPath(jarFullPath, conf);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java

License:Open Source License

/**
 * Set an array of cells in the job configuration. As the array might be
 * very large to store as one value, an alternative approach is used.
 * The cells are all written to a temporary file, and that file is added
 * to the DistributedCache of the job. Later on, a call to
 * {@link #getCells(Configuration)} will open the corresponding file from
 * DistributedCache and parse cells from that file.
 * @param conf//from   ww w  .j a v a  2s  .  c  om
 * @param cellsInfo
 * @throws IOException
 */
public static void setCells(Configuration conf, CellInfo[] cellsInfo) throws IOException {
    Path tempFile;
    FileSystem fs = FileSystem.get(conf);
    do {
        tempFile = new Path("cells_" + (int) (Math.random() * 1000000) + ".cells");
    } while (fs.exists(tempFile));
    FSDataOutputStream out = fs.create(tempFile);
    out.writeInt(cellsInfo.length);
    for (CellInfo cell : cellsInfo) {
        cell.write(out);
    }
    out.close();

    fs.deleteOnExit(tempFile);

    DistributedCache.addCacheFile(tempFile.toUri(), conf);
    conf.set(OUTPUT_CELLS, tempFile.getName());
    LOG.info("Partitioning file into " + cellsInfo.length + " cells");
}

From source file:edu.umn.cs.spatialHadoop.nasa.HDFPlot.java

License:Open Source License

public static void generateWaterMask(Path wmImage, OperationsParams params)
        throws IOException, InterruptedException, ClassNotFoundException {
    // Need to recover holes on write but the water mask is not set,
    // need to put it first
    Path wmPath = new Path(params.get(HDFRecordReader.WATER_MASK_PATH,
            "http://e4ftl01.cr.usgs.gov/MOLT/MOD44W.005/2000.02.24/"));
    wmPath = new Path(wmPath, "*.hdf");
    params.set("recover", "none");
    params.setBoolean("recoverholes", false);
    params.set("dataset", "water_mask");
    if (params.get("shape") == null) {
        // Set the default shape value
        params.setClass("shape", NASARectangle.class, Shape.class);
    } else if (!(params.getShape("shape") instanceof NASAShape)) {
        System.err.println("The specified shape " + params.get("shape") + " in not an instance of NASAShape");
        System.exit(1);/*  ww w.j  a v  a  2 s  . co  m*/
    }

    if (params.get("mbr") == null) {
        // Set to the same value as query rectangle or the whole world
        params.set("mbr", params.get("rect", "-180,-90,180,90"));
    }

    SingleLevelPlot.plot(new Path[] { wmPath }, wmImage, HDFRasterizeWaterMask.class, params);
    FileSystem outFS = wmImage.getFileSystem(params);
    outFS.deleteOnExit(wmImage);
}