Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:be.ugent.intec.halvade.MapReduceRunner.java

License:Open Source License

protected void addInputFiles(String input, Configuration conf, Job job, String filter)
        throws URISyntaxException, IOException {
    FileSystem fs = FileSystem.get(new URI(input), conf);
    if (fs.getFileStatus(new Path(input)).isDirectory()) {
        // add every file in directory
        FileStatus[] files = fs.listStatus(new Path(input));
        for (FileStatus file : files) {
            if (!file.isDirectory() && file.getPath().getName().endsWith(filter)) {
                FileInputFormat.addInputPath(job, file.getPath());
            }/* ww w . j a  v a2  s. c  o m*/
        }
    } else {
        FileInputFormat.addInputPath(job, new Path(input));
    }
}

From source file:be.ugent.intec.halvade.uploader.HalvadeUploader.java

License:Open Source License

private int processFiles() throws IOException, InterruptedException, URISyntaxException, Throwable {
    Timer timer = new Timer();
    timer.start();/*from w w w. j  a v a  2s.  c om*/

    AWSUploader upl = null;
    FileSystem fs = null;
    // write to s3?
    boolean useAWS = false;
    if (outputDir.startsWith("s3")) {
        useAWS = true;
        String existingBucketName = outputDir.replace("s3://", "").split("/")[0];
        outputDir = outputDir.replace("s3://" + existingBucketName + "/", "");
        upl = new AWSUploader(existingBucketName, SSE, profile);
    } else {
        Configuration conf = getConf();
        fs = FileSystem.get(new URI(outputDir), conf);
        Path outpath = new Path(outputDir);
        if (fs.exists(outpath) && !fs.getFileStatus(outpath).isDirectory()) {
            Logger.DEBUG("please provide an output directory");
            return 1;
        }
    }

    FileReaderFactory factory = FileReaderFactory.getInstance(mthreads);
    if (manifest != null) {
        Logger.DEBUG("reading input files from " + manifest);
        // read from file
        BufferedReader br = new BufferedReader(new FileReader(manifest));
        String line;
        while ((line = br.readLine()) != null) {
            String[] files = line.split("\t");
            if (files.length == 2) {
                factory.addReader(files[0], files[1], false);
            } else if (files.length == 1) {
                factory.addReader(files[0], null, isInterleaved);
            }
        }
    } else if (file1 != null && file2 != null) {
        Logger.DEBUG("Paired-end read input in 2 files.");
        factory.addReader(file1, file2, false);
    } else if (file1 != null) {
        if (isInterleaved)
            Logger.DEBUG("Single-end read input in 1 files.");
        else
            Logger.DEBUG("Paired-end read input in 1 files.");
        factory.addReader(file1, null, isInterleaved);
    } else {
        Logger.DEBUG("Incorrect input, use either a manifest file or give both file1 and file2 as input.");
    }

    // start reading
    (new Thread(factory)).start();

    int bestThreads = mthreads;
    long maxFileSize = getBestFileSize();
    if (useAWS) {
        AWSInterleaveFiles[] fileThreads = new AWSInterleaveFiles[bestThreads];
        // start interleaveFile threads
        for (int t = 0; t < bestThreads; t++) {
            fileThreads[t] = new AWSInterleaveFiles(outputDir + "halvade_" + t + "_", maxFileSize, upl, t,
                    codec);
            fileThreads[t].start();
        }
        for (int t = 0; t < bestThreads; t++)
            fileThreads[t].join();
        if (upl != null)
            upl.shutDownNow();
    } else {

        HDFSInterleaveFiles[] fileThreads = new HDFSInterleaveFiles[bestThreads];
        // start interleaveFile threads
        for (int t = 0; t < bestThreads; t++) {
            fileThreads[t] = new HDFSInterleaveFiles(outputDir + "halvade_" + t + "_", maxFileSize, fs, t,
                    codec);
            fileThreads[t].start();
        }
        for (int t = 0; t < bestThreads; t++)
            fileThreads[t].join();
    }
    factory.finalize();
    timer.stop();
    Logger.DEBUG("Time to process data: " + timer.getFormattedCurrentTime());
    return 0;
}

From source file:be.ugent.intec.halvade.utils.HalvadeFileUtils.java

License:Open Source License

protected static boolean checkCorrectSize(String onHDFS, String onScratch, FileSystem fs) throws IOException {
    File f = new File(onScratch);
    return fs.getFileStatus(new Path(onHDFS)).getLen() == f.length();
}

From source file:be.ugent.intec.halvade.utils.HalvadeFileUtils.java

License:Open Source License

/**
 * @return returns 0 if successfull, -1 if filesize is incorrect and -2 if an exception occurred
 *//* w w  w.ja  v  a2  s.  c om*/
protected static int privateDownloadFileFromHDFS(TaskInputOutputContext context, FileSystem fs, String from,
        String to) {
    try {
        // check if file is present on local scratch
        File f = new File(to);
        if (!f.exists()) {
            Logger.DEBUG("attempting download of \"" + to + "\"");
            fs.copyToLocalFile(new Path(from), new Path(to));
            context.getCounter(HalvadeCounters.FIN_FROM_HDFS)
                    .increment(fs.getFileStatus(new Path(from)).getLen());
        } else {
            // check if filesize is correct
            if (fs.getFileStatus(new Path(from)).getLen() != f.length()) {
                // incorrect filesize, remove and download again
                Logger.DEBUG("incorrect filesize: " + f.length() + " =/= "
                        + fs.getFileStatus(new Path(from)).getLen());
                f.delete();
                fs.copyToLocalFile(new Path(from), new Path(to));
                context.getCounter(HalvadeCounters.FIN_FROM_HDFS)
                        .increment(fs.getFileStatus(new Path(from)).getLen());

            } else {
                Logger.DEBUG("file \"" + to + "\" exists");
            }
        }
        if (fs.getFileStatus(new Path(from)).getLen() != f.length())
            return -1;
        else
            return 0;
    } catch (IOException ex) {
        Logger.DEBUG("failed to download " + from + " from HDFS: " + ex.getLocalizedMessage());
        Logger.EXCEPTION(ex);
        return -2;
    }
}

From source file:be.ugent.intec.halvade.utils.HalvadeFileUtils.java

License:Open Source License

/**
 * @return returns 0 if successfull, -1 if filesize is incorrect and -2 if an exception occurred
 *///from  w w w.  j  a  v  a2s. com
protected static int privateUploadFileToHDFS(TaskInputOutputContext context, FileSystem fs, String from,
        String to) {
    try {
        // check if file is present on HDFS
        Path toPath = new Path(to);
        Path fromPath = new Path(from);
        File f = new File(from);
        if (!fs.exists(toPath)) {
            fs.copyFromLocalFile(fromPath, toPath);
            context.getCounter(HalvadeCounters.FOUT_TO_HDFS).increment(f.length());
        } else {
            // check if filesize is correct
            if (fs.getFileStatus(toPath).getLen() != f.length()) {
                // incorrect filesize, remove and download again
                fs.delete(toPath, false);
                fs.copyFromLocalFile(fromPath, toPath);
                context.getCounter(HalvadeCounters.FOUT_TO_HDFS).increment(f.length());
            }
        }
        if (fs.getFileStatus(toPath).getLen() != f.length())
            return -1;
        else
            return 0;
    } catch (IOException ex) {
        Logger.DEBUG("failed to upload " + from + " to HDFS: " + ex.getLocalizedMessage());
        Logger.EXCEPTION(ex);
        return -2;
    }
}

From source file:bigfat.hadoop.HDFSDirInputStream.java

License:Apache License

/**
 * Create a input stream that will read through all the files in one
 * directory note that the file will be sorted by name, using the
 * comparator./* ww  w  .  ja v  a  2s  .  c  om*/
 * 
 * @param fs
 * @param dir
 * @param comp
 * @throws IOException
 */
public HDFSDirInputStream(FileSystem fs, String dir, Comparator<String> comp) throws IOException {
    this.fs = fs;
    Path p = new Path(dir);
    FileStatus fstate = fs.getFileStatus(p);
    if (fstate.isDir()) {
        FileStatus[] child = fs.globStatus(new Path(dir + "/*"));
        LinkedList<String> s = new LinkedList<String>();
        Map<String, Path> map = new HashMap<String, Path>();
        for (FileStatus c : child) {
            if (c.isDir())
                continue;
            map.put(c.getPath().getName(), c.getPath());
            s.add(c.getPath().getName());
        }
        if (comp != null)
            Collections.sort(s, comp);
        else
            Collections.sort(s);
        Iterator<String> it = s.iterator();
        while (it.hasNext()) {
            String n = it.next();
            Path pr = map.get(n);
            this.appendFile(pr.toString());
        }
    } else {
        this.appendFile(dir);
    }
}

From source file:cascading.flow.hadoop.util.HadoopUtil.java

License:Open Source License

/**
 * Copies paths from one local path to a remote path. If syncTimes is true, both modification and access time are
 * changed to match the local 'from' path.
 * <p/>//  w  w  w  . j  a  va  2s .c o  m
 * Returns a map of file-name to remote modification times if the remote time is different than the local time.
 *
 * @param config
 * @param commonPaths
 * @param syncTimes
 */
public static Map<String, Long> syncPaths(Configuration config, Map<Path, Path> commonPaths,
        boolean syncTimes) {
    if (commonPaths == null)
        return Collections.emptyMap();

    Map<String, Long> timestampMap = new HashMap<>();

    Map<Path, Path> copyPaths = getCopyPaths(config, commonPaths); // tests remote file existence or if stale

    LocalFileSystem localFS = getLocalFS(config);
    FileSystem remoteFS = getDefaultFS(config);

    for (Map.Entry<Path, Path> entry : copyPaths.entrySet()) {
        Path localPath = entry.getKey();
        Path remotePath = entry.getValue();

        try {
            LOG.info("copying from: {}, to: {}", localPath, remotePath);
            remoteFS.copyFromLocalFile(localPath, remotePath);

            if (!syncTimes) {
                timestampMap.put(remotePath.getName(),
                        remoteFS.getFileStatus(remotePath).getModificationTime());
                continue;
            }
        } catch (IOException exception) {
            throw new FlowException("unable to copy local: " + localPath + " to remote: " + remotePath,
                    exception);
        }

        FileStatus localFileStatus = null;

        try {
            // sync the modified times so we can lazily upload jars to hdfs after job is started
            // otherwise modified time will be local to hdfs
            localFileStatus = localFS.getFileStatus(localPath);
            remoteFS.setTimes(remotePath, localFileStatus.getModificationTime(), -1); // don't set the access time
        } catch (IOException exception) {
            LOG.info(
                    "unable to set local modification time on remote file: {}, 'dfs.namenode.accesstime.precision' may be set to 0 on HDFS.",
                    remotePath);

            if (localFileStatus != null)
                timestampMap.put(remotePath.getName(), localFileStatus.getModificationTime());
        }
    }

    return timestampMap;
}

From source file:cascading.flow.hadoop.util.HadoopUtil.java

License:Open Source License

private static Map<Path, Path> getCopyPaths(Configuration config, Map<Path, Path> commonPaths) {
    Map<Path, Path> copyPaths = new HashMap<Path, Path>();

    FileSystem remoteFS = getDefaultFS(config);
    FileSystem localFS = getLocalFS(config);

    for (Map.Entry<Path, Path> entry : commonPaths.entrySet()) {
        Path localPath = entry.getKey();
        Path remotePath = entry.getValue();

        try {//from  ww  w.  ja v  a 2  s. com
            boolean localExists = localFS.exists(localPath);
            boolean remoteExist = remoteFS.exists(remotePath);

            if (localExists && !remoteExist) {
                copyPaths.put(localPath, remotePath);
            } else if (localExists) {
                long localModTime = localFS.getFileStatus(localPath).getModificationTime();
                long remoteModTime = remoteFS.getFileStatus(remotePath).getModificationTime();

                if (localModTime > remoteModTime)
                    copyPaths.put(localPath, remotePath);
            }
        } catch (IOException exception) {
            throw new FlowException("unable to get handle to underlying filesystem", exception);
        }
    }

    return copyPaths;
}

From source file:cascading.flow.tez.util.TezUtil.java

License:Open Source License

public static Map<Path, Path> addToClassPath(Configuration config, String stagingRoot, String resourceSubPath,
        Collection<String> classpath, LocalResourceType resourceType, Map<String, LocalResource> localResources,
        Map<String, String> environment) {
    if (classpath == null)
        return null;

    // given to fully qualified
    Map<String, Path> localPaths = new HashMap<>();
    Map<String, Path> remotePaths = new HashMap<>();

    HadoopUtil.resolvePaths(config, classpath, stagingRoot, resourceSubPath, localPaths, remotePaths);

    try {//  w  w  w  .ja  v  a2  s.  c  o  m
        LocalFileSystem localFS = HadoopUtil.getLocalFS(config);

        for (String fileName : localPaths.keySet()) {
            Path artifact = localPaths.get(fileName);
            Path remotePath = remotePaths.get(fileName);

            if (remotePath == null)
                remotePath = artifact;

            addResource(localResources, environment, fileName, localFS.getFileStatus(artifact), remotePath,
                    resourceType);
        }

        FileSystem defaultFS = HadoopUtil.getDefaultFS(config);

        for (String fileName : remotePaths.keySet()) {
            Path artifact = remotePaths.get(fileName);
            Path localPath = localPaths.get(fileName);

            if (localPath != null)
                continue;

            addResource(localResources, environment, fileName, defaultFS.getFileStatus(artifact), artifact,
                    resourceType);
        }
    } catch (IOException exception) {
        throw new FlowException("unable to set remote resource paths", exception);
    }

    return getCommonPaths(localPaths, remotePaths);
}

From source file:cascading.tap.hadoop.Hadoop18TapUtil.java

License:Open Source License

private static void moveTaskOutputs(JobConf conf, FileSystem fs, Path jobOutputDir, Path taskOutput)
        throws IOException {
    String taskId = conf.get("mapred.task.id");

    if (fs.isFile(taskOutput)) {
        Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf));
        if (!fs.rename(taskOutput, finalOutputPath)) {
            if (!fs.delete(finalOutputPath, true)) {
                throw new IOException("Failed to delete earlier output of task: " + taskId);
            }//from w  w w  .jav  a  2  s . c  o  m
            if (!fs.rename(taskOutput, finalOutputPath)) {
                throw new IOException("Failed to save output of task: " + taskId);
            }
        }
        LOG.debug("Moved " + taskOutput + " to " + finalOutputPath);
    } else if (fs.getFileStatus(taskOutput).isDir()) {
        FileStatus[] paths = fs.listStatus(taskOutput);
        Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf));
        fs.mkdirs(finalOutputPath);
        if (paths != null) {
            for (FileStatus path : paths) {
                moveTaskOutputs(conf, fs, jobOutputDir, path.getPath());
            }
        }
    }
}