Example usage for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:be.ugent.intec.halvade.MapReduceRunner.java

License:Open Source License

protected void addInputFiles(String input, Configuration conf, Job job, String filter)
        throws URISyntaxException, IOException {
    FileSystem fs = FileSystem.get(new URI(input), conf);
    if (fs.getFileStatus(new Path(input)).isDirectory()) {
        // add every file in directory
        FileStatus[] files = fs.listStatus(new Path(input));
        for (FileStatus file : files) {
            if (!file.isDirectory() && file.getPath().getName().endsWith(filter)) {
                FileInputFormat.addInputPath(job, file.getPath());
            }/* ww w . j a  v a2  s. c  o m*/
        }
    } else {
        FileInputFormat.addInputPath(job, new Path(input));
    }
}

From source file:be.ugent.intec.halvade.uploader.HalvadeUploader.java

License:Open Source License

private int processFiles() throws IOException, InterruptedException, URISyntaxException, Throwable {
    Timer timer = new Timer();
    timer.start();/*from w w w. j  a v a  2s.  c om*/

    AWSUploader upl = null;
    FileSystem fs = null;
    // write to s3?
    boolean useAWS = false;
    if (outputDir.startsWith("s3")) {
        useAWS = true;
        String existingBucketName = outputDir.replace("s3://", "").split("/")[0];
        outputDir = outputDir.replace("s3://" + existingBucketName + "/", "");
        upl = new AWSUploader(existingBucketName, SSE, profile);
    } else {
        Configuration conf = getConf();
        fs = FileSystem.get(new URI(outputDir), conf);
        Path outpath = new Path(outputDir);
        if (fs.exists(outpath) && !fs.getFileStatus(outpath).isDirectory()) {
            Logger.DEBUG("please provide an output directory");
            return 1;
        }
    }

    FileReaderFactory factory = FileReaderFactory.getInstance(mthreads);
    if (manifest != null) {
        Logger.DEBUG("reading input files from " + manifest);
        // read from file
        BufferedReader br = new BufferedReader(new FileReader(manifest));
        String line;
        while ((line = br.readLine()) != null) {
            String[] files = line.split("\t");
            if (files.length == 2) {
                factory.addReader(files[0], files[1], false);
            } else if (files.length == 1) {
                factory.addReader(files[0], null, isInterleaved);
            }
        }
    } else if (file1 != null && file2 != null) {
        Logger.DEBUG("Paired-end read input in 2 files.");
        factory.addReader(file1, file2, false);
    } else if (file1 != null) {
        if (isInterleaved)
            Logger.DEBUG("Single-end read input in 1 files.");
        else
            Logger.DEBUG("Paired-end read input in 1 files.");
        factory.addReader(file1, null, isInterleaved);
    } else {
        Logger.DEBUG("Incorrect input, use either a manifest file or give both file1 and file2 as input.");
    }

    // start reading
    (new Thread(factory)).start();

    int bestThreads = mthreads;
    long maxFileSize = getBestFileSize();
    if (useAWS) {
        AWSInterleaveFiles[] fileThreads = new AWSInterleaveFiles[bestThreads];
        // start interleaveFile threads
        for (int t = 0; t < bestThreads; t++) {
            fileThreads[t] = new AWSInterleaveFiles(outputDir + "halvade_" + t + "_", maxFileSize, upl, t,
                    codec);
            fileThreads[t].start();
        }
        for (int t = 0; t < bestThreads; t++)
            fileThreads[t].join();
        if (upl != null)
            upl.shutDownNow();
    } else {

        HDFSInterleaveFiles[] fileThreads = new HDFSInterleaveFiles[bestThreads];
        // start interleaveFile threads
        for (int t = 0; t < bestThreads; t++) {
            fileThreads[t] = new HDFSInterleaveFiles(outputDir + "halvade_" + t + "_", maxFileSize, fs, t,
                    codec);
            fileThreads[t].start();
        }
        for (int t = 0; t < bestThreads; t++)
            fileThreads[t].join();
    }
    factory.finalize();
    timer.stop();
    Logger.DEBUG("Time to process data: " + timer.getFormattedCurrentTime());
    return 0;
}

From source file:be.ugent.intec.halvade.utils.HalvadeFileUtils.java

License:Open Source License

protected static boolean checkCorrectSize(String onHDFS, String onScratch, FileSystem fs) throws IOException {
    File f = new File(onScratch);
    return fs.getFileStatus(new Path(onHDFS)).getLen() == f.length();
}

From source file:be.ugent.intec.halvade.utils.HalvadeFileUtils.java

License:Open Source License

/**
 * @return returns 0 if successfull, -1 if filesize is incorrect and -2 if an exception occurred
 *//* w w  w.ja  v  a2  s.  c om*/
protected static int privateDownloadFileFromHDFS(TaskInputOutputContext context, FileSystem fs, String from,
        String to) {
    try {
        // check if file is present on local scratch
        File f = new File(to);
        if (!f.exists()) {
            Logger.DEBUG("attempting download of \"" + to + "\"");
            fs.copyToLocalFile(new Path(from), new Path(to));
            context.getCounter(HalvadeCounters.FIN_FROM_HDFS)
                    .increment(fs.getFileStatus(new Path(from)).getLen());
        } else {
            // check if filesize is correct
            if (fs.getFileStatus(new Path(from)).getLen() != f.length()) {
                // incorrect filesize, remove and download again
                Logger.DEBUG("incorrect filesize: " + f.length() + " =/= "
                        + fs.getFileStatus(new Path(from)).getLen());
                f.delete();
                fs.copyToLocalFile(new Path(from), new Path(to));
                context.getCounter(HalvadeCounters.FIN_FROM_HDFS)
                        .increment(fs.getFileStatus(new Path(from)).getLen());

            } else {
                Logger.DEBUG("file \"" + to + "\" exists");
            }
        }
        if (fs.getFileStatus(new Path(from)).getLen() != f.length())
            return -1;
        else
            return 0;
    } catch (IOException ex) {
        Logger.DEBUG("failed to download " + from + " from HDFS: " + ex.getLocalizedMessage());
        Logger.EXCEPTION(ex);
        return -2;
    }
}

From source file:be.ugent.intec.halvade.utils.HalvadeFileUtils.java

License:Open Source License

/**
 * @return returns 0 if successfull, -1 if filesize is incorrect and -2 if an exception occurred
 *///from  w w w.  j  a  v  a2s. com
protected static int privateUploadFileToHDFS(TaskInputOutputContext context, FileSystem fs, String from,
        String to) {
    try {
        // check if file is present on HDFS
        Path toPath = new Path(to);
        Path fromPath = new Path(from);
        File f = new File(from);
        if (!fs.exists(toPath)) {
            fs.copyFromLocalFile(fromPath, toPath);
            context.getCounter(HalvadeCounters.FOUT_TO_HDFS).increment(f.length());
        } else {
            // check if filesize is correct
            if (fs.getFileStatus(toPath).getLen() != f.length()) {
                // incorrect filesize, remove and download again
                fs.delete(toPath, false);
                fs.copyFromLocalFile(fromPath, toPath);
                context.getCounter(HalvadeCounters.FOUT_TO_HDFS).increment(f.length());
            }
        }
        if (fs.getFileStatus(toPath).getLen() != f.length())
            return -1;
        else
            return 0;
    } catch (IOException ex) {
        Logger.DEBUG("failed to upload " + from + " to HDFS: " + ex.getLocalizedMessage());
        Logger.EXCEPTION(ex);
        return -2;
    }
}

From source file:bigfat.hadoop.HDFSDirInputStream.java

License:Apache License

/**
 * Create a input stream that will read through all the files in one
 * directory note that the file will be sorted by name, using the
 * comparator./* ww  w  .  ja v  a  2s  .  c  om*/
 * 
 * @param fs
 * @param dir
 * @param comp
 * @throws IOException
 */
public HDFSDirInputStream(FileSystem fs, String dir, Comparator<String> comp) throws IOException {
    this.fs = fs;
    Path p = new Path(dir);
    FileStatus fstate = fs.getFileStatus(p);
    if (fstate.isDir()) {
        FileStatus[] child = fs.globStatus(new Path(dir + "/*"));
        LinkedList<String> s = new LinkedList<String>();
        Map<String, Path> map = new HashMap<String, Path>();
        for (FileStatus c : child) {
            if (c.isDir())
                continue;
            map.put(c.getPath().getName(), c.getPath());
            s.add(c.getPath().getName());
        }
        if (comp != null)
            Collections.sort(s, comp);
        else
            Collections.sort(s);
        Iterator<String> it = s.iterator();
        while (it.hasNext()) {
            String n = it.next();
            Path pr = map.get(n);
            this.appendFile(pr.toString());
        }
    } else {
        this.appendFile(dir);
    }
}

From source file:cascading.flow.hadoop.util.HadoopUtil.java

License:Open Source License

/**
 * Copies paths from one local path to a remote path. If syncTimes is true, both modification and access time are
 * changed to match the local 'from' path.
 * <p/>//  w  w  w  . j  a  va  2s .c o  m
 * Returns a map of file-name to remote modification times if the remote time is different than the local time.
 *
 * @param config
 * @param commonPaths
 * @param syncTimes
 */
public static Map<String, Long> syncPaths(Configuration config, Map<Path, Path> commonPaths,
        boolean syncTimes) {
    if (commonPaths == null)
        return Collections.emptyMap();

    Map<String, Long> timestampMap = new HashMap<>();

    Map<Path, Path> copyPaths = getCopyPaths(config, commonPaths); // tests remote file existence or if stale

    LocalFileSystem localFS = getLocalFS(config);
    FileSystem remoteFS = getDefaultFS(config);

    for (Map.Entry<Path, Path> entry : copyPaths.entrySet()) {
        Path localPath = entry.getKey();
        Path remotePath = entry.getValue();

        try {
            LOG.info("copying from: {}, to: {}", localPath, remotePath);
            remoteFS.copyFromLocalFile(localPath, remotePath);

            if (!syncTimes) {
                timestampMap.put(remotePath.getName(),
                        remoteFS.getFileStatus(remotePath).getModificationTime());
                continue;
            }
        } catch (IOException exception) {
            throw new FlowException("unable to copy local: " + localPath + " to remote: " + remotePath,
                    exception);
        }

        FileStatus localFileStatus = null;

        try {
            // sync the modified times so we can lazily upload jars to hdfs after job is started
            // otherwise modified time will be local to hdfs
            localFileStatus = localFS.getFileStatus(localPath);
            remoteFS.setTimes(remotePath, localFileStatus.getModificationTime(), -1); // don't set the access time
        } catch (IOException exception) {
            LOG.info(
                    "unable to set local modification time on remote file: {}, 'dfs.namenode.accesstime.precision' may be set to 0 on HDFS.",
                    remotePath);

            if (localFileStatus != null)
                timestampMap.put(remotePath.getName(), localFileStatus.getModificationTime());
        }
    }

    return timestampMap;
}

From source file:cascading.flow.hadoop.util.HadoopUtil.java

License:Open Source License

private static Map<Path, Path> getCopyPaths(Configuration config, Map<Path, Path> commonPaths) {
    Map<Path, Path> copyPaths = new HashMap<Path, Path>();

    FileSystem remoteFS = getDefaultFS(config);
    FileSystem localFS = getLocalFS(config);

    for (Map.Entry<Path, Path> entry : commonPaths.entrySet()) {
        Path localPath = entry.getKey();
        Path remotePath = entry.getValue();

        try {//from  ww  w.  ja v  a 2  s. com
            boolean localExists = localFS.exists(localPath);
            boolean remoteExist = remoteFS.exists(remotePath);

            if (localExists && !remoteExist) {
                copyPaths.put(localPath, remotePath);
            } else if (localExists) {
                long localModTime = localFS.getFileStatus(localPath).getModificationTime();
                long remoteModTime = remoteFS.getFileStatus(remotePath).getModificationTime();

                if (localModTime > remoteModTime)
                    copyPaths.put(localPath, remotePath);
            }
        } catch (IOException exception) {
            throw new FlowException("unable to get handle to underlying filesystem", exception);
        }
    }

    return copyPaths;
}

From source file:cascading.flow.tez.util.TezUtil.java

License:Open Source License

public static Map<Path, Path> addToClassPath(Configuration config, String stagingRoot, String resourceSubPath,
        Collection<String> classpath, LocalResourceType resourceType, Map<String, LocalResource> localResources,
        Map<String, String> environment) {
    if (classpath == null)
        return null;

    // given to fully qualified
    Map<String, Path> localPaths = new HashMap<>();
    Map<String, Path> remotePaths = new HashMap<>();

    HadoopUtil.resolvePaths(config, classpath, stagingRoot, resourceSubPath, localPaths, remotePaths);

    try {//  w  w  w  .ja  v  a2  s.  c  o  m
        LocalFileSystem localFS = HadoopUtil.getLocalFS(config);

        for (String fileName : localPaths.keySet()) {
            Path artifact = localPaths.get(fileName);
            Path remotePath = remotePaths.get(fileName);

            if (remotePath == null)
                remotePath = artifact;

            addResource(localResources, environment, fileName, localFS.getFileStatus(artifact), remotePath,
                    resourceType);
        }

        FileSystem defaultFS = HadoopUtil.getDefaultFS(config);

        for (String fileName : remotePaths.keySet()) {
            Path artifact = remotePaths.get(fileName);
            Path localPath = localPaths.get(fileName);

            if (localPath != null)
                continue;

            addResource(localResources, environment, fileName, defaultFS.getFileStatus(artifact), artifact,
                    resourceType);
        }
    } catch (IOException exception) {
        throw new FlowException("unable to set remote resource paths", exception);
    }

    return getCommonPaths(localPaths, remotePaths);
}

From source file:cascading.tap.hadoop.Hadoop18TapUtil.java

License:Open Source License

private static void moveTaskOutputs(JobConf conf, FileSystem fs, Path jobOutputDir, Path taskOutput)
        throws IOException {
    String taskId = conf.get("mapred.task.id");

    if (fs.isFile(taskOutput)) {
        Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf));
        if (!fs.rename(taskOutput, finalOutputPath)) {
            if (!fs.delete(finalOutputPath, true)) {
                throw new IOException("Failed to delete earlier output of task: " + taskId);
            }//from w  w w  .jav  a  2  s . c  o  m
            if (!fs.rename(taskOutput, finalOutputPath)) {
                throw new IOException("Failed to save output of task: " + taskId);
            }
        }
        LOG.debug("Moved " + taskOutput + " to " + finalOutputPath);
    } else if (fs.getFileStatus(taskOutput).isDir()) {
        FileStatus[] paths = fs.listStatus(taskOutput);
        Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf));
        fs.mkdirs(finalOutputPath);
        if (paths != null) {
            for (FileStatus path : paths) {
                moveTaskOutputs(conf, fs, jobOutputDir, path.getPath());
            }
        }
    }
}