List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:be.ugent.intec.halvade.MapReduceRunner.java
License:Open Source License
protected void addInputFiles(String input, Configuration conf, Job job, String filter) throws URISyntaxException, IOException { FileSystem fs = FileSystem.get(new URI(input), conf); if (fs.getFileStatus(new Path(input)).isDirectory()) { // add every file in directory FileStatus[] files = fs.listStatus(new Path(input)); for (FileStatus file : files) { if (!file.isDirectory() && file.getPath().getName().endsWith(filter)) { FileInputFormat.addInputPath(job, file.getPath()); }/* ww w . j a v a2 s. c o m*/ } } else { FileInputFormat.addInputPath(job, new Path(input)); } }
From source file:be.ugent.intec.halvade.uploader.HalvadeUploader.java
License:Open Source License
private int processFiles() throws IOException, InterruptedException, URISyntaxException, Throwable { Timer timer = new Timer(); timer.start();/*from w w w. j a v a 2s. c om*/ AWSUploader upl = null; FileSystem fs = null; // write to s3? boolean useAWS = false; if (outputDir.startsWith("s3")) { useAWS = true; String existingBucketName = outputDir.replace("s3://", "").split("/")[0]; outputDir = outputDir.replace("s3://" + existingBucketName + "/", ""); upl = new AWSUploader(existingBucketName, SSE, profile); } else { Configuration conf = getConf(); fs = FileSystem.get(new URI(outputDir), conf); Path outpath = new Path(outputDir); if (fs.exists(outpath) && !fs.getFileStatus(outpath).isDirectory()) { Logger.DEBUG("please provide an output directory"); return 1; } } FileReaderFactory factory = FileReaderFactory.getInstance(mthreads); if (manifest != null) { Logger.DEBUG("reading input files from " + manifest); // read from file BufferedReader br = new BufferedReader(new FileReader(manifest)); String line; while ((line = br.readLine()) != null) { String[] files = line.split("\t"); if (files.length == 2) { factory.addReader(files[0], files[1], false); } else if (files.length == 1) { factory.addReader(files[0], null, isInterleaved); } } } else if (file1 != null && file2 != null) { Logger.DEBUG("Paired-end read input in 2 files."); factory.addReader(file1, file2, false); } else if (file1 != null) { if (isInterleaved) Logger.DEBUG("Single-end read input in 1 files."); else Logger.DEBUG("Paired-end read input in 1 files."); factory.addReader(file1, null, isInterleaved); } else { Logger.DEBUG("Incorrect input, use either a manifest file or give both file1 and file2 as input."); } // start reading (new Thread(factory)).start(); int bestThreads = mthreads; long maxFileSize = getBestFileSize(); if (useAWS) { AWSInterleaveFiles[] fileThreads = new AWSInterleaveFiles[bestThreads]; // start interleaveFile threads for (int t = 0; t < bestThreads; t++) { fileThreads[t] = new AWSInterleaveFiles(outputDir + "halvade_" + t + "_", maxFileSize, upl, t, codec); fileThreads[t].start(); } for (int t = 0; t < bestThreads; t++) fileThreads[t].join(); if (upl != null) upl.shutDownNow(); } else { HDFSInterleaveFiles[] fileThreads = new HDFSInterleaveFiles[bestThreads]; // start interleaveFile threads for (int t = 0; t < bestThreads; t++) { fileThreads[t] = new HDFSInterleaveFiles(outputDir + "halvade_" + t + "_", maxFileSize, fs, t, codec); fileThreads[t].start(); } for (int t = 0; t < bestThreads; t++) fileThreads[t].join(); } factory.finalize(); timer.stop(); Logger.DEBUG("Time to process data: " + timer.getFormattedCurrentTime()); return 0; }
From source file:be.ugent.intec.halvade.utils.HalvadeFileUtils.java
License:Open Source License
protected static boolean checkCorrectSize(String onHDFS, String onScratch, FileSystem fs) throws IOException { File f = new File(onScratch); return fs.getFileStatus(new Path(onHDFS)).getLen() == f.length(); }
From source file:be.ugent.intec.halvade.utils.HalvadeFileUtils.java
License:Open Source License
/** * @return returns 0 if successfull, -1 if filesize is incorrect and -2 if an exception occurred *//* w w w.ja v a2 s. c om*/ protected static int privateDownloadFileFromHDFS(TaskInputOutputContext context, FileSystem fs, String from, String to) { try { // check if file is present on local scratch File f = new File(to); if (!f.exists()) { Logger.DEBUG("attempting download of \"" + to + "\""); fs.copyToLocalFile(new Path(from), new Path(to)); context.getCounter(HalvadeCounters.FIN_FROM_HDFS) .increment(fs.getFileStatus(new Path(from)).getLen()); } else { // check if filesize is correct if (fs.getFileStatus(new Path(from)).getLen() != f.length()) { // incorrect filesize, remove and download again Logger.DEBUG("incorrect filesize: " + f.length() + " =/= " + fs.getFileStatus(new Path(from)).getLen()); f.delete(); fs.copyToLocalFile(new Path(from), new Path(to)); context.getCounter(HalvadeCounters.FIN_FROM_HDFS) .increment(fs.getFileStatus(new Path(from)).getLen()); } else { Logger.DEBUG("file \"" + to + "\" exists"); } } if (fs.getFileStatus(new Path(from)).getLen() != f.length()) return -1; else return 0; } catch (IOException ex) { Logger.DEBUG("failed to download " + from + " from HDFS: " + ex.getLocalizedMessage()); Logger.EXCEPTION(ex); return -2; } }
From source file:be.ugent.intec.halvade.utils.HalvadeFileUtils.java
License:Open Source License
/** * @return returns 0 if successfull, -1 if filesize is incorrect and -2 if an exception occurred *///from w w w. j a v a2s. com protected static int privateUploadFileToHDFS(TaskInputOutputContext context, FileSystem fs, String from, String to) { try { // check if file is present on HDFS Path toPath = new Path(to); Path fromPath = new Path(from); File f = new File(from); if (!fs.exists(toPath)) { fs.copyFromLocalFile(fromPath, toPath); context.getCounter(HalvadeCounters.FOUT_TO_HDFS).increment(f.length()); } else { // check if filesize is correct if (fs.getFileStatus(toPath).getLen() != f.length()) { // incorrect filesize, remove and download again fs.delete(toPath, false); fs.copyFromLocalFile(fromPath, toPath); context.getCounter(HalvadeCounters.FOUT_TO_HDFS).increment(f.length()); } } if (fs.getFileStatus(toPath).getLen() != f.length()) return -1; else return 0; } catch (IOException ex) { Logger.DEBUG("failed to upload " + from + " to HDFS: " + ex.getLocalizedMessage()); Logger.EXCEPTION(ex); return -2; } }
From source file:bigfat.hadoop.HDFSDirInputStream.java
License:Apache License
/** * Create a input stream that will read through all the files in one * directory note that the file will be sorted by name, using the * comparator./* ww w . ja v a 2s . c om*/ * * @param fs * @param dir * @param comp * @throws IOException */ public HDFSDirInputStream(FileSystem fs, String dir, Comparator<String> comp) throws IOException { this.fs = fs; Path p = new Path(dir); FileStatus fstate = fs.getFileStatus(p); if (fstate.isDir()) { FileStatus[] child = fs.globStatus(new Path(dir + "/*")); LinkedList<String> s = new LinkedList<String>(); Map<String, Path> map = new HashMap<String, Path>(); for (FileStatus c : child) { if (c.isDir()) continue; map.put(c.getPath().getName(), c.getPath()); s.add(c.getPath().getName()); } if (comp != null) Collections.sort(s, comp); else Collections.sort(s); Iterator<String> it = s.iterator(); while (it.hasNext()) { String n = it.next(); Path pr = map.get(n); this.appendFile(pr.toString()); } } else { this.appendFile(dir); } }
From source file:cascading.flow.hadoop.util.HadoopUtil.java
License:Open Source License
/** * Copies paths from one local path to a remote path. If syncTimes is true, both modification and access time are * changed to match the local 'from' path. * <p/>// w w w . j a va 2s .c o m * Returns a map of file-name to remote modification times if the remote time is different than the local time. * * @param config * @param commonPaths * @param syncTimes */ public static Map<String, Long> syncPaths(Configuration config, Map<Path, Path> commonPaths, boolean syncTimes) { if (commonPaths == null) return Collections.emptyMap(); Map<String, Long> timestampMap = new HashMap<>(); Map<Path, Path> copyPaths = getCopyPaths(config, commonPaths); // tests remote file existence or if stale LocalFileSystem localFS = getLocalFS(config); FileSystem remoteFS = getDefaultFS(config); for (Map.Entry<Path, Path> entry : copyPaths.entrySet()) { Path localPath = entry.getKey(); Path remotePath = entry.getValue(); try { LOG.info("copying from: {}, to: {}", localPath, remotePath); remoteFS.copyFromLocalFile(localPath, remotePath); if (!syncTimes) { timestampMap.put(remotePath.getName(), remoteFS.getFileStatus(remotePath).getModificationTime()); continue; } } catch (IOException exception) { throw new FlowException("unable to copy local: " + localPath + " to remote: " + remotePath, exception); } FileStatus localFileStatus = null; try { // sync the modified times so we can lazily upload jars to hdfs after job is started // otherwise modified time will be local to hdfs localFileStatus = localFS.getFileStatus(localPath); remoteFS.setTimes(remotePath, localFileStatus.getModificationTime(), -1); // don't set the access time } catch (IOException exception) { LOG.info( "unable to set local modification time on remote file: {}, 'dfs.namenode.accesstime.precision' may be set to 0 on HDFS.", remotePath); if (localFileStatus != null) timestampMap.put(remotePath.getName(), localFileStatus.getModificationTime()); } } return timestampMap; }
From source file:cascading.flow.hadoop.util.HadoopUtil.java
License:Open Source License
private static Map<Path, Path> getCopyPaths(Configuration config, Map<Path, Path> commonPaths) { Map<Path, Path> copyPaths = new HashMap<Path, Path>(); FileSystem remoteFS = getDefaultFS(config); FileSystem localFS = getLocalFS(config); for (Map.Entry<Path, Path> entry : commonPaths.entrySet()) { Path localPath = entry.getKey(); Path remotePath = entry.getValue(); try {//from ww w. ja v a 2 s. com boolean localExists = localFS.exists(localPath); boolean remoteExist = remoteFS.exists(remotePath); if (localExists && !remoteExist) { copyPaths.put(localPath, remotePath); } else if (localExists) { long localModTime = localFS.getFileStatus(localPath).getModificationTime(); long remoteModTime = remoteFS.getFileStatus(remotePath).getModificationTime(); if (localModTime > remoteModTime) copyPaths.put(localPath, remotePath); } } catch (IOException exception) { throw new FlowException("unable to get handle to underlying filesystem", exception); } } return copyPaths; }
From source file:cascading.flow.tez.util.TezUtil.java
License:Open Source License
public static Map<Path, Path> addToClassPath(Configuration config, String stagingRoot, String resourceSubPath, Collection<String> classpath, LocalResourceType resourceType, Map<String, LocalResource> localResources, Map<String, String> environment) { if (classpath == null) return null; // given to fully qualified Map<String, Path> localPaths = new HashMap<>(); Map<String, Path> remotePaths = new HashMap<>(); HadoopUtil.resolvePaths(config, classpath, stagingRoot, resourceSubPath, localPaths, remotePaths); try {// w w w .ja v a2 s. c o m LocalFileSystem localFS = HadoopUtil.getLocalFS(config); for (String fileName : localPaths.keySet()) { Path artifact = localPaths.get(fileName); Path remotePath = remotePaths.get(fileName); if (remotePath == null) remotePath = artifact; addResource(localResources, environment, fileName, localFS.getFileStatus(artifact), remotePath, resourceType); } FileSystem defaultFS = HadoopUtil.getDefaultFS(config); for (String fileName : remotePaths.keySet()) { Path artifact = remotePaths.get(fileName); Path localPath = localPaths.get(fileName); if (localPath != null) continue; addResource(localResources, environment, fileName, defaultFS.getFileStatus(artifact), artifact, resourceType); } } catch (IOException exception) { throw new FlowException("unable to set remote resource paths", exception); } return getCommonPaths(localPaths, remotePaths); }
From source file:cascading.tap.hadoop.Hadoop18TapUtil.java
License:Open Source License
private static void moveTaskOutputs(JobConf conf, FileSystem fs, Path jobOutputDir, Path taskOutput) throws IOException { String taskId = conf.get("mapred.task.id"); if (fs.isFile(taskOutput)) { Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf)); if (!fs.rename(taskOutput, finalOutputPath)) { if (!fs.delete(finalOutputPath, true)) { throw new IOException("Failed to delete earlier output of task: " + taskId); }//from w w w .jav a 2 s . c o m if (!fs.rename(taskOutput, finalOutputPath)) { throw new IOException("Failed to save output of task: " + taskId); } } LOG.debug("Moved " + taskOutput + " to " + finalOutputPath); } else if (fs.getFileStatus(taskOutput).isDir()) { FileStatus[] paths = fs.listStatus(taskOutput); Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf)); fs.mkdirs(finalOutputPath); if (paths != null) { for (FileStatus path : paths) { moveTaskOutputs(conf, fs, jobOutputDir, path.getPath()); } } } }