List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus
public abstract FileStatus getFileStatus(Path f) throws IOException;
From source file:eu.edisonproject.training.tfidf.mapreduce.TermWordFrequency.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration jobconf = getConf(); FileSystem fs = FileSystem.get(jobconf); fs.delete(new Path(args[1]), true); Path in = new Path(args[0]); Path inHdfs = in;// w w w .j ava2s .c o m if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { inHdfs = new Path(in.getName()); fs.delete(inHdfs, true); fs.copyFromLocalFile(in, inHdfs); fs.deleteOnExit(inHdfs); FileStatus inHdfsStatus = fs.getFileStatus(inHdfs); // Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Copied: {0} to: {1}", new Object[]{in.toUri(), inHdfsStatus.getPath().toUri()}); } Job job = Job.getInstance(jobconf); Path stopwordsLocal = new Path(args[3]); stopwords = new Path(stopwordsLocal.getName()); fs.delete(stopwords, true); fs.copyFromLocalFile(stopwordsLocal, stopwords); fs.deleteOnExit(stopwords); FileStatus stopwordsStatus = fs.getFileStatus(stopwords); stopwords = stopwordsStatus.getPath(); job.addCacheFile(stopwords.toUri()); Path localDocs = new Path(args[2]); Path hdfsDocs = new Path(localDocs.getName()); fs.mkdirs(hdfsDocs); hdfsDocs = fs.getFileStatus(hdfsDocs).getPath(); fs.delete(hdfsDocs, true); // FileStatus[] stats = fs.listStatus(localDocs); File[] stats = new File(localDocs.toString()).listFiles(); for (File stat : stats) { // for (FileStatus stat : stats) { Path filePath = new Path(stat.getAbsolutePath()); if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) { Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName()); fs.copyFromLocalFile(filePath, dest); } } job.addCacheFile(hdfsDocs.toUri()); job.setJarByClass(TermWordFrequency.class); job.setJobName("Word Frequency Term Driver"); FileInputFormat.setInputPaths(job, inHdfs); FileOutputFormat.setOutputPath(job, new Path(args[1])); // job.setInputFormatClass(TextInputFormat.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, inHdfs); NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4])); NLineInputFormat.setMaxInputSplitSize(job, 500); Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Num. of lines: {0}", NLineInputFormat.getNumLinesPerSplit(job)); job.setMapperClass(TermWordFrequencyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Integer.class); job.setReducerClass(TermWordFrequencyReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:eu.edisonproject.training.tfidf.mapreduce.TFIDFTermsDriver.java
License:Apache License
@Override public void executeTFIDF(String inputPath) { try {//from ww w.j a v a 2 s. com String[] args1 = { inputPath, OUTPUT_PATH1, TEXT_FILES_DIR_PATH, STOPWORDS_PATH, NUM_OF_LINES }; ToolRunner.run(new TermWordFrequency(), args1); String[] args2 = { INPUT_PATH2, OUTPUT_PATH2 }; ToolRunner.run(new WordCountsForDocsDriver(), args2); File docs = new File(TEXT_FILES_DIR_PATH); File[] files = docs.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { return name.toLowerCase().endsWith(".txt"); } }); String[] args3 = { INPUT_PATH3, OUTPUT_PATH3, String.valueOf(files.length) }; ToolRunner.run(new WordsInCorpusTFIDFDriver(), args3); String[] args4 = { INPUT_PATH4, OUTPUT_PATH4 }; ToolRunner.run(new WordsGroupByTitleDriver(), args4); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path hdfsRes = new Path(OUTPUT_PATH4 + File.separator + "part-r-00000"); hdfsRes = fs.getFileStatus(hdfsRes).getPath(); readTFIDFResult(fs, hdfsRes); List<Double> sum = computeSum(transactionValues); for (int i = 0; i < sum.size(); i++) { wordTfidf.put(allWords.get(i), sum.get(i)); } computeMean(); // Resize the hashmap wordtfidf wordTfidf = resizeVector(wordTfidf); writeResizedOutputIntoCSV(OUT, wordTfidf); } catch (Exception ex) { Logger.getLogger(TFIDFTermsDriver.class.getName()).log(Level.SEVERE, "TFIDF fail", ex); } }
From source file:eu.scape_project.archiventory.hadoop.ArcRecordReader.java
License:Apache License
@Override public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException { //throw new UnsupportedOperationException("Unused."); FileSplit fileSplit = (FileSplit) is; try {/*w ww. j a v a2 s . c o m*/ Path path = fileSplit.getPath(); FileSystem fileSystem = path.getFileSystem(tac.getConfiguration()); FSDataInputStream fileInputStream = fileSystem.open(path); FileStatus fileStatus = fileSystem.getFileStatus(path); fileLength = fileStatus.getLen(); ArchiveReader reader = ArchiveReaderFactory.get(path.getName(), fileInputStream, true); recordIterator = reader.iterator(); currentKey = new Text(); currentArcRecord = new ArcRecord(); } catch (IOException ex) { Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:eu.scape_project.arcunpacker.mapreduce.ArcRecordReader.java
License:Apache License
@Override public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException { //throw new UnsupportedOperationException("Unused."); FileSplit fileSplit = (FileSplit) is; try {/*from w ww .j a va 2s . c o m*/ Path path = fileSplit.getPath(); FileSystem fileSystem = path.getFileSystem(tac.getConfiguration()); FSDataInputStream fileInputStream = fileSystem.open(path); FileStatus fileStatus = fileSystem.getFileStatus(path); long fileLength = fileStatus.getLen(); archiveReaderDelegate = new HeritrixWrapper(path.getName(), fileInputStream, fileLength); key = new Text(); value = new HadoopArcRecord(); } catch (IOException ex) { Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex); throw new IOException(ex); } }
From source file:eu.scape_project.pt.mapred.input.ControlFileInputFormat.java
License:Apache License
/** * Gets block locations of input files sorted * by the total number of occurrences.//from ww w.j a v a 2 s. co m * * @param fs Hadoop filesystem handle * @param inFiles array of input files * @return sorted String array */ public static String[] getSortedHosts(FileSystem fs, Path[] inFiles) throws IOException { final Map<String, Integer> hostMap = new HashMap<String, Integer>(); for (Path inFile : inFiles) { FileStatus s = fs.getFileStatus(inFile); BlockLocation[] locations = fs.getFileBlockLocations(s, 0, s.getLen()); for (BlockLocation location : locations) { String[] hosts = location.getHosts(); for (String host : hosts) { if (!hostMap.containsKey(host)) { hostMap.put(host, 1); continue; } hostMap.put(host, hostMap.get(host) + 1); } } } // sort hosts by number of references to blocks of input files List<String> hosts = new ArrayList<String>(); hosts.addAll(hostMap.keySet()); Collections.sort(hosts, new Comparator<String>() { @Override public int compare(String host1, String host2) { return hostMap.get(host2) - hostMap.get(host1); } }); return hosts.toArray(new String[0]); }
From source file:eu.stratosphere.hadoopcompatibility.FileOutputCommitterWrapper.java
License:Apache License
private void moveTaskOutputs(JobConf conf, TaskAttemptID taskAttemptID, FileSystem fs, Path jobOutputDir, Path taskOutput) throws IOException { if (fs.isFile(taskOutput)) { Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTempTaskOutputPath(conf, taskAttemptID)); if (!fs.rename(taskOutput, finalOutputPath)) { if (!fs.delete(finalOutputPath, true)) { throw new IOException("Failed to delete earlier output of task: " + taskAttemptID); }/*from w w w . ja va 2s .c o m*/ if (!fs.rename(taskOutput, finalOutputPath)) { throw new IOException("Failed to save output of task: " + taskAttemptID); } } LOG.debug("Moved " + taskOutput + " to " + finalOutputPath); } else if (fs.getFileStatus(taskOutput).isDir()) { FileStatus[] paths = fs.listStatus(taskOutput); Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTempTaskOutputPath(conf, taskAttemptID)); fs.mkdirs(finalOutputPath); if (paths != null) { for (FileStatus path : paths) { moveTaskOutputs(conf, taskAttemptID, fs, jobOutputDir, path.getPath()); } } } }
From source file:eu.stratosphere.hadoopcompatibility.mapreduce.HadoopOutputFormat.java
License:Apache License
/** * commit the task by moving the output file out from the temporary directory. * @throws IOException// w w w . j ava 2s . co m */ @Override public void close() throws IOException { try { this.recordWriter.close(this.context); } catch (InterruptedException e) { throw new IOException("Could not close RecordReader.", e); } if (this.fileOutputCommitter.needsTaskCommit(this.context)) { this.fileOutputCommitter.commitTask(this.context); } this.fileOutputCommitter.commitJob(this.context); // rename tmp-* files to final name FileSystem fs = FileSystem.get(this.configuration); Path outputPath = new Path(this.configuration.get("mapred.output.dir")); final Pattern p = Pattern.compile("tmp-(.)-([0-9]+)"); // isDirectory does not work in hadoop 1 if (fs.getFileStatus(outputPath).isDir()) { FileStatus[] files = fs.listStatus(outputPath); for (FileStatus f : files) { Matcher m = p.matcher(f.getPath().getName()); if (m.matches()) { int part = Integer.valueOf(m.group(2)); fs.rename(f.getPath(), new Path(outputPath.toString() + "/" + part)); } } } }
From source file:eu.stratosphere.yarn.Utils.java
License:Apache License
public static void registerLocalResource(FileSystem fs, Path remoteRsrcPath, LocalResource localResource) throws IOException { FileStatus jarStat = fs.getFileStatus(remoteRsrcPath); localResource.setResource(ConverterUtils.getYarnUrlFromURI(remoteRsrcPath.toUri())); localResource.setSize(jarStat.getLen()); localResource.setTimestamp(jarStat.getModificationTime()); localResource.setType(LocalResourceType.FILE); localResource.setVisibility(LocalResourceVisibility.PUBLIC); }
From source file:fi.tkk.ics.hadoop.bam.BAMRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { // This method should only be called once (see Hadoop API). However, // there seems to be disagreement between implementations that call // initialize() and Hadoop-BAM's own code that relies on // {@link BAMInputFormat} to call initialize() when the reader is // created. Therefore we add this check for the time being. if (isInitialized) close();//from www . ja v a 2 s. c om isInitialized = true; final Configuration conf = ContextUtil.getConfiguration(ctx); final FileVirtualSplit split = (FileVirtualSplit) spl; final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(conf); this.stringency = SAMHeaderReader.getValidationStringency(conf); final FSDataInputStream in = fs.open(file); codec = new BAMRecordCodec(SAMHeaderReader.readSAMHeaderFrom(in, conf)); in.seek(0); bci = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file)); final long virtualStart = split.getStartVirtualOffset(); fileStart = virtualStart >>> 16; virtualEnd = split.getEndVirtualOffset(); bci.seek(virtualStart); codec.setInputStream(bci); if (BAMInputFormat.DEBUG_BAM_SPLITTER) { final long recordStart = virtualStart & 0xffff; System.err.println( "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart); } }
From source file:fi.tkk.ics.hadoop.bam.BCFRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { isBGZF = spl instanceof FileVirtualSplit; if (isBGZF) { final FileVirtualSplit split = (FileVirtualSplit) spl; final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(ContextUtil.getConfiguration(ctx)); final FSDataInputStream inFile = fs.open(file); bci = new BlockCompressedInputStream(inFile); in = new PositionalBufferedStream(bci); initContigDict();//from ww w. ja v a2 s . c o m inFile.seek(0); bci = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(inFile, fs.getFileStatus(file).getLen(), file)); final long virtualStart = split.getStartVirtualOffset(), virtualEnd = split.getEndVirtualOffset(); this.fileStart = virtualStart >>> 16; this.length = (virtualEnd >>> 16) - fileStart; bci.seek(virtualStart); // Since PositionalBufferedStream does its own buffering, we have to // prevent it from going too far by using a BGZFLimitingStream. It // also allows nextKeyValue() to simply check for EOF instead of // looking at virtualEnd. in = new PositionalBufferedStream(new BGZFLimitingStream(bci, virtualEnd)); } else { final FileSplit split = (FileSplit) spl; this.fileStart = split.getStart(); this.length = split.getLength(); final Path file = split.getPath(); in = new PositionalBufferedStream(file.getFileSystem(ContextUtil.getConfiguration(ctx)).open(file)); initContigDict(); in.skip(fileStart - in.getPosition()); } }