Example usage for org.apache.hadoop.fs FileSystem getFileStatus

List of usage examples for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:eu.edisonproject.training.tfidf.mapreduce.TermWordFrequency.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration jobconf = getConf();

    FileSystem fs = FileSystem.get(jobconf);
    fs.delete(new Path(args[1]), true);
    Path in = new Path(args[0]);
    Path inHdfs = in;// w  w  w .j  ava2s  .c o  m
    if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) {
        inHdfs = new Path(in.getName());
        fs.delete(inHdfs, true);
        fs.copyFromLocalFile(in, inHdfs);
        fs.deleteOnExit(inHdfs);
        FileStatus inHdfsStatus = fs.getFileStatus(inHdfs);
        //            Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Copied: {0} to: {1}", new Object[]{in.toUri(), inHdfsStatus.getPath().toUri()});
    }

    Job job = Job.getInstance(jobconf);
    Path stopwordsLocal = new Path(args[3]);
    stopwords = new Path(stopwordsLocal.getName());
    fs.delete(stopwords, true);
    fs.copyFromLocalFile(stopwordsLocal, stopwords);
    fs.deleteOnExit(stopwords);

    FileStatus stopwordsStatus = fs.getFileStatus(stopwords);
    stopwords = stopwordsStatus.getPath();
    job.addCacheFile(stopwords.toUri());

    Path localDocs = new Path(args[2]);
    Path hdfsDocs = new Path(localDocs.getName());
    fs.mkdirs(hdfsDocs);
    hdfsDocs = fs.getFileStatus(hdfsDocs).getPath();
    fs.delete(hdfsDocs, true);
    //        FileStatus[] stats = fs.listStatus(localDocs);
    File[] stats = new File(localDocs.toString()).listFiles();

    for (File stat : stats) {
        //        for (FileStatus stat : stats) {
        Path filePath = new Path(stat.getAbsolutePath());
        if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) {
            Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName());
            fs.copyFromLocalFile(filePath, dest);
        }
    }

    job.addCacheFile(hdfsDocs.toUri());

    job.setJarByClass(TermWordFrequency.class);
    job.setJobName("Word Frequency Term Driver");

    FileInputFormat.setInputPaths(job, inHdfs);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    //         job.setInputFormatClass(TextInputFormat.class);
    job.setInputFormatClass(NLineInputFormat.class);
    NLineInputFormat.addInputPath(job, inHdfs);
    NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4]));
    NLineInputFormat.setMaxInputSplitSize(job, 500);
    Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Num. of lines: {0}",
            NLineInputFormat.getNumLinesPerSplit(job));

    job.setMapperClass(TermWordFrequencyMapper.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Integer.class);
    job.setReducerClass(TermWordFrequencyReducer.class);

    return (job.waitForCompletion(true) ? 0 : 1);

}

From source file:eu.edisonproject.training.tfidf.mapreduce.TFIDFTermsDriver.java

License:Apache License

@Override
public void executeTFIDF(String inputPath) {
    try {//from  ww w.j  a v  a 2  s.  com

        String[] args1 = { inputPath, OUTPUT_PATH1, TEXT_FILES_DIR_PATH, STOPWORDS_PATH, NUM_OF_LINES };
        ToolRunner.run(new TermWordFrequency(), args1);
        String[] args2 = { INPUT_PATH2, OUTPUT_PATH2 };
        ToolRunner.run(new WordCountsForDocsDriver(), args2);

        File docs = new File(TEXT_FILES_DIR_PATH);
        File[] files = docs.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return name.toLowerCase().endsWith(".txt");
            }
        });

        String[] args3 = { INPUT_PATH3, OUTPUT_PATH3, String.valueOf(files.length) };
        ToolRunner.run(new WordsInCorpusTFIDFDriver(), args3);

        String[] args4 = { INPUT_PATH4, OUTPUT_PATH4 };
        ToolRunner.run(new WordsGroupByTitleDriver(), args4);

        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path hdfsRes = new Path(OUTPUT_PATH4 + File.separator + "part-r-00000");
        hdfsRes = fs.getFileStatus(hdfsRes).getPath();

        readTFIDFResult(fs, hdfsRes);

        List<Double> sum = computeSum(transactionValues);
        for (int i = 0; i < sum.size(); i++) {
            wordTfidf.put(allWords.get(i), sum.get(i));
        }

        computeMean();
        // Resize the hashmap wordtfidf
        wordTfidf = resizeVector(wordTfidf);
        writeResizedOutputIntoCSV(OUT, wordTfidf);

    } catch (Exception ex) {
        Logger.getLogger(TFIDFTermsDriver.class.getName()).log(Level.SEVERE, "TFIDF fail", ex);
    }

}

From source file:eu.scape_project.archiventory.hadoop.ArcRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException {
    //throw new UnsupportedOperationException("Unused.");

    FileSplit fileSplit = (FileSplit) is;
    try {/*w  ww.  j  a v  a2  s . c  o m*/
        Path path = fileSplit.getPath();

        FileSystem fileSystem = path.getFileSystem(tac.getConfiguration());

        FSDataInputStream fileInputStream = fileSystem.open(path);
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        fileLength = fileStatus.getLen();
        ArchiveReader reader = ArchiveReaderFactory.get(path.getName(), fileInputStream, true);
        recordIterator = reader.iterator();

        currentKey = new Text();
        currentArcRecord = new ArcRecord();
    } catch (IOException ex) {
        Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:eu.scape_project.arcunpacker.mapreduce.ArcRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException {
    //throw new UnsupportedOperationException("Unused.");

    FileSplit fileSplit = (FileSplit) is;
    try {/*from   w  ww .j  a  va  2s  .  c  o m*/
        Path path = fileSplit.getPath();

        FileSystem fileSystem = path.getFileSystem(tac.getConfiguration());

        FSDataInputStream fileInputStream = fileSystem.open(path);
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        long fileLength = fileStatus.getLen();

        archiveReaderDelegate = new HeritrixWrapper(path.getName(), fileInputStream, fileLength);
        key = new Text();
        value = new HadoopArcRecord();

    } catch (IOException ex) {
        Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex);
        throw new IOException(ex);
    }

}

From source file:eu.scape_project.pt.mapred.input.ControlFileInputFormat.java

License:Apache License

/**
 * Gets block locations of input files sorted
 * by the total number of occurrences.//from  ww  w.j  a  v  a  2  s. co  m
 *
 * @param fs Hadoop filesystem handle
 * @param inFiles array of input files
 * @return sorted String array
 */
public static String[] getSortedHosts(FileSystem fs, Path[] inFiles) throws IOException {
    final Map<String, Integer> hostMap = new HashMap<String, Integer>();
    for (Path inFile : inFiles) {
        FileStatus s = fs.getFileStatus(inFile);
        BlockLocation[] locations = fs.getFileBlockLocations(s, 0, s.getLen());
        for (BlockLocation location : locations) {
            String[] hosts = location.getHosts();
            for (String host : hosts) {
                if (!hostMap.containsKey(host)) {
                    hostMap.put(host, 1);
                    continue;
                }
                hostMap.put(host, hostMap.get(host) + 1);
            }
        }
    }
    // sort hosts by number of references to blocks of input files
    List<String> hosts = new ArrayList<String>();
    hosts.addAll(hostMap.keySet());
    Collections.sort(hosts, new Comparator<String>() {
        @Override
        public int compare(String host1, String host2) {
            return hostMap.get(host2) - hostMap.get(host1);
        }
    });
    return hosts.toArray(new String[0]);

}

From source file:eu.stratosphere.hadoopcompatibility.FileOutputCommitterWrapper.java

License:Apache License

private void moveTaskOutputs(JobConf conf, TaskAttemptID taskAttemptID, FileSystem fs, Path jobOutputDir,
        Path taskOutput) throws IOException {
    if (fs.isFile(taskOutput)) {
        Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput,
                getTempTaskOutputPath(conf, taskAttemptID));
        if (!fs.rename(taskOutput, finalOutputPath)) {
            if (!fs.delete(finalOutputPath, true)) {
                throw new IOException("Failed to delete earlier output of task: " + taskAttemptID);
            }/*from   w w w .  ja va 2s  .c  o m*/
            if (!fs.rename(taskOutput, finalOutputPath)) {
                throw new IOException("Failed to save output of task: " + taskAttemptID);
            }
        }
        LOG.debug("Moved " + taskOutput + " to " + finalOutputPath);
    } else if (fs.getFileStatus(taskOutput).isDir()) {
        FileStatus[] paths = fs.listStatus(taskOutput);
        Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput,
                getTempTaskOutputPath(conf, taskAttemptID));
        fs.mkdirs(finalOutputPath);
        if (paths != null) {
            for (FileStatus path : paths) {
                moveTaskOutputs(conf, taskAttemptID, fs, jobOutputDir, path.getPath());
            }
        }
    }
}

From source file:eu.stratosphere.hadoopcompatibility.mapreduce.HadoopOutputFormat.java

License:Apache License

/**
 * commit the task by moving the output file out from the temporary directory.
 * @throws IOException//  w  w  w . j  ava  2s . co m
 */
@Override
public void close() throws IOException {
    try {
        this.recordWriter.close(this.context);
    } catch (InterruptedException e) {
        throw new IOException("Could not close RecordReader.", e);
    }

    if (this.fileOutputCommitter.needsTaskCommit(this.context)) {
        this.fileOutputCommitter.commitTask(this.context);
    }
    this.fileOutputCommitter.commitJob(this.context);

    // rename tmp-* files to final name
    FileSystem fs = FileSystem.get(this.configuration);

    Path outputPath = new Path(this.configuration.get("mapred.output.dir"));

    final Pattern p = Pattern.compile("tmp-(.)-([0-9]+)");

    // isDirectory does not work in hadoop 1
    if (fs.getFileStatus(outputPath).isDir()) {
        FileStatus[] files = fs.listStatus(outputPath);

        for (FileStatus f : files) {
            Matcher m = p.matcher(f.getPath().getName());
            if (m.matches()) {
                int part = Integer.valueOf(m.group(2));
                fs.rename(f.getPath(), new Path(outputPath.toString() + "/" + part));
            }
        }
    }
}

From source file:eu.stratosphere.yarn.Utils.java

License:Apache License

public static void registerLocalResource(FileSystem fs, Path remoteRsrcPath, LocalResource localResource)
        throws IOException {
    FileStatus jarStat = fs.getFileStatus(remoteRsrcPath);
    localResource.setResource(ConverterUtils.getYarnUrlFromURI(remoteRsrcPath.toUri()));
    localResource.setSize(jarStat.getLen());
    localResource.setTimestamp(jarStat.getModificationTime());
    localResource.setType(LocalResourceType.FILE);
    localResource.setVisibility(LocalResourceVisibility.PUBLIC);
}

From source file:fi.tkk.ics.hadoop.bam.BAMRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    // This method should only be called once (see Hadoop API). However,
    // there seems to be disagreement between implementations that call
    // initialize() and Hadoop-BAM's own code that relies on
    // {@link BAMInputFormat} to call initialize() when the reader is
    // created. Therefore we add this check for the time being. 
    if (isInitialized)
        close();//from   www .  ja  v a 2  s.  c om
    isInitialized = true;

    final Configuration conf = ContextUtil.getConfiguration(ctx);

    final FileVirtualSplit split = (FileVirtualSplit) spl;
    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(conf);

    this.stringency = SAMHeaderReader.getValidationStringency(conf);

    final FSDataInputStream in = fs.open(file);

    codec = new BAMRecordCodec(SAMHeaderReader.readSAMHeaderFrom(in, conf));

    in.seek(0);
    bci = new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    final long virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);
    codec.setInputStream(bci);

    if (BAMInputFormat.DEBUG_BAM_SPLITTER) {
        final long recordStart = virtualStart & 0xffff;
        System.err.println(
                "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart);
    }
}

From source file:fi.tkk.ics.hadoop.bam.BCFRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    isBGZF = spl instanceof FileVirtualSplit;
    if (isBGZF) {
        final FileVirtualSplit split = (FileVirtualSplit) spl;

        final Path file = split.getPath();
        final FileSystem fs = file.getFileSystem(ContextUtil.getConfiguration(ctx));

        final FSDataInputStream inFile = fs.open(file);

        bci = new BlockCompressedInputStream(inFile);
        in = new PositionalBufferedStream(bci);
        initContigDict();//from   ww w.  ja  v  a2  s  .  c  o m

        inFile.seek(0);
        bci = new BlockCompressedInputStream(
                new WrapSeekable<FSDataInputStream>(inFile, fs.getFileStatus(file).getLen(), file));

        final long virtualStart = split.getStartVirtualOffset(), virtualEnd = split.getEndVirtualOffset();

        this.fileStart = virtualStart >>> 16;
        this.length = (virtualEnd >>> 16) - fileStart;

        bci.seek(virtualStart);

        // Since PositionalBufferedStream does its own buffering, we have to
        // prevent it from going too far by using a BGZFLimitingStream. It
        // also allows nextKeyValue() to simply check for EOF instead of
        // looking at virtualEnd.
        in = new PositionalBufferedStream(new BGZFLimitingStream(bci, virtualEnd));
    } else {
        final FileSplit split = (FileSplit) spl;

        this.fileStart = split.getStart();
        this.length = split.getLength();

        final Path file = split.getPath();

        in = new PositionalBufferedStream(file.getFileSystem(ContextUtil.getConfiguration(ctx)).open(file));

        initContigDict();

        in.skip(fileStart - in.getPosition());
    }
}