Example usage for org.apache.hadoop.fs FileSystem getFileStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileStatus.

Prototype

public abstract FileStatus getFileStatus(Path f) throws IOException;

Source Link

Document

Return a file status object that represents the path.

Usage

From source file:eu.edisonproject.training.tfidf.mapreduce.TermWordFrequency.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration jobconf = getConf();

    FileSystem fs = FileSystem.get(jobconf);
    fs.delete(new Path(args[1]), true);
    Path in = new Path(args[0]);
    Path inHdfs = in;// w  w  w .j  ava2s  .c o  m
    if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) {
        inHdfs = new Path(in.getName());
        fs.delete(inHdfs, true);
        fs.copyFromLocalFile(in, inHdfs);
        fs.deleteOnExit(inHdfs);
        FileStatus inHdfsStatus = fs.getFileStatus(inHdfs);
        //            Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Copied: {0} to: {1}", new Object[]{in.toUri(), inHdfsStatus.getPath().toUri()});
    }

    Job job = Job.getInstance(jobconf);
    Path stopwordsLocal = new Path(args[3]);
    stopwords = new Path(stopwordsLocal.getName());
    fs.delete(stopwords, true);
    fs.copyFromLocalFile(stopwordsLocal, stopwords);
    fs.deleteOnExit(stopwords);

    FileStatus stopwordsStatus = fs.getFileStatus(stopwords);
    stopwords = stopwordsStatus.getPath();
    job.addCacheFile(stopwords.toUri());

    Path localDocs = new Path(args[2]);
    Path hdfsDocs = new Path(localDocs.getName());
    fs.mkdirs(hdfsDocs);
    hdfsDocs = fs.getFileStatus(hdfsDocs).getPath();
    fs.delete(hdfsDocs, true);
    //        FileStatus[] stats = fs.listStatus(localDocs);
    File[] stats = new File(localDocs.toString()).listFiles();

    for (File stat : stats) {
        //        for (FileStatus stat : stats) {
        Path filePath = new Path(stat.getAbsolutePath());
        if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) {
            Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName());
            fs.copyFromLocalFile(filePath, dest);
        }
    }

    job.addCacheFile(hdfsDocs.toUri());

    job.setJarByClass(TermWordFrequency.class);
    job.setJobName("Word Frequency Term Driver");

    FileInputFormat.setInputPaths(job, inHdfs);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    //         job.setInputFormatClass(TextInputFormat.class);
    job.setInputFormatClass(NLineInputFormat.class);
    NLineInputFormat.addInputPath(job, inHdfs);
    NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4]));
    NLineInputFormat.setMaxInputSplitSize(job, 500);
    Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Num. of lines: {0}",
            NLineInputFormat.getNumLinesPerSplit(job));

    job.setMapperClass(TermWordFrequencyMapper.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Integer.class);
    job.setReducerClass(TermWordFrequencyReducer.class);

    return (job.waitForCompletion(true) ? 0 : 1);

}

From source file:eu.edisonproject.training.tfidf.mapreduce.TFIDFTermsDriver.java

License:Apache License

@Override
public void executeTFIDF(String inputPath) {
    try {//from  ww w.j  a v  a 2  s.  com

        String[] args1 = { inputPath, OUTPUT_PATH1, TEXT_FILES_DIR_PATH, STOPWORDS_PATH, NUM_OF_LINES };
        ToolRunner.run(new TermWordFrequency(), args1);
        String[] args2 = { INPUT_PATH2, OUTPUT_PATH2 };
        ToolRunner.run(new WordCountsForDocsDriver(), args2);

        File docs = new File(TEXT_FILES_DIR_PATH);
        File[] files = docs.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return name.toLowerCase().endsWith(".txt");
            }
        });

        String[] args3 = { INPUT_PATH3, OUTPUT_PATH3, String.valueOf(files.length) };
        ToolRunner.run(new WordsInCorpusTFIDFDriver(), args3);

        String[] args4 = { INPUT_PATH4, OUTPUT_PATH4 };
        ToolRunner.run(new WordsGroupByTitleDriver(), args4);

        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path hdfsRes = new Path(OUTPUT_PATH4 + File.separator + "part-r-00000");
        hdfsRes = fs.getFileStatus(hdfsRes).getPath();

        readTFIDFResult(fs, hdfsRes);

        List<Double> sum = computeSum(transactionValues);
        for (int i = 0; i < sum.size(); i++) {
            wordTfidf.put(allWords.get(i), sum.get(i));
        }

        computeMean();
        // Resize the hashmap wordtfidf
        wordTfidf = resizeVector(wordTfidf);
        writeResizedOutputIntoCSV(OUT, wordTfidf);

    } catch (Exception ex) {
        Logger.getLogger(TFIDFTermsDriver.class.getName()).log(Level.SEVERE, "TFIDF fail", ex);
    }

}

From source file:eu.scape_project.archiventory.hadoop.ArcRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException {
    //throw new UnsupportedOperationException("Unused.");

    FileSplit fileSplit = (FileSplit) is;
    try {/*w  ww.  j  a v  a2  s . c  o m*/
        Path path = fileSplit.getPath();

        FileSystem fileSystem = path.getFileSystem(tac.getConfiguration());

        FSDataInputStream fileInputStream = fileSystem.open(path);
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        fileLength = fileStatus.getLen();
        ArchiveReader reader = ArchiveReaderFactory.get(path.getName(), fileInputStream, true);
        recordIterator = reader.iterator();

        currentKey = new Text();
        currentArcRecord = new ArcRecord();
    } catch (IOException ex) {
        Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:eu.scape_project.arcunpacker.mapreduce.ArcRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException {
    //throw new UnsupportedOperationException("Unused.");

    FileSplit fileSplit = (FileSplit) is;
    try {/*from   w  ww .j  a  va  2s  .  c  o m*/
        Path path = fileSplit.getPath();

        FileSystem fileSystem = path.getFileSystem(tac.getConfiguration());

        FSDataInputStream fileInputStream = fileSystem.open(path);
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        long fileLength = fileStatus.getLen();

        archiveReaderDelegate = new HeritrixWrapper(path.getName(), fileInputStream, fileLength);
        key = new Text();
        value = new HadoopArcRecord();

    } catch (IOException ex) {
        Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex);
        throw new IOException(ex);
    }

}

From source file:eu.scape_project.pt.mapred.input.ControlFileInputFormat.java

License:Apache License

/**
 * Gets block locations of input files sorted
 * by the total number of occurrences.//from  ww  w.j  a  v  a  2  s. co  m
 *
 * @param fs Hadoop filesystem handle
 * @param inFiles array of input files
 * @return sorted String array
 */
public static String[] getSortedHosts(FileSystem fs, Path[] inFiles) throws IOException {
    final Map<String, Integer> hostMap = new HashMap<String, Integer>();
    for (Path inFile : inFiles) {
        FileStatus s = fs.getFileStatus(inFile);
        BlockLocation[] locations = fs.getFileBlockLocations(s, 0, s.getLen());
        for (BlockLocation location : locations) {
            String[] hosts = location.getHosts();
            for (String host : hosts) {
                if (!hostMap.containsKey(host)) {
                    hostMap.put(host, 1);
                    continue;
                }
                hostMap.put(host, hostMap.get(host) + 1);
            }
        }
    }
    // sort hosts by number of references to blocks of input files
    List<String> hosts = new ArrayList<String>();
    hosts.addAll(hostMap.keySet());
    Collections.sort(hosts, new Comparator<String>() {
        @Override
        public int compare(String host1, String host2) {
            return hostMap.get(host2) - hostMap.get(host1);
        }
    });
    return hosts.toArray(new String[0]);

}

From source file:eu.stratosphere.hadoopcompatibility.FileOutputCommitterWrapper.java

License:Apache License

private void moveTaskOutputs(JobConf conf, TaskAttemptID taskAttemptID, FileSystem fs, Path jobOutputDir,
        Path taskOutput) throws IOException {
    if (fs.isFile(taskOutput)) {
        Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput,
                getTempTaskOutputPath(conf, taskAttemptID));
        if (!fs.rename(taskOutput, finalOutputPath)) {
            if (!fs.delete(finalOutputPath, true)) {
                throw new IOException("Failed to delete earlier output of task: " + taskAttemptID);
            }/*from   w w w .  ja va 2s  .c  o m*/
            if (!fs.rename(taskOutput, finalOutputPath)) {
                throw new IOException("Failed to save output of task: " + taskAttemptID);
            }
        }
        LOG.debug("Moved " + taskOutput + " to " + finalOutputPath);
    } else if (fs.getFileStatus(taskOutput).isDir()) {
        FileStatus[] paths = fs.listStatus(taskOutput);
        Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput,
                getTempTaskOutputPath(conf, taskAttemptID));
        fs.mkdirs(finalOutputPath);
        if (paths != null) {
            for (FileStatus path : paths) {
                moveTaskOutputs(conf, taskAttemptID, fs, jobOutputDir, path.getPath());
            }
        }
    }
}

From source file:eu.stratosphere.hadoopcompatibility.mapreduce.HadoopOutputFormat.java

License:Apache License

/**
 * commit the task by moving the output file out from the temporary directory.
 * @throws IOException//  w  w  w . j  ava  2s . co m
 */
@Override
public void close() throws IOException {
    try {
        this.recordWriter.close(this.context);
    } catch (InterruptedException e) {
        throw new IOException("Could not close RecordReader.", e);
    }

    if (this.fileOutputCommitter.needsTaskCommit(this.context)) {
        this.fileOutputCommitter.commitTask(this.context);
    }
    this.fileOutputCommitter.commitJob(this.context);

    // rename tmp-* files to final name
    FileSystem fs = FileSystem.get(this.configuration);

    Path outputPath = new Path(this.configuration.get("mapred.output.dir"));

    final Pattern p = Pattern.compile("tmp-(.)-([0-9]+)");

    // isDirectory does not work in hadoop 1
    if (fs.getFileStatus(outputPath).isDir()) {
        FileStatus[] files = fs.listStatus(outputPath);

        for (FileStatus f : files) {
            Matcher m = p.matcher(f.getPath().getName());
            if (m.matches()) {
                int part = Integer.valueOf(m.group(2));
                fs.rename(f.getPath(), new Path(outputPath.toString() + "/" + part));
            }
        }
    }
}

From source file:eu.stratosphere.yarn.Utils.java

License:Apache License

public static void registerLocalResource(FileSystem fs, Path remoteRsrcPath, LocalResource localResource)
        throws IOException {
    FileStatus jarStat = fs.getFileStatus(remoteRsrcPath);
    localResource.setResource(ConverterUtils.getYarnUrlFromURI(remoteRsrcPath.toUri()));
    localResource.setSize(jarStat.getLen());
    localResource.setTimestamp(jarStat.getModificationTime());
    localResource.setType(LocalResourceType.FILE);
    localResource.setVisibility(LocalResourceVisibility.PUBLIC);
}

From source file:fi.tkk.ics.hadoop.bam.BAMRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    // This method should only be called once (see Hadoop API). However,
    // there seems to be disagreement between implementations that call
    // initialize() and Hadoop-BAM's own code that relies on
    // {@link BAMInputFormat} to call initialize() when the reader is
    // created. Therefore we add this check for the time being. 
    if (isInitialized)
        close();//from   www .  ja  v a 2  s.  c om
    isInitialized = true;

    final Configuration conf = ContextUtil.getConfiguration(ctx);

    final FileVirtualSplit split = (FileVirtualSplit) spl;
    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(conf);

    this.stringency = SAMHeaderReader.getValidationStringency(conf);

    final FSDataInputStream in = fs.open(file);

    codec = new BAMRecordCodec(SAMHeaderReader.readSAMHeaderFrom(in, conf));

    in.seek(0);
    bci = new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    final long virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);
    codec.setInputStream(bci);

    if (BAMInputFormat.DEBUG_BAM_SPLITTER) {
        final long recordStart = virtualStart & 0xffff;
        System.err.println(
                "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart);
    }
}

From source file:fi.tkk.ics.hadoop.bam.BCFRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    isBGZF = spl instanceof FileVirtualSplit;
    if (isBGZF) {
        final FileVirtualSplit split = (FileVirtualSplit) spl;

        final Path file = split.getPath();
        final FileSystem fs = file.getFileSystem(ContextUtil.getConfiguration(ctx));

        final FSDataInputStream inFile = fs.open(file);

        bci = new BlockCompressedInputStream(inFile);
        in = new PositionalBufferedStream(bci);
        initContigDict();//from   ww w.  ja  v  a2  s  .  c  o m

        inFile.seek(0);
        bci = new BlockCompressedInputStream(
                new WrapSeekable<FSDataInputStream>(inFile, fs.getFileStatus(file).getLen(), file));

        final long virtualStart = split.getStartVirtualOffset(), virtualEnd = split.getEndVirtualOffset();

        this.fileStart = virtualStart >>> 16;
        this.length = (virtualEnd >>> 16) - fileStart;

        bci.seek(virtualStart);

        // Since PositionalBufferedStream does its own buffering, we have to
        // prevent it from going too far by using a BGZFLimitingStream. It
        // also allows nextKeyValue() to simply check for EOF instead of
        // looking at virtualEnd.
        in = new PositionalBufferedStream(new BGZFLimitingStream(bci, virtualEnd));
    } else {
        final FileSplit split = (FileSplit) spl;

        this.fileStart = split.getStart();
        this.length = split.getLength();

        final Path file = split.getPath();

        in = new PositionalBufferedStream(file.getFileSystem(ContextUtil.getConfiguration(ctx)).open(file));

        initContigDict();

        in.skip(fileStart - in.getPosition());
    }
}