Example usage for org.apache.hadoop.fs FileSystem getFileChecksum

List of usage examples for org.apache.hadoop.fs FileSystem getFileChecksum

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getFileChecksum.

Prototype

public FileChecksum getFileChecksum(Path f) throws IOException 

Source Link

Document

Get the checksum of a file, if the FS supports checksums.

Usage

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param indexDescriptor//from w ww.  ja  v  a2  s  .c o m
 * @param context
 * @return true if the current version of the base file's checksum
 * matches what was stored in the indexDescriptor.
 * @throws IOException
 */
protected static boolean verifyInputFileCheckSum(FileIndexDescriptor indexDescriptor, JobContext context)
        throws IOException {

    Configuration conf = context.getConfiguration();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(indexDescriptor.getSourcePath());
    FileChecksum oldChecksum = indexDescriptor.getChecksum();

    // check InputFile Checksum.
    org.apache.hadoop.fs.FileChecksum cksum = fs.getFileChecksum(file);
    if (cksum != null) {
        FileChecksum newCksum = new FileChecksum(cksum.getAlgorithmName(), ByteBuffer.wrap(cksum.getBytes()),
                cksum.getLength());
        return (newCksum.equals(oldChecksum));
    }
    return true;
}

From source file:com.wipro.ats.bdre.dq.DQDriver.java

License:Apache License

@Override
public int run(String[] arg) throws Exception {
    String processId = arg[0];//w w w.j a  v  a2  s . c o m
    String sPath = arg[1];
    String destDir = arg[2];

    Properties props = new GetProperties().getProperties(processId, "dq");
    LOGGER.debug("props=" + props);
    Configuration conf = getConf();

    conf.set("dq.process.id", processId);
    Job job = Job.getInstance(conf);
    job.setJobName("Data Quality " + processId);
    job.setJarByClass(DQDriver.class);
    job.setMapperClass(DQMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    //Reducer is not required
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);
    Path inputFilePath = new Path(sPath);
    FileInputFormat.addInputPath(job, inputFilePath);
    FileOutputFormat.setOutputPath(job, removeIfExistAndSetOutputPath(conf, destDir));
    MultipleOutputs.addNamedOutput(job, DQConstants.GOOD_RECORDS_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);
    MultipleOutputs.addNamedOutput(job, DQConstants.BAD_RECORDS_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);
    MultipleOutputs.addNamedOutput(job, DQConstants.FILE_REPORT_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);

    if (!job.waitForCompletion(true)) {
        return 1;
    }

    Path outputDir = new Path(destDir);
    FileSystem srcFs = outputDir.getFileSystem(getConf());
    FileSystem destFs = outputDir.getFileSystem(getConf());

    //Valid Records
    Path goodFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_GOOD_RECORD_OUTPUT_DIR);
    //Input and quality filtered file should have same name (but different path)
    Path goodDestFile = new Path(destDir + "/" + inputFilePath.getName());
    if (srcFs.exists(goodFilesSrcDir)) {
        FileUtil.copyMerge(srcFs, goodFilesSrcDir, destFs, goodDestFile, true, conf, "");
    }
    // Invalid Records
    Path badFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_BAD_RECORD_OUTPUT_DIR);
    Path badDestFile = new Path(destDir + "/" + DQConstants.BAD_RECORDS_FILE);
    if (srcFs.exists(badFilesSrcDir)) {
        FileUtil.copyMerge(srcFs, badFilesSrcDir, destFs, badDestFile, true, conf, "");
    }

    // Preparing report aggregation job
    Job fileReportAggregationJob = Job.getInstance(conf);
    fileReportAggregationJob.setJobName("File Report Computing " + processId);
    fileReportAggregationJob.setJarByClass(DQMain.class);

    fileReportAggregationJob.setMapperClass(DQFileReportMapper.class);
    fileReportAggregationJob.setMapOutputKeyClass(Text.class);
    fileReportAggregationJob.setMapOutputValueClass(IntWritable.class);

    fileReportAggregationJob.setReducerClass(DQFileReportReducer.class);
    fileReportAggregationJob.setOutputKeyClass(Text.class);
    fileReportAggregationJob.setOutputValueClass(Text.class);

    fileReportAggregationJob.setNumReduceTasks(1);

    Path fileReportDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_REPORT_OUTPUT_DIR);
    Path fileReportOutputDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER);

    FileInputFormat.addInputPath(fileReportAggregationJob, fileReportDir);
    FileOutputFormat.setOutputPath(fileReportAggregationJob, fileReportOutputDir);

    if (!fileReportAggregationJob.waitForCompletion(true)) {
        return 1;
    }

    // Merge Report Records MR stuffs
    Path reportsSrcDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER);
    Path reportsDestFile = new Path(destDir + "/" + DQConstants.FILE_REPORT_FILE);
    FileUtil.copyMerge(srcFs, reportsSrcDir, destFs, reportsDestFile, true, conf, "");

    Path reportDestFile = new Path(outputDir.toString() + "/" + DQConstants.FILE_REPORT_FILE);
    //Read the report file from HDFS and report the percentage
    DQStats dqStats = getQualityStats(getConf(), reportDestFile);
    LOGGER.info("Percentage of good records :" + dqStats.getGoodPercent());
    props = new GetProperties().getProperties(processId, "dq");
    String strThreshold = props.getProperty("min.pass.threshold.percent");
    float threshold = Float.parseFloat(strThreshold);
    dqStats.setThreshold(threshold);
    //Update the result in metadata
    logResult(dqStats, processId, 0L);
    if (dqStats.getGoodPercent() < threshold) {
        LOGGER.error("DQ check did not pass");
        throw new DQValidationException(dqStats);
    }
    LOGGER.info(dqStats);
    FileChecksum hdfsChecksum = destFs.getFileChecksum(goodDestFile);
    String fileHash = hdfsChecksum == null ? "0" : hdfsChecksum.toString();
    //Return file info oozie params
    RegisterFileInfo registerFileInfo = new RegisterFileInfo();
    registerFileInfo.setBatchId(null);
    registerFileInfo.setCreationTs(new Timestamp(new Date().getTime()));
    registerFileInfo.setFileHash(fileHash);
    registerFileInfo.setFileSize(destFs.getFileStatus(goodDestFile).getLen());
    registerFileInfo.setPath(goodDestFile.toString());
    registerFileInfo.setSubProcessId(Integer.parseInt(processId));
    OozieUtil oozieUtil = new OozieUtil();
    oozieUtil.persistBeanData(registerFileInfo, false);

    return 0;
}

From source file:fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsMapperHadoopModule.java

License:LGPL

/**
 * Compute the checksum of a ZIP file or use the HDFS checksum if available.
 * @param file the zip input file// w w  w  .  j av  a  2s .  co m
 * @param conf The Hadoop configuration
 * @return the checksum as a string
 * @throws IOException if an error occurs while creating the checksum
 */
static String computeZipCheckSum(final DataFile file, final Configuration conf) throws IOException {

    final Path path = new Path(file.getSource());

    FileSystem fs = FileSystem.get(path.toUri(), conf);
    final FileChecksum checksum = fs.getFileChecksum(path);

    // If exists use checksum provided by the file system
    if (checksum != null) {
        return new BigInteger(1, checksum.getBytes()).toString(16);
    }

    // Fallback solution
    return computeZipCheckSum(file.open());
}

From source file:fr.ens.biologie.genomique.eoulsan.modules.mgmt.hadoop.DistCp.java

License:LGPL

/**
 * Check whether the contents of src and dst are the same. Return false if
 * dstpath does not exist If the files have different sizes, return false. If
 * the files have the same sizes, the file checksums will be compared. When
 * file checksum is not supported in any of file systems, two files are
 * considered as the same if they have the same size.
 *//*from  w w  w  .  j av a  2s . c  o  m*/
static private boolean sameFile(final FileSystem srcfs, final FileStatus srcstatus, final FileSystem dstfs,
        final Path dstpath) throws IOException {
    FileStatus dststatus;
    try {
        dststatus = dstfs.getFileStatus(dstpath);
    } catch (FileNotFoundException fnfe) {
        return false;
    }

    // same length?
    if (srcstatus.getLen() != dststatus.getLen()) {
        return false;
    }

    // get src checksum
    final FileChecksum srccs;
    try {
        srccs = srcfs.getFileChecksum(srcstatus.getPath());
    } catch (FileNotFoundException fnfe) {
        /*
         * Two possible cases: (1) src existed once but was deleted between the
         * time period that srcstatus was obtained and the try block above. (2)
         * srcfs does not support file checksum and (incorrectly) throws FNFE,
         * e.g. some previous versions of HftpFileSystem. For case (1), it is okay
         * to return true since src was already deleted. For case (2), true should
         * be returned.
         */
        return true;
    }

    // compare checksums
    try {
        final FileChecksum dstcs = dstfs.getFileChecksum(dststatus.getPath());
        // return true if checksum is not supported
        // (i.e. some of the checksums is null)
        return srccs == null || dstcs == null || srccs.equals(dstcs);
    } catch (FileNotFoundException fnfe) {
        return false;
    }
}

From source file:org.jd.copier.mapred.DistCp.java

License:Apache License

/**
 * Check whether the contents of src and dst are the same.
 * /*from   w  w w.j a  v a2  s  .  com*/
 * Return false if dstpath does not exist
 * 
 * If the files have different sizes, return false.
 * 
 * If the files have the same sizes, the file checksums will be compared.
 * 
 * When file checksum is not supported in any of file systems,
 * two files are considered as the same if they have the same size.
 */
static private boolean sameFile(FileSystem srcfs, FileStatus srcstatus, FileSystem dstfs, Path dstpath,
        boolean skipCRCCheck) throws IOException {
    FileStatus dststatus;
    try {
        dststatus = dstfs.getFileStatus(dstpath);
    } catch (FileNotFoundException fnfe) {
        return false;
    }

    //same length?
    if (srcstatus.getLen() != dststatus.getLen()) {
        return false;
    }

    if (skipCRCCheck) {
        LOG.debug("Skipping CRC Check");
        return true;
    }

    //get src checksum
    final FileChecksum srccs;
    try {
        srccs = srcfs.getFileChecksum(srcstatus.getPath());
    } catch (FileNotFoundException fnfe) {
        /*
         * Two possible cases:
         * (1) src existed once but was deleted between the time period that
         *     srcstatus was obtained and the try block above.
         * (2) srcfs does not support file checksum and (incorrectly) throws
         *     FNFE, e.g. some previous versions of HftpFileSystem.
         * For case (1), it is okay to return true since src was already deleted.
         * For case (2), true should be returned.  
         */
        return true;
    }

    //compare checksums
    try {
        final FileChecksum dstcs = dstfs.getFileChecksum(dststatus.getPath());
        //return true if checksum is not supported
        //(i.e. some of the checksums is null)
        return srccs == null || dstcs == null || srccs.equals(dstcs);
    } catch (FileNotFoundException fnfe) {
        return false;
    }
}