List of usage examples for org.apache.hadoop.fs FileSystem getFileChecksum
public FileChecksum getFileChecksum(Path f) throws IOException
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * @param indexDescriptor//from w ww. ja v a2 s .c o m * @param context * @return true if the current version of the base file's checksum * matches what was stored in the indexDescriptor. * @throws IOException */ protected static boolean verifyInputFileCheckSum(FileIndexDescriptor indexDescriptor, JobContext context) throws IOException { Configuration conf = context.getConfiguration(); FileSystem fs = FileSystem.get(conf); Path file = new Path(indexDescriptor.getSourcePath()); FileChecksum oldChecksum = indexDescriptor.getChecksum(); // check InputFile Checksum. org.apache.hadoop.fs.FileChecksum cksum = fs.getFileChecksum(file); if (cksum != null) { FileChecksum newCksum = new FileChecksum(cksum.getAlgorithmName(), ByteBuffer.wrap(cksum.getBytes()), cksum.getLength()); return (newCksum.equals(oldChecksum)); } return true; }
From source file:com.wipro.ats.bdre.dq.DQDriver.java
License:Apache License
@Override public int run(String[] arg) throws Exception { String processId = arg[0];//w w w.j a v a2 s . c o m String sPath = arg[1]; String destDir = arg[2]; Properties props = new GetProperties().getProperties(processId, "dq"); LOGGER.debug("props=" + props); Configuration conf = getConf(); conf.set("dq.process.id", processId); Job job = Job.getInstance(conf); job.setJobName("Data Quality " + processId); job.setJarByClass(DQDriver.class); job.setMapperClass(DQMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //Reducer is not required job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); Path inputFilePath = new Path(sPath); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, removeIfExistAndSetOutputPath(conf, destDir)); MultipleOutputs.addNamedOutput(job, DQConstants.GOOD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.BAD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.FILE_REPORT_FILE, TextOutputFormat.class, Text.class, NullWritable.class); if (!job.waitForCompletion(true)) { return 1; } Path outputDir = new Path(destDir); FileSystem srcFs = outputDir.getFileSystem(getConf()); FileSystem destFs = outputDir.getFileSystem(getConf()); //Valid Records Path goodFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_GOOD_RECORD_OUTPUT_DIR); //Input and quality filtered file should have same name (but different path) Path goodDestFile = new Path(destDir + "/" + inputFilePath.getName()); if (srcFs.exists(goodFilesSrcDir)) { FileUtil.copyMerge(srcFs, goodFilesSrcDir, destFs, goodDestFile, true, conf, ""); } // Invalid Records Path badFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_BAD_RECORD_OUTPUT_DIR); Path badDestFile = new Path(destDir + "/" + DQConstants.BAD_RECORDS_FILE); if (srcFs.exists(badFilesSrcDir)) { FileUtil.copyMerge(srcFs, badFilesSrcDir, destFs, badDestFile, true, conf, ""); } // Preparing report aggregation job Job fileReportAggregationJob = Job.getInstance(conf); fileReportAggregationJob.setJobName("File Report Computing " + processId); fileReportAggregationJob.setJarByClass(DQMain.class); fileReportAggregationJob.setMapperClass(DQFileReportMapper.class); fileReportAggregationJob.setMapOutputKeyClass(Text.class); fileReportAggregationJob.setMapOutputValueClass(IntWritable.class); fileReportAggregationJob.setReducerClass(DQFileReportReducer.class); fileReportAggregationJob.setOutputKeyClass(Text.class); fileReportAggregationJob.setOutputValueClass(Text.class); fileReportAggregationJob.setNumReduceTasks(1); Path fileReportDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_REPORT_OUTPUT_DIR); Path fileReportOutputDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); FileInputFormat.addInputPath(fileReportAggregationJob, fileReportDir); FileOutputFormat.setOutputPath(fileReportAggregationJob, fileReportOutputDir); if (!fileReportAggregationJob.waitForCompletion(true)) { return 1; } // Merge Report Records MR stuffs Path reportsSrcDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); Path reportsDestFile = new Path(destDir + "/" + DQConstants.FILE_REPORT_FILE); FileUtil.copyMerge(srcFs, reportsSrcDir, destFs, reportsDestFile, true, conf, ""); Path reportDestFile = new Path(outputDir.toString() + "/" + DQConstants.FILE_REPORT_FILE); //Read the report file from HDFS and report the percentage DQStats dqStats = getQualityStats(getConf(), reportDestFile); LOGGER.info("Percentage of good records :" + dqStats.getGoodPercent()); props = new GetProperties().getProperties(processId, "dq"); String strThreshold = props.getProperty("min.pass.threshold.percent"); float threshold = Float.parseFloat(strThreshold); dqStats.setThreshold(threshold); //Update the result in metadata logResult(dqStats, processId, 0L); if (dqStats.getGoodPercent() < threshold) { LOGGER.error("DQ check did not pass"); throw new DQValidationException(dqStats); } LOGGER.info(dqStats); FileChecksum hdfsChecksum = destFs.getFileChecksum(goodDestFile); String fileHash = hdfsChecksum == null ? "0" : hdfsChecksum.toString(); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash(fileHash); registerFileInfo.setFileSize(destFs.getFileStatus(goodDestFile).getLen()); registerFileInfo.setPath(goodDestFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; }
From source file:fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsMapperHadoopModule.java
License:LGPL
/** * Compute the checksum of a ZIP file or use the HDFS checksum if available. * @param file the zip input file// w w w . j av a 2s . co m * @param conf The Hadoop configuration * @return the checksum as a string * @throws IOException if an error occurs while creating the checksum */ static String computeZipCheckSum(final DataFile file, final Configuration conf) throws IOException { final Path path = new Path(file.getSource()); FileSystem fs = FileSystem.get(path.toUri(), conf); final FileChecksum checksum = fs.getFileChecksum(path); // If exists use checksum provided by the file system if (checksum != null) { return new BigInteger(1, checksum.getBytes()).toString(16); } // Fallback solution return computeZipCheckSum(file.open()); }
From source file:fr.ens.biologie.genomique.eoulsan.modules.mgmt.hadoop.DistCp.java
License:LGPL
/** * Check whether the contents of src and dst are the same. Return false if * dstpath does not exist If the files have different sizes, return false. If * the files have the same sizes, the file checksums will be compared. When * file checksum is not supported in any of file systems, two files are * considered as the same if they have the same size. *//*from w w w . j av a 2s . c o m*/ static private boolean sameFile(final FileSystem srcfs, final FileStatus srcstatus, final FileSystem dstfs, final Path dstpath) throws IOException { FileStatus dststatus; try { dststatus = dstfs.getFileStatus(dstpath); } catch (FileNotFoundException fnfe) { return false; } // same length? if (srcstatus.getLen() != dststatus.getLen()) { return false; } // get src checksum final FileChecksum srccs; try { srccs = srcfs.getFileChecksum(srcstatus.getPath()); } catch (FileNotFoundException fnfe) { /* * Two possible cases: (1) src existed once but was deleted between the * time period that srcstatus was obtained and the try block above. (2) * srcfs does not support file checksum and (incorrectly) throws FNFE, * e.g. some previous versions of HftpFileSystem. For case (1), it is okay * to return true since src was already deleted. For case (2), true should * be returned. */ return true; } // compare checksums try { final FileChecksum dstcs = dstfs.getFileChecksum(dststatus.getPath()); // return true if checksum is not supported // (i.e. some of the checksums is null) return srccs == null || dstcs == null || srccs.equals(dstcs); } catch (FileNotFoundException fnfe) { return false; } }
From source file:org.jd.copier.mapred.DistCp.java
License:Apache License
/** * Check whether the contents of src and dst are the same. * /*from w w w.j a v a2 s . com*/ * Return false if dstpath does not exist * * If the files have different sizes, return false. * * If the files have the same sizes, the file checksums will be compared. * * When file checksum is not supported in any of file systems, * two files are considered as the same if they have the same size. */ static private boolean sameFile(FileSystem srcfs, FileStatus srcstatus, FileSystem dstfs, Path dstpath, boolean skipCRCCheck) throws IOException { FileStatus dststatus; try { dststatus = dstfs.getFileStatus(dstpath); } catch (FileNotFoundException fnfe) { return false; } //same length? if (srcstatus.getLen() != dststatus.getLen()) { return false; } if (skipCRCCheck) { LOG.debug("Skipping CRC Check"); return true; } //get src checksum final FileChecksum srccs; try { srccs = srcfs.getFileChecksum(srcstatus.getPath()); } catch (FileNotFoundException fnfe) { /* * Two possible cases: * (1) src existed once but was deleted between the time period that * srcstatus was obtained and the try block above. * (2) srcfs does not support file checksum and (incorrectly) throws * FNFE, e.g. some previous versions of HftpFileSystem. * For case (1), it is okay to return true since src was already deleted. * For case (2), true should be returned. */ return true; } //compare checksums try { final FileChecksum dstcs = dstfs.getFileChecksum(dststatus.getPath()); //return true if checksum is not supported //(i.e. some of the checksums is null) return srccs == null || dstcs == null || srccs.equals(dstcs); } catch (FileNotFoundException fnfe) { return false; } }