Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.wipro.ats.bdre.dq; import com.wipro.ats.bdre.md.api.GetProperties; import com.wipro.ats.bdre.md.api.ProcessLog; import com.wipro.ats.bdre.md.beans.ProcessLogInfo; import com.wipro.ats.bdre.md.beans.RegisterFileInfo; import com.wipro.ats.bdre.util.OozieUtil; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.*; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.log4j.Logger; import java.io.IOException; import java.io.OutputStream; import java.sql.Timestamp; import java.util.Date; import java.util.Properties; /** * @author Satish Kumar * <p/> * MapReduce program that uses the Drool API to validate the records based on rules defined in DroolFile. * <p/> * To run: bin/hadoop jar DQValidator.jar com.wipro.ats.bdre.dq.DataQualityCheckJob * <input-file-path> <output-dir-path> <drl-file-path> */ public class DQDriver extends Configured implements Tool { private static final Logger LOGGER = Logger.getLogger(DQDriver.class); @Override public int run(String[] arg) throws Exception { String processId = arg[0]; String sPath = arg[1]; String destDir = arg[2]; Properties props = new GetProperties().getProperties(processId, "dq"); LOGGER.debug("props=" + props); Configuration conf = getConf(); conf.set("dq.process.id", processId); Job job = Job.getInstance(conf); job.setJobName("Data Quality " + processId); job.setJarByClass(DQDriver.class); job.setMapperClass(DQMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //Reducer is not required job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); Path inputFilePath = new Path(sPath); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, removeIfExistAndSetOutputPath(conf, destDir)); MultipleOutputs.addNamedOutput(job, DQConstants.GOOD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.BAD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.FILE_REPORT_FILE, TextOutputFormat.class, Text.class, NullWritable.class); if (!job.waitForCompletion(true)) { return 1; } Path outputDir = new Path(destDir); FileSystem srcFs = outputDir.getFileSystem(getConf()); FileSystem destFs = outputDir.getFileSystem(getConf()); //Valid Records Path goodFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_GOOD_RECORD_OUTPUT_DIR); //Input and quality filtered file should have same name (but different path) Path goodDestFile = new Path(destDir + "/" + inputFilePath.getName()); if (srcFs.exists(goodFilesSrcDir)) { FileUtil.copyMerge(srcFs, goodFilesSrcDir, destFs, goodDestFile, true, conf, ""); } // Invalid Records Path badFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_BAD_RECORD_OUTPUT_DIR); Path badDestFile = new Path(destDir + "/" + DQConstants.BAD_RECORDS_FILE); if (srcFs.exists(badFilesSrcDir)) { FileUtil.copyMerge(srcFs, badFilesSrcDir, destFs, badDestFile, true, conf, ""); } // Preparing report aggregation job Job fileReportAggregationJob = Job.getInstance(conf); fileReportAggregationJob.setJobName("File Report Computing " + processId); fileReportAggregationJob.setJarByClass(DQMain.class); fileReportAggregationJob.setMapperClass(DQFileReportMapper.class); fileReportAggregationJob.setMapOutputKeyClass(Text.class); fileReportAggregationJob.setMapOutputValueClass(IntWritable.class); fileReportAggregationJob.setReducerClass(DQFileReportReducer.class); fileReportAggregationJob.setOutputKeyClass(Text.class); fileReportAggregationJob.setOutputValueClass(Text.class); fileReportAggregationJob.setNumReduceTasks(1); Path fileReportDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_REPORT_OUTPUT_DIR); Path fileReportOutputDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); FileInputFormat.addInputPath(fileReportAggregationJob, fileReportDir); FileOutputFormat.setOutputPath(fileReportAggregationJob, fileReportOutputDir); if (!fileReportAggregationJob.waitForCompletion(true)) { return 1; } // Merge Report Records MR stuffs Path reportsSrcDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); Path reportsDestFile = new Path(destDir + "/" + DQConstants.FILE_REPORT_FILE); FileUtil.copyMerge(srcFs, reportsSrcDir, destFs, reportsDestFile, true, conf, ""); Path reportDestFile = new Path(outputDir.toString() + "/" + DQConstants.FILE_REPORT_FILE); //Read the report file from HDFS and report the percentage DQStats dqStats = getQualityStats(getConf(), reportDestFile); LOGGER.info("Percentage of good records :" + dqStats.getGoodPercent()); props = new GetProperties().getProperties(processId, "dq"); String strThreshold = props.getProperty("min.pass.threshold.percent"); float threshold = Float.parseFloat(strThreshold); dqStats.setThreshold(threshold); //Update the result in metadata logResult(dqStats, processId, 0L); if (dqStats.getGoodPercent() < threshold) { LOGGER.error("DQ check did not pass"); throw new DQValidationException(dqStats); } LOGGER.info(dqStats); FileChecksum hdfsChecksum = destFs.getFileChecksum(goodDestFile); String fileHash = hdfsChecksum == null ? "0" : hdfsChecksum.toString(); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash(fileHash); registerFileInfo.setFileSize(destFs.getFileStatus(goodDestFile).getLen()); registerFileInfo.setPath(goodDestFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; } private Path removeIfExistAndSetOutputPath(Configuration conf, String path) throws IOException { FileSystem fileSystem = FileSystem.get(conf); Path outputPath = new Path(path); fileSystem.delete(outputPath); return outputPath; } private DQStats getQualityStats(Configuration conf, Path outputDir) throws IOException { int goodRecords = 0; int badRecords = 0; FileSystem destFs = outputDir.getFileSystem(getConf()); FSDataInputStream in = destFs.open(outputDir); OutputStream out = new OutputStream() { private StringBuilder string = new StringBuilder(); @Override public void write(int x) throws IOException { this.string.append((char) x); } @Override public String toString() { return this.string.toString(); } }; IOUtils.copyBytes(in, out, conf, false); String raw = out.toString(); for (String str : raw.split("\n")) { String[] tokens = str.toString().split("\t"); if (tokens[0].toString().equals(DQConstants.GOOD_RECORDS_FILE.trim())) { goodRecords = Integer.parseInt(tokens[1]); } else if (tokens[0].toString().equals(DQConstants.BAD_RECORDS_FILE.trim())) { badRecords = Integer.parseInt(tokens[1]); } } DQStats dqStats = new DQStats(); dqStats.setGoodPercent((goodRecords * 100.0F) / (goodRecords + badRecords)); dqStats.setNumBad(badRecords); dqStats.setNumGood(goodRecords); return dqStats; } private void logResult(DQStats dqStats, String processId, Long instanceRef) { ProcessLog processLog = new ProcessLog(); ProcessLogInfo processLogInfo = new ProcessLogInfo(); processLogInfo.setProcessId(new Integer(processId)); processLogInfo.setAddTs(new Date()); processLogInfo.setInstanceRef(instanceRef); processLogInfo.setLogCategory("dq"); //Logging num good records processLogInfo.setMessageId("good.records.count"); processLogInfo.setMessage(dqStats.getNumGood() + ""); LOGGER.debug("Process id is :" + processLogInfo.getProcessId() + " TS is " + processLogInfo.getAddTs() + "instance ref" + processLogInfo.getInstanceRef() + "" + "message" + processLogInfo.getMessage()); processLog.log(processLogInfo); //Logging num bad records processLogInfo.setMessageId("rejected.records.count"); processLogInfo.setMessage(dqStats.getNumBad() + ""); processLog.log(processLogInfo); //Logging percent processLogInfo.setMessageId("good.records.percent"); processLogInfo.setMessage(dqStats.getGoodPercent() + "%"); processLog.log(processLogInfo); //Logging threshold processLogInfo.setMessageId("good.records.threshold"); processLogInfo.setMessage(dqStats.getThreshold() + "%"); processLog.log(processLogInfo); //Logging status processLogInfo.setMessageId("did.dq.pass"); processLogInfo.setMessage((dqStats.getGoodPercent() >= dqStats.getThreshold()) + ""); processLog.log(processLogInfo); } }