List of usage examples for org.apache.hadoop.mapreduce Job getInstance
@Deprecated public static Job getInstance(Cluster ignored) throws IOException
From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormatTest.java
License:Apache License
/** * Verify listing files before a maximum age. */// w w w.j a v a2s . c o m @Test public void listStatusMaxFileAge() throws IOException { // Create temp file final File file1 = tempFolder.newFile("file1"); Assert.assertTrue(file1.setLastModified(currentTimeMillis)); final File file2 = tempFolder.newFile("file2"); Assert.assertTrue(file2.setLastModified(currentTimeMillis - 2000)); final File file3 = tempFolder.newFile("file3"); Assert.assertTrue(file3.setLastModified(currentTimeMillis - 1000)); // Test listing files with high water mark final Job job = Job.getInstance(new Configuration(false)); HighWaterMarkInputFormat.setInputPaths(job, tempFolder.getRoot().getAbsolutePath()); HighWaterMarkInputFormat.setMaxFileAge(job, 1000); final HighWaterMarkInputFormat inputFormat = new MockHighWaterMarkInputFormat(); final List<FileStatus> files = inputFormat.listStatus(job); Collections.sort(files, new FileStatusComparator()); Assert.assertEquals(new Path(file1.toURI()), files.get(0).getPath()); Assert.assertEquals(new Path(file3.toURI()), files.get(1).getPath()); Assert.assertEquals(2, files.size()); }
From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormatTest.java
License:Apache License
/** * Verify listing files after a minimum age. *//*from ww w .j a va2 s. co m*/ @Test public void listStatusMinFileAge() throws IOException { // Create temp file final File file1 = tempFolder.newFile("file1"); Assert.assertTrue(file1.setLastModified(currentTimeMillis)); final File file2 = tempFolder.newFile("file2"); Assert.assertTrue(file2.setLastModified(currentTimeMillis - 2000)); final File file3 = tempFolder.newFile("file3"); Assert.assertTrue(file3.setLastModified(currentTimeMillis - 1000)); // Test listing files with high water mark final Job job = Job.getInstance(new Configuration(false)); HighWaterMarkInputFormat.setInputPaths(job, tempFolder.getRoot().getAbsolutePath()); HighWaterMarkInputFormat.setMinFileAge(job, 1000); final HighWaterMarkInputFormat inputFormat = new MockHighWaterMarkInputFormat(); final List<FileStatus> files = inputFormat.listStatus(job); Collections.sort(files, new FileStatusComparator()); Assert.assertEquals(new Path(file2.toURI()), files.get(0).getPath()); Assert.assertEquals(new Path(file3.toURI()), files.get(1).getPath()); Assert.assertEquals(2, files.size()); }
From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormatTest.java
License:Apache License
@Test(expected = IOException.class) public void listStatusMinAfterMax() throws IOException { final Job job = Job.getInstance(new Configuration(false)); HighWaterMarkInputFormat.setMaxFileAge(job, 0); HighWaterMarkInputFormat.setMinFileAge(job, 1); final HighWaterMarkInputFormat inputFormat = new MockHighWaterMarkInputFormat(); inputFormat.listStatus(job);// www. j av a 2 s. c o m }
From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.SparkDataSetContext.java
License:Apache License
/** * Resolves the specified URIs by removing files that have been previously read. * * @throws KyloCatalogException if a data set option is invalid * @throws IOException if an I/O error occurs *//*w w w . j a va 2 s . c om*/ @Nonnull @SuppressWarnings({ "squid:HiddenFieldCheck", "squid:S1192" }) private List<String> resolveHighWaterMarkPaths(@Nonnull final List<String> uris) throws IOException { // Get configuration final Configuration conf = delegate.getHadoopConfiguration(client); final String highWaterMarkName = SparkUtil.getOrElse(getOption(HighWaterMarkInputFormat.HIGH_WATER_MARK), SparkUtil.getOrElse(getOption(HIGH_WATER_MARK_OPTION), null)); final Job job = Job.getInstance(conf); final String highWaterMarkValue = client.getHighWaterMarks().get(highWaterMarkName); if (highWaterMarkValue != null) { try { HighWaterMarkInputFormat.setHighWaterMark(job, Long.parseLong(highWaterMarkValue)); } catch (final NumberFormatException e) { throw new KyloCatalogException( "Invalid " + HIGH_WATER_MARK_OPTION + " value: " + highWaterMarkValue, e); } } final String maxFileAge = SparkUtil.getOrElse(getOption(HighWaterMarkInputFormat.MAX_FILE_AGE), SparkUtil.getOrElse(getOption(MAX_AGE_OPTION), null)); if (maxFileAge != null) { try { HighWaterMarkInputFormat.setMaxFileAge(job, Long.parseLong(maxFileAge)); } catch (final NumberFormatException e) { throw new KyloCatalogException("Invalid " + MAX_AGE_OPTION + " value: " + maxFileAge, e); } } final String minFileAge = SparkUtil.getOrElse(getOption(HighWaterMarkInputFormat.MIN_FILE_AGE), SparkUtil.getOrElse(getOption(MIN_AGE_OPTION), null)); if (minFileAge != null) { try { HighWaterMarkInputFormat.setMinFileAge(job, Long.parseLong(minFileAge)); } catch (final NumberFormatException e) { throw new KyloCatalogException("Invalid " + MIN_AGE_OPTION + " value: " + minFileAge, e); } } // Convert URIs to Paths final Path[] paths = new Path[uris.size()]; for (int i = 0; i < uris.size(); ++i) { final Path path = new Path(uris.get(i)); final FileSystem fs = path.getFileSystem(conf); paths[i] = path.makeQualified(fs.getUri(), fs.getWorkingDirectory()); } HighWaterMarkInputFormat.setInputPaths(job, paths); // Get high water mark paths final HighWaterMarkInputFormat inputFormat = new HighWaterMarkInputFormat(); final List<FileStatus> files = inputFormat.listStatus(job); client.setHighWaterMarks( Collections.singletonMap(highWaterMarkName, Long.toString(inputFormat.getLastHighWaterMark()))); // Return resolved paths final List<String> resolvedPaths = new ArrayList<>(files.size()); if (files.isEmpty()) { resolvedPaths.add("file:/dev/null"); } else { for (final FileStatus file : files) { resolvedPaths.add(file.getPath().toString()); } } return resolvedPaths; }
From source file:com.trexinhca.TrexinHCATest.java
License:Apache License
public static void main(String[] args) throws Exception { ks = KieServices.Factory.get();/*from w ww. ja v a 2s . c om*/ kContainer = ks.getKieClasspathContainer(); ksession = TrexinHCATest.kContainer.newKieSession("MapReduceKS"); Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: TrexinHCATest <in> [<in>...] <out>"); System.exit(2); } Job job = Job.getInstance(conf); job.setJobName("HCATest"); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(TrexinHCAReducer.class); job.setReducerClass(TrexinHCAReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(TextOutputFormat.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); job.setJarByClass(TrexinHCATest.class); job.waitForCompletion(true); }
From source file:com.uber.hoodie.hadoop.HoodieInputFormat.java
License:Apache License
@Override public FileStatus[] listStatus(JobConf job) throws IOException { // Get all the file status from FileInputFormat and then do the filter FileStatus[] fileStatuses = super.listStatus(job); Map<HoodieTableMetaClient, List<FileStatus>> groupedFileStatus = groupFileStatus(fileStatuses); LOG.info("Found a total of " + groupedFileStatus.size() + " groups"); List<FileStatus> returns = new ArrayList<>(); for (Map.Entry<HoodieTableMetaClient, List<FileStatus>> entry : groupedFileStatus.entrySet()) { HoodieTableMetaClient metadata = entry.getKey(); if (metadata == null) { // Add all the paths which are not hoodie specific returns.addAll(entry.getValue()); continue; }//from w w w . j a v a 2 s. c o m FileStatus[] statuses = entry.getValue().toArray(new FileStatus[entry.getValue().size()]); if (LOG.isDebugEnabled()) { LOG.debug("Hoodie Metadata initialized with completed commit Ts as :" + metadata); } String tableName = metadata.getTableConfig().getTableName(); String mode = HoodieHiveUtil.readMode(Job.getInstance(job), tableName); // Get all commits, delta commits, compactions, as all of them produce a base parquet file // today HoodieTimeline timeline = metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metadata, timeline, statuses); if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) { // this is of the form commitTs_partition_sequenceNumber String lastIncrementalTs = HoodieHiveUtil.readStartCommitTime(Job.getInstance(job), tableName); // Total number of commits to return in this batch. Set this to -1 to get all the commits. Integer maxCommits = HoodieHiveUtil.readMaxCommits(Job.getInstance(job), tableName); LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs); List<String> commitsToReturn = timeline.findInstantsAfter(lastIncrementalTs, maxCommits) .getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList()); List<HoodieDataFile> filteredFiles = roView.getLatestDataFilesInRange(commitsToReturn) .collect(Collectors.toList()); for (HoodieDataFile filteredFile : filteredFiles) { LOG.info("Processing incremental hoodie file - " + filteredFile.getPath()); filteredFile = checkFileStatus(filteredFile); returns.add(filteredFile.getFileStatus()); } LOG.info("Total paths to process after hoodie incremental filter " + filteredFiles.size()); } else { // filter files on the latest commit found List<HoodieDataFile> filteredFiles = roView.getLatestDataFiles().collect(Collectors.toList()); LOG.info("Total paths to process after hoodie filter " + filteredFiles.size()); for (HoodieDataFile filteredFile : filteredFiles) { if (LOG.isDebugEnabled()) { LOG.debug("Processing latest hoodie file - " + filteredFile.getPath()); } filteredFile = checkFileStatus(filteredFile); returns.add(filteredFile.getFileStatus()); } } } return returns.toArray(new FileStatus[returns.size()]); }
From source file:com.uber.hoodie.utilities.HDFSParquetImporter.java
License:Apache License
protected JavaRDD<HoodieRecord<HoodieRecordPayload>> buildHoodieRecordsForImport(JavaSparkContext jsc, String schemaStr) throws IOException { Job job = Job.getInstance(jsc.hadoopConfiguration()); // Allow recursive directories to be found job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true"); // To parallelize reading file status. job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024"); AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr))); ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class)); return jsc//from ww w .jav a2 s .co m .newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class, job.getConfiguration()) // To reduce large number of // tasks. .coalesce(16 * cfg.parallelism).map(entry -> { GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2(); Object partitionField = genericRecord.get(cfg.partitionKey); if (partitionField == null) { throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey); } Object rowField = genericRecord.get(cfg.rowKey); if (rowField == null) { throw new HoodieIOException("row field is missing. :" + cfg.rowKey); } String partitionPath = partitionField.toString(); logger.info("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")"); if (partitionField instanceof Number) { try { long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L); partitionPath = PARTITION_FORMATTER.format(new Date(ts)); } catch (NumberFormatException nfe) { logger.warn("Unable to parse date from partition field. Assuming partition as (" + partitionField + ")"); } } return new HoodieRecord<>(new HoodieKey((String) rowField, partitionPath), new HoodieJsonPayload(genericRecord.toString())); }); }
From source file:com.wipro.ats.bdre.datagen.mr.Driver.java
License:Apache License
/** * @param args the cli arguments/*from w w w .j a v a 2 s.c om*/ */ @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = getConf(); GetGeneralConfig generalConfig = new GetGeneralConfig(); GeneralConfig gc = generalConfig.byConigGroupAndKey("imconfig", "common.default-fs-name"); conf.set("fs.defaultFS", gc.getDefaultVal()); String processId = args[0]; Path outputDir = new Path(ResolvePath.replaceVars(args[1])); Properties dataProps = Config.getDataProperties(processId); Properties tableProps = Config.getTableProperties(processId); TableUtil tableUtil = new TableUtil(); Table table = tableUtil.formTableFromConfig(processId); FileSystem fs = FileSystem.get(conf); LOGGER.info("Default FS =" + conf.get("fs.defaultFS")); //set in the conf for mappers to use conf.set(Config.SEPARATOR_KEY, tableProps.getProperty("separator")); conf.set(Config.PID_KEY, processId); conf.setLong(Config.NUM_ROWS_KEY, Long.parseLong(dataProps.getProperty("numRows"))); conf.setInt(Config.NUM_SPLITS_KEY, Integer.parseInt(dataProps.getProperty("numSplits"))); Job job = Job.getInstance(conf); Path mrOutputPath = new Path(outputDir.toString() + "/MROUT/" + table.getTableName()); FileOutputFormat.setOutputPath(job, mrOutputPath); job.setJobName("Datagen-" + table.getTableName()); job.setJarByClass(Driver.class); job.setMapperClass(RecordGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RangeInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); //merge and create a single file Path srcDir = mrOutputPath; Path destFile = new Path(outputDir.toString() + "/" + table.getTableName()); FileUtil.copyMerge(fs, srcDir, fs, destFile, true, conf, ""); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash("0"); registerFileInfo.setFileSize(0L); registerFileInfo.setPath(destFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; }
From source file:com.wipro.ats.bdre.dq.DQDriver.java
License:Apache License
@Override public int run(String[] arg) throws Exception { String processId = arg[0];//from w ww . ja v a 2s. c om String sPath = arg[1]; String destDir = arg[2]; Properties props = new GetProperties().getProperties(processId, "dq"); LOGGER.debug("props=" + props); Configuration conf = getConf(); conf.set("dq.process.id", processId); Job job = Job.getInstance(conf); job.setJobName("Data Quality " + processId); job.setJarByClass(DQDriver.class); job.setMapperClass(DQMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //Reducer is not required job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); Path inputFilePath = new Path(sPath); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, removeIfExistAndSetOutputPath(conf, destDir)); MultipleOutputs.addNamedOutput(job, DQConstants.GOOD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.BAD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.FILE_REPORT_FILE, TextOutputFormat.class, Text.class, NullWritable.class); if (!job.waitForCompletion(true)) { return 1; } Path outputDir = new Path(destDir); FileSystem srcFs = outputDir.getFileSystem(getConf()); FileSystem destFs = outputDir.getFileSystem(getConf()); //Valid Records Path goodFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_GOOD_RECORD_OUTPUT_DIR); //Input and quality filtered file should have same name (but different path) Path goodDestFile = new Path(destDir + "/" + inputFilePath.getName()); if (srcFs.exists(goodFilesSrcDir)) { FileUtil.copyMerge(srcFs, goodFilesSrcDir, destFs, goodDestFile, true, conf, ""); } // Invalid Records Path badFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_BAD_RECORD_OUTPUT_DIR); Path badDestFile = new Path(destDir + "/" + DQConstants.BAD_RECORDS_FILE); if (srcFs.exists(badFilesSrcDir)) { FileUtil.copyMerge(srcFs, badFilesSrcDir, destFs, badDestFile, true, conf, ""); } // Preparing report aggregation job Job fileReportAggregationJob = Job.getInstance(conf); fileReportAggregationJob.setJobName("File Report Computing " + processId); fileReportAggregationJob.setJarByClass(DQMain.class); fileReportAggregationJob.setMapperClass(DQFileReportMapper.class); fileReportAggregationJob.setMapOutputKeyClass(Text.class); fileReportAggregationJob.setMapOutputValueClass(IntWritable.class); fileReportAggregationJob.setReducerClass(DQFileReportReducer.class); fileReportAggregationJob.setOutputKeyClass(Text.class); fileReportAggregationJob.setOutputValueClass(Text.class); fileReportAggregationJob.setNumReduceTasks(1); Path fileReportDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_REPORT_OUTPUT_DIR); Path fileReportOutputDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); FileInputFormat.addInputPath(fileReportAggregationJob, fileReportDir); FileOutputFormat.setOutputPath(fileReportAggregationJob, fileReportOutputDir); if (!fileReportAggregationJob.waitForCompletion(true)) { return 1; } // Merge Report Records MR stuffs Path reportsSrcDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); Path reportsDestFile = new Path(destDir + "/" + DQConstants.FILE_REPORT_FILE); FileUtil.copyMerge(srcFs, reportsSrcDir, destFs, reportsDestFile, true, conf, ""); Path reportDestFile = new Path(outputDir.toString() + "/" + DQConstants.FILE_REPORT_FILE); //Read the report file from HDFS and report the percentage DQStats dqStats = getQualityStats(getConf(), reportDestFile); LOGGER.info("Percentage of good records :" + dqStats.getGoodPercent()); props = new GetProperties().getProperties(processId, "dq"); String strThreshold = props.getProperty("min.pass.threshold.percent"); float threshold = Float.parseFloat(strThreshold); dqStats.setThreshold(threshold); //Update the result in metadata logResult(dqStats, processId, 0L); if (dqStats.getGoodPercent() < threshold) { LOGGER.error("DQ check did not pass"); throw new DQValidationException(dqStats); } LOGGER.info(dqStats); FileChecksum hdfsChecksum = destFs.getFileChecksum(goodDestFile); String fileHash = hdfsChecksum == null ? "0" : hdfsChecksum.toString(); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash(fileHash); registerFileInfo.setFileSize(destFs.getFileStatus(goodDestFile).getLen()); registerFileInfo.setPath(goodDestFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; }
From source file:com.xoriant.kafkaProducer.MyConsumer.java
License:Apache License
public static void main(String[] args) throws IOException { // System.setProperty("spark.executor.memory", "8g"); System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf(); // final Configuration config = new Configuration(); Configuration hadoopConfig = new Configuration(); hadoopConfig.set("mapreduce.output.textoutputformat.separator", ","); sparkConf.setMaster("local[2]"); sparkConf.setAppName("Insurance"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); JavaStreamingContext javaStreamingContext = new JavaStreamingContext(javaSparkContext, new Duration(500)); int numThreads = Integer.parseInt(args[3]); Map<String, Integer> topicMap = new HashMap<String, Integer>(); String[] topics = args[2].split(","); for (String topic : topics) { topicMap.put(topic, numThreads); }// ww w .j av a2 s.c o m // 3. create connection with HBase Configuration config = null; try { config = HBaseConfiguration.create(); config.set("hbase.zookeeper.quorum", "192.168.1.114"); config.set("hbase.zookeeper.property.clientPort", "2181"); // config.set("mapreduce.job.output.key.class", // Text.class.getName()); // config.set("mapreduce.job.output.value.class", // IntWritable.class.getName()); // config.set("mapreduce.outputformat.class" , // TableOutputFormat.class.getName()); // config.set("hbase.master", "127.0.0.1:60000"); HBaseAdmin.checkHBaseAvailable(config); System.out.println("HBase is running!"); } catch (MasterNotRunningException e) { System.out.println("HBase is not running!"); System.exit(1); } catch (Exception ce) { System.out.println("here....."); ce.printStackTrace(); } // config.set(TableInputFormat.INPUT_TABLE, rawTableName); // 4. new Hadoop API configuration final Job newAPIJobConfigurationState = Job.getInstance(config); newAPIJobConfigurationState.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, stateTable); newAPIJobConfigurationState.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class); final Job newAPIJobConfigurationUser = Job.getInstance(config); newAPIJobConfigurationUser.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "user_total_stream"); newAPIJobConfigurationUser.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class); final Job paymentHistoryConfig = Job.getInstance(config); paymentHistoryConfig.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "payment_history_stream"); paymentHistoryConfig.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class); /* * Set<String> topics = new HashSet<String>(); topics.add("test"); * * * Map<String, String> kafkaParams = new HashMap<String, String>(); * kafkaParams.put("metadata.broker.list", "10.20.0.199:9092"); */ /* * JavaPairInputDStream<String, String> stream = KafkaUtils * .createDirectStream(javaStreamingContext, String.class, String.class, * StringDecoder.class, StringDecoder.class, kafkaParams, topics); */ JavaPairReceiverInputDStream<String, String> stream = KafkaUtils.createStream(javaStreamingContext, args[0], args[1], topicMap); System.out.println( "Got my DStream! connecting to zookeeper " + args[0] + " group " + args[1] + " topics" + topicMap); stream.count().print(); JavaDStream<Tuple11<String, String, String, String, String, String, String, String, String, String, String>> records = stream .map(new Function<Tuple2<String, String>, Tuple11<String, String, String, String, String, String, String, String, String, String, String>>() { private static final long serialVersionUID = 1L; public Tuple11<String, String, String, String, String, String, String, String, String, String, String> call( Tuple2<String, String> defaultKeyAndRecords) throws Exception { String[] fields = defaultKeyAndRecords._2().split(","); return new Tuple11<String, String, String, String, String, String, String, String, String, String, String>( fields[0], fields[1], fields[2], fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], fields[9], fields[10]); } }); records.foreachRDD( new Function<JavaRDD<Tuple11<String, String, String, String, String, String, String, String, String, String, String>>, Void>() { private static final long serialVersionUID = -3333697808496161495L; public Void call( JavaRDD<Tuple11<String, String, String, String, String, String, String, String, String, String, String>> rdd) throws Exception { saveToHBasePaymentHistory(rdd, paymentHistoryConfig.getConfiguration()); return null; } }); JavaPairDStream<String, String> window = records.mapToPair( new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, String>() { private static final long serialVersionUID = -8849699432349098738L; public Tuple2<String, String> call( Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0) throws Exception { String str = arg0._2() + "," + arg0._3() + "," + arg0._4() + "," + arg0._5() + "," + arg0._6() + "," + arg0._7() + "," + arg0._8() + "," + arg0._9() + "," + arg0._10() + "," + arg0._11(); return new Tuple2<String, String>(arg0._1(), str); } }).window(new Duration(60000), new Duration(60000)); window.saveAsNewAPIHadoopFiles("hdfs://192.168.1.114/user/hadoop/StreamingData/Insurancedata", "", Text.class, Text.class, TextOutputFormat.class, hadoopConfig); JavaPairDStream<String, Integer> recordsMapState = records.mapToPair( new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, Integer>() { private static final long serialVersionUID = 1L; public Tuple2<String, Integer> call( Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0) throws Exception { String key = arg0._10(); Integer value = new Integer(arg0._7()); return new Tuple2<String, Integer>(key, value); } }); JavaPairDStream<String, Integer> recordsMapUser = records.mapToPair( new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, Integer>() { private static final long serialVersionUID = 1L; public Tuple2<String, Integer> call( Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0) throws Exception { String key = arg0._1(); Integer value = new Integer(arg0._7()); return new Tuple2<String, Integer>(key, value); } }); JavaPairDStream<String, Integer> reduceByKeyAndWindowState = recordsMapState .reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 197675516004789269L; public Integer call(Integer val1, Integer val2) throws Exception { return val1 + val2; } }, new Duration(86400000), new Duration(10000)); JavaPairDStream<String, Integer> reduceByKeyAndWindowUser = recordsMapUser .reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 197675516004789269L; public Integer call(Integer val1, Integer val2) throws Exception { return val1 + val2; } }, new Duration(86400000), new Duration(60000)); // reduce.count(); reduceByKeyAndWindowState.print(); reduceByKeyAndWindowState.foreachRDD(new Function<JavaPairRDD<String, Integer>, Void>() { private static final long serialVersionUID = 8534726505385048702L; public Void call(JavaPairRDD<String, Integer> rdd) throws Exception { saveToHBase(rdd, newAPIJobConfigurationState.getConfiguration()); return null; } }); reduceByKeyAndWindowUser.foreachRDD(new Function<JavaPairRDD<String, Integer>, Void>() { private static final long serialVersionUID = 8534726505385048702L; public Void call(JavaPairRDD<String, Integer> rdd) throws Exception { saveToHBase(rdd, newAPIJobConfigurationUser.getConfiguration()); return null; } }); javaStreamingContext.start(); javaStreamingContext.awaitTermination(); }