Example usage for org.apache.hadoop.conf Configuration set

List of usage examples for org.apache.hadoop.conf Configuration set

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration set.

Prototype

public void set(String name, String value) 

Source Link

Document

Set the value of the name property.

Usage

From source file:co.cask.cdap.test.ConfigurableTestBase.java

License:Apache License

private static void initialize(@Nullable Map<String, String> additionalConfiguration) throws Exception {
    if (startCount++ > 0) {
        return;/*  w  w w .  j a  va 2 s .  co  m*/
    }
    File localDataDir = tmpFolder.newFolder();

    cConf = createCConf(localDataDir, additionalConfiguration);

    org.apache.hadoop.conf.Configuration hConf = new org.apache.hadoop.conf.Configuration();
    hConf.addResource("mapred-site-local.xml");
    hConf.reloadConfiguration();
    hConf.set(Constants.CFG_LOCAL_DATA_DIR, localDataDir.getAbsolutePath());
    hConf.set(Constants.AppFabric.OUTPUT_DIR, cConf.get(Constants.AppFabric.OUTPUT_DIR));
    hConf.set("hadoop.tmp.dir",
            new File(localDataDir, cConf.get(Constants.AppFabric.TEMP_DIR)).getAbsolutePath());

    // Windows specific requirements
    if (OSDetector.isWindows()) {
        File tmpDir = tmpFolder.newFolder();
        File binDir = new File(tmpDir, "bin");
        Assert.assertTrue(binDir.mkdirs());

        copyTempFile("hadoop.dll", tmpDir);
        copyTempFile("winutils.exe", binDir);
        System.setProperty("hadoop.home.dir", tmpDir.getAbsolutePath());
        System.load(new File(tmpDir, "hadoop.dll").getAbsolutePath());
    }

    Injector injector = Guice.createInjector(createDataFabricModule(),
            new DataSetsModules().getStandaloneModules(), new DataSetServiceModules().getInMemoryModules(),
            new ConfigModule(cConf, hConf), new IOModule(), new LocationRuntimeModule().getInMemoryModules(),
            new DiscoveryRuntimeModule().getInMemoryModules(),
            new AppFabricServiceRuntimeModule().getInMemoryModules(),
            new ServiceStoreModules().getInMemoryModules(),
            new InMemoryProgramRunnerModule(LocalStreamWriter.class), new AbstractModule() {
                @Override
                protected void configure() {
                    bind(StreamHandler.class).in(Scopes.SINGLETON);
                    bind(StreamFetchHandler.class).in(Scopes.SINGLETON);
                    bind(AbstractNamespaceClient.class).to(LocalNamespaceClient.class).in(Scopes.SINGLETON);
                    bind(StreamFileJanitorService.class).to(LocalStreamFileJanitorService.class)
                            .in(Scopes.SINGLETON);
                    bind(StreamWriterSizeCollector.class).to(BasicStreamWriterSizeCollector.class)
                            .in(Scopes.SINGLETON);
                    bind(StreamCoordinatorClient.class).to(InMemoryStreamCoordinatorClient.class)
                            .in(Scopes.SINGLETON);
                }
            },
            // todo: do we need handler?
            new MetricsHandlerModule(), new MetricsClientRuntimeModule().getInMemoryModules(),
            new LoggingModules().getInMemoryModules(), new ExploreRuntimeModule().getInMemoryModules(),
            new ExploreClientModule(), new NotificationFeedServiceRuntimeModule().getInMemoryModules(),
            new NotificationServiceRuntimeModule().getInMemoryModules(), new AbstractModule() {
                @Override
                @SuppressWarnings("deprecation")
                protected void configure() {
                    install(new FactoryModuleBuilder()
                            .implement(ApplicationManager.class, DefaultApplicationManager.class)
                            .build(ApplicationManagerFactory.class));
                    install(new FactoryModuleBuilder().implement(StreamWriter.class, DefaultStreamWriter.class)
                            .build(StreamWriterFactory.class));
                    install(new FactoryModuleBuilder()
                            .implement(StreamManager.class, DefaultStreamManager.class)
                            .build(StreamManagerFactory.class));
                    bind(TemporaryFolder.class).toInstance(tmpFolder);
                }
            });

    txService = injector.getInstance(TransactionManager.class);
    txService.startAndWait();
    dsOpService = injector.getInstance(DatasetOpExecutor.class);
    dsOpService.startAndWait();
    datasetService = injector.getInstance(DatasetService.class);
    datasetService.startAndWait();
    metricsQueryService = injector.getInstance(MetricsQueryService.class);
    metricsQueryService.startAndWait();
    metricsCollectionService = injector.getInstance(MetricsCollectionService.class);
    metricsCollectionService.startAndWait();
    schedulerService = injector.getInstance(SchedulerService.class);
    schedulerService.startAndWait();
    if (cConf.getBoolean(Constants.Explore.EXPLORE_ENABLED)) {
        exploreExecutorService = injector.getInstance(ExploreExecutorService.class);
        exploreExecutorService.startAndWait();
        exploreClient = injector.getInstance(ExploreClient.class);
    }
    streamCoordinatorClient = injector.getInstance(StreamCoordinatorClient.class);
    streamCoordinatorClient.startAndWait();
    testManager = injector.getInstance(UnitTestManager.class);
    namespaceAdmin = injector.getInstance(NamespaceAdmin.class);
    // we use MetricStore directly, until RuntimeStats API changes
    RuntimeStats.metricStore = injector.getInstance(MetricStore.class);
    namespaceAdmin = injector.getInstance(NamespaceAdmin.class);
    namespaceAdmin.createNamespace(Constants.DEFAULT_NAMESPACE_META);
}

From source file:co.cask.cdap.test.TestBase.java

License:Apache License

@BeforeClass
public static void initialize() throws Exception {
    if (startCount++ > 0) {
        return;//from   w  w  w  .  j  ava  2 s .co  m
    }
    File localDataDir = TMP_FOLDER.newFolder();

    cConf = createCConf(localDataDir);

    org.apache.hadoop.conf.Configuration hConf = new org.apache.hadoop.conf.Configuration();
    hConf.addResource("mapred-site-local.xml");
    hConf.reloadConfiguration();
    hConf.set(Constants.CFG_LOCAL_DATA_DIR, localDataDir.getAbsolutePath());
    hConf.set(Constants.AppFabric.OUTPUT_DIR, cConf.get(Constants.AppFabric.OUTPUT_DIR));
    hConf.set("hadoop.tmp.dir",
            new File(localDataDir, cConf.get(Constants.AppFabric.TEMP_DIR)).getAbsolutePath());

    // Windows specific requirements
    if (OSDetector.isWindows()) {
        File tmpDir = TMP_FOLDER.newFolder();
        File binDir = new File(tmpDir, "bin");
        Assert.assertTrue(binDir.mkdirs());

        copyTempFile("hadoop.dll", tmpDir);
        copyTempFile("winutils.exe", binDir);
        System.setProperty("hadoop.home.dir", tmpDir.getAbsolutePath());
        System.load(new File(tmpDir, "hadoop.dll").getAbsolutePath());
    }

    Injector injector = Guice.createInjector(createDataFabricModule(), new TransactionExecutorModule(),
            new DataSetsModules().getStandaloneModules(), new DataSetServiceModules().getInMemoryModules(),
            new ConfigModule(cConf, hConf), new IOModule(), new LocationRuntimeModule().getInMemoryModules(),
            new DiscoveryRuntimeModule().getInMemoryModules(),
            new AppFabricServiceRuntimeModule().getInMemoryModules(),
            new ServiceStoreModules().getInMemoryModules(),
            new InMemoryProgramRunnerModule(LocalStreamWriter.class), new AbstractModule() {
                @Override
                protected void configure() {
                    bind(StreamHandler.class).in(Scopes.SINGLETON);
                    bind(StreamFetchHandler.class).in(Scopes.SINGLETON);
                    bind(StreamViewHttpHandler.class).in(Scopes.SINGLETON);
                    bind(StreamFileJanitorService.class).to(LocalStreamFileJanitorService.class)
                            .in(Scopes.SINGLETON);
                    bind(StreamWriterSizeCollector.class).to(BasicStreamWriterSizeCollector.class)
                            .in(Scopes.SINGLETON);
                    bind(StreamCoordinatorClient.class).to(InMemoryStreamCoordinatorClient.class)
                            .in(Scopes.SINGLETON);
                    bind(MetricsManager.class).toProvider(MetricsManagerProvider.class);
                }
            },
            // todo: do we need handler?
            new MetricsHandlerModule(), new MetricsClientRuntimeModule().getInMemoryModules(),
            new LoggingModules().getInMemoryModules(), new ExploreRuntimeModule().getInMemoryModules(),
            new ExploreClientModule(), new NotificationFeedServiceRuntimeModule().getInMemoryModules(),
            new NotificationServiceRuntimeModule().getInMemoryModules(),
            new NamespaceClientRuntimeModule().getStandaloneModules(),
            new NamespaceStoreModule().getStandaloneModules(), new AuthorizationModule(), new AbstractModule() {
                @Override
                @SuppressWarnings("deprecation")
                protected void configure() {
                    install(new FactoryModuleBuilder()
                            .implement(ApplicationManager.class, DefaultApplicationManager.class)
                            .build(ApplicationManagerFactory.class));
                    install(new FactoryModuleBuilder()
                            .implement(ArtifactManager.class, DefaultArtifactManager.class)
                            .build(ArtifactManagerFactory.class));
                    install(new FactoryModuleBuilder()
                            .implement(StreamManager.class, DefaultStreamManager.class)
                            .build(StreamManagerFactory.class));
                    bind(TemporaryFolder.class).toInstance(TMP_FOLDER);
                    bind(AuthorizationHandler.class).in(Scopes.SINGLETON);
                }
            });

    txService = injector.getInstance(TransactionManager.class);
    txService.startAndWait();
    dsOpService = injector.getInstance(DatasetOpExecutor.class);
    dsOpService.startAndWait();
    datasetService = injector.getInstance(DatasetService.class);
    datasetService.startAndWait();
    metricsQueryService = injector.getInstance(MetricsQueryService.class);
    metricsQueryService.startAndWait();
    metricsCollectionService = injector.getInstance(MetricsCollectionService.class);
    metricsCollectionService.startAndWait();
    schedulerService = injector.getInstance(SchedulerService.class);
    schedulerService.startAndWait();
    if (cConf.getBoolean(Constants.Explore.EXPLORE_ENABLED)) {
        exploreExecutorService = injector.getInstance(ExploreExecutorService.class);
        exploreExecutorService.startAndWait();
        exploreClient = injector.getInstance(ExploreClient.class);
    }
    streamCoordinatorClient = injector.getInstance(StreamCoordinatorClient.class);
    streamCoordinatorClient.startAndWait();
    testManager = injector.getInstance(UnitTestManager.class);
    metricsManager = injector.getInstance(MetricsManager.class);
    authorizerInstantiatorService = injector.getInstance(AuthorizerInstantiatorService.class);
    authorizerInstantiatorService.startAndWait();
    // This is needed so the logged-in user can successfully create the default namespace
    if (cConf.getBoolean(Constants.Security.Authorization.ENABLED)) {
        InstanceId instance = new InstanceId(cConf.get(Constants.INSTANCE_NAME));
        Principal principal = new Principal(SecurityRequestContext.getUserId(), Principal.PrincipalType.USER);
        authorizerInstantiatorService.get().grant(instance, principal, ImmutableSet.of(Action.ADMIN));
    }
    namespaceAdmin = injector.getInstance(NamespaceAdmin.class);
    namespaceAdmin.create(NamespaceMeta.DEFAULT);
}

From source file:co.cask.hydrator.plugin.batch.ETLMapReduceTestRun.java

License:Apache License

@Test
public void testS3toTPFS() throws Exception {
    String testPath = "s3n://test/";
    String testFile1 = "2015-06-17-00-00-00.txt";
    String testData1 = "Sample data for testing.";

    String testFile2 = "abc.txt";
    String testData2 = "Sample data for testing.";

    S3NInMemoryFileSystem fs = new S3NInMemoryFileSystem();
    Configuration conf = new Configuration();
    conf.set("fs.s3n.impl", S3NInMemoryFileSystem.class.getName());
    fs.initialize(URI.create("s3n://test/"), conf);
    fs.createNewFile(new Path(testPath));

    try (FSDataOutputStream fos1 = fs.create(new Path(testPath + testFile1))) {
        fos1.write(testData1.getBytes());
        fos1.flush();/*from  ww w  .  j ava2s .  c om*/
    }

    try (FSDataOutputStream fos2 = fs.create(new Path(testPath + testFile2))) {
        fos2.write(testData2.getBytes());
        fos2.flush();
    }

    Method method = FileSystem.class.getDeclaredMethod("addFileSystemForTesting", URI.class,
            Configuration.class, FileSystem.class);
    method.setAccessible(true);
    method.invoke(FileSystem.class, URI.create("s3n://test/"), conf, fs);
    ETLStage source = new ETLStage("source", new ETLPlugin("S3", BatchSource.PLUGIN_TYPE,
            ImmutableMap.<String, String>builder().put(Constants.Reference.REFERENCE_NAME, "S3TestSource")
                    .put(Properties.S3.ACCESS_KEY, "key").put(Properties.S3.ACCESS_ID, "ID")
                    .put(Properties.S3.PATH, testPath).put(Properties.S3.FILE_REGEX, "abc.*").build(),
            null));
    ETLStage sink = new ETLStage("sink",
            new ETLPlugin("TPFSAvro", BatchSink.PLUGIN_TYPE,
                    ImmutableMap.of(Properties.TimePartitionedFileSetDataset.SCHEMA,
                            FileBatchSource.DEFAULT_SCHEMA.toString(),
                            Properties.TimePartitionedFileSetDataset.TPFS_NAME, "TPFSsink"),
                    null));
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink)
            .addConnection(source.getName(), sink.getName()).build();

    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(ETLBATCH_ARTIFACT, etlConfig);
    Id.Application appId = Id.Application.from(Id.Namespace.DEFAULT, "S3ToTPFS");
    ApplicationManager appManager = deployApplication(appId, appRequest);

    MapReduceManager mrManager = appManager.getMapReduceManager(ETLMapReduce.NAME);
    mrManager.start();
    mrManager.waitForFinish(2, TimeUnit.MINUTES);

    DataSetManager<TimePartitionedFileSet> fileSetManager = getDataset("TPFSsink");
    try (TimePartitionedFileSet fileSet = fileSetManager.get()) {
        List<GenericRecord> records = readOutput(fileSet, FileBatchSource.DEFAULT_SCHEMA);
        // Two input files, each with one input record were specified. However, only one file matches the regex,
        // so only one record should be found in the output.
        Assert.assertEquals(1, records.size());
        Assert.assertEquals(testData1, records.get(0).get("body").toString());
    }
}

From source file:co.cask.hydrator.plugin.batch.source.BatchCassandraSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    Configuration conf = new Configuration();
    conf.clear();/*from   ww w. j  av a  2  s . co m*/

    ConfigHelper.setInputColumnFamily(conf, config.keyspace, config.columnFamily);
    ConfigHelper.setInputInitialAddress(conf, config.initialAddress);
    ConfigHelper.setInputPartitioner(conf, config.partitioner);
    ConfigHelper.setInputRpcPort(conf, (config.port == null) ? "9160" : Integer.toString(config.port));
    Preconditions
            .checkArgument(!(Strings.isNullOrEmpty(config.username) ^ Strings.isNullOrEmpty(config.password)),
                    "You must either set both username and password or neither username nor password. "
                            + "Currently, they are username: " + config.username + " and password: "
                            + config.password);
    if (!Strings.isNullOrEmpty(config.username)) {
        ConfigHelper.setInputKeyspaceUserNameAndPassword(conf, config.username, config.password);
    }

    if (!Strings.isNullOrEmpty(config.properties)) {
        for (String pair : config.properties.split(",")) {
            // the key and value of properties might have spaces so remove only leading and trailing ones
            conf.set(CharMatcher.WHITESPACE.trimFrom(pair.split(":")[0]),
                    CharMatcher.WHITESPACE.trimFrom(pair.split(":")[1]));
        }
    }
    CqlConfigHelper.setInputCql(conf, config.query);
    context.setInput(Input.of(config.referenceName, new SourceInputFormatProvider(CqlInputFormat.class, conf)));
}

From source file:co.cask.hydrator.plugin.batch.source.ExcelInputFormat.java

License:Apache License

public static void setConfigurations(Job job, String filePattern, String sheetName, boolean reprocess,
        int sheetNo, String columnList, boolean skipFirstRow, String terminateIfEmptyRow, String rowLimit,
        String ifErrorRecord, String processedFiles) {

    Configuration configuration = job.getConfiguration();
    configuration.set(FILE_PATTERN, filePattern);
    configuration.set(SHEET_NAME, sheetName);
    configuration.setBoolean(RE_PROCESS, reprocess);
    configuration.setInt(SHEET_NO, sheetNo);
    configuration.set(COLUMN_LIST, columnList);
    configuration.setBoolean(SKIP_FIRST_ROW, skipFirstRow);
    configuration.set(TERMINATE_IF_EMPTY_ROW, terminateIfEmptyRow);

    if (!Strings.isNullOrEmpty(rowLimit)) {
        configuration.set(ROWS_LIMIT, rowLimit);
    }//from   w  w  w . jav a2  s .  c om

    configuration.set(IF_ERROR_RECORD, ifErrorRecord);
    configuration.set(PROCESSED_FILES, processedFiles);
}

From source file:co.cask.hydrator.plugin.batch.source.FileBatchSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    //SimpleDateFormat needs to be local because it is not threadsafe
    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH");

    //calculate date one hour ago, rounded down to the nearest hour
    prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1));
    Calendar cal = Calendar.getInstance();
    cal.setTime(prevHour);/*  w  w  w  .  j a  v a  2 s. co  m*/
    cal.set(Calendar.MINUTE, 0);
    cal.set(Calendar.SECOND, 0);
    cal.set(Calendar.MILLISECOND, 0);
    prevHour = cal.getTime();

    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();

    Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE);
    //noinspection ConstantConditions
    for (Map.Entry<String, String> entry : properties.entrySet()) {
        conf.set(entry.getKey(), entry.getValue());
    }

    conf.set(INPUT_REGEX_CONFIG, config.fileRegex);
    conf.set(INPUT_NAME_CONFIG, config.path);

    if (config.timeTable != null) {
        table = context.getDataset(config.timeTable);
        datesToRead = Bytes.toString(table.read(LAST_TIME_READ));
        if (datesToRead == null) {
            List<Date> firstRun = Lists.newArrayList(new Date(0));
            datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE);
        }
        List<Date> attempted = Lists.newArrayList(prevHour);
        String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE);
        if (!updatedDatesToRead.equals(datesToRead)) {
            table.write(LAST_TIME_READ, updatedDatesToRead);
        }
        conf.set(LAST_TIME_READ, datesToRead);
    }

    conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour));
    FileInputFormat.setInputPathFilter(job, BatchFileFilter.class);
    FileInputFormat.addInputPath(job, new Path(config.path));
    if (config.maxSplitSize != null) {
        FileInputFormat.setMaxInputSplitSize(job, config.maxSplitSize);
    }
    context.setInput(
            Input.of(config.referenceName, new SourceInputFormatProvider(config.inputFormatClass, conf)));
}

From source file:co.cask.hydrator.plugin.batch.source.HiveBatchSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    // This line is to load VersionInfo class here to make it available in the HCatInputFormat.setInput call. This is
    // needed to support CDAP 3.2 where we were just exposing the classes of the plugin jar and not the resources.
    LOG.trace("Hadoop version: {}", VersionInfo.getVersion());
    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();

    conf.set(HiveConf.ConfVars.METASTOREURIS.varname, config.metaStoreURI);

    if (UserGroupInformation.isSecurityEnabled()) {
        conf.set(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL.varname, "true");
        conf.set("hive.metastore.token.signature", HiveAuthFactory.HS2_CLIENT_TOKEN);
    }//from  w w  w  .j av  a  2s  .com
    // Use the current thread's classloader to ensure that when setInput is called it can access VersionInfo class
    // loaded above. This is needed to support CDAP 3.2 where we were just exposing classes to plugin jars and not
    // resources.
    ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
    try {
        Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
        HCatInputFormat.setInput(conf, config.dbName, config.tableName, config.partitions);
    } finally {
        Thread.currentThread().setContextClassLoader(classLoader);
    }

    HCatSchema hCatSchema = HCatInputFormat.getTableSchema(conf);
    if (config.schema != null) {
        // if the user provided a schema then we should use that schema to read the table. This will allow user to
        // drop non-primitive types and read the table.
        hCatSchema = HiveSchemaConverter.toHiveSchema(Schema.parseJson(config.schema), hCatSchema);
        HCatInputFormat.setOutputSchema(job, hCatSchema);
    }
    HiveSchemaStore.storeHiveSchema(context, config.dbName, config.tableName, hCatSchema);
    context.setInput(
            Input.of(config.referenceName, new SourceInputFormatProvider(HCatInputFormat.class, conf)));
}

From source file:co.cask.hydrator.plugin.batch.source.XMLReaderBatchSource.java

License:Apache License

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();
    conf.set(XMLInputFormat.XML_INPUTFORMAT_PATH_NAME, config.path);
    conf.set(XMLInputFormat.XML_INPUTFORMAT_NODE_PATH, config.nodePath);
    if (StringUtils.isNotEmpty(config.pattern)) {
        conf.set(XMLInputFormat.XML_INPUTFORMAT_PATTERN, config.pattern);
    }//from  w  w w  . j  a v a 2  s  .  c o  m
    conf.set(XMLInputFormat.XML_INPUTFORMAT_FILE_ACTION, config.actionAfterProcess);
    if (StringUtils.isNotEmpty(config.targetFolder)) {
        conf.set(XMLInputFormat.XML_INPUTFORMAT_TARGET_FOLDER, config.targetFolder);
    }

    setFileTrackingInfo(context, conf);

    //Create a temporary directory, in which XMLRecordReader will add file tracking information.
    fileSystem = FileSystem.get(conf);
    long startTime = context.getLogicalStartTime();
    //Create temp file name using start time to make it unique.
    String tempDirectory = config.tableName + startTime;
    tempDirectoryPath = new Path(config.temporaryFolder, tempDirectory);
    fileSystem.mkdirs(tempDirectoryPath);
    fileSystem.deleteOnExit(tempDirectoryPath);
    conf.set(XMLInputFormat.XML_INPUTFORMAT_PROCESSED_DATA_TEMP_FOLDER, tempDirectoryPath.toUri().toString());

    XMLInputFormat.setInputPathFilter(job, BatchXMLFileFilter.class);
    XMLInputFormat.addInputPath(job, new Path(config.path));
    context.setInput(Input.of(config.referenceName, new SourceInputFormatProvider(XMLInputFormat.class, conf)));
}

From source file:co.cask.hydrator.plugin.batch.source.XMLReaderBatchSource.java

License:Apache License

/**
 * Method to set file tracking information in to configuration.
 */// w ww  .  ja v a  2 s.c  om
private void setFileTrackingInfo(BatchSourceContext context, Configuration conf) {
    //For reprocessing not required, set processed file name to configuration.
    processedFileTrackingTable = context.getDataset(config.tableName);
    if (processedFileTrackingTable != null && !config.isReprocessingRequired()) {
        List<String> processedFiles = new ArrayList<String>();
        Calendar cal = Calendar.getInstance();
        cal.add(Calendar.DATE, -Integer.valueOf(config.tableExpiryPeriod));
        Date expiryDate = cal.getTime();

        try (CloseableIterator<KeyValue<byte[], byte[]>> iterator = processedFileTrackingTable.scan(null,
                null)) {
            while (iterator.hasNext()) {
                KeyValue<byte[], byte[]> keyValue = iterator.next();
                //Delete record before expiry time period
                Long time = Bytes.toLong(keyValue.getValue());
                Date processedDate = new Date(time);
                if (processedDate.before(expiryDate)) {
                    processedFileTrackingTable.delete(keyValue.getKey());
                } else {
                    processedFiles.add(Bytes.toString(keyValue.getKey()));
                }
            }
        }
        //File name use by BatchXMLFileFilter to filter already processed files.
        conf.set(XMLInputFormat.XML_INPUTFORMAT_PROCESSED_FILES,
                GSON.toJson(processedFiles, ARRAYLIST_PREPROCESSED_FILES));
    }
}

From source file:co.cask.hydrator.plugin.batchSource.KafkaInputFormat.java

License:Apache License

public static List<KafkaRequest> saveKafkaRequests(Configuration conf, String topic,
        Map<String, Integer> brokers, Set<Integer> partitions, Map<TopicAndPartition, Long> initOffsets,
        KeyValueTable table) throws Exception {
    ArrayList<KafkaRequest> finalRequests;
    HashMap<LeaderInfo, ArrayList<TopicAndPartition>> offsetRequestInfo = new HashMap<>();

    // Get Metadata for all topics
    List<TopicMetadata> topicMetadataList = getKafkaMetadata(brokers, topic);

    for (TopicMetadata topicMetadata : topicMetadataList) {
        for (PartitionMetadata partitionMetadata : topicMetadata.partitionsMetadata()) {
            LeaderInfo leader = new LeaderInfo(
                    new URI("tcp://" + partitionMetadata.leader().connectionString()),
                    partitionMetadata.leader().id());
            if (partitions.isEmpty() || partitions.contains(partitionMetadata.partitionId())) {
                if (offsetRequestInfo.containsKey(leader)) {
                    ArrayList<TopicAndPartition> topicAndPartitions = offsetRequestInfo.get(leader);
                    topicAndPartitions/*from w w  w.  j  ava2  s. c o m*/
                            .add(new TopicAndPartition(topicMetadata.topic(), partitionMetadata.partitionId()));
                    offsetRequestInfo.put(leader, topicAndPartitions);
                } else {
                    ArrayList<TopicAndPartition> topicAndPartitions = new ArrayList<>();
                    topicAndPartitions
                            .add(new TopicAndPartition(topicMetadata.topic(), partitionMetadata.partitionId()));
                    offsetRequestInfo.put(leader, topicAndPartitions);
                }
            }
        }
    }

    // Get the latest offsets and generate the KafkaRequests
    finalRequests = fetchLatestOffsetAndCreateKafkaRequests(offsetRequestInfo, initOffsets, table);

    Collections.sort(finalRequests, new Comparator<KafkaRequest>() {
        @Override
        public int compare(KafkaRequest r1, KafkaRequest r2) {
            return r1.getTopic().compareTo(r2.getTopic());
        }
    });

    Map<KafkaRequest, KafkaKey> offsetKeys = new HashMap<>();
    for (KafkaRequest request : finalRequests) {
        KafkaKey key = offsetKeys.get(request);

        if (key != null) {
            request.setOffset(key.getOffset());
            request.setAvgMsgSize(key.getMessageSize());
        }

        if (request.getEarliestOffset() > request.getOffset()
                || request.getOffset() > request.getLastOffset()) {

            boolean offsetUnset = request.getOffset() == KafkaRequest.DEFAULT_OFFSET;
            // When the offset is unset, it means it's a new topic/partition, we also need to consume the earliest offset
            if (offsetUnset) {
                request.setOffset(request.getEarliestOffset());
                offsetKeys.put(request, new KafkaKey(request.getTopic(), request.getLeaderId(),
                        request.getPartition(), 0, request.getOffset()));
            }
        }
    }
    conf.set(KAFKA_REQUEST, new Gson().toJson(finalRequests));
    return finalRequests;
}