org.trellisldp.rosid.file.streaming.FileProcessingPipeline.java Source code

Java tutorial

Introduction

Here is the source code for org.trellisldp.rosid.file.streaming.FileProcessingPipeline.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.trellisldp.rosid.file.streaming;

import static java.lang.Long.parseLong;
import static java.util.stream.Collectors.toMap;
import static org.slf4j.LoggerFactory.getLogger;
import static org.trellisldp.rosid.common.RosidConstants.TOPIC_CACHE;
import static org.trellisldp.rosid.common.RosidConstants.TOPIC_CACHE_AGGREGATE;
import static org.trellisldp.rosid.common.RosidConstants.TOPIC_EVENT;
import static org.trellisldp.rosid.common.RosidConstants.TOPIC_LDP_CONTAINMENT_ADD;
import static org.trellisldp.rosid.common.RosidConstants.TOPIC_LDP_CONTAINMENT_DELETE;
import static org.trellisldp.rosid.common.RosidConstants.TOPIC_LDP_MEMBERSHIP_ADD;
import static org.trellisldp.rosid.common.RosidConstants.TOPIC_LDP_MEMBERSHIP_DELETE;

import java.io.IOException;
import java.io.InputStream;
import java.io.FileInputStream;
import java.util.Map;
import java.util.Properties;

import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.kafka.KafkaIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Combine;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime;
import org.apache.beam.sdk.transforms.windowing.FixedWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.values.KV;

import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.kafka.common.serialization.StringSerializer;

import org.joda.time.Duration;
import org.slf4j.Logger;

import org.trellisldp.vocabulary.LDP;

/**
 * @author acoburn
 */
public class FileProcessingPipeline {

    private static final Logger LOGGER = getLogger(FileProcessingPipeline.class);

    private static final String PARTITION_PREFIX = "trellis.partitions.";
    private static final String DATA_SUFFIX = ".data";
    private static final String BASE_URL_SUFFIX = ".baseUrl";

    private final Map<String, String> dataConfiguration;
    private final Map<String, String> baseUrlConfiguration;
    private final String bootstrapServers;
    private final long aggregateSeconds;

    /**
     * Build a file processing pipeline
     * @param configuration the configuration
     */
    public FileProcessingPipeline(final Properties configuration) {
        this.bootstrapServers = configuration.getProperty("kafka.bootstrapServers");
        this.aggregateSeconds = parseLong(configuration.getProperty("trellis.aggregateSeconds", "1"));
        this.dataConfiguration = configuration.stringPropertyNames().stream()
                .filter(k -> k.startsWith(PARTITION_PREFIX) && k.endsWith(DATA_SUFFIX))
                .collect(toMap(k -> k.substring(PARTITION_PREFIX.length(), k.length() - DATA_SUFFIX.length()),
                        configuration::getProperty));
        this.baseUrlConfiguration = configuration.stringPropertyNames().stream()
                .filter(k -> k.startsWith(PARTITION_PREFIX) && k.endsWith(BASE_URL_SUFFIX))
                .collect(toMap(k -> k.substring(PARTITION_PREFIX.length(), k.length() - BASE_URL_SUFFIX.length()),
                        configuration::getProperty));

        if (!this.dataConfiguration.keySet().equals(this.baseUrlConfiguration.keySet())) {
            LOGGER.error("Invalid configuration values: data and baseUrl values do not align");
            LOGGER.error("Data partitions: {}", this.dataConfiguration.keySet());
            LOGGER.error("BaseUrl partitions: {}", this.baseUrlConfiguration.keySet());
            throw new IllegalArgumentException("Data and BaseUrl values are missing for some partitions!");
        }
    }

    /**
     * Get the beam pipeline
     * @return the pipeline
     */
    public Pipeline getPipeline() {

        LOGGER.debug("Building Beam Pipeline");
        final PipelineOptions options = PipelineOptionsFactory.create();
        final Pipeline p = Pipeline.create(options);

        // Add membership triples
        p.apply(getKafkaReader(bootstrapServers).withTopic(TOPIC_LDP_MEMBERSHIP_ADD).withoutMetadata())
                .apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferMembership.getIRIString(), true)))
                .apply(getKafkaWriter(bootstrapServers).withTopic(TOPIC_CACHE_AGGREGATE));

        // Delete membership triples
        p.apply(getKafkaReader(bootstrapServers).withTopic(TOPIC_LDP_MEMBERSHIP_DELETE).withoutMetadata())
                .apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferMembership.getIRIString(), false)))
                .apply(getKafkaWriter(bootstrapServers).withTopic(TOPIC_CACHE_AGGREGATE));

        // Add containment triples
        p.apply(getKafkaReader(bootstrapServers).withTopic(TOPIC_LDP_CONTAINMENT_ADD).withoutMetadata())
                .apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), true)))
                .apply(getKafkaWriter(bootstrapServers).withTopic(TOPIC_CACHE_AGGREGATE));

        // Delete containment triples
        p.apply(getKafkaReader(bootstrapServers).withTopic(TOPIC_LDP_CONTAINMENT_DELETE).withoutMetadata())
                .apply(ParDo.of(new BeamProcessor(dataConfiguration, LDP.PreferContainment.getIRIString(), false)))
                .apply(getKafkaWriter(bootstrapServers).withTopic(TOPIC_CACHE_AGGREGATE));

        if (aggregateSeconds > 0) {
            // Aggregate cache writes
            p.apply(getKafkaReader(bootstrapServers).withTopic(TOPIC_CACHE_AGGREGATE).withoutMetadata())
                    .apply(Window
                            .<KV<String, String>>into(FixedWindows.of(Duration.standardSeconds(aggregateSeconds)))
                            .triggering(AfterProcessingTime.pastFirstElementInPane()
                                    .plusDelayOf(Duration.standardSeconds(aggregateSeconds)))
                            .discardingFiredPanes().withAllowedLateness(Duration.ZERO))
                    .apply(Combine.perKey(x -> x.iterator().next()))
                    .apply(getKafkaWriter(bootstrapServers).withTopic(TOPIC_CACHE));
        } else {
            // Skip aggregation
            p.apply(getKafkaReader(bootstrapServers).withTopic(TOPIC_CACHE_AGGREGATE).withoutMetadata())
                    .apply(getKafkaWriter(bootstrapServers).withTopic(TOPIC_CACHE));
        }

        // Write to cache and dispatch to the event bus
        p.apply(getKafkaReader(bootstrapServers).withTopic(TOPIC_CACHE).withoutMetadata())
                .apply(ParDo.of(new CacheWriter(dataConfiguration)))
                .apply(ParDo.of(new EventProcessor(baseUrlConfiguration)))
                .apply(getKafkaWriter(bootstrapServers).withTopic(TOPIC_EVENT));

        return p;
    }

    private static KafkaIO.Read<String, String> getKafkaReader(final String bootstrapServers) {
        return KafkaIO.<String, String>read().withBootstrapServers(bootstrapServers)
                .withKeyDeserializer(StringDeserializer.class).withValueDeserializer(StringDeserializer.class);
    }

    private static KafkaIO.Write<String, String> getKafkaWriter(final String bootstrapServers) {
        return KafkaIO.<String, String>write().withBootstrapServers(bootstrapServers)
                .withKeySerializer(StringSerializer.class).withValueSerializer(StringSerializer.class);
    }

    /**
     * Load a FileProcessingPipeline
     * @param args the arguments
     * @return the beam pipeline
     * @throws IOException in the event of an error loading the configuration
     */
    public static Pipeline loadPipeline(final String[] args) throws IOException {
        if (args.length >= 1) {
            LOGGER.debug("Loading Beam Pipeline with {}", args[0]);
            return new FileProcessingPipeline(loadConfiguration(args[0])).getPipeline();
        }
        LOGGER.error("No configuration file provided");
        throw new IOException("No configuration file provided");
    }

    /**
     * Load the configuration for the pipeline
     * @param filename the configuration filename
     * @return the configuration properties
     * @throws IOException in the event of an error loading the configuration
     */
    public static Properties loadConfiguration(final String filename) throws IOException {
        final Properties config = new Properties();
        try (final InputStream input = new FileInputStream(filename)) {
            LOGGER.debug("Loading configuration from {}", filename);
            config.load(input);
            return config;
        }
    }
}