com.mozilla.fhr.consumer.FHRConsumer.java Source code

Java tutorial

Introduction

Here is the source code for com.mozilla.fhr.consumer.FHRConsumer.java

Source

/*
 * Copyright 2012 Mozilla Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.mozilla.fhr.consumer;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Properties;
import java.util.concurrent.Callable;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import kafka.consumer.KafkaStream;
import kafka.message.Message;
import kafka.message.MessageAndMetadata;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.log4j.Logger;

import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.protobuf.ByteString;
import com.google.protobuf.InvalidProtocolBufferException;
import com.maxmind.geoip.Location;
import com.maxmind.geoip.LookupService;
import com.mozilla.bagheera.BagheeraProto.BagheeraMessage;
import com.mozilla.bagheera.BagheeraProto.BagheeraMessage.Operation;
import com.mozilla.bagheera.cli.OptionFactory;
import com.mozilla.bagheera.consumer.KafkaConsumer;
import com.mozilla.bagheera.metrics.MetricsManager;
import com.mozilla.bagheera.sink.KeyValueSink;
import com.mozilla.bagheera.sink.KeyValueSinkFactory;
import com.mozilla.bagheera.sink.SinkConfiguration;
import com.mozilla.bagheera.util.ShutdownHook;
import com.mozilla.bagheera.sink.HBaseSink;
import com.yammer.metrics.Metrics;
import com.yammer.metrics.core.Meter;
import com.yammer.metrics.core.MetricName;

public class FHRConsumer extends KafkaConsumer {

    private static final Logger LOG = Logger.getLogger(FHRConsumer.class);

    private static final String GEO_COUNTRY_FIELD = "geoCountry";
    private static final String UNKNOWN_COUNTRY_CODE = "--";

    private ObjectMapper jsonMapper;
    private LookupService geoIpLookupService;

    protected Meter invalidJsonMeter;
    protected Meter unknownGeoIpMeter;

    public FHRConsumer(String topic, Properties props) {
        this(topic, props, DEFAULT_NUM_THREADS);
    }

    public FHRConsumer(String topic, Properties props, int numThreads) {
        super(topic, props, numThreads);
        jsonMapper = new ObjectMapper();
        String maxmindPath = props.getProperty("maxmind.db.path");
        try {
            geoIpLookupService = new LookupService(maxmindPath, LookupService.GEOIP_MEMORY_CACHE);
        } catch (IOException e) {
            LOG.error("Failed to load geoip database", e);
            throw new RuntimeException(e);
        }

        invalidJsonMeter = Metrics.newMeter(new MetricName("bagheera", "consumer", topic + ".json.invalid"),
                "messages", TimeUnit.SECONDS);
        unknownGeoIpMeter = Metrics.newMeter(new MetricName("bagheera", "consumer", topic + ".geoip.unknown"),
                "messages", TimeUnit.SECONDS);
    }

    @Override
    public void close() {
        super.close();
        if (geoIpLookupService != null) {
            geoIpLookupService.close();
        }
    }

    @Override
    public void poll() {
        final CountDownLatch latch = new CountDownLatch(streams.size());
        workers = new ArrayList<Future<Void>>(streams.size());
        for (final KafkaStream<Message> stream : streams) {
            workers.add(executor.submit(new FHRConsumerWorker(stream, latch)));
        }

        // Wait for all tasks to complete which in the normal case they will
        // run indefinitely unless killed
        try {
            while (true) {
                latch.await(10, TimeUnit.SECONDS);
                if (latch.getCount() != streams.size()) {
                    // we have a dead thread and should exit
                    break;
                }
            }
        } catch (InterruptedException e) {
            LOG.info("Interrupted during polling", e);
        }

        // Spit out errors if there were any
        for (Future<Void> worker : workers) {
            try {
                if (worker.isDone() && !worker.isCancelled()) {
                    worker.get(1, TimeUnit.SECONDS);
                }
            } catch (InterruptedException e) {
                LOG.error("Thread was interrupted:", e);
            } catch (ExecutionException e) {
                LOG.error("Exception occured in thread:", e);
            } catch (TimeoutException e) {
                LOG.error("Timed out waiting for thread result:", e);
            } catch (CancellationException e) {
                LOG.error("Thread has been canceled: ", e);
            }
        }
    }

    /**
     * This method overrides KafkaConsumer but we can't annotate due to the way Java
     * determines types on static methods.
     */
    public static Options getOptions() {
        OptionFactory optFactory = OptionFactory.getInstance();
        Options options = KafkaConsumer.getOptions();
        options.addOption(optFactory.create("tbl", "table", true, "HBase table name.").required());
        options.addOption(optFactory.create("f", "family", true, "Column family."));
        options.addOption(optFactory.create("q", "qualifier", true, "Column qualifier."));
        options.addOption(optFactory.create("pd", "prefixdate", false, "Prefix key with salted date."));
        options.addOption(
                optFactory.create("b", "batchsize", true, "Batch size (number of messages per HBase flush)."));
        return options;
    }

    /**
     * This method overrides KafkaConsumer but we can't annotate due to the way Java
     * determines types on static methods.
     */
    public static KafkaConsumer fromOptions(CommandLine cmd) {
        Properties props = new Properties();
        String propsFilePath = cmd.getOptionValue("properties");
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new InputStreamReader(new FileInputStream(propsFilePath)));
            props.load(reader);
            props.setProperty("groupid", cmd.getOptionValue("groupid"));
        } catch (FileNotFoundException e) {
            LOG.error("Could not find properties file", e);
        } catch (IOException e) {
            LOG.error("Error reading properties file", e);
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                    LOG.error("Error closing properties file", e);
                }
            }
        }

        int numThreads = props.containsKey("consumer.threads")
                ? Integer.parseInt(props.getProperty("consumer.threads"))
                : DEFAULT_NUM_THREADS;
        // if numthreads specified on command-line then override
        if (cmd.hasOption("numthreads")) {
            numThreads = Integer.parseInt(cmd.getOptionValue("numthreads"));
        }

        return new FHRConsumer(cmd.getOptionValue("topic"), props, numThreads);
    }

    private class FHRConsumerWorker implements Callable<Void> {

        private final KafkaStream<Message> stream;
        private final CountDownLatch latch;

        public FHRConsumerWorker(KafkaStream<Message> stream, CountDownLatch latch) {
            this.stream = stream;
            this.latch = latch;
        }

        @Override
        public Void call() throws Exception {
            try {
                for (MessageAndMetadata<Message> mam : stream) {
                    BagheeraMessage bmsg = BagheeraMessage.parseFrom(ByteString.copyFrom(mam.message().payload()));
                    // get the sink for this message's namespace
                    // (typically only one sink unless a regex pattern was used to listen to multiple topics)
                    KeyValueSink sink = sinkFactory.getSink(bmsg.getNamespace());
                    if (bmsg.getOperation() == Operation.CREATE_UPDATE && bmsg.hasId() && bmsg.hasPayload()) {
                        String payloadString = bmsg.getPayload().toStringUtf8();
                        try {
                            ObjectNode document = jsonMapper.readValue(payloadString, ObjectNode.class);
                            // do a geoip lookup on the IP if we have one
                            if (bmsg.hasIpAddr()) {
                                Location location = geoIpLookupService
                                        .getLocation(InetAddress.getByAddress(bmsg.getIpAddr().toByteArray()));
                                if (location != null && !UNKNOWN_COUNTRY_CODE.equals(location.countryCode)
                                        && location.countryCode.trim().length() > 0) {
                                    document.put(GEO_COUNTRY_FIELD, location.countryCode);
                                } else {
                                    unknownGeoIpMeter.mark();
                                    document.put(GEO_COUNTRY_FIELD, UNKNOWN_COUNTRY_CODE);
                                }
                            } else {
                                unknownGeoIpMeter.mark();
                                document.put(GEO_COUNTRY_FIELD, UNKNOWN_COUNTRY_CODE);
                            }
                            LOG.info("CONSUMER_PUT " + bmsg.getNamespace() + " " + bmsg.getId());
                            // store the document
                            if (bmsg.hasTimestamp()) {
                                sink.store(bmsg.getId(), jsonMapper.writeValueAsBytes(document),
                                        bmsg.getTimestamp());
                            } else {
                                sink.store(bmsg.getId(), jsonMapper.writeValueAsBytes(document));
                            }
                        } catch (JsonParseException e) {
                            invalidJsonMeter.mark();
                            LOG.error("Invalid JSON", e);
                            LOG.debug(payloadString);
                        } catch (JsonMappingException e) {
                            invalidJsonMeter.mark();
                            LOG.error("Invalid JSON", e);
                            LOG.debug(payloadString);
                        }
                    } else if (bmsg.getOperation() == Operation.DELETE && bmsg.hasId()) {
                        LOG.info("CONSUMER_DELETE " + bmsg.getNamespace() + " " + bmsg.getId());
                        sink.delete(bmsg.getId());
                    }
                    consumed.mark();
                }
            } catch (InvalidProtocolBufferException e) {
                LOG.error("Invalid protocol buffer in data stream", e);
            } catch (UnsupportedEncodingException e) {
                LOG.error("Message ID was not in UTF-8 encoding", e);
            } catch (IOException e) {
                LOG.error("IO error while storing to data sink", e);
            } finally {
                latch.countDown();
            }

            return null;
        }
    }

    public static void main(String[] args) {
        Options options = FHRConsumer.getOptions();
        CommandLineParser parser = new GnuParser();
        ShutdownHook sh = ShutdownHook.getInstance();
        try {
            // Parse command line options
            CommandLine cmd = parser.parse(options, args);

            final FHRConsumer consumer = (FHRConsumer) FHRConsumer.fromOptions(cmd);
            sh.addFirst(consumer);

            // Set the sink for consumer storage
            SinkConfiguration sinkConfig = new SinkConfiguration();
            if (cmd.hasOption("numthreads")) {
                sinkConfig.setInt("hbasesink.hbase.numthreads", Integer.parseInt(cmd.getOptionValue("numthreads")));
            }
            if (cmd.hasOption("batchsize")) {
                sinkConfig.setInt("hbasesink.hbase.batchsize", Integer.parseInt(cmd.getOptionValue("batchsize")));
            }

            sinkConfig.setString("hbasesink.hbase.tablename", cmd.getOptionValue("table"));
            sinkConfig.setString("hbasesink.hbase.column.family", cmd.getOptionValue("family", "data"));
            sinkConfig.setString("hbasesink.hbase.column.qualifier", cmd.getOptionValue("qualifier", "json"));
            sinkConfig.setBoolean("hbasesink.hbase.rowkey.prefixdate",
                    Boolean.parseBoolean(cmd.getOptionValue("prefixdate", "false")));
            KeyValueSinkFactory sinkFactory = KeyValueSinkFactory.getInstance(HBaseSink.class, sinkConfig);
            sh.addLast(sinkFactory);
            consumer.setSinkFactory(sinkFactory);

            // Initialize metrics collection, reporting, etc.
            final MetricsManager manager = MetricsManager.getDefaultMetricsManager();

            // Begin polling
            consumer.poll();
        } catch (ParseException e) {
            LOG.error("Error parsing command line options", e);
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(FHRConsumer.class.getName(), options);
        }
    }
}