org.calrissian.accumulorecipes.thirdparty.pig.loader.EventLoader.java Source code

Java tutorial

Introduction

Here is the source code for org.calrissian.accumulorecipes.thirdparty.pig.loader.EventLoader.java

Source

/*
* Copyright (C) 2014 The Calrissian Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.calrissian.accumulorecipes.thirdparty.pig.loader;

import com.google.common.base.Preconditions;
import com.google.common.collect.Iterators;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import groovy.lang.Binding;
import groovy.lang.GroovyShell;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.mapreduce.AbstractInputFormat;
import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat;
import org.apache.accumulo.core.client.mapreduce.lib.impl.ConfiguratorBase;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.calrissian.accumulorecipes.commons.hadoop.EventWritable;
import org.calrissian.accumulorecipes.commons.hadoop.RecordReaderValueIterator;
import org.calrissian.accumulorecipes.commons.support.GettableTransform;
import org.calrissian.accumulorecipes.commons.util.UriUtils;
import org.calrissian.accumulorecipes.eventstore.hadoop.EventInputFormat;
import org.calrissian.accumulorecipes.thirdparty.pig.support.AttributeStoreIterator;
import org.calrissian.mango.criteria.builder.QueryBuilder;
import org.calrissian.mango.domain.event.Event;
import org.calrissian.mango.types.SimpleTypeEncoders;
import org.calrissian.mango.types.TypeRegistry;
import org.joda.time.DateTime;

import java.io.IOException;
import java.io.Serializable;
import java.util.Collection;
import java.util.Iterator;
import java.util.Set;

/**
 * Allows events to be streamed from the event store into Pig attributes. This class is really a proxy
 * for the {@link EventInputFormat} which also decomposes the attributes of an event into quads
 * (id, timestamp, key, value) which can be manipulated easily through pig statements.
 */
public class EventLoader extends LoadFunc implements Serializable {

    public static final String USAGE = "Usage: event://indexTable/shardTable?user=&pass=&inst=&zk=&start=&end=&auths=[&fields=]";

    protected transient AttributeStoreIterator<Event> itr;
    protected final TypeRegistry<String> registry = SimpleTypeEncoders.SIMPLE_TYPES;

    protected final QueryBuilder qb;

    public EventLoader(String query) {
        Preconditions.checkNotNull(query);
        Preconditions.checkArgument(!query.equals(""));

        try {
            // call groovy expressions from Java code
            Binding binding = new Binding();
            binding.setVariable("q", QueryBuilder.create());
            GroovyShell shell = new GroovyShell(binding);
            qb = (QueryBuilder) shell.evaluate(query);
        } catch (Exception e) {
            throw new RuntimeException("There was an error parsing the groovy query string. ");
        }
    }

    @Override
    public void setLocation(String uri, Job job) throws IOException {

        Configuration conf = job.getConfiguration();
        if (!ConfiguratorBase.isConnectorInfoSet(AccumuloInputFormat.class, job.getConfiguration())) {
            String path = uri.substring(uri.indexOf("://") + 3, uri.indexOf("?"));

            String[] indexAndShardTable = StringUtils.splitPreserveAllTokens(path, "/");
            if (indexAndShardTable.length != 2)
                throw new IOException("Path portion of URI must contain the index and shard tables. " + USAGE);

            if (uri.startsWith("event")) {
                String queryPortion = uri.substring(uri.indexOf("?") + 1, uri.length());
                Multimap<String, String> queryParams = UriUtils.splitQuery(queryPortion);

                String accumuloUser = getProp(queryParams, "user");
                String accumuloPass = getProp(queryParams, "pass");
                String accumuloInst = getProp(queryParams, "inst");
                String zookeepers = getProp(queryParams, "zk");
                if (accumuloUser == null || accumuloPass == null || accumuloInst == null || zookeepers == null)
                    throw new IOException(
                            "Some Accumulo connection information is missing. Must supply username, password, instance, and zookeepers. "
                                    + USAGE);

                String startTime = getProp(queryParams, "start");
                String endTime = getProp(queryParams, "end");
                if (startTime == null || endTime == null)
                    throw new IOException("Start and end times are required. " + USAGE);

                String auths = getProp(queryParams, "auths");
                if (auths == null)
                    auths = ""; // default auths to empty
                String selectFields = getProp(queryParams, "fields");

                String types = getProp(queryParams, "types");

                Set<String> fields = selectFields != null
                        ? Sets.newHashSet(StringUtils.splitPreserveAllTokens(selectFields, ","))
                        : null;
                Set<String> finalTypes = types != null
                        ? Sets.newHashSet(StringUtils.splitPreserveAllTokens(types, ","))
                        : null;

                DateTime startDT = new DateTime(startTime);
                DateTime endDT = new DateTime(endTime);

                AbstractInputFormat.setZooKeeperInstance(job, accumuloInst, zookeepers);
                try {
                    EventInputFormat.setInputInfo(job, accumuloUser, accumuloPass.getBytes(),
                            new Authorizations(auths.getBytes()));
                } catch (AccumuloSecurityException e) {
                    throw new RuntimeException(e);
                }
                try {
                    EventInputFormat.setQueryInfo(job, startDT.toDate(), endDT.toDate(), finalTypes, qb.build());
                    if (fields != null)
                        EventInputFormat.setSelectFields(conf, fields);
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            } else {
                throw new IOException("Location uri must begin with event://");
            }
        }
    }

    private String getProp(Multimap<String, String> queryParams, String propKey) {
        Collection<String> props = queryParams.get(propKey);
        if (props.size() > 0)
            return props.iterator().next();
        return null;
    }

    @Override
    public InputFormat getInputFormat() throws IOException {
        return new EventInputFormat();
    }

    @Override
    public void prepareToRead(RecordReader recordReader, PigSplit pigSplit) throws IOException {
        RecordReaderValueIterator<Key, EventWritable> rri = new RecordReaderValueIterator<Key, EventWritable>(
                recordReader);
        Iterator<Event> xformed = Iterators.transform(rri, new GettableTransform<Event>());
        itr = new AttributeStoreIterator<Event>(xformed);
    }

    @Override
    public Tuple getNext() throws IOException {

        if (!itr.hasNext())
            return null;

        org.calrissian.mango.domain.Attribute eventAttribute = itr.next();

        /**
         * Create the pig attribute and hydrate with event details. The format of the attribute is as follows:
         * (id, timestamp, key, datatype, value, visiblity)
         */
        Tuple t = TupleFactory.getInstance().newTuple();
        t.append(itr.getTopStore().getId());
        t.append(itr.getTopStore().getTimestamp());
        t.append(eventAttribute.getKey());
        t.append(registry.getAlias(eventAttribute.getValue()));
        t.append(registry.encode(eventAttribute.getValue()));

        return t;
    }
}