com.cloudera.science.pig.Combinatorial.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.science.pig.Combinatorial.java

Source

/**
 * Copyright 2011 Cloudera Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.science.pig;

import java.io.IOException;

import java.util.Arrays;
import java.util.List;
import java.util.Set;

import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

/**
 * A pig function that can be used to generate all subsets of a certain size
 * from a bag.
 * 
 * <p>It expects that the bag's tuple contains only a single field, and
 * that the type of that field implements {@link Comparable}. The returned
 * value is a bag of {@link Tuple} values with N-fields, where N is the arity
 * specified in the constructor.
 * 
 */
public class Combinatorial extends EvalFunc<DataBag> {

    private final TupleFactory tupleFactory = TupleFactory.getInstance();
    private final BagFactory bagFactory = BagFactory.getInstance();
    private final int arity;

    public Combinatorial(String arity) {
        this.arity = Integer.valueOf(arity);
    }

    @Override
    public DataBag exec(Tuple input) throws IOException {
        try {
            DataBag output = bagFactory.newDefaultBag();
            Object o = input.get(0);
            if (!(o instanceof DataBag)) {
                throw new IOException("Expected input to be a bag, but got: " + o.getClass().getName());
            }
            DataBag inputBag = (DataBag) o;
            Set<Comparable> uniqs = Sets.newTreeSet();
            for (Tuple t : inputBag) {
                if (t != null && t.get(0) != null) {
                    uniqs.add((Comparable) t.get(0));
                }
            }
            if (uniqs.size() < arity) {
                return output;
            }
            List<Comparable> values = Lists.newArrayList(uniqs);
            Comparable[] subset = new Comparable[arity];

            process(values, subset, 0, 0, output);

            return output;
        } catch (ExecException e) {
            throw new IOException(e);
        }
    }

    private void process(List<Comparable> values, Comparable[] subset, int curSubsetSize, int nextIndex,
            DataBag output) {
        if (curSubsetSize == subset.length) {
            output.add(tupleFactory.newTuple(Arrays.asList(subset)));
        } else {
            for (int i = nextIndex; i < values.size(); i++) {
                subset[curSubsetSize] = values.get(i);
                process(values, subset, curSubsetSize + 1, i + 1, output);
            }
        }
    }

    private boolean isComparable(byte pigType) {
        return DataType.isAtomic(pigType) || pigType == DataType.GENERIC_WRITABLECOMPARABLE;
    }

    @Override
    /**
     * This UDF is given a Bag of Tuples of Comparables.  
     *    Describe output: ({name:chararray})
     * 
     * We want to output a Bag of Tuples of X of Comparables.  X being equal to arity
     *    Describe should output: ({name0:chararray, name1:chararray})s
     */
    public Schema outputSchema(Schema input) {
        if (input.size() != 1) {
            throw new IllegalArgumentException("Expected a bag; input has > 1 field");
        }
        try {

            //Run some error checking
            if (input.getField(0).type != DataType.BAG) {
                throw new IllegalArgumentException(
                        "Expected a bag; found: " + DataType.findTypeName(input.getField(0).type));
            }
            if (input.getField(0).schema.getField(0).type != DataType.TUPLE) {
                throw new IllegalArgumentException(
                        "Expected a tuple in a bag; found: " + DataType.findTypeName(input.getField(0).type));
            }
            if (input.getField(0).schema.size() != 1) {
                throw new IllegalArgumentException("The bag must contain a single field");
            }

            //just to bucket schemas because we will be going 3 levels deep
            Schema bagSchema = input.getField(0).schema;
            Schema tupleSchema = bagSchema.getField(0).schema;

            byte fieldType = tupleSchema.getField(0).type;

            if (!isComparable(fieldType)) {
                throw new IllegalArgumentException("The bag's Tulple's field must be a comparable type");
            }

            FieldSchema inputField = tupleSchema.getField(0);
            String inputName = inputField.alias;

            //Define how many fields will be in the tuple
            List<FieldSchema> fields = Lists.newArrayList();
            for (int i = 0; i < arity; i++) {
                fields.add(new FieldSchema(inputName + i, inputField.type));
            }
            Schema newTupleSchema = new Schema(fields);

            //Define the tuple
            FieldSchema tupleFieldSchema = new FieldSchema(inputName + "tuple", newTupleSchema, DataType.TUPLE);

            //Define Bag
            Schema newBagSchema = new Schema(tupleFieldSchema);
            //bagSchema.setTwoLevelAccessRequired(true); // This was deprecated.  TODO why was this there.
            Schema.FieldSchema bagFieldSchema = new Schema.FieldSchema(inputName + "bag", newBagSchema,
                    DataType.BAG);

            return new Schema(bagFieldSchema);

        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }
}