com.cloudera.crunch.impl.mem.collect.MemGroupedTable.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.crunch.impl.mem.collect.MemGroupedTable.java

Source

/**
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.crunch.impl.mem.collect;

import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.util.ReflectionUtils;

import com.cloudera.crunch.CombineFn;
import com.cloudera.crunch.GroupingOptions;
import com.cloudera.crunch.PCollection;
import com.cloudera.crunch.PGroupedTable;
import com.cloudera.crunch.PTable;
import com.cloudera.crunch.Pair;
import com.cloudera.crunch.Pipeline;
import com.cloudera.crunch.Target;
import com.cloudera.crunch.type.PTableType;
import com.cloudera.crunch.type.PType;
import com.cloudera.crunch.type.PTypeFamily;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

class MemGroupedTable<K, V> extends MemCollection<Pair<K, Iterable<V>>> implements PGroupedTable<K, V> {

    private final MemTable<K, V> parent;

    private static <S, T> Map<S, Collection<T>> createMapFor(PType<S> keyType, GroupingOptions options,
            Pipeline pipeline) {
        if (options != null && options.getSortComparatorClass() != null) {
            RawComparator<S> rc = ReflectionUtils.newInstance(options.getSortComparatorClass(),
                    pipeline.getConfiguration());
            return new TreeMap<S, Collection<T>>(rc);
        } else if (keyType != null && Comparable.class.isAssignableFrom(keyType.getTypeClass())) {
            return new TreeMap<S, Collection<T>>();
        }
        return Maps.newHashMap();
    }

    private static <S, T> Iterable<Pair<S, Iterable<T>>> buildMap(MemTable<S, T> parent, GroupingOptions options) {
        PType<S> keyType = parent.getKeyType();
        Map<S, Collection<T>> map = createMapFor(keyType, options, parent.getPipeline());

        for (Pair<S, T> pair : parent.materialize()) {
            S key = pair.first();
            if (!map.containsKey(key)) {
                map.put(key, Lists.<T>newArrayList());
            }
            map.get(key).add(pair.second());
        }

        List<Pair<S, Iterable<T>>> values = Lists.newArrayList();
        for (Map.Entry<S, Collection<T>> e : map.entrySet()) {
            values.add(Pair.of(e.getKey(), (Iterable<T>) e.getValue()));
        }
        return values;
    }

    public MemGroupedTable(MemTable<K, V> parent, GroupingOptions options) {
        super(buildMap(parent, options));
        this.parent = parent;
    }

    @Override
    public PCollection<Pair<K, Iterable<V>>> union(PCollection<Pair<K, Iterable<V>>>... collections) {
        throw new UnsupportedOperationException();
    }

    @Override
    public PCollection<Pair<K, Iterable<V>>> write(Target target) {
        getPipeline().write(this.ungroup(), target);
        return this;
    }

    @Override
    public PType<Pair<K, Iterable<V>>> getPType() {
        PTableType<K, V> parentType = parent.getPTableType();
        if (parentType != null) {
            return parentType.getGroupedTableType();
        }
        return null;
    }

    @Override
    public PTypeFamily getTypeFamily() {
        return parent.getTypeFamily();
    }

    @Override
    public long getSize() {
        return parent.getSize();
    }

    @Override
    public String getName() {
        return "MemGrouped(" + parent.getName() + ")";
    }

    @Override
    public PTable<K, V> combineValues(CombineFn<K, V> combineFn) {
        return parallelDo(combineFn, parent.getPTableType());
    }

    @Override
    public PTable<K, V> ungroup() {
        return parent;
    }
}