com.liveramp.cascading_ext.CascadingUtil.java Source code

Java tutorial

Introduction

Here is the source code for com.liveramp.cascading_ext.CascadingUtil.java

Source

/**
 *  Copyright 2012 LiveRamp
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.liveramp.cascading_ext;

import cascading.flow.FlowConnector;
import cascading.flow.FlowProcess;
import cascading.flow.FlowStepStrategy;
import cascading.flow.hadoop.HadoopFlowProcess;
import com.liveramp.cascading_ext.bloom.BloomAssemblyStrategy;
import com.liveramp.cascading_ext.bloom.BloomProps;
import com.liveramp.cascading_ext.flow.LoggingFlowConnector;
import com.liveramp.cascading_ext.flow_step_strategy.FlowStepStrategyFactory;
import com.liveramp.cascading_ext.flow_step_strategy.MultiFlowStepStrategy;
import com.liveramp.cascading_ext.flow_step_strategy.RenameJobStrategy;
import com.liveramp.cascading_ext.flow_step_strategy.SimpleFlowStepStrategyFactory;
import com.liveramp.cascading_ext.util.OperationStatsUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.serializer.Serialization;
import org.apache.hadoop.mapred.JobConf;

import java.util.*;

public class CascadingUtil {

    public static final String CASCADING_RUN_ID = "cascading_ext.cascading.run.id";

    private static final CascadingUtil INSTANCE = new CascadingUtil();

    public static CascadingUtil get() {
        return INSTANCE;
    }

    protected CascadingUtil() {
        addDefaultFlowStepStrategy(RenameJobStrategy.class);
        addDefaultFlowStepStrategy(BloomAssemblyStrategy.class);

        defaultProperties.putAll(BloomProps.getDefaultProperties());
    }

    private final Map<Object, Object> defaultProperties = new HashMap<Object, Object>();
    private final List<FlowStepStrategyFactory<JobConf>> defaultFlowStepStrategies = new ArrayList<FlowStepStrategyFactory<JobConf>>();
    private final Set<Class<? extends Serialization>> serializations = new HashSet<Class<? extends Serialization>>();
    private final Map<Integer, Class<?>> serializationTokens = new HashMap<Integer, Class<?>>();

    private transient JobConf conf = null;

    public void setDefaultProperty(Object key, Object value) {
        defaultProperties.put(key, value);
        conf = null;
    }

    public void addDefaultFlowStepStrategy(FlowStepStrategyFactory<JobConf> flowStepStrategyFactory) {
        defaultFlowStepStrategies.add(flowStepStrategyFactory);
    }

    public void addDefaultFlowStepStrategy(Class<? extends FlowStepStrategy<JobConf>> klass) {
        defaultFlowStepStrategies.add(new SimpleFlowStepStrategyFactory(klass));
    }

    public void clearDefaultFlowStepStrategies() {
        defaultFlowStepStrategies.clear();
    }

    public void addSerialization(Class<? extends Serialization> serialization) {
        serializations.add(serialization);
        conf = null;
    }

    public void addSerializationToken(int token, Class<?> klass) {
        if (token < 128) {
            throw new IllegalArgumentException(
                    "Serialization tokens must be >= 128 (lower numbers are reserved by Cascading)");
        }

        if (serializationTokens.containsKey(token) && !serializationTokens.get(token).equals(klass)) {
            throw new IllegalArgumentException(
                    "Token " + token + " is already assigned to class " + serializationTokens.get(token));
        }

        serializationTokens.put(token, klass);
    }

    private Map<String, String> getSerializationsProperty() {
        // Get the existing serializations
        List<String> strings = new ArrayList<String>();

        String existing = new JobConf().get("io.serializations");
        if (existing != null) {
            strings.add(existing);
        }

        // Append our custom serializations
        for (Class<? extends Serialization> klass : serializations) {
            strings.add(klass.getName());
        }

        return Collections.singletonMap("io.serializations", StringUtils.join(strings, ","));
    }

    private Map<String, String> getSerializationTokensProperty() {
        List<String> strings = new ArrayList<String>();
        for (Map.Entry<Integer, Class<?>> entry : serializationTokens.entrySet()) {
            strings.add(entry.getKey() + "=" + entry.getValue().getName());
        }
        if (strings.isEmpty()) {
            return Collections.emptyMap();
        } else {
            return Collections.singletonMap("cascading.serialization.tokens", StringUtils.join(strings, ","));
        }
    }

    public Map<Object, Object> getDefaultProperties() {
        Map<Object, Object> properties = new HashMap<Object, Object>();
        properties.putAll(getSerializationsProperty());
        properties.putAll(getSerializationTokensProperty());
        properties.putAll(defaultProperties);
        return properties;
    }

    public JobConf getJobConf() {
        if (conf == null) {
            conf = new JobConf();
            setAll(conf, getSerializationsProperty());
            setAll(conf, getSerializationTokensProperty());
        }
        return new JobConf(conf);
    }

    public FlowConnector getFlowConnector() {
        return realGetFlowConnector(Collections.<Object, Object>emptyMap(),
                Collections.<FlowStepStrategy<JobConf>>emptyList());
    }

    public FlowConnector getFlowConnector(Map<Object, Object> properties) {
        return realGetFlowConnector(properties, Collections.<FlowStepStrategy<JobConf>>emptyList());
    }

    public FlowConnector getFlowConnector(List<FlowStepStrategy<JobConf>> flowStepStrategies) {
        return realGetFlowConnector(Collections.<Object, Object>emptyMap(), flowStepStrategies);
    }

    public FlowConnector getFlowConnector(Map<Object, Object> properties,
            List<FlowStepStrategy<JobConf>> flowStepStrategies) {
        return realGetFlowConnector(properties, flowStepStrategies);
    }

    // We extract this method so that the default name based on the stack position makes sense
    private FlowConnector realGetFlowConnector(Map<Object, Object> properties,
            List<FlowStepStrategy<JobConf>> flowStepStrategies) {
        //Add in default properties
        Map<Object, Object> combinedProperties = getDefaultProperties();
        combinedProperties.putAll(properties);

        //Add in default flow step strategies
        List<FlowStepStrategy<JobConf>> combinedStrategies = new ArrayList<FlowStepStrategy<JobConf>>(
                flowStepStrategies);
        for (FlowStepStrategyFactory<JobConf> flowStepStrategyFactory : defaultFlowStepStrategies) {
            combinedStrategies.add(flowStepStrategyFactory.getFlowStepStrategy());
        }

        return new LoggingFlowConnector(combinedProperties, new MultiFlowStepStrategy(combinedStrategies),
                OperationStatsUtils.formatStackPosition(OperationStatsUtils.getStackPosition(2)));
    }

    public FlowProcess<JobConf> getFlowProcess() {
        return getFlowProcess(getJobConf());
    }

    public FlowProcess<JobConf> getFlowProcess(JobConf jobConf) {
        return new HadoopFlowProcess(jobConf);
    }

    private void setAll(Configuration conf, Map<String, String> properties) {
        for (Map.Entry<String, String> property : properties.entrySet()) {
            conf.set(property.getKey(), property.getValue());
        }
    }
}