Java tutorial
/** * Copyright 2012 LiveRamp * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.liveramp.cascading_ext; import cascading.flow.FlowConnector; import cascading.flow.FlowProcess; import cascading.flow.FlowStepStrategy; import cascading.flow.hadoop.HadoopFlowProcess; import com.liveramp.cascading_ext.bloom.BloomAssemblyStrategy; import com.liveramp.cascading_ext.bloom.BloomProps; import com.liveramp.cascading_ext.flow.LoggingFlowConnector; import com.liveramp.cascading_ext.flow_step_strategy.FlowStepStrategyFactory; import com.liveramp.cascading_ext.flow_step_strategy.MultiFlowStepStrategy; import com.liveramp.cascading_ext.flow_step_strategy.RenameJobStrategy; import com.liveramp.cascading_ext.flow_step_strategy.SimpleFlowStepStrategyFactory; import com.liveramp.cascading_ext.util.OperationStatsUtils; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.serializer.Serialization; import org.apache.hadoop.mapred.JobConf; import java.util.*; public class CascadingUtil { public static final String CASCADING_RUN_ID = "cascading_ext.cascading.run.id"; private static final CascadingUtil INSTANCE = new CascadingUtil(); public static CascadingUtil get() { return INSTANCE; } protected CascadingUtil() { addDefaultFlowStepStrategy(RenameJobStrategy.class); addDefaultFlowStepStrategy(BloomAssemblyStrategy.class); defaultProperties.putAll(BloomProps.getDefaultProperties()); } private final Map<Object, Object> defaultProperties = new HashMap<Object, Object>(); private final List<FlowStepStrategyFactory<JobConf>> defaultFlowStepStrategies = new ArrayList<FlowStepStrategyFactory<JobConf>>(); private final Set<Class<? extends Serialization>> serializations = new HashSet<Class<? extends Serialization>>(); private final Map<Integer, Class<?>> serializationTokens = new HashMap<Integer, Class<?>>(); private transient JobConf conf = null; public void setDefaultProperty(Object key, Object value) { defaultProperties.put(key, value); conf = null; } public void addDefaultFlowStepStrategy(FlowStepStrategyFactory<JobConf> flowStepStrategyFactory) { defaultFlowStepStrategies.add(flowStepStrategyFactory); } public void addDefaultFlowStepStrategy(Class<? extends FlowStepStrategy<JobConf>> klass) { defaultFlowStepStrategies.add(new SimpleFlowStepStrategyFactory(klass)); } public void clearDefaultFlowStepStrategies() { defaultFlowStepStrategies.clear(); } public void addSerialization(Class<? extends Serialization> serialization) { serializations.add(serialization); conf = null; } public void addSerializationToken(int token, Class<?> klass) { if (token < 128) { throw new IllegalArgumentException( "Serialization tokens must be >= 128 (lower numbers are reserved by Cascading)"); } if (serializationTokens.containsKey(token) && !serializationTokens.get(token).equals(klass)) { throw new IllegalArgumentException( "Token " + token + " is already assigned to class " + serializationTokens.get(token)); } serializationTokens.put(token, klass); } private Map<String, String> getSerializationsProperty() { // Get the existing serializations List<String> strings = new ArrayList<String>(); String existing = new JobConf().get("io.serializations"); if (existing != null) { strings.add(existing); } // Append our custom serializations for (Class<? extends Serialization> klass : serializations) { strings.add(klass.getName()); } return Collections.singletonMap("io.serializations", StringUtils.join(strings, ",")); } private Map<String, String> getSerializationTokensProperty() { List<String> strings = new ArrayList<String>(); for (Map.Entry<Integer, Class<?>> entry : serializationTokens.entrySet()) { strings.add(entry.getKey() + "=" + entry.getValue().getName()); } if (strings.isEmpty()) { return Collections.emptyMap(); } else { return Collections.singletonMap("cascading.serialization.tokens", StringUtils.join(strings, ",")); } } public Map<Object, Object> getDefaultProperties() { Map<Object, Object> properties = new HashMap<Object, Object>(); properties.putAll(getSerializationsProperty()); properties.putAll(getSerializationTokensProperty()); properties.putAll(defaultProperties); return properties; } public JobConf getJobConf() { if (conf == null) { conf = new JobConf(); setAll(conf, getSerializationsProperty()); setAll(conf, getSerializationTokensProperty()); } return new JobConf(conf); } public FlowConnector getFlowConnector() { return realGetFlowConnector(Collections.<Object, Object>emptyMap(), Collections.<FlowStepStrategy<JobConf>>emptyList()); } public FlowConnector getFlowConnector(Map<Object, Object> properties) { return realGetFlowConnector(properties, Collections.<FlowStepStrategy<JobConf>>emptyList()); } public FlowConnector getFlowConnector(List<FlowStepStrategy<JobConf>> flowStepStrategies) { return realGetFlowConnector(Collections.<Object, Object>emptyMap(), flowStepStrategies); } public FlowConnector getFlowConnector(Map<Object, Object> properties, List<FlowStepStrategy<JobConf>> flowStepStrategies) { return realGetFlowConnector(properties, flowStepStrategies); } // We extract this method so that the default name based on the stack position makes sense private FlowConnector realGetFlowConnector(Map<Object, Object> properties, List<FlowStepStrategy<JobConf>> flowStepStrategies) { //Add in default properties Map<Object, Object> combinedProperties = getDefaultProperties(); combinedProperties.putAll(properties); //Add in default flow step strategies List<FlowStepStrategy<JobConf>> combinedStrategies = new ArrayList<FlowStepStrategy<JobConf>>( flowStepStrategies); for (FlowStepStrategyFactory<JobConf> flowStepStrategyFactory : defaultFlowStepStrategies) { combinedStrategies.add(flowStepStrategyFactory.getFlowStepStrategy()); } return new LoggingFlowConnector(combinedProperties, new MultiFlowStepStrategy(combinedStrategies), OperationStatsUtils.formatStackPosition(OperationStatsUtils.getStackPosition(2))); } public FlowProcess<JobConf> getFlowProcess() { return getFlowProcess(getJobConf()); } public FlowProcess<JobConf> getFlowProcess(JobConf jobConf) { return new HadoopFlowProcess(jobConf); } private void setAll(Configuration conf, Map<String, String> properties) { for (Map.Entry<String, String> property : properties.entrySet()) { conf.set(property.getKey(), property.getValue()); } } }