Java tutorial
/*********************************************************************************************************************** * * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. * **********************************************************************************************************************/ package eu.stratosphere.core.testing; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.IdentityHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.junit.Assert; import com.esotericsoftware.kryo.util.UnsafeUtil; import eu.stratosphere.api.common.Plan; import eu.stratosphere.api.common.io.FileInputFormat; import eu.stratosphere.api.common.io.FileOutputFormat; import eu.stratosphere.api.common.io.GenericInputFormat; import eu.stratosphere.api.common.operators.Operator; import eu.stratosphere.api.common.operators.base.FileDataSinkBase; import eu.stratosphere.api.common.operators.base.FileDataSourceBase; import eu.stratosphere.api.common.operators.base.GenericDataSinkBase; import eu.stratosphere.api.common.operators.base.GenericDataSourceBase; import eu.stratosphere.client.LocalExecutor; import eu.stratosphere.configuration.ConfigConstants; import eu.stratosphere.configuration.Configuration; import eu.stratosphere.configuration.GlobalConfiguration; import eu.stratosphere.core.fs.FileStatus; import eu.stratosphere.core.fs.FileSystem; import eu.stratosphere.core.fs.Path; import eu.stratosphere.core.testing.io.SequentialInputFormat; import eu.stratosphere.core.testing.io.SequentialOutputFormat; import eu.stratosphere.nephele.services.memorymanager.UnsafeMemorySegment; import eu.stratosphere.nephele.taskmanager.TaskManager; import eu.stratosphere.util.StringUtils; import eu.stratosphere.util.Visitor; /** * Base class for type-specific test plans. See {@link TestPlan} for the core {@link Record} implementation. * * @param <T> * the record type * @param <Records> * record type-specific GenericTestRecords */ public abstract class GenericTestPlan<T, Records extends GenericTestRecords<T>> implements Closeable { private final Map<GenericDataSinkBase<T>, Records> actualOutputs = new IdentityHashMap<GenericDataSinkBase<T>, Records>(); private final Operator<?>[] contracts; private static final Log LOG = LogFactory.getLog(GenericTestPlan.class); private int degreeOfParallelism = 1; private final Map<GenericDataSinkBase<T>, Records> expectedOutputs = new IdentityHashMap<GenericDataSinkBase<T>, Records>(); private final Map<GenericDataSourceBase<?, ?>, Records> inputs = new IdentityHashMap<GenericDataSourceBase<?, ?>, Records>(); private final List<GenericDataSinkBase<T>> sinks = new ArrayList<GenericDataSinkBase<T>>(); private final List<GenericDataSourceBase<?, ?>> sources = new ArrayList<GenericDataSourceBase<?, ?>>(); private TypeConfig<T> defaultConfig; /** * Initializes TestPlan with the given {@link Operator<?>}s. Like the original {@link Plan}, the contracts may be * {@link GenericDataSinkBase<T>}s. However, it * is also possible to add arbitrary Operators, to which GenericDataSinkOperators * are automatically added. * * @param defaultConfig * the {@link TypeConfig} that is used for the plan if no specific config is given * @param contracts * a list of Operators with at least one element. */ public GenericTestPlan(final TypeConfig<T> defaultConfig, final Collection<? extends Operator<?>> contracts) { this(defaultConfig, contracts.toArray(new Operator<?>[contracts.size()])); } /** * Initializes TestPlan with the given {@link Operator<?>}s. Like the original {@link Plan}, the contracts may be * {@link GenericDataSinkBase<T>}s. However, it * is also possible to add arbitrary Operators, to which GenericDataSinkOperators * are automatically added. * * @param defaultConfig * the {@link TypeConfig} that is used for the plan if no specific config is given * @param contracts * a list of Operators with at least one element. */ public GenericTestPlan(final TypeConfig<T> defaultConfig, final Operator<?>... contracts) { if (contracts.length == 0) throw new IllegalArgumentException(); this.defaultConfig = defaultConfig; final Configuration config = new Configuration(); config.setString(ConfigConstants.DEFAULT_INSTANCE_TYPE, "standard,1,1,200,1,1"); GlobalConfiguration.includeConfiguration(config); this.contracts = new InputOutputAdder<T>(defaultConfig).process(contracts); this.findSinksAndSources(); this.configureSinksAndSources(); } @Override public void close() throws IOException { final ClosableManager closableManager = new ClosableManager(); for (final Records pairs : this.inputs.values()) closableManager.add(pairs); for (final Records pairs : this.actualOutputs.values()) closableManager.add(pairs); for (final Records pairs : this.expectedOutputs.values()) closableManager.add(pairs); closableManager.close(); } /** * Returns the first output {@link GenericTestRecords} of the TestPlan associated with the * given sink. This is the recommended method to get output records for more * complex TestPlans.<br> * The values are only meaningful after a {@link #run()}. * * @return the output {@link GenericTestRecords} of the TestPlan associated with the * first sink */ public Records getActualOutput() { return this.getActualOutput(0); } /** * Returns the output {@link GenericTestRecords} of the TestPlan associated with the * given sink. This is the recommended method to get output records for more * complex TestPlans.<br> * The values are only meaningful after a {@link #run()}. * * @param sink * the sink of which the associated output GenericTestRecords should be * returned * @return the output {@link GenericTestRecords} of the TestPlan associated with the * given sink */ public Records getActualOutput(final GenericDataSinkBase<T> sink) { return this.getActualOutput(sink, null); } /** * Returns the output {@link GenericTestRecords} of the TestPlan associated with the * given sink. This is the recommended method to get output records for more * complex TestPlans.<br> * The values are only meaningful after a {@link #run()}. * * @param typeConfig * the {@link TypeConfig} that is used for this output * @param sink * the sink of which the associated output GenericTestRecords should be * returned * @return the output {@link GenericTestRecords} of the TestPlan associated with the * given sink */ public Records getActualOutput(final GenericDataSinkBase<T> sink, final TypeConfig<T> typeConfig) { Records values = this.actualOutputs.get(sink); if (values == null) this.actualOutputs.put(sink, values = this.createTestRecords(typeConfig)); else if (typeConfig != null) values.setTypeConfig(typeConfig); return values; } /** * Returns the output {@link GenericTestRecords} associated with the <i>i</i>th * output of the TestPlan. If multiple contracts are tested in the TestPlan, * it is recommended to use the {@link #getActualOutput(GenericDataSinkBase<T>)} method to unambiguously get the * values.<br> * The values are only meaningful after a {@link #run()}. * * @param number * the number of the output. * @return the <i>i</i>th output of the TestPlan */ public Records getActualOutput(final int number) { return this.getActualOutput(this.getDataSinks().get(number)); } /** * Returns the output {@link GenericTestRecords} of the TestPlan associated with the * <i>i</i>th sink.<br> * The values are only meaningful after a {@link #run()}. * * @param typeConfig * the {@link TypeConfig} that is used for this output * @param sinkNumber * the <i>i</i>th sink of which the associated output GenericTestRecords should be * returned * @return the <i>i</i>th output of the TestPlan */ public Records getActualOutput(final int sinkNumber, final TypeConfig<T> typeConfig) { return this.getActualOutput(this.sinks.get(sinkNumber), typeConfig); } /** * Returns the first output {@link GenericTestRecords} of the TestPlan associated with the * given sink. This is the recommended method to get output records for more * complex TestPlans.<br> * The values are only meaningful after a {@link #run()}. * * @param typeConfig * the {@link TypeConfig} that is used for this output * @return the output {@link GenericTestRecords} of the TestPlan associated with the * first sink */ public Records getActualOutput(final TypeConfig<T> typeConfig) { return this.getActualOutput(0, typeConfig); } /** * Returns the degreeOfParallelism. * * @return the degreeOfParallelism */ public int getDegreeOfParallelism() { return this.degreeOfParallelism; } /** * Returns the expected output {@link GenericTestRecords} with the given {@link TypeConfig} of the TestPlan * associated with the given sink. This is the recommended method to set expected * output records for more complex TestPlans. * * @param sink * the sink of which the associated expected output GenericTestRecords * should be returned * @param typeConfig * the TypeConfig that should be used to create a new GenericTestRecords if needed * @return the expected output {@link GenericTestRecords} of the TestPlan associated * with the given sink */ public Records getExpectedOutput(final GenericDataSinkBase<T> sink, final TypeConfig<T> typeConfig) { Records values = this.expectedOutputs.get(sink); if (values == null) { this.expectedOutputs.put(sink, values = this.createTestRecords(typeConfig)); final Records actualOutput = this.getActualOutput(sink); actualOutput.setTypeConfig(typeConfig); } else if (typeConfig != null) values.setTypeConfig(typeConfig); return values; } /** * Returns the expected output {@link GenericTestRecords} associated with the * <i>i</i>th expected output of the TestPlan. If multiple contracts are * tested in the TestPlan, it is recommended to use the {@link #getExpectedOutput(GenericDataSinkBase<T>, * TypeConfig)} method to * unambiguously set the values. * * @param typeConfig * the TypeConfig that should be used to create a new GenericTestRecords if needed * @param number * the number of the expected output. * @return the <i>i</i>th expected output of the TestPlan */ public Records getExpectedOutput(final int number, final TypeConfig<T> typeConfig) { return this.getExpectedOutput(this.getDataSinks().get(number), typeConfig); } /** * Returns the first expected output {@link GenericTestRecords} of the TestPlan. If * multiple contracts are tested in the TestPlan, it is recommended to use * the {@link #getExpectedOutput(GenericDataSinkBase<T>, TypeConfig)} method to unambiguously * set the values. * * @param typeConfig * @return the first expected output of the TestPlan */ public Records getExpectedOutput(final TypeConfig<T> typeConfig) { return this.getExpectedOutput(0, typeConfig); } /** * Returns the first input {@link GenericTestRecords} of the TestPlan. If multiple * contracts are tested in the TestPlan, it is recommended to use the {@link #getInput(GenericDataSource)} method * to unambiguously set the * values. * * @return the first input of the TestPlan */ public Records getInput() { return this.getInput(0); } /** * Returns the input {@link GenericTestRecords} of the TestPlan associated with the * given source. This is the recommended method to set input records for more * complex TestPlans. * * @param source * the source of which the associated input GenericTestRecords should be * returned * @return the input {@link GenericTestRecords} of the TestPlan associated with the * given source */ public Records getInput(final Operator<?> source) { return this.getInput(source, null); } /** * Returns the input {@link GenericTestRecords} of the TestPlan associated with the * given source. This is the recommended method to set input records for more * complex TestPlans. * * @param typeConfig * the TypeConfig that should be used to create a new GenericTestRecords if needed * @param source * the source of which the associated input GenericTestRecords should be * returned * @return the input {@link GenericTestRecords} of the TestPlan associated with the * given source */ public Records getInput(final Operator<?> source, final TypeConfig<T> typeConfig) { Records values = this.inputs.get(source); if (values == null) this.inputs.put((GenericDataSourceBase<?, ?>) source, values = this.createTestRecords(typeConfig)); else if (typeConfig != null) values.setTypeConfig(typeConfig); return values; } /** * Returns the input {@link GenericTestRecords} associated with the <i>i</i>th input * of the TestPlan. If multiple contracts are tested in the TestPlan, it is * recommended to use the {@link #getInput(GenericDataSource)} method to * unambiguously set the values. * * @param number * the number of the input. * @return the <i>i</i>th input of the TestPlan */ public Records getInput(final int number) { return this.getInput(number, null); } /** * Returns the input {@link GenericTestRecords} associated with the <i>i</i>th input * of the TestPlan. If multiple contracts are tested in the TestPlan, it is * recommended to use the {@link #getInput(GenericDataSource, TypeConfig)} method to * unambiguously set the values. * * @param number * the number of the input. * @param typeConfig * the TypeConfig that should be used to create a new GenericTestRecords if needed * @return the <i>i</i>th input of the TestPlan */ public Records getInput(final int number, final TypeConfig<T> typeConfig) { return this.getInput(this.getDataSources().get(number), typeConfig); } /** * Traverses the test plan and returns the first contracts that process the * data of the given contract. * * @param contract * the contract of which one preceding contracts should be * returned * @return returns the first contract that process the data of the given * contract */ public Operator<?> getOutputOfOperator(final Operator<?> contract) { return this.getOutputsOfOperator(contract)[0]; } /** * Traverses the test plan and returns all contracts that process the data * of the given contract. * * @param contract * the contract of which preceding contracts should be returned * @return returns all contracts that process the data of the given contract */ public Operator<?>[] getOutputsOfOperator(final Operator<?> contract) { final ArrayList<Operator<?>> outputs = new ArrayList<Operator<?>>(); for (final Operator<?> sink : this.sinks) sink.accept(new Visitor<Operator<?>>() { LinkedList<Operator<?>> outputStack = new LinkedList<Operator<?>>(); @Override public void postVisit(final Operator<?> visitable) { } @Override public boolean preVisit(final Operator<?> visitable) { if (visitable == contract) outputs.add(this.outputStack.peek()); this.outputStack.push(visitable); return true; } }); return outputs.toArray(new Operator<?>[outputs.size()]); } /** * Returns all {@link GenericDataSinkBase<T>}s of this test plan. * * @return the sinks */ public List<GenericDataSinkBase<T>> getSinks() { return this.sinks; } /** * Returns the sources. * * @return the sources */ public List<GenericDataSourceBase<?, ?>> getSources() { return this.sources; } /** * Compiles the plan to an {@link Plan} and executes it. If * expected values have been specified, the actual outputs values are * compared to the expected values. */ public void run() { try { final Plan plan = this.buildPlanWithReadableSinks(); this.syncDegreeOfParallelism(plan); this.initAdhocInputs(); // Configuration memoryLimit = new Configuration(); // memoryLimit.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, (int) (Runtime.getRuntime().totalMemory() / 2)); // GlobalConfiguration.includeConfiguration(memoryLimit); byte[] reservedSpace = new byte[20 * 1024 * 1024]; LocalExecutor.execute(plan); LOG.trace("Reserving " + reservedSpace.length + " bytes for Sopremo"); } catch (final Exception e) { Assert.fail("plan scheduling: " + e.getMessage() + "\n" + StringUtils.stringifyException(e)); } try { this.validateResults(); } finally { try { this.close(); } catch (final IOException e) { } } } /** * Sets the degreeOfParallelism to the specified value. * * @param degreeOfParallelism * the degreeOfParallelism to set */ public void setDegreeOfParallelism(final int degreeOfParallelism) { this.degreeOfParallelism = degreeOfParallelism; } /** * Creates the actual plan for the given sinks. * * @param wrappedSinks * the sinks * @return the plan */ protected Plan createPlan(final Collection<GenericDataSinkBase<T>> wrappedSinks) { return new Plan(wrappedSinks); } /** * Creates the concrete implementation of {@link GenericTestRecords} for this plan. * * @param typeConfig * the {@link TypeConfig} to use * @return the test records */ protected abstract Records createTestRecords(final TypeConfig<T> typeConfig); /** * Returns the defaultConfig. * * @return the defaultConfig */ protected TypeConfig<T> getDefaultConfig() { return this.defaultConfig; } /** * Sets the defaultConfig to the specified value. * * @param defaultConfig * the defaultConfig to set */ protected void setDefaultConfig(final TypeConfig<T> defaultConfig) { if (defaultConfig == null) throw new NullPointerException("defaultConfig must not be null"); this.defaultConfig = defaultConfig; } /** * Actually builds the plan but guarantees that the output can be read * without additional knowledge. Currently the {@link SequentialOutputFormat} is used for a guaranteed * deserializable * output.<br> * If a data source is not {@link SequentialOutputFormat}, it is replaced by * a {@link SplittingOutputFormat}, with two outputs: the original one and * one {@link SequentialOutputFormat}. */ private Plan buildPlanWithReadableSinks() { final Collection<GenericDataSinkBase<T>> existingSinks = this.getDataSinks(); final Collection<GenericDataSinkBase<T>> wrappedSinks = new ArrayList<GenericDataSinkBase<T>>(); for (final GenericDataSinkBase<T> fileSink : existingSinks) { final Configuration inputConfig = new Configuration(); final TypeConfig<T> typeConfig = this.getActualOutput(fileSink, this.defaultConfig).getTypeConfig(); SequentialInputFormat.configureSequentialFormat(inputConfig) .typeSerializer(typeConfig.getTypeSerializerFactory()); // need a format which is deserializable without configuration if (!fileSink.getFormatWrapper().getUserCodeClass().equals(SequentialOutputFormat.class)) { final Records expectedValues = this.expectedOutputs.get(fileSink); final FileDataSinkBase<T> safeSink = createDefaultSink(fileSink.getName(), typeConfig); safeSink.setInput(fileSink.getInput()); wrappedSinks.add(fileSink); wrappedSinks.add(safeSink); // only add to expected outputs if we need to check for values if (expectedValues != null) this.expectedOutputs.put(safeSink, expectedValues); this.actualOutputs.put(safeSink, this.getActualOutput(fileSink)); this.getActualOutput(fileSink).load(SequentialInputFormat.class, safeSink.getFilePath(), inputConfig); } else { wrappedSinks.add(fileSink); this.getActualOutput(fileSink).load(SequentialInputFormat.class, ((FileDataSinkBase<T>) fileSink).getFilePath(), inputConfig); // make sure that the type serializer is set final Configuration outputConfig = new Configuration(); SequentialOutputFormat.configureSequentialFormat(outputConfig) .typeSerializer(typeConfig.getTypeSerializerFactory()); fileSink.getParameters().addAll(outputConfig); } } return this.createPlan(wrappedSinks); } /** * */ private void configureSinksAndSources() { for (final GenericDataSinkBase<T> sink : this.sinks) if (sink.getFormatWrapper().getUserCodeObject() instanceof FileOutputFormat<?>) ((FileOutputFormat<?>) sink.getFormatWrapper().getUserCodeObject()).setOpenTimeout(0); for (final GenericDataSourceBase<?, ?> source : this.sources) if (source.getFormatWrapper().getUserCodeObject() instanceof FileInputFormat<?>) ((FileInputFormat<?>) source.getFormatWrapper().getUserCodeObject()).setOpenTimeout(0); } /** * Traverses the plan for all sinks and sources. */ @SuppressWarnings({ "unchecked", "rawtypes" }) private void findSinksAndSources() { for (final Operator<?> contract : this.contracts) contract.accept(new Visitor<Operator<?>>() { @Override public void postVisit(final Operator<?> visitable) { } @Override public boolean preVisit(final Operator<?> visitable) { if (visitable instanceof GenericDataSinkBase && !GenericTestPlan.this.sinks.contains(visitable)) GenericTestPlan.this.sinks.add((GenericDataSinkBase<T>) visitable); if (visitable instanceof GenericDataSourceBase<?, ?> && !GenericTestPlan.this.sources.contains(visitable)) GenericTestPlan.this.sources.add((GenericDataSourceBase<?, ?>) visitable); return true; } }); for (final GenericDataSourceBase<?, ?> source : this.sources) if (source instanceof FileDataSourceBase) this.getInput(source).load( (Class<? extends FileInputFormat>) source.getFormatWrapper().getUserCodeClass(), ((FileDataSourceBase) source).getFilePath(), source.getParameters()); else this.getInput(source).load( (Class<? extends GenericInputFormat>) source.getFormatWrapper().getUserCodeClass(), source.getParameters()); } private List<GenericDataSinkBase<T>> getDataSinks() { return this.sinks; } private List<? extends GenericDataSourceBase<?, ?>> getDataSources() { return this.sources; } private void initAdhocInputs() throws IOException { for (final GenericDataSourceBase<?, ?> source : this.sources) { final Records input = this.getInput(source, this.defaultConfig); if (source.getFormatWrapper().getUserCodeClass().equals(SequentialInputFormat.class)) SequentialInputFormat.configureSequentialFormat(source) .typeSerializer(input.getTypeConfig().getTypeSerializerFactory()); if (input.isAdhoc() && source instanceof FileDataSourceBase) input.saveToFile(((FileDataSourceBase<?>) source).getFilePath()); } } /** * Sets the degree of parallelism for every node in the plan. */ private void syncDegreeOfParallelism(final Plan plan) { plan.accept(new Visitor<Operator<?>>() { @Override public void postVisit(final Operator<?> visitable) { } @Override public boolean preVisit(final Operator<?> visitable) { int degree = GenericTestPlan.this.getDegreeOfParallelism(); if (visitable instanceof GenericDataSourceBase<?, ?>) degree = 1; else if (degree > 1 && visitable instanceof FileDataSinkBase) try { final Path path = new Path(((FileDataSinkBase<?>) visitable).getFilePath()); final FileSystem fs = path.getFileSystem(); final FileStatus f = fs.getFileStatus(path); if (!f.isDir()) { fs.delete(path, false); fs.mkdirs(path); } } catch (final IOException e) { e.printStackTrace(); } if (visitable.getDegreeOfParallelism() == -1) visitable.setDegreeOfParallelism(degree); return true; } }); } private void validateResults() { for (final GenericDataSinkBase<T> sinkOperator : this.getDataSinks()) { final Records expectedValues = this.expectedOutputs.get(sinkOperator); // need a format which is deserializable without configuration if (sinkOperator.getFormatWrapper().getUserCodeClass() == (Class<?>) SequentialOutputFormat.class && expectedValues != null && expectedValues.isInitialized()) { final Records actualValues = this.getActualOutput(sinkOperator); try { actualValues.assertEquals(expectedValues); } catch (final AssertionError e) { final AssertionError assertionError = new AssertionError( sinkOperator.getName() + ": " + e.getMessage()); assertionError.initCause(e.getCause()); throw assertionError; } finally { actualValues.close(); } } } } /** * Creates a default sink with the given name. This sink may be used with ad-hoc values added to the corresponding * {@link GenericTestRecords}. * * @param name * the name of the sink * @param typeConfig * the {@link TypeConfig} used in {@link SequentialOutputFormat} * @return the created sink */ public static <T> FileDataSinkBase<T> createDefaultSink(final String name, final TypeConfig<T> typeConfig) { @SuppressWarnings("unchecked") final FileDataSinkBase<T> sink = new FileDataSinkBase<T>((FileOutputFormat<T>) new SequentialOutputFormat(), typeConfig.getSink(), getTestPlanFile("output"), name); SequentialOutputFormat.configureSequentialFormat(sink) .typeSerializer(typeConfig.getTypeSerializerFactory()); return sink; } /** * Creates a default source with the given name. This sink may be used with ad-hoc values added to the corresponding * {@link GenericTestRecords}. * * @param name * the name of the source * @param typeConfig * the {@link TypeConfig} used in {@link SequentialInputFormat} * @return the created source */ @SuppressWarnings({ "unchecked", "rawtypes" }) public static FileDataSourceBase createDefaultSource(final String name, final TypeConfig<?> typeConfig) { final FileDataSourceBase source = new FileDataSourceBase(new SequentialInputFormat(), typeConfig.getSource(), getTestPlanFile("input"), name); SequentialInputFormat.configureSequentialFormat(source) .typeSerializer(typeConfig.getTypeSerializerFactory()); return source; } static String getTestPlanFile(final String prefix) { return createTemporaryFile("testPlan", prefix); } private static String createTemporaryFile(final String suffix, final String prefix) { try { final File tempFile = File.createTempFile(suffix, prefix); tempFile.deleteOnExit(); return tempFile.toURI().toString(); } catch (final IOException e) { throw new IllegalStateException("Cannot create temporary file for prefix " + prefix, e); } } }