Java tutorial
/* * Copyright 2012 Alex Holmes * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.alexholmes.hadooputils.test; import com.alexholmes.hadooputils.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import java.io.DataOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; import static org.junit.Assert.assertEquals; /** * A class that helps with testing MapReduce jobs with the * {@link org.apache.hadoop.mapred.LocalJobRunner}, which is an in-memory MapReduce * implementation. * <p/> * It allows the user to create input files, and then provides some utility methods to help * test the output file contents generated by a MapReduce job. */ public class TextIOJobBuilder { /** * The default key/value separator for input files. */ private String inputSeparator = "\t"; /** * The default key/value separator for output files. */ private String outputSeparator = "\t"; /** * The input directory for the MapReduce job. */ private final Path inputPath; /** * The output directory for the MapReduce job. */ private final Path outputPath; /** * The (local) filesystem. */ private final FileSystem fs; /** * The input strings which are written to the directory specified in {@link #inputPath}. */ private List<String> inputs = new ArrayList<String>(); /** * The expected output strings which are read from the directory specified in * {@link #outputPath}. */ private List<String> expectedOutputs = new ArrayList<String>(); /** * Constructor which instantiates input/output paths. * * @param fs the file system within which input directory and * files are created * @param inputPath the input directory where input files will be created * @param outputPath the output directory that the MapReduce job will write to * @throws IOException if something goes wrong */ public TextIOJobBuilder(final FileSystem fs, final Path inputPath, final Path outputPath) throws IOException { this.fs = fs; if (inputPath == null) { this.inputPath = new Path("/input"); } else { this.inputPath = inputPath; } if (outputPath == null) { this.outputPath = new Path("/output"); } else { this.outputPath = outputPath; } } /** * Constructor which instantiates input/output paths. * * @param config the Hadoop configuration * @param inputPath the input directory where input files will be created * @param outputPath the output directory that the MapReduce job will write to * @throws IOException if something goes wrong */ public TextIOJobBuilder(final Configuration config, final Path inputPath, final Path outputPath) throws IOException { this(FileSystem.get(config), inputPath, outputPath); } /** * Constructor which instantiates input/output paths. * * @param config the Hadoop configuration * @throws IOException if something goes wrong */ public TextIOJobBuilder(final Configuration config) throws IOException { this(FileSystem.get(config), new Path("/input"), new Path("/output")); } /** * Constructor which instantiates input/output paths. * * @param config the Hadoop configuration * @param fileSystem the Hadoop file system * @throws IOException if something goes wrong */ public TextIOJobBuilder(final Configuration config, final FileSystem fileSystem) throws IOException { this(fileSystem, null, null); } /** * Constructor which instantiates input/output paths. * * @param fileSystem the Hadoop file system * @throws IOException if something goes wrong */ public TextIOJobBuilder(final FileSystem fileSystem) throws IOException { this(fileSystem, null, null); } /** * Set the input file key/value separator. * * @param separator the separator * @return a reference to this object */ public TextIOJobBuilder setInputSeparator(final String separator) { this.inputSeparator = separator; return this; } /** * Get the input file key/value separator. * * @return the separator */ public String getInputSeparator() { return inputSeparator; } /** * Set the output file key/value separator. * * @param separator the separator * @return a reference to this object */ public TextIOJobBuilder setOutputSeparator(final String separator) { this.outputSeparator = separator; return this; } /** * Get the output file key/value separator. * * @return the separator */ public String getOutputSeparator() { return outputSeparator; } /** * Add a line to the inputs. * * @param line an input line * @return a reference to this object */ public TextIOJobBuilder addInput(final String line) { inputs.add(line); return this; } /** * Add a single line to the inputs, where each part is separated by * {@link #getInputSeparator()}. * * @param parts vargs/array of tokens * @return a reference to this object */ public TextIOJobBuilder addInput(final String... parts) { inputs.add(StringUtils.join(parts, inputSeparator)); return this; } /** * Add a line to the expected outputs. * * @param line an output line * @return a reference to this object */ public TextIOJobBuilder addExpectedOutput(final String line) { expectedOutputs.add(line); return this; } /** * Add a single line to the expected outputs, where each part is separated by * {@link #getOutputSeparator()}. * * @param parts vargs/array of tokens * @return a reference to this object */ public TextIOJobBuilder addExpectedOutput(final String... parts) { expectedOutputs.add(StringUtils.join(parts, outputSeparator)); return this; } /** * Gathers all the inputs buffered by calls to {@link #addInput(String)} or * {@link #addInput(String...)} and writes them to the input directory, in * preparation for running the MapReduce job. * * @return a reference to this object * @throws IOException if something goes wrong */ public TextIOJobBuilder writeInputs() throws IOException { if (fs.exists(outputPath)) { fs.delete(outputPath, true); } if (fs.exists(inputPath)) { fs.delete(inputPath, true); } fs.mkdirs(inputPath); DataOutputStream stream = fs.create(new Path(inputPath, "part-0")); IOUtils.writeLines(inputs, String.format("%n"), stream); stream.close(); return this; } /** * Called after the MapReduce job has completed, to verify that the outputs * generated by the MapReduce job align with the expected outputs that were * set with calls to {@link #addExpectedOutput(String)} and * {@link #addExpectedOutput(String...)}. * * @return a reference to this object * @throws IOException if something goes wrong */ public TextIOJobBuilder verifyResults() throws IOException { FileStatus[] outputFiles = fs.listStatus(outputPath, new PathFilter() { @Override public boolean accept(final Path path) { return path.getName().startsWith("part"); } }); System.out.println("Output files: " + StringUtils.join(outputFiles)); int i = 0; for (FileStatus file : outputFiles) { List<String> actualLines = FileUtils.readLines(fs, file.getPath()); for (String actualLine : actualLines) { String expectedLine = expectedOutputs.get(i++); assertEquals(expectedLine, actualLine); } } assertEquals(expectedOutputs.size(), i); return this; } /** * Gets the input path. * * @return the input path */ public Path getInputPath() { return inputPath; } /** * Gets the output path. * * @return the output path */ public Path getOutputPath() { return outputPath; } /** * Get the file system. * * @return the file system */ public FileSystem getFs() { return fs; } }