org.apache.hadoop.streaming.TestMultipleArchiveFiles.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.streaming.TestMultipleArchiveFiles.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.streaming;

import java.io.File;
import java.io.IOException;
import java.io.DataOutputStream;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.hdfs.MiniDFSCluster;

/**
 * This class tests cacheArchive option of streaming
 * The test case creates 2 archive files, ships it with hadoop
 * streaming and compares the output with expected output
 */
public class TestMultipleArchiveFiles extends TestStreaming {
    private static final Log LOG = LogFactory.getLog(TestMultipleArchiveFiles.class);

    private StreamJob job;
    private String INPUT_DIR = "multiple-archive-files/";
    private String INPUT_FILE = INPUT_DIR + "input.txt";
    private String CACHE_ARCHIVE_1 = INPUT_DIR + "cacheArchive1.zip";
    private File CACHE_FILE_1 = null;
    private String CACHE_ARCHIVE_2 = INPUT_DIR + "cacheArchive2.zip";
    private File CACHE_FILE_2 = null;
    private String expectedOutput = null;
    private String OUTPUT_DIR = "out";
    private Configuration conf = null;
    private MiniDFSCluster dfs = null;
    private MiniMRCluster mr = null;
    private FileSystem fileSys = null;
    private String namenode = null;

    public TestMultipleArchiveFiles() throws Exception {
        CACHE_FILE_1 = new File("cacheArchive1");
        CACHE_FILE_2 = new File("cacheArchive2");
        input = "HADOOP";
        expectedOutput = "HADOOP\t\nHADOOP\t\n";
        conf = new Configuration();
        dfs = new MiniDFSCluster.Builder(conf).build();
        fileSys = dfs.getFileSystem();
        namenode = fileSys.getUri().getAuthority();
        mr = new MiniMRCluster(1, namenode, 1, false);

        map = XARGS_CAT;
        reduce = CAT;
    }

    @Override
    protected void setInputOutput() {
        inputFile = INPUT_FILE;
        outDir = OUTPUT_DIR;
    }

    protected void createInput() throws IOException {
        fileSys.delete(new Path(INPUT_DIR), true);
        DataOutputStream dos = fileSys.create(new Path(INPUT_FILE));
        String inputFileString = "symlink1" + File.separator + "cacheArchive1\nsymlink2" + File.separator
                + "cacheArchive2";
        dos.write(inputFileString.getBytes("UTF-8"));
        dos.close();

        DataOutputStream out = fileSys.create(new Path(CACHE_ARCHIVE_1.toString()));
        ZipOutputStream zos = new ZipOutputStream(out);
        ZipEntry ze = new ZipEntry(CACHE_FILE_1.toString());
        zos.putNextEntry(ze);
        zos.write(input.getBytes("UTF-8"));
        zos.closeEntry();
        zos.close();

        out = fileSys.create(new Path(CACHE_ARCHIVE_2.toString()));
        zos = new ZipOutputStream(out);
        ze = new ZipEntry(CACHE_FILE_2.toString());
        zos.putNextEntry(ze);
        zos.write(input.getBytes("UTF-8"));
        zos.closeEntry();
        zos.close();
    }

    protected String[] genArgs() {
        String workDir = fileSys.getWorkingDirectory().toString() + "/";
        String cache1 = workDir + CACHE_ARCHIVE_1 + "#symlink1";
        String cache2 = workDir + CACHE_ARCHIVE_2 + "#symlink2";

        for (Map.Entry<String, String> entry : mr.createJobConf()) {
            args.add("-jobconf");
            args.add(entry.getKey() + "=" + entry.getValue());
        }
        args.add("-jobconf");
        args.add("mapreduce.job.reduces=1");
        args.add("-cacheArchive");
        args.add(cache1);
        args.add("-cacheArchive");
        args.add(cache2);
        args.add("-jobconf");
        args.add("mapred.jar=" + STREAMING_JAR);
        return super.genArgs();
    }

    protected void checkOutput() throws IOException {
        StringBuffer output = new StringBuffer(256);
        Path[] fileList = FileUtil.stat2Paths(fileSys.listStatus(new Path(OUTPUT_DIR)));
        for (int i = 0; i < fileList.length; i++) {
            LOG.info("Adding output from file: " + fileList[i]);
            output.append(StreamUtil.slurpHadoop(fileList[i], fileSys));
        }
        assertOutput(expectedOutput, output.toString());
    }
}