com.matthewrathbone.hadoop.MRTester.java Source code

Java tutorial

Introduction

Here is the source code for com.matthewrathbone.hadoop.MRTester.java

Source

// Copyright 2013 Matthew Rathbone

//    Licensed under the Apache License, Version 2.0 (the "License");
//    you may not use this file except in compliance with the License.
//    You may obtain a copy of the License at

//        http://www.apache.org/licenses/LICENSE-2.0

//    Unless required by applicable law or agreed to in writing, software
//    distributed under the License is distributed on an "AS IS" BASIS,
//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//    See the License for the specific language governing permissions and
//    limitations under the License.

package com.matthewrathbone.hadoop;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import java.io.BufferedWriter;
import java.io.File;
import java.io.StringWriter;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.List;
import java.util.ArrayList;
import java.util.UUID;
import org.apache.commons.io.IOUtils;

public class MRTester {
    private File base;
    private final Path baseDir;
    private JobConf conf = new JobConf();
    private FileSystem fileSystem;

    public static class JobArgs {
        public Configuration conf;
        public FileSystem fs;

        public JobArgs(Configuration conf, FileSystem fs) {
            this.conf = conf;
            this.fs = fs;
        }
    }

    public static abstract class TestJob {
        public abstract void run(JobArgs args) throws Exception;
    }

    public MRTester() throws Exception {
        this("target/test");
    }

    public MRTester(String b) throws Exception {
        this.base = new File(b);
        this.baseDir = new Path("file://" + base.getAbsolutePath());
        String mrDir = "mapred/local";
        String logs = "logs";

        conf.set("mapred.local.dir", new File(base, mrDir).getAbsolutePath());
        conf.set("mapred.job.tracker.handle.count", "2");
        conf.set("tasktracker.http.threads", "2");
        // avoid heap space issues during testing -- default is 100
        conf.set("io.sort.mb", "20");
        conf.set("mapred.map.max.attempts", "2");
        conf.set("mapred.reduce.max.attempts", "2");
        conf.set("jobclient.completion.poll.interval", "30");
        System.getProperties().setProperty("hadoop.log.dir", new File(base, logs).getAbsolutePath());
        this.fileSystem = FileSystem.get(baseDir.toUri(), conf);
        assert (fileSystem.mkdirs(new Path(baseDir, mrDir)));
        assert (fileSystem.mkdirs(new Path(baseDir, logs)));
    }

    public Path registerDirectory(boolean create) throws Exception {
        String dirName = UUID.randomUUID().toString();
        Path destination = new Path(baseDir, dirName);
        fileSystem.delete(destination, true);
        if (create)
            fileSystem.mkdirs(destination);
        return destination;
    }

    public Path registerDirectory() throws Exception {
        return registerDirectory(true);
    }

    public Path registerString(String value) throws Exception {
        Path dir = registerDirectory();
        Path file = new Path(dir, "file");
        BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fileSystem.create(file, true)));
        try {
            br.write(value);
        } finally {
            br.close();
        }
        return dir;
    }

    public <A extends Writable, B extends Writable> List<Tuple<A, B>> readSequenceFile(Path path, Class<A> acls,
            Class<B> bcls) throws Exception {

        SequenceFile.Reader reader = new SequenceFile.Reader(this.conf, SequenceFile.Reader.file(path));
        long position = reader.getPosition();

        A key = acls.newInstance();
        B value = bcls.newInstance();

        List<Tuple<A, B>> results = new ArrayList<Tuple<A, B>>();
        while (reader.next(key, value)) {
            results.add(new Tuple(key, value));
            key = acls.newInstance();
            value = bcls.newInstance();
        }
        return results;
    }

    public List<String> collectStrings(Path location) throws Exception {
        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
        FileStatus[] items = fileSystem.listStatus(location);
        if (items == null)
            return new ArrayList<String>();
        List<String> results = new ArrayList<String>();
        for (FileStatus item : items) {
            if (item.getPath().getName().startsWith("_")) {
                continue;
            }

            CompressionCodec codec = factory.getCodec(item.getPath());
            InputStream stream = null;

            // check if we have a compression codec
            if (codec != null) {
                stream = codec.createInputStream(fileSystem.open(item.getPath()));
            } else {
                stream = fileSystem.open(item.getPath());
            }

            StringWriter writer = new StringWriter();
            IOUtils.copy(stream, writer, "UTF-8");
            String raw = writer.toString();
            String[] resulting = raw.split("\n");
            for (String str : raw.split("\n")) {
                results.add(str);
            }
        }
        return results;
    }

    public void run(TestJob job) throws Exception {
        JobArgs args = new JobArgs(conf, fileSystem);
        job.run(args);
    }
}