org.apache.mahout.df.mapreduce.partial.PartialBuilderTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.mahout.df.mapreduce.partial.PartialBuilderTest.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.df.mapreduce.partial;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;

import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.df.builder.DefaultTreeBuilder;
import org.apache.mahout.df.builder.TreeBuilder;
import org.apache.mahout.df.callback.PredictionCallback;
import org.apache.mahout.df.mapreduce.MapredOutput;
import org.apache.mahout.df.node.Leaf;
import org.apache.mahout.df.node.Node;

public class PartialBuilderTest extends MahoutTestCase {

    private static final int numMaps = 5;

    private static final int numTrees = 32;

    /** instances per partition */
    private static final int numInstances = 20;

    public void testProcessOutput() throws Exception {
        Configuration conf = new Configuration();
        conf.setInt("mapred.map.tasks", numMaps);

        Random rng = RandomUtils.getRandom();

        // prepare the output
        TreeID[] keys = new TreeID[numTrees];
        MapredOutput[] values = new MapredOutput[numTrees];
        int[] firstIds = new int[numMaps];
        randomKeyValues(rng, keys, values, firstIds);

        // store the output in a sequence file
        Path base = getTestTempDirPath("testdata");
        FileSystem fs = base.getFileSystem(conf);

        Path outputFile = new Path(base, "PartialBuilderTest.seq");
        Writer writer = SequenceFile.createWriter(fs, conf, outputFile, TreeID.class, MapredOutput.class);

        for (int index = 0; index < numTrees; index++) {
            writer.append(keys[index], values[index]);
        }
        writer.close();

        // load the output and make sure its valid
        TreeID[] newKeys = new TreeID[numTrees];
        Node[] newTrees = new Node[numTrees];

        PartialBuilder.processOutput(new Job(conf), base, firstIds, newKeys, newTrees,
                new TestCallback(keys, values));

        // check the forest
        for (int tree = 0; tree < numTrees; tree++) {
            assertEquals(values[tree].getTree(), newTrees[tree]);
        }

        assertTrue("keys not equal", Arrays.deepEquals(keys, newKeys));
    }

    /**
     * Make sure that the builder passes the good parameters to the job
     * 
     */
    public void testConfigure() {
        TreeBuilder treeBuilder = new DefaultTreeBuilder();
        Path dataPath = new Path("notUsedDataPath");
        Path datasetPath = new Path("notUsedDatasetPath");
        Long seed = 5L;

        new PartialBuilderChecker(treeBuilder, dataPath, datasetPath, seed);
    }

    /**
     * Generates random (key, value) pairs. Shuffles the partition's order
     * 
     * @param rng
     * @param keys
     * @param values
     * @param firstIds partitions's first ids in hadoop's order
     */
    private static void randomKeyValues(Random rng, TreeID[] keys, MapredOutput[] values, int[] firstIds) {
        int index = 0;
        int firstId = 0;
        List<Integer> partitions = new ArrayList<Integer>();

        for (int p = 0; p < numMaps; p++) {
            // select a random partition, not yet selected
            int partition;
            do {
                partition = rng.nextInt(numMaps);
            } while (partitions.contains(partition));

            partitions.add(partition);

            int nbTrees = Step1Mapper.nbTrees(numMaps, numTrees, partition);

            for (int treeId = 0; treeId < nbTrees; treeId++) {
                Node tree = new Leaf(rng.nextInt(100));

                keys[index] = new TreeID(partition, treeId);
                values[index] = new MapredOutput(tree, nextIntArray(rng, numInstances));

                index++;
            }

            firstIds[p] = firstId;
            firstId += numInstances;
        }

    }

    private static int[] nextIntArray(Random rng, int size) {
        int[] array = new int[size];
        for (int index = 0; index < size; index++) {
            array[index] = rng.nextInt(101) - 1;
        }

        return array;
    }

    static class PartialBuilderChecker extends PartialBuilder {

        private final Long seed;

        private final TreeBuilder treeBuilder;

        private final Path datasetPath;

        PartialBuilderChecker(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed) {
            super(treeBuilder, dataPath, datasetPath, seed);

            this.seed = seed;
            this.treeBuilder = treeBuilder;
            this.datasetPath = datasetPath;
        }

        @Override
        protected boolean runJob(Job job) throws IOException {
            // no need to run the job, just check if the params are correct

            Configuration conf = job.getConfiguration();

            assertEquals(seed, getRandomSeed(conf));

            // PartialBuilder should detect the 'local' mode and overrides the number
            // of map tasks
            assertEquals(1, conf.getInt("mapred.map.tasks", -1));

            assertEquals(numTrees, getNbTrees(conf));

            assertFalse(isOutput(conf));
            assertTrue(isOobEstimate(conf));

            assertEquals(treeBuilder, getTreeBuilder(conf));

            assertEquals(datasetPath, getDistributedCacheFile(conf, 0));

            return true;
        }

    }

    /**
     * Mock Callback. Make sure that the callback receives the correct predictions
     * 
     */
    static class TestCallback implements PredictionCallback {

        private final TreeID[] keys;

        private final MapredOutput[] values;

        TestCallback(TreeID[] keys, MapredOutput[] values) {
            this.keys = keys;
            this.values = values;
        }

        @Override
        public void prediction(int treeId, int instanceId, int prediction) {
            int partition = instanceId / numInstances;

            TreeID key = new TreeID(partition, treeId);
            int index = ArrayUtils.indexOf(keys, key);
            assertTrue("key not found", index >= 0);

            assertEquals(values[index].getPredictions()[instanceId % numInstances], prediction);
        }

    }
}