org.apache.mahout.classifier.chi_rwcs.mapreduce.Builder.java Source code

Introduction

Here is the source code for org.apache.mahout.classifier.chi_rwcs.mapreduce.Builder.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.mahout.classifier.chi_rwcs.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.mahout.classifier.chi_rwcs.builder.Fuzzy_ChiCSBuilder;
import org.apache.mahout.classifier.chi_rwcs.*;
import org.apache.mahout.classifier.chi_rwcs.data.Dataset;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.Comparator;

public abstract class Builder {

    private static final Logger log = LoggerFactory.getLogger(Builder.class);

    private final Fuzzy_ChiCSBuilder fuzzy_ChiCSBuilder;
    private final Path dataPath;
    private final Path datasetPath;
    private final Configuration conf;
    private String outputDirName = "output";

    protected Builder(Fuzzy_ChiCSBuilder fuzzy_ChiCSBuilder, Path dataPath, Path datasetPath, Configuration conf) {
        this.fuzzy_ChiCSBuilder = fuzzy_ChiCSBuilder;
        this.dataPath = dataPath;
        this.datasetPath = datasetPath;
        this.conf = new Configuration(conf);
    }

    protected Fuzzy_ChiCSBuilder getFuzzy_ChiCSBuilder() {
        return fuzzy_ChiCSBuilder;
    }

    protected Path getDataPath() {
        return dataPath;
    }

    /**
     * Returns the random seed
     * 
     * @param conf
     *          configuration
     * @return null if no seed is available
     */
    public static Long getRandomSeed(Configuration conf) {
        String seed = conf.get("mahout.rf.random.seed");
        if (seed == null) {
            return null;
        }

        return Long.valueOf(seed);
    }

    /**
     * Return the value of "mapred.map.tasks".
     * 
     * @param conf
     *          configuration
     * @return number of map tasks
     */
    public static int getNumMaps(Configuration conf) {
        return conf.getInt("mapred.map.tasks", -1);
    }

    /**
     * Used only for DEBUG purposes. if false, the mappers doesn't output anything, so the builder has nothing
     * to process
     * 
     * @param conf
     *          configuration
     * @return true if the builder has to return output. false otherwise
     */
    protected static boolean isOutput(Configuration conf) {
        return conf.getBoolean("debug.mahout.fc.output", true);
    }

    public static Fuzzy_ChiCSBuilder getFuzzy_ChiCSBuilder(Configuration conf) {
        String string = conf.get("mahout.fc.fuzzy_ChiCSBuilder");
        if (string == null) {
            return null;
        }

        return StringUtils.fromString(string);
    }

    private static void setFuzzy_ChiCSBuilder(Configuration conf, Fuzzy_ChiCSBuilder fuzzy_ChiCSBuilder) {
        conf.set("mahout.fc.fuzzy_ChiCSBuilder", StringUtils.toString(fuzzy_ChiCSBuilder));
    }

    /**
     * Sets the Output directory name, will be creating in the working directory
     * 
     * @param name
     *          output dir. name
     */
    public void setOutputDirName(String name) {
        outputDirName = name;
    }

    /**
     * Output Directory name
     * 
     * @param conf
     *          configuration
     * @return output dir. path (%WORKING_DIRECTORY%/OUTPUT_DIR_NAME%)
     * @throws IOException
     *           if we cannot get the default FileSystem
     */
    protected Path getOutputPath(Configuration conf) throws IOException {
        // the output directory is accessed only by this class, so use the default
        // file system
        FileSystem fs = FileSystem.get(conf);
        return new Path(fs.getWorkingDirectory(), outputDirName);
    }

    /**
     * Helper method. Get a path from the DistributedCache
     * 
     * @param conf
     *          configuration
     * @param index
     *          index of the path in the DistributedCache files
     * @return path from the DistributedCache
     * @throws IOException
     *           if no path is found
     */
    public static Path getDistributedCacheFile(Configuration conf, int index) throws IOException {
        URI[] files = DistributedCache.getCacheFiles(conf);

        if (files == null || files.length <= index) {
            throw new IOException("path not found in the DistributedCache");
        }

        return new Path(files[index].getPath());
    }

    /**
     * Helper method. Load a Dataset stored in the DistributedCache
     * 
     * @param conf
     *          configuration
     * @return loaded Dataset
     * @throws IOException
     *           if we cannot retrieve the Dataset path from the DistributedCache, or the Dataset could not be
     *           loaded
     */
    public static Dataset loadDataset(Configuration conf) throws IOException {
        Path datasetPath = getDistributedCacheFile(conf, 0);

        return Dataset.load(conf, datasetPath);
    }

    /**
     * Used by the inheriting classes to configure the job
     * 
     *
     * @param job
     *          Hadoop's Job
     * @throws IOException
     *           if anything goes wrong while configuring the job
     */
    protected abstract void configureJob(Job job) throws IOException;

    /**
     * Sequential implementation should override this method to simulate the job execution
     * 
     * @param job
     *          Hadoop's job
     * @return true is the job succeeded
     */
    protected boolean runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {
        return job.waitForCompletion(true);
    }

    /**
     * Parse the output files to extract the model and pass the predictions to the callback
     * 
     * @param job
     *          Hadoop's job
     * @return Built DecisionForest
     * @throws IOException
     *           if anything goes wrong while parsing the output
     */
    protected abstract RuleBase parseOutput(Job job) throws IOException;

    public RuleBase build() throws IOException, ClassNotFoundException, InterruptedException {

        Path outputPath = getOutputPath(conf);
        FileSystem fs = outputPath.getFileSystem(conf);

        // check the output
        if (fs.exists(outputPath)) {
            throw new IOException("Output path already exists : " + outputPath);
        }

        setFuzzy_ChiCSBuilder(conf, fuzzy_ChiCSBuilder);

        // put the dataset into the DistributedCache
        DistributedCache.addCacheFile(datasetPath.toUri(), conf);

        Job job = new Job(conf, "fuzzy_ChiCS builder");

        log.debug("ChiCS: Configuring the job...");
        configureJob(job);

        log.debug("ChiCS: Running the job...");
        if (!runJob(job)) {
            log.error("ChiCS: Job failed!");
            return null;
        }

        if (isOutput(conf)) {
            log.debug("Parsing the output...");
            RuleBase ruleBase = parseOutput(job);
            HadoopUtil.delete(conf, outputPath);
            return ruleBase;
        }

        return null;
    }

    /**
     * sort the splits into order based on size, so that the biggest go first.<br>
     * This is the same code used by Hadoop's JobClient.
     * 
     * @param splits
     *          input splits
     */
    public static void sortSplits(InputSplit[] splits) {
        Arrays.sort(splits, new Comparator<InputSplit>() {
            @Override
            public int compare(InputSplit a, InputSplit b) {
                try {
                    long left = a.getLength();
                    long right = b.getLength();
                    if (left == right) {
                        return 0;
                    } else if (left < right) {
                        return 1;
                    } else {
                        return -1;
                    }
                } catch (IOException ie) {
                    throw new IllegalStateException("Problem getting input split size", ie);
                } catch (InterruptedException ie) {
                    throw new IllegalStateException("Problem getting input split size", ie);
                }
            }
        });
    }
}