org.apache.mahout.feature.mrmr.MRMRDriver.java Source code

Introduction

Here is the source code for org.apache.mahout.feature.mrmr.MRMRDriver.java
Source

/* Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.mahout.feature.mrmr;

import org.apache.mahout.feature.mrmr.common.commandline.DefaultOptionCreator;
import org.apache.mahout.feature.mrmr.common.mapreduce.MaxCombiner;
import org.apache.mahout.feature.mrmr.common.mapreduce.MaxReducer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.util.ToolRunner;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.filecache.DistributedCache;

import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.VectorWritable;

import com.google.common.collect.Lists;

import java.lang.NumberFormatException;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.IOException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MRMRDriver extends AbstractJob {

    private static final Logger log = LoggerFactory.getLogger(MRMRDriver.class);

    private String featureJobTempUri(Path basedir, int iteration) {
        return basedir.toUri() + Path.SEPARATOR + iteration + "_feature";
    }

    private String maxJobTempUri(Path basedir, int iteration) {
        return basedir.toUri() + Path.SEPARATOR + iteration + "_max";
    }

    private String cacheFileUri(Path basedir, int iteration) {
        return basedir.toUri() + Path.SEPARATOR + (iteration - 1) + "_max" + Path.SEPARATOR + "part-r-00000";
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new Configuration(), new MRMRDriver(), args);
    }

    @Override
    public int run(String[] args) throws Exception {

        addInputOption();
        addOutputOption();
        addOption(DefaultOptionCreator.targetColumnOption().create());
        addOption(DefaultOptionCreator.rowNumberOption().create());
        addOption(DefaultOptionCreator.columnNumberOption().create());
        addOption(DefaultOptionCreator.featureNumberOption().create());

        Map<String, List<String>> parsedArgs = parseArguments(args);

        Path input = getInputPath();
        Path output = getOutputPath();
        Path temp = getTempPath();

        int targetColumn = Integer.parseInt(getOption(DefaultOptionCreator.TARGET_COLUMN));
        int rowNumber = Integer.parseInt(getOption(DefaultOptionCreator.ROW_NUMBER));
        int columnNumber = Integer.parseInt(getOption(DefaultOptionCreator.COLUMN_NUMBER));
        int featureNumber = Integer.parseInt(getOption(DefaultOptionCreator.FEATURE_NUMBER));

        log.info("Feature selection algorithm: MRMR");

        Path tempMax = null;
        for (int i = 0; i < featureNumber; i++) {

            log.info("Generating candidates at stage " + i + "th");

            Path tempFeature = new Path(featureJobTempUri(temp, i));

            Configuration confFeature = getConf();
            confFeature.set(DefaultOptionCreator.TARGET_INDEX, "" + (targetColumn - 1));
            confFeature.set(DefaultOptionCreator.COLUMN_NUMBER, "" + columnNumber);
            confFeature.set(DefaultOptionCreator.ROW_NUMBER, "" + rowNumber);

            Job jobFeature = HadoopUtil.prepareJob(input, tempFeature, TextInputFormat.class, MRMRMapper.class,
                    IntWritable.class, Text.class, MRMRReducer.class, LongWritable.class, Text.class,
                    SequenceFileOutputFormat.class, confFeature);
            jobFeature.setJobName("Feature Candidate stage " + i + "th");

            if (i > 0) {
                DistributedCache.addCacheFile(new Path(cacheFileUri(temp, i)).toUri(),
                        jobFeature.getConfiguration());
            }

            boolean succeededFeature = jobFeature.waitForCompletion(true);
            if (!succeededFeature)
                return -1;

            // Selecting best candidate job
            log.info("Selecting the best candidate at stage " + i + "th");

            Configuration confMax = getConf();
            confFeature.set(DefaultOptionCreator.COLUMN_NUMBER, "" + columnNumber);

            tempMax = new Path(maxJobTempUri(temp, i));
            if (i == featureNumber - 1)
                tempMax = outputPath;

            Job jobMax = HadoopUtil.prepareJob(tempFeature, tempMax, SequenceFileInputFormat.class, Mapper.class,
                    LongWritable.class, Text.class, MaxReducer.class, LongWritable.class, Text.class,
                    TextOutputFormat.class, confMax);
            jobMax.setJobName("Best candidate at stage " + i + "th");
            jobMax.setCombinerClass(MaxCombiner.class);

            if (i > 0) {
                DistributedCache.addCacheFile(new Path(cacheFileUri(temp, i)).toUri(), jobMax.getConfiguration());
            }

            boolean succeededMax = jobMax.waitForCompletion(true);
            if (!succeededMax)
                return -1;

            try {
                FileSystem hdfs = FileSystem.get(confMax);
                hdfs.delete(tempFeature, true);
            } catch (IOException e) {
            }

        }
        return 0;
    }
}