minor_MapReduce.C4_5.java Source code

Java tutorial

Introduction

Here is the source code for minor_MapReduce.C4_5.java

Source

/**
 * This file is part of an implementation of C4.5 by Yohann Jardin.
 * 
 * This implementation of C4.5 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This implementation of C4.5 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this implementation of C4.5. If not, see <http://www.gnu.org/licenses/>.
 */

package minor_MapReduce;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

public class C4_5 {

    private static Path input_path;
    private static Path tmp_path;
    private static Path summarized_data_path;

    private static Map<String[], Integer> summarized_data;

    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            System.err.println("Usage: main/C4_5 <input path> <tmp path>");
            System.exit(-1);
        }

        input_path = new Path(args[0]);
        tmp_path = new Path(args[1]);
        summarized_data_path = new Path(args[1] + "/summarized_data");

        //Put each unique line of data associated with their count into summarized_data 
        summarizeData();
        FileSystem.get(new Configuration()).delete(tmp_path, true);

        //Store rule, associating a set of attribute/value pair to a class.
        Map<Map<String, String>, String> classification = new HashMap<Map<String, String>, String>();

        Deque<Map<String, String>> conditions_to_test = new ArrayDeque<Map<String, String>>();

        Map<String, String> init = new HashMap<String, String>();
        conditions_to_test.add(init);

        String exceptions_conditions = "";

        while (!conditions_to_test.isEmpty()) {

            Map<String, String> conditions = conditions_to_test.pop();
            calcAttributesInfo(conditions);
            findBestAttribute();

        }

        printClassifications(classification);
        System.out.println(exceptions_conditions);
    }

    private static void summarizeData() throws Exception {
        Job job = Job.getInstance();
        job.setJarByClass(C4_5.class);
        job.setJobName("C4.5_summarizeData");

        FileInputFormat.addInputPath(job, input_path);
        FileOutputFormat.setOutputPath(job, summarized_data_path);

        job.setMapperClass(SummarizeMapper.class);
        job.setReducerClass(SummarizeReducer.class);

        job.setOutputKeyClass(TextArrayWritable.class);
        job.setOutputValueClass(IntWritable.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        job.waitForCompletion(false);

        /* Store it locally */
        Option optPath = SequenceFile.Reader.file(new Path(summarized_data_path.toString() + "/part-r-00000"));
        SequenceFile.Reader reader = new SequenceFile.Reader(new Configuration(), optPath);

        TextArrayWritable key = new TextArrayWritable();
        IntWritable val = new IntWritable();

        summarized_data = new HashMap<String[], Integer>();
        while (reader.next(key, val)) {
            summarized_data.put(key.toStrings(), val.get());
        }

        reader.close();
    }

    private static void calcAttributesInfo(Map<String, String> conditions) throws Exception {

    }

    private static void findBestAttribute() throws Exception {

    }

    private static void printClassifications(Map<Map<String, String>, String> classification) {
        List<String> msgs = new ArrayList<String>();

        String msg;
        for (Map<String, String> conditions : classification.keySet()) {
            msg = "";

            List<String> key_sorted = new ArrayList<String>(conditions.keySet());
            Collections.sort(key_sorted);

            for (int i = 0; i < key_sorted.size(); ++i) {
                msg += key_sorted.get(i) + "=" + conditions.get(key_sorted.get(i)) + ", ";
            }

            msg += "CLASSIFICATION: " + classification.get(conditions);

            msgs.add(msg);
        }

        Collections.sort(msgs);

        for (int i = 0; i < msgs.size(); ++i) {
            System.out.println(msgs.get(i));
        }

    }
}