edu.cooper.cloud.Normalize.java Source code

Java tutorial

Introduction

Here is the source code for edu.cooper.cloud.Normalize.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.cooper.cloud;

import java.io.IOException;
import java.util.Map;

import org.apache.commons.math.util.DoubleArray;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Normalize {

    public static class NormalizeMapper
            extends Mapper<LongWritable, DoubleArrayWritable, IntWritable, DoubleArrayWritable> {

        private int dataDimFrom;
        private int dataDimTo;
        private long samplesCount;
        private int universeSize;

        @Override
        protected void setup(Context context) throws IOException {
            Configuration conf = context.getConfiguration();
            dataDimFrom = conf.getInt("dataDimFrom", 0);
            dataDimTo = conf.getInt("dataDimTo", 0);
            samplesCount = conf.getLong("samplesCount", 0);
            universeSize = dataDimTo - dataDimFrom + 1;
        }

        @Override
        public void map(LongWritable key, DoubleArrayWritable array, Context context)
                throws IOException, InterruptedException {
            DoubleWritable[] outArray = new DoubleWritable[universeSize * 2];
            for (int c = 0; c < universeSize; c++) {
                outArray[c] = new DoubleWritable(array.get(c + dataDimFrom).get() / samplesCount); //Mean
            }
            for (int c = universeSize; c < universeSize * 2; c++) {
                double val = array.get(c - universeSize + dataDimFrom).get();
                outArray[c] = new DoubleWritable((val * val) / samplesCount);
            }
            context.write(new IntWritable(1), new DoubleArrayWritable(outArray));
        }

    }

    public static class NormalizeCombiner
            extends Reducer<IntWritable, DoubleArrayWritable, IntWritable, DoubleArrayWritable> {

        private int dataDimFrom;
        private int dataDimTo;
        private int universeSize;

        @Override
        protected void setup(Context context) throws IOException {
            Configuration conf = context.getConfiguration();
            dataDimFrom = conf.getInt("dataDimFrom", 0);
            dataDimTo = conf.getInt("dataDimTo", 0);
            universeSize = dataDimTo - dataDimFrom + 1;
        }

        @Override
        public void reduce(IntWritable column, Iterable<DoubleArrayWritable> partialSums, Context context)
                throws IOException, InterruptedException {
            DoubleWritable[] outArray = new DoubleWritable[universeSize * 2];
            boolean isFirst = true;
            for (DoubleArrayWritable partialSum : partialSums) {
                for (int i = 0; i < universeSize * 2; i++) {
                    if (!isFirst) {
                        outArray[i].set(outArray[i].get() + partialSum.get(i).get());
                    } else {
                        outArray[i] = new DoubleWritable(partialSum.get(i).get());
                    }
                }
                isFirst = false;
            }
            context.write(column, new DoubleArrayWritable(outArray));
        }

    }

    public static class NormalizeReducer
            extends Reducer<IntWritable, DoubleArrayWritable, IntWritable, DoubleArrayWritable> {

        private int dataDimFrom;
        private int dataDimTo;
        private int universeSize;

        @Override
        protected void setup(Context context) throws IOException {
            Configuration conf = context.getConfiguration();
            dataDimFrom = conf.getInt("dataDimFrom", 0);
            dataDimTo = conf.getInt("dataDimTo", 0);
            universeSize = dataDimTo - dataDimFrom + 1;
        }

        @Override
        public void reduce(IntWritable column, Iterable<DoubleArrayWritable> partialSums, Context context)
                throws IOException, InterruptedException {
            DoubleWritable[] outArray = new DoubleWritable[universeSize * 2];
            boolean isFirst = true;
            for (DoubleArrayWritable partialSum : partialSums) {
                for (int i = 0; i < universeSize; i++) {
                    if (!isFirst) {
                        outArray[i].set(outArray[i].get() + partialSum.get(i).get());
                    } else {
                        outArray[i] = new DoubleWritable(partialSum.get(i).get());
                    }
                }
                isFirst = false;
            }
            for (int i = universeSize; i < universeSize * 2; i++) {
                double mean = outArray[i - universeSize].get();
                outArray[i].set(Math.sqrt(outArray[i].get() - mean * mean));
            }
            context.write(column, new DoubleArrayWritable(outArray));
        }

    }

    public static void main(String[] args) throws Exception {

        String input = "datasets/train_subject01.csv";
        String output = "output/trainX2.csv";

        Configuration conf = new Configuration();
        Map<String, String> env = System.getenv();
        Path coreSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/core-site.xml");
        Path hdfsSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/hdfs-site.xml");
        Path yarnSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/yarn-site.xml");
        Path mapredSiteXml = new Path(env.get("HADOOP_CONF_DIR") + "/mapred-site.xml");
        conf.addResource(coreSiteXml);
        conf.addResource(hdfsSiteXml);
        conf.addResource(yarnSiteXml);
        conf.addResource(mapredSiteXml);

        //        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        //        if (otherArgs.length != 2) {
        //            System.err.println("Usage: wordcount <in> <out>");
        //            System.exit(2);
        //        }

        Job job = new Job(conf, "normalize");
        job.setJarByClass(Normalize.class);
        job.setMapperClass(NormalizeMapper.class);
        job.setCombinerClass(NormalizeCombiner.class);
        job.setReducerClass(NormalizeReducer.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(DoubleArrayWritable.class);
        //        job.setInputFormatClass(new FileInputFormat<IntWritable,DoubleArrayWritable>());

        Path inputPath = new Path(input);
        System.out.println(inputPath);
        Path outputPath = new Path(output);
        System.out.println(outputPath);

        NLineInputFormat.addInputPath(job, inputPath);
        //        FileInputFormat.addInputPath(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);
        System.exit(job.waitForCompletion(true) ? 0 : 1);

        //        Use means and std dev to normalize the data

    }
}