org.apache.mahout.clustering.spectral.eigencuts.EigencutsAffinityCutsJob.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.mahout.clustering.spectral.eigencuts.EigencutsAffinityCutsJob.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.spectral.eigencuts;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.clustering.spectral.common.VertexWritable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class EigencutsAffinityCutsJob {

    private static final Logger log = LoggerFactory.getLogger(EigencutsAffinityCutsJob.class);

    private EigencutsAffinityCutsJob() {
    }

    enum CUTSCOUNTER {
        NUM_CUTS
    }

    /**
     * Runs a single iteration of defining cluster boundaries, based on
     * previous calculations and the formation of the "cut matrix".
     * 
     * @param currentAffinity Path to the current affinity matrix.
     * @param cutMatrix Path to the sensitivity matrix.
     * @param nextAffinity Output path for the new affinity matrix.
     */
    public static long runjob(Path currentAffinity, Path cutMatrix, Path nextAffinity, Configuration conf)
            throws IOException, ClassNotFoundException, InterruptedException {

        // these options allow us to differentiate between the two vectors
        // in the mapper and reducer - we'll know from the working path
        // which SequenceFile we're accessing
        conf.set(EigencutsKeys.AFFINITY_PATH, currentAffinity.getName());
        conf.set(EigencutsKeys.CUTMATRIX_PATH, cutMatrix.getName());

        Job job = new Job(conf, "EigencutsAffinityCutsJob");
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(VertexWritable.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(VectorWritable.class);
        job.setMapperClass(EigencutsAffinityCutsMapper.class);
        job.setCombinerClass(EigencutsAffinityCutsCombiner.class);
        job.setReducerClass(EigencutsAffinityCutsReducer.class);

        //FileInputFormat.addInputPath(job, currentAffinity);
        FileInputFormat.addInputPath(job, cutMatrix);
        FileOutputFormat.setOutputPath(job, nextAffinity);

        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }

        return job.getCounters().findCounter(CUTSCOUNTER.NUM_CUTS).getValue();
    }

    public static class EigencutsAffinityCutsMapper
            extends Mapper<IntWritable, VectorWritable, Text, VertexWritable> {

        @Override
        protected void map(IntWritable key, VectorWritable row, Context context)
                throws IOException, InterruptedException {

            // all this method does is construct a bunch of vertices, mapping those
            // together which have the same *combination* of indices; for example,
            // (1, 3) will have the same key as (3, 1) but a different key from (1, 1)
            // and (3, 3) (which, incidentally, will also not be grouped together)
            String type = context.getWorkingDirectory().getName();
            Vector vector = row.get();
            for (Vector.Element e : vector) {
                String newkey = Math.max(key.get(), e.index()) + "_" + Math.min(key.get(), e.index());
                context.write(new Text(newkey), new VertexWritable(key.get(), e.index(), e.get(), type));
            }
        }
    }

    public static class EigencutsAffinityCutsCombiner extends Reducer<Text, VertexWritable, Text, VertexWritable> {

        @Override
        protected void reduce(Text t, Iterable<VertexWritable> vertices, Context context)
                throws IOException, InterruptedException {
            // there should be exactly 4 items in the iterable; two from the
            // first Path source, and two from the second with matching (i, j) indices

            // the idea here is that we want the two vertices of the "cut" matrix,
            // and if either of them has a non-zero value, we want to:
            //
            // 1) zero out the two affinity vertices, and 
            // 2) add their former values to the (i, i) and (j, j) coordinates
            //
            // though obviously we want to perform these steps in reverse order
            Configuration conf = context.getConfiguration();
            log.debug("{}", t);
            boolean zero = false;
            int i = -1;
            int j = -1;
            double k = 0;
            int count = 0;
            for (VertexWritable v : vertices) {
                count++;
                if (v.getType().equals(conf.get(EigencutsKeys.AFFINITY_PATH))) {
                    i = v.getRow();
                    j = v.getCol();
                    k = v.getValue();
                } else if (v.getValue() != 0.0) {
                    zero = true;
                }
            }
            // if there are only two vertices, we have a diagonal
            // we want to preserve whatever is currently in the diagonal,
            // since this is acting as a running sum of all other values
            // that have been "cut" so far - simply return this element as is
            if (count == 2) {
                VertexWritable vw = new VertexWritable(i, j, k, "unimportant");
                context.write(new Text(String.valueOf(i)), vw);
                return;
            }

            // do we zero out the values?
            VertexWritable outI = new VertexWritable();
            VertexWritable outJ = new VertexWritable();
            if (zero) {
                // increment the cut counter
                context.getCounter(CUTSCOUNTER.NUM_CUTS).increment(1);

                // we want the values to exist on the diagonal
                outI.setCol(i);
                outJ.setCol(j);

                // also, set the old values to zero
                VertexWritable zeroI = new VertexWritable();
                VertexWritable zeroJ = new VertexWritable();
                zeroI.setCol(j);
                zeroI.setValue(0);
                zeroJ.setCol(i);
                zeroJ.setValue(0);
                zeroI.setType("unimportant");
                zeroJ.setType("unimportant");
                context.write(new Text(String.valueOf(i)), zeroI);
                context.write(new Text(String.valueOf(j)), zeroJ);
            } else {
                outI.setCol(j);
                outJ.setCol(i);
            }

            // set the values and write them
            outI.setValue(k);
            outJ.setValue(k);
            outI.setType("unimportant");
            outJ.setType("unimportant");
            context.write(new Text(String.valueOf(i)), outI);
            context.write(new Text(String.valueOf(j)), outJ);
        }
    }

    public static class EigencutsAffinityCutsReducer
            extends Reducer<Text, VertexWritable, IntWritable, VectorWritable> {

        @Override
        protected void reduce(Text row, Iterable<VertexWritable> entries, Context context)
                throws IOException, InterruptedException {
            // now to assemble the vectors
            RandomAccessSparseVector output = new RandomAccessSparseVector(
                    context.getConfiguration().getInt(EigencutsKeys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE), 100);
            int rownum = Integer.parseInt(row.toString());
            for (VertexWritable e : entries) {
                // first, are we setting a diagonal?
                if (e.getCol() == rownum) {
                    // add to what's already present
                    output.setQuick(e.getCol(), output.getQuick(e.getCol()) + e.getValue());
                } else {
                    // simply set the value
                    output.setQuick(e.getCol(), e.getValue());
                }
            }
            context.write(new IntWritable(rownum), new VectorWritable(output));
        }
    }
}