org.apache.mahout.math.stats.entropy.ConditionalEntropy.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.mahout.math.stats.entropy.ConditionalEntropy.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.math.stats.entropy;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.math.VarIntWritable;

import java.io.IOException;

/**
 * A Hadoop job to compute the conditional entropy H(Value|Key) for a sequence file.
 * <ul>
 * <li>-i The input sequence file</li>
 * <li>-o The output sequence file</li>
 * </ul>
 */
public final class ConditionalEntropy extends AbstractJob {

    private long numberItems;

    private Path keyValueCountPath;
    private Path specificConditionalEntropyPath;

    private static final String KEY_VALUE_COUNT_FILE = "key_value_count";
    private static final String SPECIFIC_CONDITIONAL_ENTROPY_FILE = "specific_conditional_entropy";
    static final String NUMBER_ITEMS_PARAM = "items.number";

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new Entropy(), args);
    }

    @Override
    public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        prepareArguments(args);
        groupAndCountByKeyAndValue();
        calculateSpecificConditionalEntropy();
        calculateConditionalEntropy();
        return 0;
    }

    /**
     * Prepares and sets the arguments.
     */
    private void prepareArguments(String[] args) throws IOException {
        addInputOption();
        addOutputOption();
        parseArguments(args);
        keyValueCountPath = new Path(getTempPath(), KEY_VALUE_COUNT_FILE + '-' + System.currentTimeMillis());
        specificConditionalEntropyPath = new Path(getTempPath(),
                SPECIFIC_CONDITIONAL_ENTROPY_FILE + '_' + System.currentTimeMillis());
    }

    /**
     * Groups and counts by key and value.
     * SQL-like: SELECT key, value, COUNT(*) FROM x GROUP BY key, value
     */
    private void groupAndCountByKeyAndValue() throws IOException, ClassNotFoundException, InterruptedException {

        Job job = prepareJob(getInputPath(), keyValueCountPath, SequenceFileInputFormat.class,
                GroupAndCountByKeyAndValueMapper.class, StringTuple.class, VarIntWritable.class,
                VarIntSumReducer.class, StringTuple.class, VarIntWritable.class, SequenceFileOutputFormat.class);
        job.setCombinerClass(VarIntSumReducer.class);
        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }

        numberItems = job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS")
                .getValue();

    }

    /**
     * Calculates the specific conditional entropy which is H(Y|X).
     * Needs the number of all items for normalizing.
     */
    private void calculateSpecificConditionalEntropy()
            throws IOException, ClassNotFoundException, InterruptedException {

        Job job = prepareJob(keyValueCountPath, specificConditionalEntropyPath, SequenceFileInputFormat.class,
                SpecificConditionalEntropyMapper.class, Text.class, VarIntWritable.class,
                SpecificConditionalEntropyReducer.class, Text.class, DoubleWritable.class,
                SequenceFileOutputFormat.class);
        job.getConfiguration().set(NUMBER_ITEMS_PARAM, String.valueOf(numberItems));
        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }

    }

    /**
     * Sums the calculated specific conditional entropy. Output is in the value.
     */
    private void calculateConditionalEntropy() throws IOException, ClassNotFoundException, InterruptedException {

        Job job = prepareJob(specificConditionalEntropyPath, getOutputPath(), SequenceFileInputFormat.class,
                CalculateSpecificConditionalEntropyMapper.class, NullWritable.class, DoubleWritable.class,
                DoubleSumReducer.class, NullWritable.class, DoubleWritable.class, SequenceFileOutputFormat.class);
        job.setCombinerClass(DoubleSumReducer.class);
        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }

    }

}