org.apache.mahout.math.stats.entropy.Entropy.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.mahout.math.stats.entropy.Entropy.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.math.stats.entropy;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.math.VarIntWritable;

import java.io.IOException;
import java.util.List;
import java.util.Map;

/**
 * A Hadoop job to compute the entropy of keys or values in a {@link SequenceFile}. Format has to be {@link Text} for
 * key or value.
 * <p/>
 * <ul>
 * <li>-i The input sequence file</li>
 * <li>-o The output sequence file</li>
 * <li>-s The source. Can be \<key\> or \<value\>. Default is \<key\></li>
 * </ul>
 */
public final class Entropy extends AbstractJob {

    private Path tempPath;
    private long numberItems;
    private String source;

    private static final String TEMP_FILE = "temp";
    static final String NUMBER_ITEMS_PARAM = "number.items";

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new Entropy(), args);
    }

    /**
     * Returns the number of elements in the file. Only works after run.
     *
     * @return The number of processed items
     */
    public long getNumberItems() {
        return numberItems;
    }

    @Override
    public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        prepareArguments(args);
        groupAndCount();
        calculateEntropy();

        return 1;
    }

    /**
     * Prepares and sets the arguments.
     *
     * @param args
     * @throws IOException
     */
    private void prepareArguments(String[] args) throws IOException {

        addInputOption();
        addOutputOption();
        addOption("source", "s",
                "Sets, if the entropy is calculated for the keys or the values. Can be <key> or <value>", "key");

        Map<String, List<String>> arguments = parseArguments(args);
        if (arguments == null) {
            return;
        }
        source = getOption("source");
        tempPath = new Path(getTempPath(), TEMP_FILE + '-' + System.currentTimeMillis());

    }

    /**
     * Groups the items and counts the occur for each of them.
     * SQL-like: SELECT item, COUNT(*) FROM x GROUP BY item
     *
     * @throws IOException
     * @throws ClassNotFoundException
     * @throws InterruptedException
     */
    private void groupAndCount() throws IOException, ClassNotFoundException, InterruptedException {

        Class<? extends Mapper> mapper = "key".equals(source) ? KeyCounterMapper.class : ValueCounterMapper.class;

        Job job = prepareJob(getInputPath(), tempPath, SequenceFileInputFormat.class, mapper, Text.class,
                VarIntWritable.class, VarIntSumReducer.class, Text.class, VarIntWritable.class,
                SequenceFileOutputFormat.class);
        job.setCombinerClass(VarIntSumReducer.class);
        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }

        numberItems = job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS")
                .getValue();

    }

    /**
     * Calculates the entropy with
     * <p/>
     * H(X) = -sum_i(x_i/n * log_2(x_i/n))  WITH n = sum_i(x_i)
     * = -sum_i(x_i/n * (log_2(x_i) - log_2(n)))
     * = -sum_i(x_i/n * log_2(x_i)) + sum_i(x_i/n * log_2(n))
     * = (n * log_2(n) - sum_i(x_i * log_2(x_i)) / n
     * = log_2(n) - sum_i(x_i * log_2(x_i)) / n
     * = (log(n) - sum_i(x_i * log(x_i)) / n) / log(2)
     */
    private void calculateEntropy() throws IOException, ClassNotFoundException, InterruptedException {

        Job job = prepareJob(tempPath, getOutputPath(), SequenceFileInputFormat.class, CalculateEntropyMapper.class,
                NullWritable.class, DoubleWritable.class, CalculateEntropyReducer.class, NullWritable.class,
                DoubleWritable.class, SequenceFileOutputFormat.class);
        job.getConfiguration().set(NUMBER_ITEMS_PARAM, String.valueOf(numberItems));
        job.setCombinerClass(DoubleSumReducer.class);
        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }

    }

}