cc.slda.DisplayTopic.java Source code

Java tutorial

Introduction

Here is the source code for cc.slda.DisplayTopic.java

Source

/*
 * Ivory: A Hadoop toolkit for Web-scale information retrieval
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package cc.slda;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.google.common.base.Preconditions;

import edu.umd.cloud9.io.map.HMapIDW;
import edu.umd.cloud9.io.pair.PairOfIntFloat;

public class DisplayTopic extends Configured implements Tool {
    public static final String TOP_DISPLAY_OPTION = "topdisplay";
    public static final int TOP_DISPLAY = 10;

    @SuppressWarnings("unchecked")
    public int run(String[] args) throws Exception {
        Options options = new Options();

        options.addOption(Settings.HELP_OPTION, false, "print the help message");
        options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg()
                .withDescription("input beta file").create(Settings.INPUT_OPTION));
        options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg()
                .withDescription("term index file").create(ParseCorpus.INDEX));
        options.addOption(OptionBuilder.withArgName(Settings.INTEGER_INDICATOR).hasArg()
                .withDescription("display top terms only (default - 10)").create(TOP_DISPLAY_OPTION));

        String betaString = null;
        String indexString = null;
        int topDisplay = TOP_DISPLAY;

        CommandLineParser parser = new GnuParser();
        HelpFormatter formatter = new HelpFormatter();
        try {
            CommandLine line = parser.parse(options, args);

            if (line.hasOption(Settings.HELP_OPTION)) {
                formatter.printHelp(ParseCorpus.class.getName(), options);
                System.exit(0);
            }

            if (line.hasOption(Settings.INPUT_OPTION)) {
                betaString = line.getOptionValue(Settings.INPUT_OPTION);
            } else {
                throw new ParseException("Parsing failed due to " + Settings.INPUT_OPTION + " not initialized...");
            }

            if (line.hasOption(ParseCorpus.INDEX)) {
                indexString = line.getOptionValue(ParseCorpus.INDEX);
            } else {
                throw new ParseException("Parsing failed due to " + ParseCorpus.INDEX + " not initialized...");
            }

            if (line.hasOption(TOP_DISPLAY_OPTION)) {
                topDisplay = Integer.parseInt(line.getOptionValue(TOP_DISPLAY_OPTION));
            }
        } catch (ParseException pe) {
            System.err.println(pe.getMessage());
            formatter.printHelp(ParseCorpus.class.getName(), options);
            System.exit(0);
        } catch (NumberFormatException nfe) {
            System.err.println(nfe.getMessage());
            System.exit(0);
        }

        JobConf conf = new JobConf(DisplayTopic.class);
        FileSystem fs = FileSystem.get(conf);

        Path indexPath = new Path(indexString);
        Preconditions.checkArgument(fs.exists(indexPath) && fs.isFile(indexPath), "Invalid index path...");

        Path betaPath = new Path(betaString);
        Preconditions.checkArgument(fs.exists(betaPath) && fs.isFile(betaPath), "Invalid beta path...");

        SequenceFile.Reader sequenceFileReader = null;
        try {
            IntWritable intWritable = new IntWritable();
            Text text = new Text();
            Map<Integer, String> termIndex = new HashMap<Integer, String>();
            sequenceFileReader = new SequenceFile.Reader(fs, indexPath, conf);
            while (sequenceFileReader.next(intWritable, text)) {
                termIndex.put(intWritable.get(), text.toString());
            }

            PairOfIntFloat pairOfIntFloat = new PairOfIntFloat();
            // HMapIFW hmap = new HMapIFW();
            HMapIDW hmap = new HMapIDW();
            TreeMap<Double, Integer> treeMap = new TreeMap<Double, Integer>();
            sequenceFileReader = new SequenceFile.Reader(fs, betaPath, conf);
            while (sequenceFileReader.next(pairOfIntFloat, hmap)) {
                treeMap.clear();

                System.out.println("==============================");
                System.out.println(
                        "Top ranked " + topDisplay + " terms for Topic " + pairOfIntFloat.getLeftElement());
                System.out.println("==============================");

                Iterator<Integer> itr1 = hmap.keySet().iterator();
                int temp1 = 0;
                while (itr1.hasNext()) {
                    temp1 = itr1.next();
                    treeMap.put(-hmap.get(temp1), temp1);
                    if (treeMap.size() > topDisplay) {
                        treeMap.remove(treeMap.lastKey());
                    }
                }

                Iterator<Double> itr2 = treeMap.keySet().iterator();
                double temp2 = 0;
                while (itr2.hasNext()) {
                    temp2 = itr2.next();
                    if (termIndex.containsKey(treeMap.get(temp2))) {
                        System.out.println(termIndex.get(treeMap.get(temp2)) + "\t\t" + -temp2);
                    } else {
                        System.out.println("How embarrassing! Term index not found...");
                    }
                }
            }
        } finally {
            IOUtils.closeStream(sequenceFileReader);
        }

        return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new DisplayTopic(), args);
        System.exit(res);
    }
}