ivory.preprocess.GetTermCount.java Source code

Introduction

Here is the source code for ivory.preprocess.GetTermCount.java
Source

/*
 * Ivory: A Hadoop toolkit for Web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.preprocess;

import ivory.data.TermDocVector;
import ivory.util.Constants;
import ivory.util.RetrievalEnvironment;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import edu.umd.cloud9.io.pair.PairOfIntLong;
import edu.umd.cloud9.util.PowerTool;

@SuppressWarnings("deprecation")
public class GetTermCount extends PowerTool {
    private static final Logger sLogger = Logger.getLogger(GetTermCount.class);

    protected static enum Statistics {
        Docs, Terms, SumOfDocLengths
    }

    private static class MyMapper extends MapReduceBase
            implements Mapper<IntWritable, TermDocVector, Text, PairOfIntLong> {

        private static Text sTerm = new Text();
        private static PairOfIntLong sPair = new PairOfIntLong();

        public void configure(JobConf job) {
            sLogger.setLevel(Level.WARN);
        }

        public void map(IntWritable key, TermDocVector doc, OutputCollector<Text, PairOfIntLong> output,
                Reporter reporter) throws IOException {

            TermDocVector.Reader r = doc.getReader();
            reporter.setStatus("d" + key.get());
            int dl = 0;
            int tf;
            while (r.hasMoreTerms()) {
                sTerm.set(r.nextTerm());
                tf = r.getTf();
                dl += tf;
                sPair.set(1, tf);
                output.collect(sTerm, sPair);
            }

            reporter.incrCounter(Statistics.Docs, 1);
            reporter.incrCounter(Statistics.SumOfDocLengths, dl);
        }
    }

    private static class MyCombiner extends MapReduceBase
            implements Reducer<Text, PairOfIntLong, Text, PairOfIntLong> {

        private static PairOfIntLong sPair = new PairOfIntLong();

        public void reduce(Text key, Iterator<PairOfIntLong> values, OutputCollector<Text, PairOfIntLong> output,
                Reporter reporter) throws IOException {
            int df = 0;
            long cf = 0;
            while (values.hasNext()) {
                sPair = values.next();
                df += sPair.getLeftElement();
                cf += sPair.getRightElement();
            }

            sPair.set(df, cf);
            output.collect(key, sPair);
        }
    }

    private static class MyReducer extends MapReduceBase
            implements Reducer<Text, PairOfIntLong, Text, PairOfIntLong> {

        private int minDf, maxDf;

        public void configure(JobConf job) {
            minDf = job.getInt("Ivory.MinDf", 2);
            maxDf = job.getInt("Ivory.MaxDf", Integer.MAX_VALUE);
        }

        PairOfIntLong dfcf = new PairOfIntLong();

        public void reduce(Text key, Iterator<PairOfIntLong> values, OutputCollector<Text, PairOfIntLong> output,
                Reporter reporter) throws IOException {
            int df = 0;
            long cf = 0;
            while (values.hasNext()) {
                dfcf = values.next();
                df += dfcf.getLeftElement();
                cf += dfcf.getRightElement();
            }
            if (df < minDf || df > maxDf)
                return;
            reporter.incrCounter(Statistics.Terms, 1);
            dfcf.set(df, cf);
            output.collect(key, dfcf);
        }
    }

    public static final String[] RequiredParameters = { Constants.NumMapTasks, Constants.CollectionName,
            Constants.IndexPath, Constants.MinDf, Constants.MaxDf };

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public GetTermCount(Configuration conf) {
        super(conf);
    }

    public int runTool() throws Exception {
        // create a new JobConf, inheriting from the configuration of this
        // PowerTool
        JobConf conf = new JobConf(getConf(), GetTermCount.class);
        FileSystem fs = FileSystem.get(conf);

        String indexPath = conf.get(Constants.IndexPath);
        RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

        int mapTasks = conf.getInt(Constants.NumMapTasks, 0);
        int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);

        String collectionName = env.readCollectionName();
        String termDocVectorsPath = env.getTermDocVectorsDirectory();
        String termDfCfPath = env.getTermDfCfDirectory();

        if (!fs.exists(new Path(indexPath))) {
            sLogger.info("index path doesn't existing: skipping!");
            return 0;
        }

        sLogger.info("PowerTool: GetTermCount");
        sLogger.info(" - CollectionName: " + collectionName);
        sLogger.info(" - NumMapTasks: " + mapTasks);
        sLogger.info(" - NumReduceTasks: " + reduceTasks);
        sLogger.info(" - MinDf: " + conf.getInt(Constants.MinDf, 0));
        sLogger.info(" - MaxDf: " + conf.getInt(Constants.MaxDf, Integer.MAX_VALUE));

        Path outputPath = new Path(termDfCfPath);
        if (fs.exists(outputPath)) {
            sLogger.error("TermDfCf directory exist: skipping!");
            return 0;
        }

        conf.setJobName("GetTermCount:" + collectionName);

        conf.setNumMapTasks(mapTasks);
        conf.setNumReduceTasks(reduceTasks);
        conf.set("mapred.child.java.opts", "-Xmx2048m");

        FileInputFormat.setInputPaths(conf, new Path(termDocVectorsPath));
        FileOutputFormat.setOutputPath(conf, outputPath);

        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(PairOfIntLong.class);
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(PairOfIntLong.class);

        conf.setMapperClass(MyMapper.class);
        conf.setCombinerClass(MyCombiner.class);
        conf.setReducerClass(MyReducer.class);

        long startTime = System.currentTimeMillis();
        RunningJob job = JobClient.runJob(conf);
        sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

        Counters counters = job.getCounters();
        // write out number of postings
        int collectionTermCount = (int) counters.findCounter(Statistics.Terms).getCounter();
        env.writeCollectionTermCount(collectionTermCount);
        // NOTE: this value is not the same as number of postings, because
        // postings for non-English terms are discarded, or as result of df cut

        long collectionLength = counters.findCounter(Statistics.SumOfDocLengths).getCounter();
        env.writeCollectionLength(collectionLength);
        return 0;
    }
}