ivory.preprocess.BuildIntDocVectorsForwardIndex.java Source code

Introduction

Here is the source code for ivory.preprocess.BuildIntDocVectorsForwardIndex.java
Source

/*
 * Ivory: A Hadoop toolkit for web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.preprocess;

import ivory.data.IntDocVector;
import ivory.util.RetrievalEnvironment;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.log4j.Logger;

import edu.umd.cloud9.util.PowerTool;

@SuppressWarnings("deprecation")
public class BuildIntDocVectorsForwardIndex extends PowerTool {

    private static final Logger sLogger = Logger.getLogger(BuildIntDocVectorsForwardIndex.class);

    protected static enum Dictionary {
        Size
    };

    private static class MyMapRunner implements MapRunnable<IntWritable, IntDocVector, IntWritable, Text> {

        private String mInputFile;
        private Text outputValue = new Text();

        public void configure(JobConf job) {
            mInputFile = job.get("map.input.file");
        }

        public void run(RecordReader<IntWritable, IntDocVector> input, OutputCollector<IntWritable, Text> output,
                Reporter reporter) throws IOException {
            IntWritable key = input.createKey();
            IntDocVector value = input.createValue();
            int fileNo = Integer.parseInt(mInputFile.substring(mInputFile.lastIndexOf("-") + 1));

            long pos = input.getPos();
            while (input.next(key, value)) {
                outputValue.set(fileNo + "\t" + pos);

                output.collect(key, outputValue);
                reporter.incrCounter(Dictionary.Size, 1);

                pos = input.getPos();
            }
            sLogger.info("last termid: " + key + "(" + fileNo + ", " + pos + ")");
        }
    }

    public static final long BigNumber = 1000000000;

    private static class MyReducer extends MapReduceBase implements Reducer<IntWritable, Text, Text, Text> {

        FSDataOutputStream mOut;

        int mCollectionDocumentCount;
        int mCurDoc = 0;

        public void configure(JobConf job) {
            FileSystem fs;
            try {
                fs = FileSystem.get(job);
            } catch (Exception e) {
                throw new RuntimeException("Error opening the FileSystem!");
            }

            String indexPath = job.get("Ivory.IndexPath");

            RetrievalEnvironment env = null;
            try {
                env = new RetrievalEnvironment(indexPath, fs);
            } catch (IOException e) {
                throw new RuntimeException("Unable to create RetrievalEnvironment!");
            }

            boolean buildWeighted = job.getBoolean("Ivory.BuildWeighted", false);
            String forwardIndexPath = (buildWeighted ? env.getWeightedIntDocVectorsForwardIndex()
                    : env.getIntDocVectorsForwardIndex());
            mCollectionDocumentCount = env.readCollectionDocumentCount();

            try {
                mOut = fs.create(new Path(forwardIndexPath), true);
                mOut.writeInt(env.readDocnoOffset());
                mOut.writeInt(mCollectionDocumentCount);
            } catch (Exception e) {
                throw new RuntimeException("Error in creating files!");
            }

        }

        public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<Text, Text> output,
                Reporter reporter) throws IOException {
            String[] s = values.next().toString().split("\\s+");

            //sLogger.info (key + ": " + s[0] + " " + s[1]);
            if (values.hasNext())
                throw new RuntimeException("There shouldn't be more than one value, key=" + key);

            int fileNo = Integer.parseInt(s[0]);
            long filePos = Long.parseLong(s[1]);
            long pos = BigNumber * fileNo + filePos;

            mCurDoc++;

            mOut.writeLong(pos);
        }

        public void close() throws IOException {
            mOut.close();

            if (mCurDoc != mCollectionDocumentCount) {
                throw new IOException(
                        "Expected " + mCollectionDocumentCount + " docs, actually got " + mCurDoc + " terms!");
            }
        }
    }

    public BuildIntDocVectorsForwardIndex(Configuration conf) {
        super(conf);
    }

    public static final String[] RequiredParameters = { "Ivory.IndexPath", "Ivory.NumMapTasks" };

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public int runTool() throws Exception {
        JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class);
        FileSystem fs = FileSystem.get(conf);

        String indexPath = conf.get("Ivory.IndexPath");
        RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

        int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
        String collectionName = env.readCollectionName();
        boolean buildWeighted = conf.getBoolean("Ivory.BuildWeighted", false);

        sLogger.info("Tool: BuildIntDocVectorsIndex");
        sLogger.info(" - IndexPath: " + indexPath);
        sLogger.info(" - CollectionName: " + collectionName);
        sLogger.info(" - BuildWeighted: " + buildWeighted);
        sLogger.info(" - NumMapTasks: " + mapTasks);

        String intDocVectorsPath;
        String forwardIndexPath;
        if (buildWeighted) {
            intDocVectorsPath = env.getWeightedIntDocVectorsDirectory();
            forwardIndexPath = env.getWeightedIntDocVectorsForwardIndex();
        } else {
            intDocVectorsPath = env.getIntDocVectorsDirectory();
            forwardIndexPath = env.getIntDocVectorsForwardIndex();
        }

        if (!fs.exists(new Path(intDocVectorsPath))) {
            sLogger.info("Error: IntDocVectors don't exist!");
            return 0;
        }

        if (fs.exists(new Path(forwardIndexPath))) {
            sLogger.info("IntDocVectorIndex already exists: skipping!");
            return 0;
        }

        conf.setJobName("BuildIntDocVectorsForwardIndex:" + collectionName);

        Path inputPath = new Path(intDocVectorsPath);
        FileInputFormat.setInputPaths(conf, inputPath);

        conf.setNumMapTasks(mapTasks);
        conf.setNumReduceTasks(1);

        conf.set("mapred.child.java.opts", "-Xmx2048m");

        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setMapOutputKeyClass(IntWritable.class);
        conf.setMapOutputValueClass(Text.class);
        conf.setOutputFormat(NullOutputFormat.class);

        conf.setMapRunnerClass(MyMapRunner.class);
        conf.setReducerClass(MyReducer.class);

        JobClient.runJob(conf);

        return 0;
    }
}