ivory.core.index.BuildIntPostingsForwardIndex.java Source code

Introduction

Here is the source code for ivory.core.index.BuildIntPostingsForwardIndex.java
Source

/*
 * Ivory: A Hadoop toolkit for web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.core.index;

import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.IntDocVector;
import ivory.core.data.index.IntPostingsForwardIndex;
import ivory.core.preprocess.PositionalSequenceFileRecordReader;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.log4j.Logger;

import edu.umd.cloud9.util.PowerTool;

public class BuildIntPostingsForwardIndex extends PowerTool {
    private static final Logger LOG = Logger.getLogger(BuildIntPostingsForwardIndex.class);

    protected static enum Dictionary {
        Size
    };

    private static class MyMapper extends Mapper<IntWritable, IntDocVector, IntWritable, Text> {
        @Override
        public void run(Context context) throws IOException, InterruptedException {
            String file = ((FileSplit) context.getInputSplit()).getPath().getName();
            LOG.info("Input file: " + file);

            IntWritable key = new IntWritable();
            Text outputValue = new Text();

            PositionalSequenceFileRecordReader<IntWritable, IntDocVector> reader = new PositionalSequenceFileRecordReader<IntWritable, IntDocVector>();
            reader.initialize(context.getInputSplit(), context);

            int fileNo = Integer.parseInt(file.substring(file.lastIndexOf("-") + 1));
            long pos = reader.getPosition();
            while (reader.nextKeyValue()) {
                key = reader.getCurrentKey();
                outputValue.set(fileNo + "\t" + pos);

                context.write(key, outputValue);
                context.getCounter(Dictionary.Size).increment(1);

                pos = reader.getPosition();
            }
            reader.close();
        }
    }

    private static class MyReducer extends Reducer<IntWritable, Text, Text, Text> {
        private String positionsFile;
        private int collectionTermCount;
        private FSDataOutputStream out;
        private int curKeyIndex = 0;

        @Override
        public void setup(Context context) {
            Configuration conf = context.getConfiguration();
            FileSystem fs;
            try {
                fs = FileSystem.get(conf);
            } catch (Exception e) {
                throw new RuntimeException("Error opening the FileSystem!");
            }

            String indexPath = conf.get(Constants.IndexPath);

            RetrievalEnvironment env = null;
            try {
                env = new RetrievalEnvironment(indexPath, fs);
            } catch (IOException e) {
                throw new RuntimeException("Unable to create RetrievalEnvironment!");
            }

            positionsFile = env.getPostingsIndexData();
            collectionTermCount = env.readCollectionTermCount();

            LOG.info("Ivory.PostingsPositionsFile: " + positionsFile);
            LOG.info("Ivory.CollectionTermCount: " + collectionTermCount);

            try {
                out = fs.create(new Path(positionsFile), true);
                out.writeInt(collectionTermCount);
            } catch (Exception e) {
                throw new RuntimeException("Error in creating files!");
            }
        }

        @Override
        public void reduce(IntWritable key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            Iterator<Text> iter = values.iterator();
            String[] s = iter.next().toString().split("\\s+");

            if (iter.hasNext()) {
                throw new RuntimeException("There shouldn't be more than one value, key=" + key);
            }

            int fileNo = Integer.parseInt(s[0]);
            long filePos = Long.parseLong(s[1]);
            long pos = IntPostingsForwardIndex.BigNumber * fileNo + filePos;

            curKeyIndex++;

            // This is subtle point: Ivory.CollectionTermCount specifies the number of terms in the
            // collection, computed by BuildTermIdMap. However, this number is sometimes greater than the
            // number of postings that are in the index (as is the case for ClueWeb09). Here's what
            // happens: when creating TermDocVectors, the vocabulary gets populated with tokens that
            // contain special symbols. The vocabulary then gets compressed with front-coding. However,
            // for whatever reason, the current implementation cannot properly handle these special
            // characters. So when we convert from TermDocVectors to IntDocVectors, we can't find the term
            // id for these special tokens. As a result, no postings list get created for them. So there
            // are assigned term ids for which there are no postings. Since IntPostingsForwardIndex
            // assumes consecutive term ids (since it loads position offsets into an array), we must
            // insert "padding".
            // - Jimmy, 5/29/2010

            while (curKeyIndex < key.get()) {
                out.writeLong(-1);
                curKeyIndex++;
            }

            out.writeLong(pos);
        }

        @Override
        public void cleanup(Context context) throws IOException {
            // Insert padding at the end.
            while (curKeyIndex < collectionTermCount) {
                out.writeLong(-1);
                curKeyIndex++;
            }

            out.close();

            if (curKeyIndex != collectionTermCount) {
                throw new IOException(String.format("Expected %d terms, actually got %d terms!",
                        collectionTermCount, curKeyIndex));
            }
        }
    }

    public BuildIntPostingsForwardIndex(Configuration conf) {
        super(conf);
    }

    public static final String[] RequiredParameters = { Constants.IndexPath };

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public int runTool() throws Exception {
        Configuration conf = getConf();
        FileSystem fs = FileSystem.get(conf);

        int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
        String indexPath = conf.get(Constants.IndexPath);

        RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
        String collectionName = env.readCollectionName();

        LOG.info("Tool: " + BuildIntPostingsForwardIndex.class.getCanonicalName());
        LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
        LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));

        Job job = new Job(getConf(), BuildIntPostingsForwardIndex.class.getSimpleName() + ":" + collectionName);
        job.setJarByClass(BuildIntPostingsForwardIndex.class);

        Path inputPath = new Path(env.getPostingsDirectory());
        FileInputFormat.setInputPaths(job, inputPath);

        Path postingsIndexPath = new Path(env.getPostingsIndexData());

        if (fs.exists(postingsIndexPath)) {
            LOG.info("Postings forward index path already exists!");
            return 0;
        }
        job.setNumReduceTasks(1);

        conf.setInt("mapred.min.split.size", minSplitSize);
        conf.set("mapred.child.java.opts", "-Xmx2048m");

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(NullOutputFormat.class);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        job.waitForCompletion(true);

        return 0;
    }
}