org.terrier.structures.indexing.singlepass.hadoop.BitPostingIndexInputFormat.java Source code

Introduction

Here is the source code for org.terrier.structures.indexing.singlepass.hadoop.BitPostingIndexInputFormat.java
Source

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org/
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is BitPostingIndexInputFormat.java
 *
 * The Original Code is Copyright (C) 2004-2014 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original contributor)
 */
package org.terrier.structures.indexing.singlepass.hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Logger;
import org.terrier.structures.BitIndexPointer;
import org.terrier.structures.DocumentIndex;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.Index;
import org.terrier.structures.IndexOnDisk;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.bit.BitPostingIndexInputStream;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Wrapper.IntObjectWrapper;
import org.terrier.utility.io.HadoopPlugin;
import org.terrier.utility.io.HadoopUtility;

/** An InputFormat, i.e. MapReduce input reader, for a BitPostingIndex. Splits the main posting
 * file into generic InputSplits, according to the block size of the underlying file - i.e. the 
 * number of entries, or indeed postings, can be variable. 
 * The following JobConf properties are used:
 * <ul>
 * <li><tt>mapred.index.path</tt> and <tt>mapred.index.prefix</tt> - where to find the index.</li>
 * <li><tt>mapred.bitpostingindex.structure</tt> - which structure are we splitting?</li>
 * <li><tt>mapred.bitpostingindex.lookup.structure</tt> - which structure's inputstream is the Iterator of BitIndexPointers?</li>
 * </ul>
 */
@SuppressWarnings("deprecation")
public class BitPostingIndexInputFormat extends FileInputFormat<IntWritable, IntObjectWrapper<IterablePosting>> {

    final static Logger logger = Logger.getLogger(BitPostingIndexInputFormat.class);
    final static String BITPOSTING_STRUCTURE_KEY = "mapred.bitpostingindex.structure";
    final static String BITPOSTING_LOOKUP_STRUCTURE_KEY = "mapred.bitpostingindex.lookup.structure";

    final static boolean REPLACE_DOCUMENT_INDEX = true;

    static class NullDocumentIndex implements DocumentIndex {
        int docs;

        public NullDocumentIndex(int numDocs) {
            this.docs = numDocs;
        }

        @Override
        public DocumentIndexEntry getDocumentEntry(int docid) throws IOException {
            return null;
        }

        @Override
        public int getDocumentLength(int docid) throws IOException {
            return 0;
        }

        @Override
        public int getNumberOfDocuments() {
            return docs;
        }

    }

    static class BitPostingIndexInputSplit extends FileSplit {
        /** start entry of split */
        int startingEntryIndex;
        /** number of entries in split */
        int entryCount;

        /** Constructor for a split of a BitPosting structures, 
         * where the start and number of entries are specified */
        public BitPostingIndexInputSplit(Path file, long start, long length, String[] hosts,
                int _startingEntryIndex, int _entryCount) {
            super(file, start, length, hosts);
            startingEntryIndex = _startingEntryIndex;
            entryCount = _entryCount;
            logger.debug(
                    "new BitPostingIndexInputSplit: start at " + startingEntryIndex + " entries " + _entryCount);
        }

        /** default constructor, for serialization */
        public BitPostingIndexInputSplit() {
            super(null, (long) 0, (long) 0, new String[0]);
        }

        /** Start entry of the split */
        public int getStartingEntryIndex() {
            return startingEntryIndex;
        }

        /** Number of entries in split */
        public int getEntryCount() {
            return entryCount;
        }

        @Override
        public String toString() {
            return super.toString() + ", " + entryCount + " entries starting at " + startingEntryIndex;
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            super.readFields(in);
            startingEntryIndex = WritableUtils.readVInt(in);
            entryCount = WritableUtils.readVInt(in);
        }

        @Override
        public void write(DataOutput out) throws IOException {
            super.write(out);
            WritableUtils.writeVInt(out, startingEntryIndex);
            WritableUtils.writeVInt(out, entryCount);
        }
    }

    static class BitPostingIndexRecordReader
            implements RecordReader<IntWritable, IntObjectWrapper<IterablePosting>> {
        /** id of first entry */
        int startingEntryIndex;
        /** id of entry we're currently at */
        int entryIndex = 0;
        /** number of entries in our split */
        int entryCount = 0;
        /** actual posting stream */
        BitPostingIndexInputStream postingStream;

        BitPostingIndexRecordReader(BitPostingIndexInputStream _postingStream, int _entryIndex, int _entryCount) {
            this.postingStream = _postingStream;
            this.startingEntryIndex = this.entryIndex = _entryIndex;
            this.entryCount = _entryCount;
            logger.info("new BitPostingIndexRecordReader: start at index " + entryIndex + " process " + _entryCount
                    + " entries");
        }

        public void close() throws IOException {
            this.postingStream.close();
            logger.info("BitPostingIndexRecordReader: closing: started at " + startingEntryIndex + " now, at "
                    + entryIndex);
        }

        public IntWritable createKey() {
            return new IntWritable();
        }

        public IntObjectWrapper<IterablePosting> createValue() {
            return new IntObjectWrapper<IterablePosting>();
        }

        public long getPos() throws IOException {
            return postingStream.getPos().getOffset();
        }

        public float getProgress() throws IOException {
            /* TODO: could we calculate progress in terms of bytes of the target structure, as this
             * would be more accurate than entries */
            //progress can be greater than 1, because of trailing empty entries
            final float progress = (float) (entryIndex - startingEntryIndex) / (float) entryCount;
            return progress > 1.0f ? 1.0f : progress;
        }

        public boolean next(IntWritable docid, IntObjectWrapper<IterablePosting> wrapperPostingList)
                throws IOException {
            //check if entryCount entries have been read
            //count can be greater than entry count due to entry skipping
            if ((entryIndex - startingEntryIndex) >= entryCount)
                return false;
            if (!postingStream.hasNext())
                return false;
            IterablePosting rtr = postingStream.next();

            //System.err.println("skipped=" + postingStream.getEntriesSkipped());
            entryIndex += postingStream.getEntriesSkipped();

            if (rtr == null) {
                entryIndex++;
                //this entry should be trailing
                logger.warn("No posting list for trailing entry " + entryIndex);
                return next(docid, wrapperPostingList); //TODO recursion is BAD
            }
            docid.set(entryIndex++);
            wrapperPostingList.setObject(rtr);
            wrapperPostingList.setInt(postingStream.getNumberOfCurrentPostings());
            return true;
        }
    }

    /** Get a record reader for the specified split */
    public RecordReader<IntWritable, IntObjectWrapper<IterablePosting>> getRecordReader(final InputSplit _split,
            final JobConf job, final Reporter reporter) throws IOException {
        HadoopUtility.loadTerrierJob(job);
        final BitPostingIndexInputSplit split = (BitPostingIndexInputSplit) _split;
        Index.setIndexLoadingProfileAsRetrieval(false);
        final IndexOnDisk index = HadoopUtility.fromHConfiguration(job);
        if (index == null)
            throw new IOException("Index not found in JobConf:" + Index.getLastIndexLoadError());
        if (REPLACE_DOCUMENT_INDEX)
            IndexUtil.forceStructure(index, "document",
                    new NullDocumentIndex(index.getCollectionStatistics().getNumberOfDocuments()));
        final String bitPostingStructureName = job.get(BITPOSTING_STRUCTURE_KEY);

        final BitPostingIndexInputStream postingStream = (BitPostingIndexInputStream) index
                .getIndexStructureInputStream(bitPostingStructureName);
        postingStream.skip(split.getStartingEntryIndex());
        logger.info("BitPostingIndexRecordReader for structure " + bitPostingStructureName + " start entry "
                + split.getStartingEntryIndex() + " split size " + split.getEntryCount());
        return new BitPostingIndexRecordReader(postingStream, split.getStartingEntryIndex(), split.getEntryCount());
    }

    /** Returns the block size of the specified file. Only recommended to overload for testing */
    protected long getBlockSize(Path path, FileStatus fss) {
        return fss.getBlockSize();
    }

    /** 
     * {@inheritDoc} 
     */
    @SuppressWarnings("unchecked")
    /** Make the splits of the index structure. Bit structures split across multiple files are supported */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        HadoopUtility.loadTerrierJob(job);

        final String lookupStructureName = job.get(BITPOSTING_LOOKUP_STRUCTURE_KEY);
        final String bitPostingStructureName = job.get(BITPOSTING_STRUCTURE_KEY);
        Index.setIndexLoadingProfileAsRetrieval(false);
        final IndexOnDisk index = HadoopUtility.fromHConfiguration(job);

        final byte fileCount = Byte
                .parseByte(index.getIndexProperty("index." + bitPostingStructureName + ".data-files", "1"));
        final Path bitPostingStructureFiles[] = new Path[fileCount];
        final FileStatus[] fss = new FileStatus[fileCount];
        final long[] bitPostingStructureFSBlockSizes = new long[fileCount];

        logger.info("Calculating splits of structure " + bitPostingStructureName);
        FileSystem fs = FileSystem.get(job);
        for (byte i = 0; i < fileCount; i++) {
            bitPostingStructureFiles[i] = new Path(
                    BitPostingIndexInputStream.getFilename(index, bitPostingStructureName, fileCount, i));
            fss[i] = fs.getFileStatus(bitPostingStructureFiles[i]);
            bitPostingStructureFSBlockSizes[i] = getBlockSize(bitPostingStructureFiles[i], fss[i]);
            logger.info("File " + i + " approx splits="
                    + ((double) fss[i].getLen() / (double) bitPostingStructureFSBlockSizes[i]));
        }

        //this smells of a hack, because we dont have a strategy for naming various index structures streams
        final Iterator<? extends BitIndexPointer> offsetIterator = index
                .hasIndexStructureInputStream(lookupStructureName + "-entry")
                        ? (Iterator<? extends BitIndexPointer>) index
                                .getIndexStructureInputStream(lookupStructureName + "-entry")
                        : (Iterator<? extends BitIndexPointer>) index
                                .getIndexStructureInputStream(lookupStructureName);

        if (offsetIterator == null)
            throw new IOException("No such stream structure called " + lookupStructureName + "-entry or "
                    + lookupStructureName + " found in index");
        final List<InputSplit> splitList = new ArrayList<InputSplit>();

        int currentId = 0;

        //size of the current split of each file
        final long[] blockSize = new long[fileCount];
        //location of the last split for each file
        final long[] bitPostingStructureSplitEndOffsets = new long[fileCount];

        //how many entries will be in this split, for each file
        final int[] entriesInBlock = new int[fileCount];
        //what is the starting id of the next entry split, for each file
        final int[] firstEntryOfNextSplit = new int[fileCount];

        //number of splits per file, for logging only
        final int[] splitsPerFile = new int[fileCount];

        Arrays.fill(firstEntryOfNextSplit, Integer.MAX_VALUE);

        BitIndexPointer currentPointer = null;
        //iterate through the lookup iterator
        //split the target bit posting index structure into chunks of size bitPostingStructureFSBlockSize
        while (offsetIterator.hasNext()) {
            //ok, where is the next pointer to
            currentPointer = offsetIterator.next();
            final byte fileId = currentPointer.getFileNumber();

            //what is the first entry of the next split of this file?
            firstEntryOfNextSplit[fileId] = Math.min(currentId, firstEntryOfNextSplit[fileId]);
            //this split will have one more entry
            entriesInBlock[fileId]++;

            //what is our current offset?
            long offset = currentPointer.getOffset();
            //System.err.println("Offset" + offset);
            //if we made the split here, how big would it be?
            blockSize[fileId] = offset - bitPostingStructureSplitEndOffsets[fileId];
            //is this block is large enough
            if (blockSize[fileId] > bitPostingStructureFSBlockSizes[fileId]) {
                //yes, its big enough
                //block will be from bitPostingStructureSplitEndOffsets[fileId] to offset, which is blockSize[fileId]
                BlockLocation[] blkLocations = fs.getFileBlockLocations(fss[fileId],
                        bitPostingStructureSplitEndOffsets[fileId], blockSize[fileId]);
                splitList.add(new BitPostingIndexInputSplit(bitPostingStructureFiles[fileId], //path
                        bitPostingStructureSplitEndOffsets[fileId], //start
                        blockSize[fileId], //length
                        blkLocations[0].getHosts(), //hosts
                        firstEntryOfNextSplit[fileId], //first entry in this split
                        entriesInBlock[fileId]) //number of entries in this split
                );
                logger.info("File " + fileId + " split " + (splitList.size() - 1) + " "
                        + splitList.get(splitList.size() - 1).toString());
                //record another split for this file (for logging only)
                splitsPerFile[fileId]++;
                //update recording of last offset for this file
                bitPostingStructureSplitEndOffsets[fileId] = offset;
                //reset size of split for this file
                blockSize[fileId] = 0;
                //reset counter of entries in split of this file
                entriesInBlock[fileId] = 0;
                //reset the first offset of this split
                firstEntryOfNextSplit[fileId] = Integer.MAX_VALUE;
            }

            //ids always increment
            currentId++;
        }
        IndexUtil.close(offsetIterator);
        //find any files which have trailing blocks
        for (byte fileId = 0; fileId < fileCount; fileId++) {
            if (entriesInBlock[fileId] == 0)
                continue;
            assert (firstEntryOfNextSplit[fileId] != Integer.MAX_VALUE);

            //block will be from bitPostingStructureSplitEndOffsets[fileId], with length blockSize[fileId]
            BlockLocation[] blkLocations = fs.getFileBlockLocations(fss[fileId],
                    bitPostingStructureSplitEndOffsets[fileId], blockSize[fileId]);
            splitList.add(new BitPostingIndexInputSplit(bitPostingStructureFiles[fileId], //path of file for split
                    bitPostingStructureSplitEndOffsets[fileId], //start offset of this split
                    blockSize[fileId], //size of this split
                    blkLocations[0].getHosts(), //hosts for this split
                    firstEntryOfNextSplit[fileId], //first entry id for this split
                    entriesInBlock[fileId]) //number of entries in this split
            );
            logger.info("File " + fileId + " trailing split " + (splitList.size() - 1) + " "
                    + splitList.get(splitList.size() - 1).toString());

            //record another split for this file (for logging only)
            splitsPerFile[fileId]++;
        }

        logger.info("Split " + bitPostingStructureName + " (of " + currentId + " entries) into " + splitList.size()
                + " splits");
        if (fileCount > 1) {
            logger.info("Multiple files of " + bitPostingStructureName + " were split as follows: "
                    + ArrayUtils.join(splitsPerFile, ","));
        }
        assert (splitList.size() > 0);
        index.close();
        return splitList.toArray(new InputSplit[splitList.size()]);
    }

    /** Checks to see if required keys are present */
    public void validateInput(JobConf job) throws IOException {
        for (String k : new String[] { BITPOSTING_LOOKUP_STRUCTURE_KEY, BITPOSTING_STRUCTURE_KEY }) {
            if (job.get(k, null) == null)
                throw new IOException("Required key " + k + " not defined in job");
        }
    }

    /** Provides the starting entry id for the specified split */
    public static int getSplit_StartingEntryIndex(InputSplit s) {
        return ((BitPostingIndexInputSplit) s).getStartingEntryIndex();
    }

    /** Returns the number of entries in specified split */
    public static int getSplit_EntryCount(InputSplit s) {
        return ((BitPostingIndexInputSplit) s).getEntryCount();
    }

    /** Save in the JobConf, the names of the bit and pointer lookup structures that this inputformat should look for */
    public static void setStructures(JobConf jc, String bitStructureName, String lookupStructureName) {
        jc.setInputFormat(BitPostingIndexInputFormat.class);
        jc.set(BITPOSTING_STRUCTURE_KEY, bitStructureName);
        jc.set(BITPOSTING_LOOKUP_STRUCTURE_KEY, lookupStructureName);
    }

    /** Test method, runs splits for inverted/lexicon with the command line specified index */
    public static void main(String[] args) throws Exception {
        Index.setIndexLoadingProfileAsRetrieval(false);
        IndexOnDisk index = Index.createIndex(args[1], args[2]);
        if (args[0].equals("--splits")) {
            JobConf job = HadoopPlugin.getJobFactory(BitPostingIndexInputFormat.class.getSimpleName()).newJob();
            HadoopUtility.toHConfiguration(index, job);
            setStructures(job, "inverted", "lexicon");
            index.close();
            new BitPostingIndexInputFormat().getSplits(job, 100);
        } else {
            JobConf job = HadoopPlugin.getJobFactory(BitPostingIndexInputFormat.class.getSimpleName()).newJob();
            setStructures(job, "linksin", "linksin-lookup");
            HadoopUtility.toHConfiguration(index, job);
            index.close();
            InputSplit s = new BitPostingIndexInputSplit(new Path(args[3]), Long.parseLong(args[4]),
                    Long.parseLong(args[5]), new String[0], Integer.parseInt(args[6]), Integer.parseInt(args[7]));
            RecordReader<IntWritable, IntObjectWrapper<IterablePosting>> rr = new BitPostingIndexInputFormat()
                    .getRecordReader(s, job, new Reporter() {
                        public InputSplit getInputSplit() throws UnsupportedOperationException {
                            return null;
                        }

                        @SuppressWarnings({ "rawtypes" })
                        public void incrCounter(Enum arg0, long arg1) {
                        }

                        public void incrCounter(String arg0, String arg1, long arg2) {
                        }

                        @SuppressWarnings({ "rawtypes" })
                        public org.apache.hadoop.mapred.Counters.Counter getCounter(Enum arg0) {
                            return null;
                        }

                        public org.apache.hadoop.mapred.Counters.Counter getCounter(String arg0, String arg1) {
                            return null;
                        }

                        public void setStatus(String arg0) {
                        }

                        public void progress() {
                        }
                    });
            IntWritable key = rr.createKey();
            IntObjectWrapper<IterablePosting> value = rr.createValue();
            long pointers = 0;
            int lastId = 0;
            int nonZeroEntryCount = 0;
            float maxProgress = 0;
            while (rr.next(key, value)) {
                IterablePosting ip = value.getObject();
                lastId = key.get();
                while (ip.next() != IterablePosting.EOL) {
                    pointers++;
                }
                nonZeroEntryCount++;
                if (rr.getProgress() > maxProgress)
                    maxProgress = rr.getProgress();
            }
            rr.close();
            System.out.println("maxProgress=" + maxProgress + " Lastid=" + lastId + " nonZeroEntryCount="
                    + nonZeroEntryCount + " postings=" + pointers);
        }
    }

}