org.terrier.structures.indexing.CompressingMetaIndexBuilder.java Source code

Introduction

Here is the source code for org.terrier.structures.indexing.CompressingMetaIndexBuilder.java
Source

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org/
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is CompressingMetaIndexBuilder.java
 *
 * The Original Code is Copyright (C) 2004-2014 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original contributor)
 */
package org.terrier.structures.indexing;

import gnu.trove.TObjectIntHashMap;

import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.Flushable;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.zip.Deflater;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.log4j.Logger;

import org.terrier.structures.Index;
import org.terrier.structures.CompressingMetaIndex.CompressingMetaIndexInputFormat;
import org.terrier.structures.IndexOnDisk;
import org.terrier.structures.collections.FSOrderedMapFile;
import org.terrier.structures.collections.FSOrderedMapFile.MapFileWriter;
import org.terrier.structures.collections.FSOrderedMapFile.MultiFSOMapWriter;
import org.terrier.structures.seralization.FixedSizeIntWritableFactory;
import org.terrier.structures.seralization.FixedSizeTextFactory;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Files;
import org.terrier.utility.MemoryChecker;
import org.terrier.utility.RuntimeMemoryChecker;
import org.terrier.utility.Wrapper;
import org.terrier.utility.io.HadoopPlugin;
import org.terrier.utility.io.HadoopUtility;

/** Creates a metaindex structure that compresses all values using Deflator. 
 * <b>Properties:</b>
 * <ul>
 * <li><tt>metaindex.compressed.max.data.in-mem.mb</tt> - maximum size that a meta index .zdata file will be kept in memory. Defaults to 400(mb). </li>
 * <li><tt>metaindex.compressed.max.index.in-mem.mb</tt> - maximum size that a meta index .zdata file will be kept in memory. Defaults to 100(mb).</li>
 * <li><tt>metaindex.compressed.reverse.allow.duplicates</tt> - set this property to true to suppress errors when a reverse meta value is not unique. Default false.</li>
 * <li><tt>metaindex.compressed.crop.long</tt> - set this property to suppress errors with overlong Document metadata, while will instead be cropped.</li>
 * @since 3.0
 * @author Craig Macdonald &amp; Vassilis Plachouras 
 */
@SuppressWarnings("deprecation")
public class CompressingMetaIndexBuilder extends MetaIndexBuilder implements Flushable {
    protected static final Logger logger = Logger.getLogger(CompressingMetaIndexBuilder.class);
    protected static final int MAX_MB_IN_MEM_RETRIEVAL = Integer
            .parseInt(ApplicationSetup.getProperty("metaindex.compressed.max.data.in-mem.mb", "400"));
    protected static final int MAX_INDEX_MB_IN_MEM_RETRIEVAL = Integer
            .parseInt(ApplicationSetup.getProperty("metaindex.compressed.max.index.in-mem.mb", "100"));
    protected static final boolean REVERSE_ALLOW_DUPS = Boolean
            .parseBoolean(ApplicationSetup.getProperty("metaindex.compressed.reverse.allow.duplicates", "false"));
    protected static final boolean CROP_LONG = Boolean
            .parseBoolean(ApplicationSetup.getProperty("metaindex.compressed.crop.long", "false"));

    protected static final int REVERSE_KEY_LOOKUP_WRITING_BUFFER_SIZE = 20000;
    protected static final int DOCS_PER_CHECK = ApplicationSetup.DOCS_CHECK_SINGLEPASS;
    protected static final int ZIP_COMPRESSION_LEVEL = 5;//TODO (auto)configure? 

    protected final TObjectIntHashMap<String> key2Index;
    protected DataOutputStream dataOutput = null;
    protected final String[] keyNames;
    protected final int keyCount;
    protected Deflater zip = new Deflater();
    protected ByteArrayOutputStream baos = new ByteArrayOutputStream();
    protected DataOutputStream indexOutput = null;
    protected byte[] compressedBuffer = new byte[1024];
    protected IndexOnDisk index;
    protected int[] valueLensChars;
    protected int[] valueLensBytes;

    protected byte[] spaces;
    protected int entryLengthBytes = 0;
    protected long currentOffset = 0;
    protected long currentIndexOffset = 0;
    protected int entryCount = 0;

    protected int[] forwardKeys;
    protected String[] forwardKeyNames;

    protected MapFileWriter[] forwardWriters;
    protected boolean[] forwardKeyValuesSorted;
    protected String[] lastValues;
    protected MemoryChecker memCheck = new RuntimeMemoryChecker();
    protected FixedSizeWriteableFactory<Text>[] keyFactories;
    protected String structureName;

    /**
     * constructor
     * @param _index
     * @param _keyNames
     * @param _valueLens
     * @param _forwardKeys
     */
    public CompressingMetaIndexBuilder(IndexOnDisk _index, String[] _keyNames, int[] _valueLens,
            String[] _forwardKeys) {
        this(_index, "meta", _keyNames, _valueLens, _forwardKeys);
    }

    /**
     * constructor
     * @param _index
     * @param _structureName
     * @param _keyNames
     * @param _valueLens
     * @param _forwardKeys
     */
    @SuppressWarnings("unchecked")
    public CompressingMetaIndexBuilder(IndexOnDisk _index, String _structureName, String[] _keyNames,
            int[] _valueLens, String[] _forwardKeys) {
        this.index = _index;
        this.structureName = _structureName;
        this.keyNames = _keyNames;
        this.valueLensChars = _valueLens;
        if (this.keyNames.length != this.valueLensChars.length)
            throw new IllegalArgumentException(
                    "CompressingMetaIndexBuilder configuration incorrect: number of keys and number of value lengths are unequal: "
                            + Arrays.toString(keyNames) + " vs " + Arrays.toString(_valueLens));
        this.key2Index = new TObjectIntHashMap<String>(keyNames.length);
        this.keyCount = keyNames.length;
        for (int i = 0; i < keyCount; i++)
            this.key2Index.put(keyNames[i], i);
        logger.debug("Initialising CompressingMetaIndexBuilder");
        try {
            this.dataOutput = new DataOutputStream(Files
                    .writeFileStream(_index.getPath() + "/" + _index.getPrefix() + "." + structureName + ".zdata"));
            this.indexOutput = new DataOutputStream(Files
                    .writeFileStream(_index.getPath() + "/" + _index.getPrefix() + "." + structureName + ".idx"));
        } catch (IOException ioe) {
            throw new IllegalArgumentException(ioe);
        }

        this.zip.setLevel(ZIP_COMPRESSION_LEVEL);

        if (_forwardKeys.length == 1 && _forwardKeys[0].length() == 0)
            _forwardKeys = new String[0];

        this.forwardKeyNames = _forwardKeys;
        this.forwardKeys = new int[_forwardKeys.length];
        int i = 0;
        for (String fwdKey : _forwardKeys) {
            if (!key2Index.contains(fwdKey))
                throw new IllegalArgumentException("Reverse key " + fwdKey
                        + " must also be a forward meta index key. Add it to indexer.meta.forward.keys");
            forwardKeys[i++] = key2Index.get(fwdKey);
        }

        this.forwardWriters = new MultiFSOMapWriter[forwardKeys.length];
        this.keyFactories = new FixedSizeWriteableFactory[forwardKeys.length];
        this.forwardKeyValuesSorted = new boolean[forwardKeys.length];
        this.lastValues = new String[forwardKeys.length];

        for (i = 0; i < forwardKeys.length; i++) {
            forwardWriters[i] = new MultiFSOMapWriter(
                    _index.getPath() + "/" + _index.getPrefix() + "." + structureName + "-" + i
                            + FSOrderedMapFile.USUAL_EXTENSION,
                    REVERSE_KEY_LOOKUP_WRITING_BUFFER_SIZE,
                    keyFactories[i] = new FixedSizeTextFactory(valueLensChars[forwardKeys[i]]),
                    new FixedSizeIntWritableFactory(), REVERSE_ALLOW_DUPS);
            forwardKeyValuesSorted[i] = true;
        }

        this.valueLensBytes = new int[keyNames.length];
        assert (this.valueLensBytes.length > 0);
        for (i = 0; i < keyNames.length; i++) {
            this.valueLensBytes[i] = FixedSizeTextFactory.getMaximumTextLength(this.valueLensChars[i]);
            this.entryLengthBytes += this.valueLensBytes[i];
        }
        this.spaces = new byte[entryLengthBytes];//for padding
    }

    /** {@inheritDoc} */
    @Override
    public void writeDocumentEntry(Map<String, String> data) throws IOException {
        String[] values = new String[keyCount];
        int i = 0;
        for (String keyName : keyNames) {
            values[i++] = data.get(keyName);
        }
        writeDocumentEntry(values);
    }

    /** {@inheritDoc} */
    @Override
    public void writeDocumentEntry(String[] data) throws IOException {
        int i = 0;
        for (String value : data) {
            if (value == null)
                value = "";
            else if (value.length() > valueLensChars[i])
                if (CROP_LONG)
                    value = value.substring(0, valueLensChars[i] - 1);
                else
                    throw new IllegalArgumentException("Data (" + value + ") of string length " + value.length()
                            + " for key " + keyNames[i] + " exceeds max string length of " + valueLensChars[i]
                            + "(byte length of " + valueLensBytes[i]
                            + "). Crop in the Document, increase indexer.meta.forward.keylens, or set metaindex.compressed.crop.long");

            final byte[] b = Text.encode(value).array();
            final int numberOfBytesToWrite = b.length;
            if (numberOfBytesToWrite > valueLensBytes[i])
                throw new IllegalArgumentException("Data (" + value + ") of byte length " + numberOfBytesToWrite
                        + " for key " + keyNames[i] + " exceeds max byte length of " + valueLensBytes[i]
                        + "(string length of " + valueLensChars[i]
                        + "). Crop in the Document, or increase indexer.meta.forward.keylens");
            baos.write(b);
            if (numberOfBytesToWrite < valueLensBytes[i])
                baos.write(spaces, 0, valueLensBytes[i] - numberOfBytesToWrite);
            i++;
        }
        zip.reset();
        zip.setInput(baos.toByteArray());
        zip.finish();
        baos.reset();
        indexOutput.writeLong(currentOffset);
        currentIndexOffset += 8;
        int compressedEntrySize = 0;
        while (!zip.finished()) {
            final int numOfCompressedBytes = zip.deflate(compressedBuffer);
            dataOutput.write(compressedBuffer, 0, numOfCompressedBytes);
            compressedEntrySize += numOfCompressedBytes;
        }
        currentOffset += compressedEntrySize;
        for (i = 0; i < forwardKeys.length; i++) {
            Text key = keyFactories[i].newInstance();
            key.set(data[forwardKeys[i]]);
            IntWritable value = new IntWritable();
            value.set(entryCount);
            forwardWriters[i].write(key, value);
            if (lastValues[i] != null && data[forwardKeys[i]].compareTo(lastValues[i]) < 1)
                forwardKeyValuesSorted[i] = false;
            lastValues[i] = data[forwardKeys[i]];
        }
        entryCount++;

        //check for low memory, and flush if necessary
        if (entryCount % DOCS_PER_CHECK == 0 && memCheck.checkMemory()) {
            flush();
            memCheck.reset();
        }
    }

    /** 
     * {@inheritDoc} 
     */
    public void flush() throws IOException {
        //logger.info("CompressingMetaIndexBuilder flush");
        for (MapFileWriter w : forwardWriters)
            ((Flushable) w).flush();

    }

    /** 
     * {@inheritDoc} 
     */
    public void close() throws IOException {
        dataOutput.close();
        indexOutput.close();
        index.addIndexStructure(structureName, "org.terrier.structures.CompressingMetaIndex",
                "org.terrier.structures.IndexOnDisk,java.lang.String", "index,structureName");
        index.addIndexStructureInputStream(structureName, "org.terrier.structures.CompressingMetaIndex$InputStream",
                "org.terrier.structures.IndexOnDisk,java.lang.String", "index,structureName");
        index.setIndexProperty("index." + structureName + ".entries", "" + entryCount);
        index.setIndexProperty("index." + structureName + ".compression-level", "" + ZIP_COMPRESSION_LEVEL);
        index.setIndexProperty("index." + structureName + ".key-names", ArrayUtils.join(keyNames, ","));
        index.setIndexProperty("index." + structureName + ".value-lengths", ArrayUtils.join(valueLensChars, ","));
        index.setIndexProperty("index." + structureName + ".entry-length", "" + entryLengthBytes);
        index.setIndexProperty("index." + structureName + ".data-source",
                currentOffset > MAX_MB_IN_MEM_RETRIEVAL * (long) 1024 * (long) 1024 ? "file" : "fileinmem");
        index.setIndexProperty("index." + structureName + ".index-source",
                currentIndexOffset > MAX_INDEX_MB_IN_MEM_RETRIEVAL * (long) 1024 * (long) 1024 ? "file"
                        : "fileinmem");
        //TODO emit warnings
        index.flush();

        for (int i = 0; i < forwardKeys.length; i++) {
            if (forwardKeyValuesSorted[i]) {
                logger.info("Key " + forwardKeyNames[i]
                        + " values are sorted in meta index, consider binary searching zdata file");
                forwardWriters[i].close();
            } else {
                forwardWriters[i].close();
            }
        }
        index.setIndexProperty("index." + structureName + ".reverse-key-names",
                ArrayUtils.join(forwardKeyNames, ","));
        index.flush();

    }

    public static void main(String[] args) throws Exception {
        if (args.length == 0) {
            System.err.println("Usage: -Dterrier.index.path=hdfs://path/to/index "
                    + CompressingMetaIndexBuilder.class.getName() + " docno");
            return;
        }
        Index.setIndexLoadingProfileAsRetrieval(false);
        IndexOnDisk index = Index.createIndex();
        reverseAsMapReduceJob(index, "meta", args);
    }

    /**
     * reverseAsMapReduceJob
     * @param index
     * @param structureName
     * @param keys
     * @throws Exception
     */
    public static void reverseAsMapReduceJob(IndexOnDisk index, String structureName, String[] keys)
            throws Exception {
        final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("TerrierIndexingMeta");
        if (jf == null)
            throw new Exception("Could not get JobFactory from HadoopPlugin");
        reverseAsMapReduceJob(index, structureName, keys, jf);
        jf.close();
    }

    /**
     * reverseAsMapReduceJob
     * @param index
     * @param structureName
     * @param keys
     * @param jf
     * @throws Exception
     */
    //@SuppressWarnings("deprecation")
    public static void reverseAsMapReduceJob(IndexOnDisk index, String structureName, String[] keys,
            HadoopPlugin.JobFactory jf) throws Exception {
        long time = System.currentTimeMillis();
        final JobConf conf = jf.newJob();
        conf.setJobName("Reverse MetaIndex");
        conf.setMapOutputKeyClass(KeyValueTuple.class);
        conf.setMapOutputValueClass(IntWritable.class);
        conf.setMapperClass(MapperReducer.class);
        conf.setReducerClass(MapperReducer.class);
        conf.setNumReduceTasks(keys.length);
        conf.setPartitionerClass(KeyedPartitioner.class);
        conf.setInputFormat(CompressingMetaIndexInputFormat.class);
        conf.setReduceSpeculativeExecution(false);
        conf.set("MetaIndexInputStreamRecordReader.structureName", structureName);
        conf.setInt("CompressingMetaIndexBuilder.reverse.keyCount", keys.length);
        conf.set("CompressingMetaIndexBuilder.reverse.keys", ArrayUtils.join(keys, ","));
        conf.set("CompressingMetaIndexBuilder.forward.valueLengths",
                index.getIndexProperty("index." + structureName + ".value-lengths", ""));
        conf.set("CompressingMetaIndexBuilder.forward.keys",
                index.getIndexProperty("index." + structureName + ".key-names", ""));
        FileOutputFormat.setOutputPath(conf, new Path(index.getPath()));
        HadoopUtility.toHConfiguration(index, conf);

        conf.setOutputFormat(NullOutputFormat.class);
        try {
            RunningJob rj = JobClient.runJob(conf);
            rj.getID();
            HadoopUtility.finishTerrierJob(conf);
        } catch (Exception e) {
            throw new Exception("Problem running job to reverse metadata", e);
        }
        //only update the index from the controlling process, so that we dont have locking/concurrency issues
        index.setIndexProperty("index." + structureName + ".reverse-key-names", ArrayUtils.join(keys, ","));
        index.flush();
        logger.info("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    }

    /**
     * class KeyedPartitioner
     *
     */
    //@SuppressWarnings("deprecation")
    public static class KeyedPartitioner implements Partitioner<KeyValueTuple, IntWritable> {
        protected int keyCount;
        protected TObjectIntHashMap<String> key2reverseOffset = null;

        /** 
         * {@inheritDoc} 
         */
        public int getPartition(KeyValueTuple kv, IntWritable docid, int numReducers) {
            if (numReducers == 1)
                return 0;
            final String key = kv.getKeyName();
            final int keyIndex = key2reverseOffset.get(key);
            return keyIndex % numReducers;
        }

        /** 
         * {@inheritDoc} 
         */
        public void configure(JobConf jc) {
            keyCount = jc.getInt("CompressingMetaIndexBuilder.reverse.keyCount", 0);
            key2reverseOffset = new TObjectIntHashMap<String>();
            String[] keys = jc.get("CompressingMetaIndexBuilder.reverse.keys", "").split("\\s*,\\s*");
            int i = 0;
            for (String k : keys) {
                key2reverseOffset.put(k, i++);
            }
        }

    }

    static class KeyValueTuple implements WritableComparable<KeyValueTuple> {
        String k;
        String v;

        public KeyValueTuple(String key, String value) {
            k = key;
            v = value;
        }

        public KeyValueTuple() {
        }

        public String getKeyName() {
            return k;
        }

        public String getValue() {
            return v;
        }

        public void readFields(DataInput in) throws IOException {
            k = in.readUTF();
            v = in.readUTF();
        }

        public void write(DataOutput out) throws IOException {
            out.writeUTF(k);
            out.writeUTF(v);
        }

        public int compareTo(KeyValueTuple o) {
            final int rtr = k.compareTo(o.getKeyName());
            if (rtr != 0)
                return rtr;
            return v.compareTo(o.getValue());
        }

        @Override
        public boolean equals(Object other) {
            if (!(other instanceof KeyValueTuple))
                return false;
            return this.compareTo((KeyValueTuple) other) == 0;
        }

        @Override
        public int hashCode() {
            return k.hashCode() + v.hashCode();
        }
    }

    static class MapperReducer extends
            HadoopUtility.MapReduceBase<IntWritable, Wrapper<String[]>, KeyValueTuple, IntWritable, Object, Object> {
        String[] reverseKeyNames;
        int[] reverseKeyIndices;
        int reverseKeyCount;

        @Override
        protected void configureMap() throws IOException {
            reverseKeyCount = jc.getInt("CompressingMetaIndexBuilder.reverse.keyCount", 0);
            reverseKeyNames = jc.get("CompressingMetaIndexBuilder.reverse.keys", "").split("\\s*,\\s*");
            final TObjectIntHashMap<String> key2forwardOffset = new TObjectIntHashMap<String>(reverseKeyCount);
            final String[] forwardKeyNames = jc.get("CompressingMetaIndexBuilder.forward.keys", "")
                    .split("\\s*,\\s*");
            int i = 0;
            for (String k : forwardKeyNames) {
                key2forwardOffset.put(k, i++);
            }
            reverseKeyIndices = new int[reverseKeyNames.length];
            i = 0;
            for (String k : reverseKeyNames) {
                reverseKeyIndices[i] = key2forwardOffset.get(k);
            }
        }

        public void map(IntWritable docid, Wrapper<String[]> _metadata,
                OutputCollector<KeyValueTuple, IntWritable> collector, Reporter reporter) throws IOException {
            String[] metadata = _metadata.getObject();
            reporter.setStatus("Processing metadata for document " + docid.get());
            for (int i = 0; i < reverseKeyCount; i++) {
                collector.collect(new KeyValueTuple(reverseKeyNames[i], metadata[i]), docid);
            }
            reporter.progress();
        }

        @Override
        protected void closeMap() throws IOException {
        }

        String currentReducingKey = null;
        MapFileWriter currentReducingOutput;
        IndexOnDisk index;
        Path reduceTaskFileDestinations;
        TObjectIntHashMap<String> key2reverseOffset = null;
        TObjectIntHashMap<String> key2valuelength = null;
        FixedSizeWriteableFactory<Text> keyFactory;
        int duplicateKeyCount = 0;
        int currentKeyTupleCount = 0;

        @Override
        protected void configureReduce() throws IOException {
            Index.setIndexLoadingProfileAsRetrieval(false);
            index = HadoopUtility.fromHConfiguration(jc);
            reduceTaskFileDestinations = FileOutputFormat.getWorkOutputPath(jc);
            Files.mkdir(reduceTaskFileDestinations.toString());

            String structureName = jc.get("MetaIndexInputStreamRecordReader.structureName", "");
            reverseKeyCount = jc.getInt("CompressingMetaIndexBuilder.reverse.keyCount", 0);
            reverseKeyNames = jc.get("CompressingMetaIndexBuilder.reverse.keys", "").split("\\s*,\\s*");
            key2reverseOffset = new TObjectIntHashMap<String>(reverseKeyCount);
            int i = 0;
            for (String k : reverseKeyNames) {
                key2reverseOffset.put(k, i++);
            }
            key2valuelength = new TObjectIntHashMap<String>(reverseKeyCount);
            final String[] allKeys = index.getIndexProperty("index." + structureName + ".key-names", "")
                    .split("\\s*,\\s*");
            final String[] allValueLens = index.getIndexProperty("index." + structureName + ".value-lengths", "")
                    .split("\\s*,\\s*");
            i = 0;
            for (String k : allKeys) {
                logger.debug("Key " + k + " value length=" + allValueLens[i]);
                key2valuelength.put(k, Integer.parseInt(allValueLens[i++]));
            }
        }

        /** Reduce function. Input Key: (meta Key name, meta Key value) Value: list of matching docids. */
        public void reduce(KeyValueTuple metaTuple, Iterator<IntWritable> docids,
                OutputCollector<Object, Object> arg2, Reporter reporter) throws IOException {
            if (currentReducingKey == null || !metaTuple.getKeyName().equals(currentReducingKey)) {
                if (currentReducingKey != null) {
                    logger.info("currentKey was " + currentReducingKey + " (" + currentKeyTupleCount
                            + " entries) new Key is " + metaTuple.getKeyName() + " : force closed");
                    currentReducingOutput.close();
                    if (duplicateKeyCount > 0) {
                        logger.warn("MetaIndex key " + currentReducingKey + " had " + duplicateKeyCount
                                + " distinct values with duplicated associated document ids");
                    }
                    currentReducingOutput = null;
                }
                currentKeyTupleCount = 0;
                duplicateKeyCount = 0;
                currentReducingKey = metaTuple.getKeyName();
                currentReducingOutput = openMapFileWriter(currentReducingKey);
                logger.info("Opening new MapFileWriter for key " + currentReducingKey);
            }
            final IntWritable docid = docids.next();
            final Text key = keyFactory.newInstance();
            key.set(metaTuple.getValue());
            currentReducingOutput.write(key, docid);
            currentKeyTupleCount++;
            int extraCount = 0;
            while (docids.hasNext()) {
                docids.next();
                extraCount++;
            }
            reporter.progress();
            if (extraCount > 0) {
                //logger.warn("Key "+currentReducingKey + " value "+ metaTuple.getValue() + " had "+ extraCount +" extra documents. First document selected.");
                duplicateKeyCount++;
            }
            reporter.setStatus("Reducing metadata value " + metaTuple.getValue());
        }

        @Override
        protected void closeReduce() throws IOException {
            if (currentKeyTupleCount > 0) {
                logger.info("Finished reducing for " + currentReducingKey + ", with " + currentKeyTupleCount
                        + " entries");
            }
            if (duplicateKeyCount > 0) {
                logger.warn("MetaIndex key " + currentReducingKey + " had " + duplicateKeyCount
                        + " distinct values with duplicated associated document ids");
            }
            if (currentReducingOutput != null)
                currentReducingOutput.close();
        }

        /* open a MapFileWriter for the specified key. This will automatically promoted to the index folder when the job is finished */
        protected MapFileWriter openMapFileWriter(String keyName) throws IOException {
            final int metaKeyIndex = key2reverseOffset.get(keyName);
            final int valueLength = key2valuelength.get(keyName);
            keyFactory = new FixedSizeTextFactory(valueLength);
            logger.info("Opening MapFileWriter for key " + keyName + " - index " + metaKeyIndex);
            return FSOrderedMapFile.mapFileWrite(reduceTaskFileDestinations.toString() /*index.getPath()*/
                    + "/" + ((IndexOnDisk) index).getPrefix() + "."
                    + jc.get("MetaIndexInputStreamRecordReader.structureName") + "-" + metaKeyIndex
                    + FSOrderedMapFile.USUAL_EXTENSION);
            //
            /*return new MultiFSOMapWriter(
                  reduceTaskFileDestinations.toString()
              + "/" + index.getPrefix() + "."
              + jc.get("MetaIndexInputStreamRecordReader.structureName")
              + "-"+metaKeyIndex+FSOrderedMapFile.USUAL_EXTENSION, 
                  REVERSE_KEY_LOOKUP_WRITING_BUFFER_SIZE, 
                  keyFactory = new FixedSizeTextFactory(valueLength), 
                  new FixedSizeIntWritableFactory()
               );*/
        }
    }

}