it.unimi.di.big.mg4j.tool.PartitionDocumentally.java Source code

Introduction

Here is the source code for it.unimi.di.big.mg4j.tool.PartitionDocumentally.java
Source

package it.unimi.di.big.mg4j.tool;

/*       
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2006-2015 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

import it.unimi.di.big.mg4j.index.BitStreamHPIndexWriter;
import it.unimi.di.big.mg4j.index.BitStreamIndex;
import it.unimi.di.big.mg4j.index.BitStreamIndexWriter;
import it.unimi.di.big.mg4j.index.CachingOutputBitStream;
import it.unimi.di.big.mg4j.index.CompressionFlags;
import it.unimi.di.big.mg4j.index.CompressionFlags.Coding;
import it.unimi.di.big.mg4j.index.CompressionFlags.Component;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
import it.unimi.di.big.mg4j.index.Index;
import it.unimi.di.big.mg4j.index.IndexIterator;
import it.unimi.di.big.mg4j.index.IndexReader;
import it.unimi.di.big.mg4j.index.IndexWriter;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter;
import it.unimi.di.big.mg4j.index.SkipBitStreamIndexWriter;
import it.unimi.di.big.mg4j.index.cluster.ContiguousDocumentalStrategy;
import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
import it.unimi.di.big.mg4j.index.cluster.DocumentalConcatenatedCluster;
import it.unimi.di.big.mg4j.index.cluster.DocumentalMergedCluster;
import it.unimi.di.big.mg4j.index.cluster.DocumentalPartitioningStrategy;
import it.unimi.di.big.mg4j.index.cluster.DocumentalStrategies;
import it.unimi.di.big.mg4j.index.cluster.IndexCluster;
import it.unimi.di.big.mg4j.index.payload.Payload;
import it.unimi.di.big.mg4j.tool.Combine.IndexType;
import it.unimi.di.big.mg4j.io.IOFactory;
import it.unimi.dsi.Util;
import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.fastutil.ints.IntBigList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.BloomFilter;
import it.unimi.dsi.util.ImmutableExternalPrefixMap;
import it.unimi.dsi.util.PrefixMap;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.util.StringMap;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.nio.ByteOrder;
import java.util.Map;
import java.util.concurrent.TimeUnit;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.ConfigurationMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

/** Partitions an index documentally.
 *
 * <p>A global index is partitioned documentally by providing a {@link DocumentalPartitioningStrategy}
 * that specifies a destination local index for each document, and a local document pointer. The global index
 * is scanned, and the postings are partitioned among the local indices using the provided strategy. For instance,
 * a {@link ContiguousDocumentalStrategy} divides an index into blocks of contiguous documents.
 * 
 * <p>Since each local index contains a (proper) subset of the original set of documents, it contains in general a (proper)
 * subset of the terms in the global index. Thus, the local term numbers and the global term numbers will not in general coincide.
 * As a result, when a set of local indices is accessed transparently as a single index
 * using a {@link it.unimi.di.big.mg4j.index.cluster.DocumentalCluster}, 
 * a call to {@link it.unimi.di.big.mg4j.index.Index#documents(long)} will throw an {@link java.lang.UnsupportedOperationException},
 * because there is no way to map the global term numbers to local term numbers.
 * 
 * <p>On the other hand, a call to {@link it.unimi.di.big.mg4j.index.Index#documents(CharSequence)} will be passed each local index to
 * build a global iterator. To speed up this phase for not-so-frequent terms, when partitioning an index you can require
 * the construction of {@linkplain BloomFilter Bloom filters} that will be used to try to avoid
 * inquiring indices that do not contain a term. The precision of the filters is settable.
 *
 * <p>The property file will use a {@link it.unimi.di.big.mg4j.index.cluster.DocumentalMergedCluster} unless you provide
 * a {@link ContiguousDocumentalStrategy}, in which case a 
 * {@link it.unimi.di.big.mg4j.index.cluster.DocumentalConcatenatedCluster} will be used instead. Note that there might
 * be other cases in which the latter is adapt, in which case you can edit manually the property file.
 *
 * <p><strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps} 
 * or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g.,
 * {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}).
 * 
 * <p><strong>Warning</strong>: variable quanta are not supported by this class, as it is impossible to predict accurately
 * the number of bits used for positions when partitioning documentally. If you want to use variable quanta, use a
 * simple interleaved index without skips as an intermediate step, and pass it through {@link Combine}.
 * 
 * <h2>Sizes</h2>
 * 
 * <p>Partitioning the file containing document sizes is a tricky issue. For the time being this class
 * implements a very simple policy: if {@link DocumentalPartitioningStrategy#numberOfDocuments(int)} returns the number of
 * documents of the global index, the size file for a local index is generated by replacing all sizes of documents not
 * belonging to the index with a zero. Otherwise, the file is generated by appending in order the sizes of the documents
 * belonging to the index. This simple strategy works well with contiguous splitting and with splittings that do not
 * change the document numbers (e.g., the inverse operation of a {@link Merge}). However, more complex splittings might give rise
 * to inconsistent size files.  
 * 
 * <h2>Write-once output and distributed index partitioning</h2>
 * 
 * Please see {@link it.unimi.di.big.mg4j.tool.PartitionLexically}&mdash;the same comments apply.
 * 
 * @author Alessandro Arrabito
 * @author Sebastiano Vigna
 * 
 * @since 1.0.1
 */

public class PartitionDocumentally {
    private final static Logger LOGGER = LoggerFactory.getLogger(PartitionDocumentally.class);

    /**  The default buffer size for all involved indices. */
    public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024;

    /** The number of local indices. */
    private final int numIndices;
    /** The output basenames. */
    private final String outputBasename;
    /** The array of local output basenames. */
    private final String[] localBasename;
    /** The input basename. */
    private final String inputBasename;
    /** The properties of the input index. */
    private final Properties inputProperties;
    /** The size of I/O buffers. */
    private final int bufferSize;
    /** The filename of the strategy used to partition the index. */
    private final String strategyFilename;
    /** The strategy used to perform the partitioning. */
    private final DocumentalPartitioningStrategy strategy;
    /** The additional local properties of each local index. */
    private final Properties[] strategyProperties;
    /** The logging interval. */
    private final long logInterval;
    /** The global index to be partitioned. */
    private final Index globalIndex;
    /** A reader on {@link #globalIndex}. */
    private final IndexReader indexReader;
    /** A reader for the terms of the global index. */
    private final FastBufferedReader terms;
    /** An index writer for each local index. */
    private final IndexWriter[] indexWriter;
    /** A copy of {@link #indexWriter} which is non-<code>null</code> if {@link #indexWriter} is an instance of {@link QuasiSuccinctIndexWriter}[]. */
    private QuasiSuccinctIndexWriter[] quasiSuccinctIndexWriter;
    /** Whether each {@link #indexWriter} has counts. */
    private final boolean haveCounts;
    /** Whether each {@link #indexWriter} has positions. */
    private final boolean havePositions;
    /** Whether each {@link #indexWriter} has payloads. */
    private final boolean havePayloads;
    /** A print writer for the terms of each local index. */
    private final PrintWriter[] localTerms;
    /** The maximum size of a document in each local index. */
    private final int[] maxDocSize;
    /** The maximum number of positions in each local index. */
    private final int[] maxDocPos;
    /** The number of terms in each local index. */
    private final long[] numTerms;
    /** The number of postings in each local index. */
    private final long[] numPostings;
    /** The number of occurrences in each local index. */
    private final long[] numOccurrences;
    /** The global count for each local index. */
    private final long[] occurrencies;
    /** The required precision for Bloom filters (0 means no filter). */
    private final int bloomFilterPrecision;

    public PartitionDocumentally(final String inputBasename, final String outputBasename,
            final DocumentalPartitioningStrategy strategy, final String strategyFilename,
            final int BloomFilterPrecision, final int bufferSize, final Map<Component, Coding> writerFlags,
            IndexType indexType, boolean skips, final int quantum, final int height,
            final int skipBufferOrCacheSize, final long logInterval) throws ConfigurationException, IOException,
            ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException,
            URISyntaxException, InvocationTargetException, NoSuchMethodException {

        this.inputBasename = inputBasename;
        this.outputBasename = outputBasename;
        this.strategy = strategy;
        this.strategyFilename = strategyFilename;
        this.strategyProperties = strategy.properties();
        this.bufferSize = bufferSize;
        this.logInterval = logInterval;
        this.bloomFilterPrecision = BloomFilterPrecision;

        numIndices = strategy.numberOfLocalIndices();

        final Coding positionCoding = writerFlags.get(Component.POSITIONS);

        inputProperties = new Properties(inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION);
        globalIndex = Index.getInstance(inputBasename, false,
                positionCoding == Coding.GOLOMB || positionCoding == Coding.INTERPOLATIVE, false);
        indexReader = globalIndex.getReader();

        localBasename = new String[numIndices];
        for (int i = 0; i < numIndices; i++)
            localBasename[i] = outputBasename + "-" + i;

        localTerms = new PrintWriter[numIndices];
        maxDocSize = new int[numIndices];
        maxDocPos = new int[numIndices];
        numTerms = new long[numIndices];
        occurrencies = new long[numIndices];
        numOccurrences = new long[numIndices];
        numPostings = new long[numIndices];
        indexWriter = new IndexWriter[numIndices];
        quasiSuccinctIndexWriter = new QuasiSuccinctIndexWriter[numIndices];

        if ((havePayloads = writerFlags.containsKey(Component.PAYLOADS)) && !globalIndex.hasPayloads)
            throw new IllegalArgumentException(
                    "You requested payloads, but the global index does not contain them.");
        if ((haveCounts = writerFlags.containsKey(Component.COUNTS)) && !globalIndex.hasCounts)
            throw new IllegalArgumentException("You requested counts, but the global index does not contain them.");
        if ((havePositions = writerFlags.containsKey(Component.POSITIONS)) && !globalIndex.hasPositions)
            throw new IllegalArgumentException(
                    "You requested positions, but the global index does not contain them.");

        if (indexType == IndexType.HIGH_PERFORMANCE && !havePositions)
            throw new IllegalArgumentException("You cannot disable positions for high-performance indices.");
        if (indexType != IndexType.INTERLEAVED && havePayloads)
            throw new IllegalArgumentException("Payloads are available in interleaved indices only.");
        skips |= indexType == IndexType.HIGH_PERFORMANCE;
        if (skips && (quantum <= 0 || height < 0))
            throw new IllegalArgumentException(
                    "You must specify a positive quantum and a nonnegative height (variable quanta are not available when partitioning documentally).");

        for (int i = 0; i < numIndices; i++) {
            switch (indexType) {
            case INTERLEAVED:
                if (!skips)
                    indexWriter[i] = new BitStreamIndexWriter(IOFactory.FILESYSTEM_FACTORY, localBasename[i],
                            strategy.numberOfDocuments(i), true, writerFlags);
                else
                    indexWriter[i] = new SkipBitStreamIndexWriter(IOFactory.FILESYSTEM_FACTORY, localBasename[i],
                            strategy.numberOfDocuments(i), true, skipBufferOrCacheSize, writerFlags, quantum,
                            height);
                break;
            case HIGH_PERFORMANCE:
                indexWriter[i] = new BitStreamHPIndexWriter(localBasename[i], strategy.numberOfDocuments(i), true,
                        skipBufferOrCacheSize, writerFlags, quantum, height);
                break;
            case QUASI_SUCCINCT:
                quasiSuccinctIndexWriter[i] = (QuasiSuccinctIndexWriter) (indexWriter[i] = new QuasiSuccinctIndexWriter(
                        IOFactory.FILESYSTEM_FACTORY, localBasename[i], strategy.numberOfDocuments(i),
                        Fast.mostSignificantBit(quantum < 0 ? QuasiSuccinctIndex.DEFAULT_QUANTUM : quantum),
                        skipBufferOrCacheSize, writerFlags, ByteOrder.nativeOrder()));
            }
            localTerms[i] = new PrintWriter(new BufferedWriter(new OutputStreamWriter(
                    new FileOutputStream(localBasename[i] + DiskBasedIndex.TERMS_EXTENSION), "UTF-8")));
        }

        terms = new FastBufferedReader(new InputStreamReader(
                new FileInputStream(inputBasename + DiskBasedIndex.TERMS_EXTENSION), "UTF-8"));
    }

    private void partitionSizes() throws IOException {
        final File sizesFile = new File(inputBasename + DiskBasedIndex.SIZES_EXTENSION);
        if (sizesFile.exists()) {
            LOGGER.info("Partitioning sizes...");
            final InputBitStream sizes = new InputBitStream(sizesFile);
            final OutputBitStream localSizes[] = new OutputBitStream[numIndices];
            for (int i = 0; i < numIndices; i++)
                localSizes[i] = new OutputBitStream(localBasename[i] + DiskBasedIndex.SIZES_EXTENSION);

            // ALERT: for the time being, we decide whether to "fill the gaps" in sizes using as sole indicator the equality between global and local number of documents.
            int size, localIndex;
            if (globalIndex.numberOfDocuments == strategy.numberOfDocuments(0)) {
                for (int i = 0; i < globalIndex.numberOfDocuments; i++) {
                    localSizes[localIndex = strategy.localIndex(i)].writeGamma(size = sizes.readGamma());
                    if (maxDocSize[localIndex] < size)
                        maxDocSize[localIndex] = size;
                    for (int l = numIndices; l-- != 0;)
                        if (l != localIndex)
                            localSizes[l].writeGamma(0);
                }
            } else {
                for (int i = 0; i < globalIndex.numberOfDocuments; i++) {
                    localSizes[localIndex = strategy.localIndex(i)].writeGamma(size = sizes.readGamma());
                    if (maxDocSize[localIndex] < size)
                        maxDocSize[localIndex] = size;
                }
            }

            sizes.close();
            for (int i = 0; i < numIndices; i++)
                localSizes[i].close();
        }
    }

    public void run() throws Exception {
        final ProgressLogger pl = new ProgressLogger(LOGGER, logInterval, TimeUnit.MILLISECONDS);
        final IntBigList sizeList = globalIndex.sizes;
        partitionSizes();

        final int[] position = new int[Math.max(0, globalIndex.maxCount)];
        final long[] localFrequency = new long[numIndices];
        final long[] sumMaxPos = new long[numIndices];
        final int[] usedIndex = new int[numIndices];
        final InputBitStream[] direct = new InputBitStream[numIndices];
        final InputBitStream[] indirect = new InputBitStream[numIndices];
        @SuppressWarnings("unchecked")
        final BloomFilter<Void>[] bloomFilter = bloomFilterPrecision != 0 ? new BloomFilter[numIndices] : null;
        final File[] tempFile = new File[numIndices];
        final CachingOutputBitStream[] temp = new CachingOutputBitStream[numIndices];
        IndexIterator indexIterator;

        for (int i = 0; i < numIndices; i++) {
            tempFile[i] = new File(localBasename[i] + ".temp");
            temp[i] = new CachingOutputBitStream(tempFile[i], bufferSize);
            direct[i] = new InputBitStream(temp[i].buffer());
            indirect[i] = new InputBitStream(tempFile[i]);
            if (bloomFilterPrecision != 0)
                bloomFilter[i] = BloomFilter.create(globalIndex.numberOfTerms, bloomFilterPrecision);
        }
        int usedIndices;
        MutableString currentTerm = new MutableString();
        Payload payload = null;
        long frequency, globalPointer, localPointer;
        int localIndex, count = -1;

        pl.expectedUpdates = globalIndex.numberOfPostings;
        pl.itemsName = "postings";
        pl.logInterval = logInterval;
        pl.start("Partitioning index...");

        for (long t = 0; t < globalIndex.numberOfTerms; t++) {
            terms.readLine(currentTerm);
            indexIterator = indexReader.nextIterator();
            usedIndices = 0;
            frequency = indexIterator.frequency();

            for (long j = 0; j < frequency; j++) {
                globalPointer = indexIterator.nextDocument();
                localIndex = strategy.localIndex(globalPointer);

                if (localFrequency[localIndex] == 0) {
                    // First time we see a document for this index.
                    currentTerm.println(localTerms[localIndex]);
                    numTerms[localIndex]++;
                    usedIndex[usedIndices++] = localIndex;
                    if (bloomFilterPrecision != 0)
                        bloomFilter[localIndex].add(currentTerm);
                }

                /* Store temporarily posting data; note that we save the global pointer as we
                 * will have to access the size list. */

                localFrequency[localIndex]++;
                numPostings[localIndex]++;
                temp[localIndex].writeLongGamma(globalPointer);

                if (globalIndex.hasPayloads)
                    payload = indexIterator.payload();
                if (havePayloads)
                    payload.write(temp[localIndex]);

                if (haveCounts) {
                    count = indexIterator.count();
                    temp[localIndex].writeGamma(count);
                    occurrencies[localIndex] += count;
                    if (maxDocPos[localIndex] < count)
                        maxDocPos[localIndex] = count;
                    if (havePositions) {
                        int pos = indexIterator.nextPosition(), prevPos = pos;
                        temp[localIndex].writeDelta(pos);
                        for (int p = 1; p < count; p++) {
                            temp[localIndex].writeDelta((pos = indexIterator.nextPosition()) - prevPos - 1);
                            prevPos = pos;
                        }
                        sumMaxPos[localIndex] += pos;
                    }
                }
            }

            // We now run through the indices used by this term and copy from the temporary buffer.

            OutputBitStream obs;

            for (int k = 0; k < usedIndices; k++) {
                final int i = usedIndex[k];

                if (haveCounts)
                    numOccurrences[i] += occurrencies[i];
                InputBitStream ibs;
                if (quasiSuccinctIndexWriter[i] != null)
                    quasiSuccinctIndexWriter[i].newInvertedList(localFrequency[i], occurrencies[i], sumMaxPos[i]);
                else
                    indexWriter[i].newInvertedList();
                occurrencies[i] = 0;

                temp[i].align();
                if (temp[i].buffer() != null)
                    ibs = direct[i];
                else {
                    // We cannot read directly from the internal buffer.
                    ibs = indirect[i];
                    ibs.flush();
                    temp[i].flush();
                }

                ibs.position(0);

                indexWriter[i].writeFrequency(localFrequency[i]);
                for (long j = 0; j < localFrequency[i]; j++) {
                    obs = indexWriter[i].newDocumentRecord();
                    globalPointer = ibs.readLongGamma();
                    localPointer = strategy.localPointer(globalPointer);
                    indexWriter[i].writeDocumentPointer(obs, localPointer);
                    if (havePayloads) {
                        payload.read(ibs);
                        indexWriter[i].writePayload(obs, payload);
                    }
                    if (haveCounts)
                        indexWriter[i].writePositionCount(obs, count = ibs.readGamma());
                    if (havePositions) {
                        ibs.readDeltas(position, count);
                        for (int p = 1; p < count; p++)
                            position[p] += position[p - 1] + 1;
                        indexWriter[i].writeDocumentPositions(obs, position, 0, count,
                                sizeList != null ? sizeList.getInt(globalPointer) : -1);
                    }

                }
                temp[i].position(0);
                temp[i].writtenBits(0);
                localFrequency[i] = 0;
                sumMaxPos[i] = 0;
            }

            usedIndices = 0;
            pl.count += frequency - 1;
            pl.update();
        }

        pl.done();

        Properties globalProperties = new Properties();
        globalProperties.setProperty(Index.PropertyKeys.FIELD,
                inputProperties.getProperty(Index.PropertyKeys.FIELD));
        globalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
                inputProperties.getProperty(Index.PropertyKeys.TERMPROCESSOR));

        for (int i = 0; i < numIndices; i++) {
            localTerms[i].close();
            indexWriter[i].close();
            if (bloomFilterPrecision != 0)
                BinIO.storeObject(bloomFilter[i], localBasename[i] + DocumentalCluster.BLOOM_EXTENSION);
            temp[i].close();
            tempFile[i].delete();

            Properties localProperties = indexWriter[i].properties();
            localProperties.addAll(globalProperties);
            localProperties.setProperty(Index.PropertyKeys.MAXCOUNT, String.valueOf(maxDocPos[i]));
            localProperties.setProperty(Index.PropertyKeys.MAXDOCSIZE, maxDocSize[i]);
            localProperties.setProperty(Index.PropertyKeys.FIELD,
                    globalProperties.getProperty(Index.PropertyKeys.FIELD));
            localProperties.setProperty(Index.PropertyKeys.OCCURRENCES, haveCounts ? numOccurrences[i] : -1);
            localProperties.setProperty(Index.PropertyKeys.POSTINGS, numPostings[i]);
            localProperties.setProperty(Index.PropertyKeys.TERMS, numTerms[i]);
            if (havePayloads)
                localProperties.setProperty(Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName());
            if (strategyProperties[i] != null)
                localProperties.addAll(strategyProperties[i]);
            localProperties.save(localBasename[i] + DiskBasedIndex.PROPERTIES_EXTENSION);
        }

        if (strategyFilename != null)
            globalProperties.setProperty(IndexCluster.PropertyKeys.STRATEGY, strategyFilename);
        for (int i = 0; i < numIndices; i++)
            globalProperties.addProperty(IndexCluster.PropertyKeys.LOCALINDEX, localBasename[i]);
        globalProperties.setProperty(DocumentalCluster.PropertyKeys.BLOOM, bloomFilterPrecision != 0);
        // If we partition an index with a single term, by definition we have a flat cluster
        globalProperties.setProperty(DocumentalCluster.PropertyKeys.FLAT,
                inputProperties.getLong(Index.PropertyKeys.TERMS) <= 1);
        globalProperties.setProperty(Index.PropertyKeys.MAXCOUNT,
                inputProperties.getProperty(Index.PropertyKeys.MAXCOUNT));
        globalProperties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
                inputProperties.getProperty(Index.PropertyKeys.MAXDOCSIZE));
        globalProperties.setProperty(Index.PropertyKeys.POSTINGS,
                inputProperties.getProperty(Index.PropertyKeys.POSTINGS));
        globalProperties.setProperty(Index.PropertyKeys.OCCURRENCES,
                inputProperties.getProperty(Index.PropertyKeys.OCCURRENCES));
        globalProperties.setProperty(Index.PropertyKeys.DOCUMENTS,
                inputProperties.getProperty(Index.PropertyKeys.DOCUMENTS));
        globalProperties.setProperty(Index.PropertyKeys.TERMS,
                inputProperties.getProperty(Index.PropertyKeys.TERMS));
        if (havePayloads)
            globalProperties.setProperty(Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName());

        /* For the general case, we must rely on a merged cluster. However, if we detect a contiguous
         * strategy we can optimise a bit. */

        globalProperties.setProperty(Index.PropertyKeys.INDEXCLASS,
                strategy instanceof ContiguousDocumentalStrategy ? DocumentalConcatenatedCluster.class.getName()
                        : DocumentalMergedCluster.class.getName());

        globalProperties.save(outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION);
        LOGGER.debug(
                "Properties for clustered index " + outputBasename + ": " + new ConfigurationMap(globalProperties));

    }

    public static void main(final String arg[])
            throws ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, Exception {

        SimpleJSAP jsap = new SimpleJSAP(PartitionDocumentally.class.getName(), "Partitions an index documentally.",
                new Parameter[] {
                        new FlaggedOption("bufferSize", JSAP.INTSIZE_PARSER,
                                Util.formatBinarySize(DEFAULT_BUFFER_SIZE), JSAP.NOT_REQUIRED, 'b', "buffer-size",
                                "The size of an I/O buffer."),
                        new FlaggedOption("logInterval", JSAP.LONG_PARSER,
                                Long.toString(ProgressLogger.DEFAULT_LOG_INTERVAL), JSAP.NOT_REQUIRED, 'l',
                                "log-interval", "The minimum time interval between activity logs in milliseconds."),
                        new FlaggedOption("strategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's',
                                "strategy", "A serialised documental partitioning strategy."),
                        new FlaggedOption("uniformStrategy", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
                                JSAP.NOT_REQUIRED, 'u', "uniform",
                                "Requires a uniform partitioning in the given number of parts."),
                        new FlaggedOption("bloom", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 'B', "bloom",
                                "Generates Bloom filters with given precision."),
                        new FlaggedOption("comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c',
                                "comp", "A compression flag for the index (may be specified several times).")
                                        .setAllowMultipleDeclarations(true),
                        new Switch("noSkips", JSAP.NO_SHORTFLAG, "no-skips", "Disables skips."),
                        new Switch("interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index."),
                        new Switch("highPerformance", 'h', "high-performance", "Forces a high-performance index."),
                        new FlaggedOption("cacheSize", JSAP.INTSIZE_PARSER,
                                Util.formatBinarySize(QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE),
                                JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "cache-size",
                                "The size of the bit cache used while creating a quasi-succinct index."),
                        new FlaggedOption("quantum", JSAP.INTSIZE_PARSER, "32", JSAP.NOT_REQUIRED, 'Q', "quantum",
                                "The skip quantum."),
                        new FlaggedOption("height", JSAP.INTSIZE_PARSER,
                                Integer.toString(BitStreamIndex.DEFAULT_HEIGHT), JSAP.NOT_REQUIRED, 'H', "height",
                                "The skip height."),
                        new FlaggedOption("skipBufferSize", JSAP.INTSIZE_PARSER,
                                Util.formatBinarySize(SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE),
                                JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size",
                                "The size of the internal temporary buffer used while creating an index with skips."),
                        new UnflaggedOption("inputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "The basename of the global index."),
                        new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "The basename of the local indices.") });

        JSAPResult jsapResult = jsap.parse(arg);
        if (jsap.messagePrinted())
            return;
        String inputBasename = jsapResult.getString("inputBasename");
        String outputBasename = jsapResult.getString("outputBasename");
        String strategyFilename = jsapResult.getString("strategy");
        DocumentalPartitioningStrategy strategy = null;

        if (jsapResult.userSpecified("uniformStrategy")) {
            strategy = DocumentalStrategies.uniform(jsapResult.getInt("uniformStrategy"),
                    Index.getInstance(inputBasename).numberOfDocuments);
            BinIO.storeObject(strategy,
                    strategyFilename = outputBasename + IndexCluster.STRATEGY_DEFAULT_EXTENSION);
        } else if (strategyFilename != null)
            strategy = (DocumentalPartitioningStrategy) BinIO.loadObject(strategyFilename);
        else
            throw new IllegalArgumentException("You must specify a partitioning strategy");

        final boolean skips = !jsapResult.getBoolean("noSkips");
        final boolean interleaved = jsapResult.getBoolean("interleaved");
        final boolean highPerformance = jsapResult.getBoolean("highPerformance");
        if (!skips && !interleaved)
            throw new IllegalArgumentException("You can disable skips only for interleaved indices");
        if (interleaved && highPerformance)
            throw new IllegalArgumentException("You must specify either --interleaved or --high-performance.");
        if (!skips && (jsapResult.userSpecified("quantum") || jsapResult.userSpecified("height")))
            throw new IllegalArgumentException("You specified quantum or height, but you also disabled skips.");

        final IndexType indexType = interleaved ? IndexType.INTERLEAVED
                : highPerformance ? IndexType.HIGH_PERFORMANCE : IndexType.QUASI_SUCCINCT;
        final Map<Component, Coding> compressionFlags = indexType == IndexType.QUASI_SUCCINCT
                ? CompressionFlags.valueOf(jsapResult.getStringArray("comp"),
                        CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX)
                : CompressionFlags.valueOf(jsapResult.getStringArray("comp"),
                        CompressionFlags.DEFAULT_STANDARD_INDEX);

        new PartitionDocumentally(inputBasename, outputBasename, strategy, strategyFilename,
                jsapResult.getInt("bloom"), jsapResult.getInt("bufferSize"), compressionFlags, indexType, skips,
                jsapResult.getInt("quantum"), jsapResult.getInt("height"),
                indexType == IndexType.QUASI_SUCCINCT ? jsapResult.getInt("cacheSize")
                        : jsapResult.getInt("skipBufferSize"),
                jsapResult.getLong("logInterval")).run();
    }
}