it.unimi.di.big.mg4j.tool.PartitionLexically.java Source code

Introduction

Here is the source code for it.unimi.di.big.mg4j.tool.PartitionLexically.java
Source

package it.unimi.di.big.mg4j.tool;

/*       
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2006-2015 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

import it.unimi.di.big.mg4j.index.BitStreamHPIndex;
import it.unimi.di.big.mg4j.index.BitStreamIndex;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
import it.unimi.di.big.mg4j.index.Index;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex;
import it.unimi.di.big.mg4j.index.QuasiSuccinctIndexWriter.LongWordOutputBitStream;
import it.unimi.di.big.mg4j.index.cluster.ContiguousLexicalStrategy;
import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
import it.unimi.di.big.mg4j.index.cluster.IndexCluster;
import it.unimi.di.big.mg4j.index.cluster.LexicalCluster;
import it.unimi.di.big.mg4j.index.cluster.LexicalPartitioningStrategy;
import it.unimi.di.big.mg4j.index.cluster.LexicalStrategies;
import it.unimi.di.big.mg4j.search.score.BM25Scorer;
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.BloomFilter;
import it.unimi.dsi.util.ImmutableExternalPrefixMap;
import it.unimi.dsi.util.PrefixMap;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.util.StringMap;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.util.concurrent.TimeUnit;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.ConfigurationMap;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

/** Partitions an index lexically.
 * 
 * <p>A global index is partitioned lexically by providing a {@link LexicalPartitioningStrategy}
 * that specifies a destination local index for each term, and a local term number. The global index
 * is read directly at the bit level, and the posting lists are divided among the 
 * local indices using the provided strategy. For instance,
 * an {@link ContiguousLexicalStrategy} divides an index into 
 * contiguous blocks (of terms) specified by the given strategy.
 * 
 * <p>By choice, document pointers are not remapped. Thus, it may happen that one of the local indices 
 * contains <em>no</em> posting with a certain document. However, computing the subset of documents contained
 * in each local index to remap them in a contiguous interval is not a good idea, as usually the subset
 * of documents appearing in the postings of each local index is large.
 *
 * <p>To speed up the search of the right local index of a not-so-frequent term (in
 * particular with a {@linkplain it.unimi.di.big.mg4j.index.cluster.ChainedLexicalClusteringStrategy chained strategy}), 
 * after partitioning an index you can create {@linkplain BloomFilter Bloom filters} that will be used to try to avoid
 * inquiring indices that do not contain a term. The filters will be automatically loaded
 * by {@link it.unimi.di.big.mg4j.index.cluster.IndexCluster#getInstance(CharSequence, boolean, boolean)}.
 * 
 * <p>Note that the size file is the same for each local index and <em>is not copied</em>. Please use
 * standard operating system features such as symbolic links to provide size files to 
 * local indices. 
 * 
 * <p>If you plan to {@linkplain LexicalCluster cluster} the partitioned indices and you need document sizes 
 * (e.g., for {@linkplain BM25Scorer BM25 scoring}), you can use the index property 
 * {@link it.unimi.di.big.mg4j.index.Index.UriKeys#SIZES} to load the original size file.  
 * 
 * If you plan on partitioning an index requiring
 * document sizes, you should consider a custom index loading scheme 
 * that shares the {@linkplain it.unimi.di.big.mg4j.index.BitStreamIndex#sizes size list}
 * among all local indices.
 *
 * <strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps} 
 * or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g.,
 * {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}).
 *
 * <h2>Write-once output and distributed index partitioning</h2>
 * 
 * <p>The partitioning process writes each index file sequentially exactly once, so index partitioning
 * can output its results to <em>pipes</em>, which in
 * turn can spill their content, for instance, through the network. In other words, albeit this
 * class theoretically creates a number of local indices on disk, those indices can be
 * substituted with suitable pipes creating remote local indices without affecting the partitioning process.
 * For instance, the following <samp>bash</samp> code creates three sets of pipes:
 * <pre style="margin: 1em 0">
 * for i in 0 1 2; do
 *   for e in frequencies occurrencies index offsets properties sizes terms; do 
 *     mkfifo pipe-$i.$e
 *   done
 * done
 * </pre> 
 * 
 * <p>Each pipe must be emptied elsewhere, for instance (assuming
 * you want local indices <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp> on <samp>example.com</samp>):
 * <pre style="margin: 1em 0">
 * for i in 0 1 2; do 
 *   for e in frequencies occurrencies index offsets properties sizes terms; do 
 *     (cat pipe-$i.$e | ssh -x example.com "cat >index-$i.$e" &)
 *   done
 * done
 * </pre> 
 * <p>If we now start a partitioning process generating three local indices named <samp>pipe-0</samp>,
 * <samp>pipe-1</samp> and <samp>pipe-2</samp>
 * all pipes will be written to by the process, and the data will create remotely
 * indices <samp>index-0</samp>, <samp>index-1</samp> and <samp>index-2</samp>.
 *
 * @author Sebastiano Vigna
 * 
 * @since 1.0.1
 */

public class PartitionLexically {
    private static final Logger LOGGER = LoggerFactory.getLogger(PartitionLexically.class);

    /**  The default buffer size for all involved indices. */
    public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024;

    protected final static class LongWordInputBitStream {
        /** The underlying channel. */
        private final FileChannel fileChannel;
        /** The buffer. */
        private final ByteBuffer byteBuffer;
        /** The 64-bit buffer, whose lower {@link #filled} bits contain data. */
        private long buffer;
        /** The number of lower used bits {@link #buffer} (less than {@link Long#SIZE}). */
        private int filled;

        @SuppressWarnings("resource")
        public LongWordInputBitStream(final String filename, final int bufferSize, final ByteOrder byteOrder)
                throws FileNotFoundException {
            fileChannel = new FileInputStream(filename).getChannel();
            byteBuffer = ByteBuffer.allocateDirect(bufferSize).order(byteOrder);
            byteBuffer.flip();
        }

        public long extract(final int width) throws IOException {
            if (width <= filled) {
                long result = buffer & (1L << width) - 1;
                filled -= width;
                buffer >>>= width;
                return result;
            } else {
                if (!byteBuffer.hasRemaining()) {
                    byteBuffer.clear();
                    fileChannel.read(byteBuffer);
                    byteBuffer.flip();
                }

                if (filled == 0 && width == Long.SIZE)
                    return byteBuffer.getLong();

                long result = buffer;
                final int remainder = width - filled;
                buffer = byteBuffer.getLong();
                result |= (buffer & (1L << remainder) - 1) << filled;
                buffer >>>= remainder;
                filled = Long.SIZE - remainder;
                return result;
            }
        }

        public void close() throws IOException {
            fileChannel.close();
        }

    }

    /** The number of local indices. */
    private final int numIndices;
    /** The output basenames. */
    private final String outputBasename;
    /** The array of local output basenames. */
    private final String[] localBasename;
    /** The input basename. */
    private final String inputBasename;
    /** The size of I/O buffers. */
    private final int bufferSize;
    /** The filename of the strategy used to partition the index. */
    private final String strategyFilename;
    /** The strategy used to partition the index. */
    private final LexicalPartitioningStrategy strategy;
    /** The additional local properties of each local index. */
    private final Properties[] strategyProperties;
    /** The logging interval. */
    private final long logInterval;

    public PartitionLexically(final String inputBasename, final String outputBasename,
            final LexicalPartitioningStrategy strategy, final String strategyFilename, final int bufferSize,
            final long logInterval) {

        this.inputBasename = inputBasename;
        this.outputBasename = outputBasename;
        this.strategy = strategy;
        this.strategyFilename = strategyFilename;
        this.bufferSize = bufferSize & ~(Long.SIZE - 1);
        this.logInterval = logInterval;
        numIndices = strategy.numberOfLocalIndices();
        strategyProperties = strategy.properties();
        localBasename = new String[numIndices];
        for (int i = 0; i < numIndices; i++)
            localBasename[i] = outputBasename + "-" + i;
    }

    public void runTermsOnly() throws IOException {
        final ProgressLogger pl = new ProgressLogger(LOGGER, logInterval, TimeUnit.MILLISECONDS);

        final PrintWriter[] localTerms = new PrintWriter[numIndices];
        final long numTerms[] = new long[numIndices];
        @SuppressWarnings("resource")
        final FastBufferedReader terms = new FastBufferedReader(new InputStreamReader(
                new FileInputStream(inputBasename + DiskBasedIndex.TERMS_EXTENSION), "UTF-8"));

        for (int i = 0; i < numIndices; i++)
            localTerms[i] = new PrintWriter(
                    new OutputStreamWriter(
                            new FastBufferedOutputStream(
                                    new FileOutputStream(localBasename[i] + DiskBasedIndex.TERMS_EXTENSION)),
                            "UTF-8"));

        // The current term
        final MutableString currTerm = new MutableString();

        pl.itemsName = "terms";
        pl.logInterval = logInterval;
        pl.start("Partitioning index terms...");

        long termNumber = 0;
        int k;

        while (terms.readLine(currTerm) != null) {
            k = strategy.localIndex(termNumber); // The local index for this term
            if (numTerms[k] != strategy.localNumber(termNumber))
                throw new IllegalStateException();
            numTerms[k]++;
            currTerm.println(localTerms[k]);
            pl.update();
            termNumber++;
        }

        terms.close();
        for (int i = 0; i < numIndices; i++)
            localTerms[i].close();

        pl.done();
    }

    @SuppressWarnings("resource")
    public void run() throws ConfigurationException, IOException, ClassNotFoundException {
        final ProgressLogger pl = new ProgressLogger(LOGGER, logInterval, TimeUnit.MILLISECONDS);
        final byte[] buffer = new byte[bufferSize];

        final Properties properties = new Properties(inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION);
        final long numberOfTerms = properties.getLong(Index.PropertyKeys.TERMS);
        final Class<?> indexClass = Class.forName(properties.getString(Index.PropertyKeys.INDEXCLASS));
        final boolean isHighPerformance = BitStreamHPIndex.class.isAssignableFrom(indexClass);
        final boolean isQuasiSuccinct = QuasiSuccinctIndex.class.isAssignableFrom(indexClass);
        final ByteOrder byteOrder = isQuasiSuccinct
                ? DiskBasedIndex.byteOrder(properties.getString(QuasiSuccinctIndex.PropertyKeys.BYTEORDER))
                : null;

        final OutputBitStream[] localIndex = new OutputBitStream[numIndices];
        final OutputBitStream[] localPositions = new OutputBitStream[numIndices];
        final OutputBitStream[] localOffsets = new OutputBitStream[numIndices];

        final LongWordOutputBitStream[] localQSPointers = isQuasiSuccinct ? new LongWordOutputBitStream[numIndices]
                : null;
        final LongWordOutputBitStream[] localQSCounts = isQuasiSuccinct ? new LongWordOutputBitStream[numIndices]
                : null;
        final LongWordOutputBitStream[] localQSPositions = isQuasiSuccinct ? new LongWordOutputBitStream[numIndices]
                : null;
        final OutputBitStream[] localCountsOffsets = isQuasiSuccinct ? new OutputBitStream[numIndices] : null;
        final OutputBitStream[] localPositionsOffsets = isQuasiSuccinct ? new OutputBitStream[numIndices] : null;

        final OutputBitStream[] localPosNumBits = new OutputBitStream[numIndices];
        final OutputBitStream[] localSumsMaxPos = new OutputBitStream[numIndices];
        final OutputBitStream[] localFrequencies = new OutputBitStream[numIndices];
        final OutputBitStream[] localOccurrencies = new OutputBitStream[numIndices];
        final PrintWriter[] localTerms = new PrintWriter[numIndices];
        final long numTerms[] = new long[numIndices];
        final long localOccurrences[] = new long[numIndices];
        final long numberOfPostings[] = new long[numIndices];

        final InputBitStream globalIndex = isQuasiSuccinct ? null
                : new InputBitStream(inputBasename + DiskBasedIndex.INDEX_EXTENSION, bufferSize);
        final long globalPositionsLength = new File(inputBasename + DiskBasedIndex.POSITIONS_EXTENSION).length();
        final InputBitStream globalPositions = isHighPerformance
                ? new InputBitStream(inputBasename + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize)
                : null;
        final FastBufferedReader terms = new FastBufferedReader(new InputStreamReader(
                new FileInputStream(inputBasename + DiskBasedIndex.TERMS_EXTENSION), "UTF-8"));
        final InputBitStream globalOffsets = new InputBitStream(inputBasename
                + (isQuasiSuccinct ? DiskBasedIndex.POINTERS_EXTENSIONS + DiskBasedIndex.OFFSETS_POSTFIX
                        : DiskBasedIndex.OFFSETS_EXTENSION));
        final InputBitStream globalPositionsOffsets = isQuasiSuccinct ? new InputBitStream(
                inputBasename + DiskBasedIndex.POSITIONS_EXTENSION + DiskBasedIndex.OFFSETS_POSTFIX, bufferSize)
                : null;
        final InputBitStream globalCountsOffsets = isQuasiSuccinct ? new InputBitStream(
                inputBasename + DiskBasedIndex.COUNTS_EXTENSION + DiskBasedIndex.OFFSETS_POSTFIX, bufferSize)
                : null;

        final LongWordInputBitStream globalPointersIbs = isQuasiSuccinct
                ? new LongWordInputBitStream(inputBasename + DiskBasedIndex.POINTERS_EXTENSIONS, bufferSize,
                        byteOrder)
                : null;
        final LongWordInputBitStream globalCountsIbs = isQuasiSuccinct
                ? new LongWordInputBitStream(inputBasename + DiskBasedIndex.COUNTS_EXTENSION, bufferSize, byteOrder)
                : null;
        final LongWordInputBitStream globalPositionsIbs = isQuasiSuccinct
                ? new LongWordInputBitStream(inputBasename + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize,
                        byteOrder)
                : null;

        final File posNumBitsFile = new File(inputBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION);
        final InputBitStream posNumBits = posNumBitsFile.exists()
                ? new InputBitStream(inputBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION)
                : null;
        final File sumsMaxPosFile = new File(inputBasename + DiskBasedIndex.SUMS_MAX_POSITION_EXTENSION);
        final InputBitStream sumsMaxPos = sumsMaxPosFile.exists() ? new InputBitStream(sumsMaxPosFile) : null;
        final InputBitStream frequencies = new InputBitStream(inputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION);
        final InputBitStream occurrencies = new InputBitStream(
                inputBasename + DiskBasedIndex.OCCURRENCIES_EXTENSION);
        globalOffsets.readGamma();
        if (globalCountsOffsets != null)
            globalCountsOffsets.readGamma();
        if (globalPositionsOffsets != null)
            globalPositionsOffsets.readGamma();

        for (int i = 0; i < numIndices; i++) {
            if (!isQuasiSuccinct)
                localIndex[i] = new OutputBitStream(localBasename[i] + DiskBasedIndex.INDEX_EXTENSION, bufferSize);
            if (isHighPerformance)
                localPositions[i] = new OutputBitStream(localBasename[i] + DiskBasedIndex.POSITIONS_EXTENSION,
                        bufferSize);
            if (isQuasiSuccinct) {
                localQSPointers[i] = new LongWordOutputBitStream(
                        new FileOutputStream(localBasename[i] + DiskBasedIndex.POINTERS_EXTENSIONS).getChannel(),
                        byteOrder);
                localQSCounts[i] = new LongWordOutputBitStream(
                        new FileOutputStream(localBasename[i] + DiskBasedIndex.COUNTS_EXTENSION).getChannel(),
                        byteOrder);
                localQSPositions[i] = new LongWordOutputBitStream(
                        new FileOutputStream(localBasename[i] + DiskBasedIndex.POSITIONS_EXTENSION).getChannel(),
                        byteOrder);
            }
            localFrequencies[i] = new OutputBitStream(localBasename[i] + DiskBasedIndex.FREQUENCIES_EXTENSION);
            localOccurrencies[i] = new OutputBitStream(localBasename[i] + DiskBasedIndex.OCCURRENCIES_EXTENSION);
            localTerms[i] = new PrintWriter(
                    new OutputStreamWriter(
                            new FastBufferedOutputStream(
                                    new FileOutputStream(localBasename[i] + DiskBasedIndex.TERMS_EXTENSION)),
                            "UTF-8"));
            localOffsets[i] = new OutputBitStream(localBasename[i]
                    + (isQuasiSuccinct ? DiskBasedIndex.POINTERS_EXTENSIONS + DiskBasedIndex.OFFSETS_POSTFIX
                            : DiskBasedIndex.OFFSETS_EXTENSION));
            if (posNumBits != null)
                localPosNumBits[i] = new OutputBitStream(
                        localBasename[i] + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION);
            if (sumsMaxPos != null)
                localSumsMaxPos[i] = new OutputBitStream(
                        localBasename[i] + DiskBasedIndex.SUMS_MAX_POSITION_EXTENSION);
            localOffsets[i].writeGamma(0);
            if (isQuasiSuccinct) {
                localCountsOffsets[i] = new OutputBitStream(
                        localBasename[i] + DiskBasedIndex.COUNTS_EXTENSION + DiskBasedIndex.OFFSETS_POSTFIX);
                localPositionsOffsets[i] = new OutputBitStream(
                        localBasename[i] + DiskBasedIndex.POSITIONS_EXTENSION + DiskBasedIndex.OFFSETS_POSTFIX);
                localCountsOffsets[i].writeGamma(0);
                localPositionsOffsets[i].writeGamma(0);
            }
        }

        // The current term
        final MutableString currTerm = new MutableString();

        pl.expectedUpdates = numberOfTerms;
        pl.itemsName = "bits";
        pl.logInterval = logInterval;
        pl.start("Partitioning index...");

        long termNumber = 0;
        int k, prevK = -1;
        long previousHeaderLength = 0, newHeaderLength = 0;
        long length, positionsOffset = 0;

        while (terms.readLine(currTerm) != null) {
            k = strategy.localIndex(termNumber); // The local index for this term
            if (numTerms[k] != strategy.localNumber(termNumber))
                throw new IllegalStateException();
            numTerms[k]++;

            if (isHighPerformance) {
                final long temp = globalIndex.readBits();
                positionsOffset = globalIndex.readLongDelta();
                previousHeaderLength = (int) (globalIndex.readBits() - temp);
                if (prevK != -1) {
                    length = positionsOffset - globalPositions.readBits();
                    copy(buffer, globalPositions, localPositions[prevK], length);
                }
                newHeaderLength = localIndex[k].writeLongDelta(localPositions[k].writtenBits());
            }

            final long frequency = frequencies.readLongGamma();
            localFrequencies[k].writeLongGamma(frequency);
            numberOfPostings[k] += frequency;

            if (posNumBits != null)
                localPosNumBits[k].writeLongGamma(posNumBits.readLongGamma());
            if (sumsMaxPos != null)
                localSumsMaxPos[k].writeLongDelta(sumsMaxPos.readLongDelta());

            final long occurrency = occurrencies.readLongGamma();
            localOccurrences[k] += occurrency;
            localOccurrencies[k].writeLongGamma(occurrency);

            currTerm.println(localTerms[k]);

            if (isQuasiSuccinct) {
                localOffsets[k].writeLongGamma(length = globalOffsets.readLongGamma());
                copy(globalPointersIbs, localQSPointers[k], length);
                localCountsOffsets[k].writeLongGamma(length = globalCountsOffsets.readLongGamma());
                copy(globalCountsIbs, localQSCounts[k], length);
                localPositionsOffsets[k].writeLongGamma(length = globalPositionsOffsets.readLongGamma());
                copy(globalPositionsIbs, localQSPositions[k], length);
            } else {
                length = globalOffsets.readLongGamma() - previousHeaderLength;
                localOffsets[k].writeLongGamma(length + newHeaderLength);

                copy(buffer, globalIndex, localIndex[k], length);
            }

            pl.update();
            prevK = k;
            termNumber++;
        }

        // We pour the last piece of positions
        if (isHighPerformance) {
            if (prevK != -1) {
                length = globalPositionsLength * 8 - globalPositions.readBits();
                copy(buffer, globalPositions, localPositions[prevK], length);
            }
        }

        pl.done();

        terms.close();
        globalOffsets.close();
        if (globalIndex != null)
            globalIndex.close();
        if (globalPointersIbs != null)
            globalPointersIbs.close();
        if (globalCountsIbs != null)
            globalCountsIbs.close();
        if (globalCountsOffsets != null)
            globalCountsOffsets.close();
        if (globalPositionsIbs != null)
            globalPositionsIbs.close();
        if (globalPositionsOffsets != null)
            globalPositionsOffsets.close();

        if (globalPositions != null)
            globalPositions.close();
        frequencies.close();
        occurrencies.close();
        if (posNumBits != null)
            posNumBits.close();
        if (sumsMaxPos != null)
            sumsMaxPos.close();
        if (isHighPerformance)
            globalPositions.close();

        // We copy the relevant properties from the original 
        Properties globalProperties = new Properties();
        if (strategyFilename != null)
            globalProperties.setProperty(IndexCluster.PropertyKeys.STRATEGY, strategyFilename);
        globalProperties.setProperty(DocumentalCluster.PropertyKeys.BLOOM, false);
        globalProperties.setProperty(Index.PropertyKeys.INDEXCLASS, LexicalCluster.class.getName());
        for (int i = 0; i < numIndices; i++)
            globalProperties.addProperty(IndexCluster.PropertyKeys.LOCALINDEX, localBasename[i]);
        globalProperties.setProperty(Index.PropertyKeys.FIELD, properties.getProperty(Index.PropertyKeys.FIELD));
        globalProperties.setProperty(Index.PropertyKeys.POSTINGS,
                properties.getProperty(Index.PropertyKeys.POSTINGS));
        globalProperties.setProperty(Index.PropertyKeys.OCCURRENCES,
                properties.getProperty(Index.PropertyKeys.OCCURRENCES));
        globalProperties.setProperty(Index.PropertyKeys.DOCUMENTS,
                properties.getProperty(Index.PropertyKeys.DOCUMENTS));
        globalProperties.setProperty(Index.PropertyKeys.TERMS, properties.getProperty(Index.PropertyKeys.TERMS));
        globalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
                properties.getProperty(Index.PropertyKeys.TERMPROCESSOR));
        globalProperties.setProperty(Index.PropertyKeys.MAXCOUNT,
                properties.getProperty(Index.PropertyKeys.MAXCOUNT));
        globalProperties.setProperty(Index.PropertyKeys.MAXDOCSIZE,
                properties.getProperty(Index.PropertyKeys.MAXDOCSIZE));
        globalProperties.save(outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION);
        LOGGER.debug(
                "Properties for clustered index " + outputBasename + ": " + new ConfigurationMap(globalProperties));

        for (int i = 0; i < numIndices; i++) {
            if (isQuasiSuccinct) {
                localQSPointers[i].close();
                if (localQSCounts != null) {
                    localCountsOffsets[i].close();
                    localQSCounts[i].close();
                }
                if (localQSPositions != null) {
                    localPositionsOffsets[i].close();
                    localQSPositions[i].close();
                }
            } else {
                localIndex[i].close();
                if (isHighPerformance)
                    localPositions[i].close();
            }

            localOffsets[i].close();

            if (posNumBits != null)
                localPosNumBits[i].close();
            if (sumsMaxPos != null)
                localSumsMaxPos[i].close();

            localFrequencies[i].close();
            localOccurrencies[i].close();
            localTerms[i].close();
            final InputStream input = new FileInputStream(inputBasename + DiskBasedIndex.SIZES_EXTENSION);
            final OutputStream output = new FileOutputStream(localBasename[i] + DiskBasedIndex.SIZES_EXTENSION);
            IOUtils.copy(input, output);
            input.close();
            output.close();
            Properties localProperties = new Properties();
            localProperties.addAll(globalProperties);
            localProperties.setProperty(Index.PropertyKeys.TERMS, numTerms[i]);
            localProperties.setProperty(Index.PropertyKeys.OCCURRENCES, localOccurrences[i]);
            localProperties.setProperty(Index.PropertyKeys.POSTINGS, numberOfPostings[i]);
            localProperties.setProperty(Index.PropertyKeys.POSTINGS, numberOfPostings[i]);
            localProperties.setProperty(Index.PropertyKeys.INDEXCLASS,
                    properties.getProperty(Index.PropertyKeys.INDEXCLASS));
            localProperties.setProperty(QuasiSuccinctIndex.PropertyKeys.BYTEORDER,
                    properties.getProperty(QuasiSuccinctIndex.PropertyKeys.BYTEORDER));
            localProperties.addProperties(Index.PropertyKeys.CODING,
                    properties.getStringArray(Index.PropertyKeys.CODING));
            localProperties.setProperty(BitStreamIndex.PropertyKeys.SKIPQUANTUM,
                    properties.getProperty(BitStreamIndex.PropertyKeys.SKIPQUANTUM));
            localProperties.setProperty(BitStreamIndex.PropertyKeys.SKIPHEIGHT,
                    properties.getProperty(BitStreamIndex.PropertyKeys.SKIPHEIGHT));
            if (strategyProperties != null && strategyProperties[i] != null)
                localProperties.addAll(strategyProperties[i]);
            localProperties.save(localBasename[i] + DiskBasedIndex.PROPERTIES_EXTENSION);
            LOGGER.debug("Post-partitioning properties for index " + localBasename[i] + ": "
                    + new ConfigurationMap(localProperties));
        }
    }

    private static void copy(final byte[] buffer, final InputBitStream from, final OutputBitStream to, long length)
            throws IOException {
        int res;
        final int bufferSize = buffer.length;
        while (length > 0) {
            res = (int) Math.min(bufferSize * 8, length);
            from.read(buffer, res);
            to.write(buffer, res);
            length -= res;
        }
    }

    private static void copy(final LongWordInputBitStream from, final LongWordOutputBitStream to, long length)
            throws IOException {
        while (length > 0) {
            final int width = (int) Math.min(Long.SIZE, length);
            to.append(from.extract(width), width);
            length -= width;
        }
    }

    public static void main(final String[] arg) throws JSAPException, ConfigurationException, IOException,
            ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException {

        SimpleJSAP jsap = new SimpleJSAP(PartitionLexically.class.getName(), "Partitions an index lexically.",
                new Parameter[] {
                        new FlaggedOption("bufferSize", JSAP.INTSIZE_PARSER,
                                Util.formatBinarySize(DEFAULT_BUFFER_SIZE), JSAP.NOT_REQUIRED, 'b', "buffer-size",
                                "The size of an I/O buffer."),
                        new FlaggedOption("logInterval", JSAP.LONG_PARSER,
                                Long.toString(ProgressLogger.DEFAULT_LOG_INTERVAL), JSAP.NOT_REQUIRED, 'l',
                                "log-interval", "The minimum time interval between activity logs in milliseconds."),
                        new FlaggedOption("strategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's',
                                "strategy", "A serialised lexical partitioning strategy."),
                        new FlaggedOption("uniformStrategy", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
                                JSAP.NOT_REQUIRED, 'u', "uniform",
                                "Requires a uniform partitioning in the given number of parts."),
                        new Switch("termsOnly", 't', "terms-only", "Just partition the term list."),
                        new UnflaggedOption("inputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "The basename of the global index."),
                        new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "The basename of the local indices.") });

        JSAPResult jsapResult = jsap.parse(arg);
        if (jsap.messagePrinted())
            return;
        String inputBasename = jsapResult.getString("inputBasename");
        String outputBasename = jsapResult.getString("outputBasename");
        String strategyFilename = jsapResult.getString("strategy");
        LexicalPartitioningStrategy strategy = null;

        if (jsapResult.userSpecified("uniformStrategy")) {
            strategy = LexicalStrategies.uniform(jsapResult.getInt("uniformStrategy"),
                    DiskBasedIndex.getInstance(inputBasename, false, false, true));
            BinIO.storeObject(strategy,
                    strategyFilename = outputBasename + IndexCluster.STRATEGY_DEFAULT_EXTENSION);
        } else if (strategyFilename != null)
            strategy = (LexicalPartitioningStrategy) BinIO.loadObject(strategyFilename);
        else
            throw new IllegalArgumentException("You must specify a splitting strategy");

        final PartitionLexically partitionLexically = new PartitionLexically(inputBasename, outputBasename,
                strategy, strategyFilename, jsapResult.getInt("bufferSize"), jsapResult.getLong("logInterval"));

        if (jsapResult.getBoolean("termsOnly"))
            partitionLexically.runTermsOnly();
        else
            partitionLexically.run();
    }
}