Java tutorial
package it.unimi.di.big.mg4j.document; /* * MG4J: Managing Gigabytes for Java (big) * * Copyright (C) 2009-2015 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ import it.unimi.di.big.mg4j.document.DocumentFactory.FieldType; import it.unimi.di.big.mg4j.tool.Scan.VirtualDocumentFragment; import it.unimi.di.big.mg4j.util.parser.callback.AnchorExtractor; import it.unimi.dsi.fastutil.ints.AbstractIntComparator; import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntArrays; import it.unimi.dsi.fastutil.io.FastBufferedInputStream; import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; import it.unimi.dsi.fastutil.longs.AbstractLongIterator; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.fastutil.objects.Reference2ObjectMap; import it.unimi.dsi.fastutil.objects.Reference2ObjectMaps; import it.unimi.dsi.io.ByteBufferInputStream; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.InputBitStream; import it.unimi.dsi.io.NullInputStream; import it.unimi.dsi.io.OutputBitStream; import it.unimi.dsi.io.WordReader; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; import java.nio.channels.FileChannel.MapMode; import java.util.NoSuchElementException; import java.util.zip.ZipFile; import org.apache.commons.configuration.ConfigurationException; import org.apache.commons.io.IOUtils; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; import com.martiansoftware.jsap.UnflaggedOption; /** A basic, compressed document collection that can be easily built at indexing time. * * <p>Instances of this class record virtual and non-text fields just like {@link ZipDocumentCollection}—that is, * in a zip file. However, text fields are recorded in a simple but highly efficient format. Terms (and nonterms) are numbered globally * in an increasing way as they are met. While we scan each document, we keep track of frequencies for a limited number of terms: * terms are encoded with their frequency rank if we know their statistics, or by a special code derived from their * global number if we have no statistics about them. Every number involved is written in delta code. * * <p>A collection can be <em>exact</em> or <em>approximated</em>: in the latter case, nonwords will not be recorded, and will * be turned into spaces when decompressing. * * <p>A instance of this collection will be, as any other collection, serialised on a file, but it will refer to several other files * that are derived from the instance basename. Please use {@link AbstractDocumentSequence#load(CharSequence)} * to load instances of this collection. * * <p>This class suffers the same scalability problem of {@link ZipDocumentCollection} if you compress non-text or virtual fields. Text * compression, on the other hand, is extremely efficient and scalable. * * @author Sebastiano Vigna */ public class SimpleCompressedDocumentCollection extends AbstractDocumentCollection implements Serializable { private static final long serialVersionUID = 1L; private static final boolean DEBUG = false; protected static final boolean ASSERTS = false; /** Standard extension for the file containing encoded documents. */ public static final String DOCUMENTS_EXTENSION = ".documents"; /** Standard extension for the file containing document offsets stored as δ-encoded gaps. */ public static final String DOCUMENT_OFFSETS_EXTENSION = ".docoffsets"; /** Standard extension for the file containing terms in {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)} format. */ public static final String TERMS_EXTENSION = ".terms"; /** Standard extension for the file containing term offsets stored as δ-encoded gaps. */ public static final String TERM_OFFSETS_EXTENSION = ".termoffsets"; /** Standard extension for the file containing nonterms in {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)} format. */ public static final String NONTERMS_EXTENSION = ".nonterms"; /** Standard extension for the file containing nonterm offsets stored as δ-encoded gaps. */ public static final String NONTERM_OFFSETS_EXTENSION = ".nontermoffsets"; /** Standard extension for the stats file. */ public static final String STATS_EXTENSION = ".stats"; /** The basename of this collection. */ private final String basename; /** Whether this collection is exact (i.e., whether it stores nonwords). */ private final boolean exact; /** The number of documents in this collection. */ private final long documents; /** The number of terms in this collection. */ private final long terms; /** The number of nonterms in this collection, or -1 if {@link #exact} is false. */ private final long nonTerms; /** The document offsets. */ private transient EliasFanoMonotoneLongBigList docOffsets; /** The term offsets. */ private transient EliasFanoMonotoneLongBigList termOffsets; /** The nonterm offsets, or <code>null</code> if {@link #exact} is false. */ private transient EliasFanoMonotoneLongBigList nonTermOffsets; /** The input bit stream for documents. */ private transient InputBitStream documentsInputBitStream; /** The input bit stream for terms. */ private transient FastBufferedInputStream termsInputStream; /** The input bit stream for nonterms, or <code>null</code> if {@link #exact} is false. */ private transient FastBufferedInputStream nonTermsInputStream; /** A frequency keeper used to decompress document terms. */ private transient FrequencyCodec termsFrequencyKeeper; /** A frequency keeper used to decompress document nonterms, or <code>null</code> if {@link #exact} is false. */ private transient FrequencyCodec nonTermsFrequencyKeeper; /** The underlying factory. */ private final DocumentFactory factory; /** Whether this collection contains non-text or virtual fields. */ private final boolean hasNonText; /** The zip file used to store non-text and virtual fields if {@link #hasNonText} is true, or <code>null</code> if this collection does not store such fields. */ private transient ZipFile zipFile; /** The input stream obtained by memory-mapping the file containing documents, or <code>null</code>. */ private transient ByteBufferInputStream documentsByteBufferInputStream; /** The input stream obtained by memory-mapping the file containing terms, or <code>null</code>. */ private transient ByteBufferInputStream termsByteBufferInputStream; /** The input stream obtained by memory-mapping the file containing nonterms, or <code>null</code>. */ private transient ByteBufferInputStream nonTermsByteBufferInputStream; /** True if ancillary files have been all correctly opened. */ private boolean fileOpenOk; /** True if memory mappings have been all been obtained. */ private boolean fileMappingOk; /** An iterator used to load δ-encoded offset gaps. */ private static final class OffsetsLongIterator extends AbstractLongIterator { private final long numberOfItems; private long currIndex; private long currValue; private final InputBitStream ibs; public OffsetsLongIterator(InputBitStream ibs, long numberOfItems) { this.ibs = ibs; this.numberOfItems = numberOfItems; } public boolean hasNext() { return currIndex < numberOfItems; } @Override public long nextLong() { if (!hasNext()) throw new NoSuchElementException(); try { currIndex++; return currValue += ibs.readDelta(); } catch (IOException e) { throw new RuntimeException(e); } } } /** A simple codec for integers that remaps frequent numbers to smaller numbers. */ protected static class FrequencyCodec { /** The size of the symbol queue. */ private final static int MAX_QUEUE_SIZE = 2048; /** The symbol queue. */ private final int[] queue; /** An array parallel to {@link #queue} containing frequencies. */ private final int[] freq; /** A map from input symbols to positions in {@link #queue}. */ private final Int2IntOpenHashMap code2Pos; /** The current size of {{@link #queue}. */ private int queueSize; public FrequencyCodec() { code2Pos = new Int2IntOpenHashMap(); code2Pos.defaultReturnValue(-1); queue = new int[MAX_QUEUE_SIZE]; freq = new int[MAX_QUEUE_SIZE]; } /** Empties the queue and the symbol-to-position map. */ public void reset() { queueSize = 0; code2Pos.clear(); } private final void newSymbol(final int symbol) { if (queueSize == MAX_QUEUE_SIZE) { // Queue filled up. First, we guarantee that there are elements with frequency one. if (freq[MAX_QUEUE_SIZE - 1] != 1) for (int j = MAX_QUEUE_SIZE; j-- != 0;) freq[j] /= freq[MAX_QUEUE_SIZE - 1]; // Then, we remove half of them. int j = MAX_QUEUE_SIZE; while (j-- != 0) if (freq[j] > 1) break; for (int k = j + (MAX_QUEUE_SIZE - j) / 2; k < MAX_QUEUE_SIZE; k++) { if (ASSERTS) assert freq[k] == 1; code2Pos.remove(queue[k]); } queueSize = j + (MAX_QUEUE_SIZE - j) / 2; } // Now we know that we have space. if (ASSERTS) assert queueSize < MAX_QUEUE_SIZE; code2Pos.put(symbol, queueSize); queue[queueSize] = symbol; freq[queueSize] = 1; queueSize++; } private final void oldSymbol(final int pos) { // Term already in list // Find term to exchange for change of frequency int ex = pos; while (ex >= 0 && freq[ex] == freq[pos]) ex--; ++ex; freq[pos]++; // Exchange int t = queue[pos]; queue[pos] = queue[ex]; queue[ex] = t; t = freq[pos]; freq[pos] = freq[ex]; freq[ex] = t; code2Pos.put(queue[ex], ex); code2Pos.put(queue[pos], pos); } /** Encodes a symbol, returning a (hopefully smaller) symbol. * * @param symbol the input symbol. * @return the output symbol. */ public int encode(final int symbol) { final int pos = code2Pos.get(symbol); if (pos == -1) { final int result = queueSize + symbol; newSymbol(symbol); return result; } else { if (DEBUG) System.err.println("Symbol " + symbol + " in list; writing " + pos + " " + code2Pos + " " + IntArrayList.wrap(queue, queueSize) + " " + IntArrayList.wrap(freq, queueSize)); oldSymbol(pos); return pos; } } /** Decodes a symbol, returning the original symbol. * * @param symbol a symbol an encoded file. * @return the corresponding original input symbol. */ public int decode(final int symbol) { if (symbol < queueSize) { final int result = queue[symbol]; oldSymbol(symbol); return result; } else { int term = symbol - queueSize; newSymbol(term); return term; } } } private SimpleCompressedDocumentCollection(String basename, DocumentFactory factory, EliasFanoMonotoneLongBigList docOffsets, EliasFanoMonotoneLongBigList termOffsets, EliasFanoMonotoneLongBigList nonTermOffsets, ByteBufferInputStream documentsByteBufferInputStream, ByteBufferInputStream termsByteBufferInputStream, ByteBufferInputStream nonTermsByteBufferInputStream) { this.basename = basename; this.documents = docOffsets.size64() - 1; this.terms = termOffsets.size64() - 1; this.exact = nonTermOffsets != null; this.nonTerms = exact ? termOffsets.size64() - 1 : -1; this.docOffsets = docOffsets; this.termOffsets = termOffsets; this.nonTermOffsets = nonTermOffsets; this.factory = factory; this.termsFrequencyKeeper = new FrequencyCodec(); this.nonTermsFrequencyKeeper = exact ? new FrequencyCodec() : null; this.documentsByteBufferInputStream = documentsByteBufferInputStream; this.termsByteBufferInputStream = termsByteBufferInputStream; this.nonTermsByteBufferInputStream = nonTermsByteBufferInputStream; this.hasNonText = hasNonText(factory); } protected SimpleCompressedDocumentCollection(final String basename, final long documents, final long terms, final long nonTerms, final boolean exact, final DocumentFactory factory) { this.hasNonText = hasNonText(factory); this.basename = basename; this.documents = documents; this.terms = terms; this.nonTerms = nonTerms; this.exact = exact; this.factory = factory; this.termsFrequencyKeeper = null; this.nonTermsFrequencyKeeper = null; docOffsets = termOffsets = nonTermOffsets = null; documentsInputBitStream = null; termsInputStream = nonTermsInputStream = null; zipFile = null; try { super.close(); } catch (IOException cantHappen) { throw new RuntimeException(cantHappen); } } private static boolean hasNonText(final DocumentFactory factory) { boolean hasNonText = false; for (int i = factory.numberOfFields(); i-- != 0;) hasNonText |= factory.fieldType(i) != FieldType.TEXT; return hasNonText; } private void initMappings(final String basename, final boolean rethrow) throws IOException { try { // TODO: This is too risky: we will have to make it optional at some point // documentsByteBufferInputStream = ByteBufferInputStream.map( new FileInputStream( basename + DOCUMENTS_EXTENSION ).getChannel(), MapMode.READ_ONLY ); termsByteBufferInputStream = ByteBufferInputStream .map(new FileInputStream(basename + TERMS_EXTENSION).getChannel(), MapMode.READ_ONLY); nonTermsByteBufferInputStream = nonTermOffsets != null ? ByteBufferInputStream.map(new FileInputStream(basename + NONTERMS_EXTENSION).getChannel(), MapMode.READ_ONLY) : null; fileMappingOk = true; } catch (IOException e) { // We leave the possibility for a filename() to fix the problem and map the files. if (rethrow) throw e; } } private void loadOffsets(final String basename, final boolean rethrow) throws IOException { try { docOffsets = loadOffsetsSuccinctly(basename + DOCUMENT_OFFSETS_EXTENSION, documents, new File(basename + DOCUMENTS_EXTENSION).length() * Byte.SIZE + 1); termOffsets = loadOffsetsSuccinctly(basename + TERM_OFFSETS_EXTENSION, terms, new File(basename + TERMS_EXTENSION).length() + 1); nonTermOffsets = nonTerms < 0 ? null : loadOffsetsSuccinctly(basename + NONTERM_OFFSETS_EXTENSION, nonTerms, new File(basename + NONTERMS_EXTENSION).length() + 1); } catch (IOException e) { // We leave the possibility for a filename() to fix the problem and load the right files. if (rethrow) throw e; } } private void initFiles(final String basename, final boolean rethrow) throws IOException { try { documentsInputBitStream = documentsByteBufferInputStream != null ? new InputBitStream(documentsByteBufferInputStream) : new InputBitStream(basename + DOCUMENTS_EXTENSION); termsInputStream = new FastBufferedInputStream( termsByteBufferInputStream != null ? termsByteBufferInputStream : new FileInputStream(basename + TERMS_EXTENSION)); nonTermsInputStream = exact ? new FastBufferedInputStream( nonTermsByteBufferInputStream != null ? nonTermsByteBufferInputStream : new FileInputStream(basename + NONTERMS_EXTENSION)) : null; zipFile = hasNonText ? new ZipFile(basename + ZipDocumentCollection.ZIP_EXTENSION) : null; fileOpenOk = true; } catch (IOException e) { // We leave the possibility for a filename() to fix the problem and load the right files. if (rethrow) throw e; } } private void ensureFiles() { if (!fileOpenOk) throw new IllegalStateException("Some of the files used by this " + SimpleCompressedDocumentCollection.class.getSimpleName() + " have not been loaded correctly; please use " + AbstractDocumentSequence.class.getName() + ".load() or call filename() after deserialising this instance"); } private static EliasFanoMonotoneLongBigList loadOffsetsSuccinctly(final CharSequence filename, final long numberOfItems, final long upperBound) throws IOException { final InputBitStream ibs = new InputBitStream(filename.toString()); final EliasFanoMonotoneLongBigList offsets = new EliasFanoMonotoneLongBigList(numberOfItems + 1, upperBound, new OffsetsLongIterator(ibs, numberOfItems + 1)); ibs.close(); return offsets; } @Override public void filename(CharSequence filename) throws IOException { if (!fileMappingOk) initMappings(new File(new File(filename.toString()).getParentFile(), basename).toString(), true); if (!fileOpenOk) { loadOffsets(new File(new File(filename.toString()).getParentFile(), basename).toString(), true); initFiles(new File(new File(filename.toString()).getParentFile(), basename).toString(), true); } } public DocumentCollection copy() { ensureFiles(); try { SimpleCompressedDocumentCollection copy = new SimpleCompressedDocumentCollection(basename, factory.copy(), docOffsets, termOffsets, nonTermOffsets, documentsByteBufferInputStream != null ? documentsByteBufferInputStream.copy() : null, termsByteBufferInputStream != null ? termsByteBufferInputStream.copy() : null, nonTermsByteBufferInputStream != null ? nonTermsByteBufferInputStream.copy() : null); copy.loadOffsets(basename, true); copy.initFiles(basename, true); return copy; } catch (IOException e) { throw new RuntimeException(e); } } private static MutableString readSelfDelimitedUtf8String(final InputBitStream ibs, final MutableString s) throws IOException { s.length(0); for (int length = ibs.readDelta(); length-- != 0;) s.append((char) ibs.readZeta(7)); return s; } public Document document(long index) throws IOException { ensureDocumentIndex(index); ensureFiles(); documentsInputBitStream.position(docOffsets.getLong(index)); @SuppressWarnings("resource") final DataInputStream nonTextDataInputStream = hasNonText ? new DataInputStream( new FastBufferedInputStream(zipFile.getInputStream(zipFile.getEntry(Long.toString(index))))) : null; final MutableString uri = readSelfDelimitedUtf8String(documentsInputBitStream, new MutableString()); final MutableString title = readSelfDelimitedUtf8String(documentsInputBitStream, new MutableString()); return new AbstractDocument() { final MutableString fieldContent = new MutableString(); @SuppressWarnings("unchecked") final Document fakeDocument = factory.getDocument(NullInputStream.getInstance(), Reference2ObjectMaps.EMPTY_MAP); int nextField = 0; public Object content(int field) throws IOException { FieldType fieldType = factory.fieldType(field); if (nextField > field) throw new IllegalStateException(); // Skip fields final MutableString s = new MutableString(); int len; while (nextField < field) { switch (factory.fieldType(nextField)) { case TEXT: len = documentsInputBitStream.readDelta(); if (exact) len *= 2; documentsInputBitStream.skipDeltas(len); break; case VIRTUAL: final int nfrag = nonTextDataInputStream.readInt(); for (int i = 0; i < 2 * nfrag; i++) MutableString.skipSelfDelimUTF8(nonTextDataInputStream); break; default: try { new ObjectInputStream(nonTextDataInputStream).readObject(); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } nextField++; } // Read field nextField++; switch (fieldType) { case TEXT: len = documentsInputBitStream.readDelta(); fieldContent.length(0); termsFrequencyKeeper.reset(); if (exact) nonTermsFrequencyKeeper.reset(); while (len-- != 0) { termsInputStream.position(termOffsets .getLong(termsFrequencyKeeper.decode(documentsInputBitStream.readDelta()))); s.readSelfDelimUTF8(termsInputStream); fieldContent.append(s); if (exact) { nonTermsInputStream.position(nonTermOffsets .getLong(nonTermsFrequencyKeeper.decode(documentsInputBitStream.readDelta()))); s.readSelfDelimUTF8(nonTermsInputStream); fieldContent.append(s); } else fieldContent.append(' '); } return new FastBufferedReader(fieldContent); case VIRTUAL: final int nfrag = nonTextDataInputStream.readInt(); MutableString doc = new MutableString(); MutableString text = new MutableString(); VirtualDocumentFragment[] fragArray = new VirtualDocumentFragment[nfrag]; for (int i = 0; i < nfrag; i++) { doc.readSelfDelimUTF8((InputStream) nonTextDataInputStream); text.readSelfDelimUTF8((InputStream) nonTextDataInputStream); fragArray[i] = new AnchorExtractor.Anchor(doc.copy(), text.copy()); } return new ObjectArrayList<VirtualDocumentFragment>(fragArray); default: try { return new ObjectInputStream(nonTextDataInputStream).readObject(); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } } public CharSequence title() { return title; } public CharSequence uri() { return uri.length() == 0 ? null : uri; } public WordReader wordReader(int field) { switch (factory.fieldType(field)) { case TEXT: case VIRTUAL: return fakeDocument.wordReader(field); default: return null; } } public void close() throws IOException { super.close(); if (hasNonText) nonTextDataInputStream.close(); } }; } public Reference2ObjectMap<Enum<?>, Object> metadata(long index) throws IOException { throw new UnsupportedOperationException(); } public long size() { return documents; } public InputStream stream(long index) throws IOException { throw new UnsupportedOperationException(); } public void close() throws IOException { super.close(); if (documentsInputBitStream != null) documentsInputBitStream.close(); IOUtils.closeQuietly(termsInputStream); IOUtils.closeQuietly(nonTermsInputStream); } public DocumentFactory factory() { return factory; } private void readObject(final ObjectInputStream s) throws IOException, ClassNotFoundException { s.defaultReadObject(); loadOffsets(basename, false); initMappings(basename, false); initFiles(basename, false); termsFrequencyKeeper = new FrequencyCodec(); if (exact) nonTermsFrequencyKeeper = new FrequencyCodec(); } // Unfinished, experimental method @SuppressWarnings("resource") public static void optimize(final CharSequence basename) throws IOException, ClassNotFoundException { final SimpleCompressedDocumentCollection collection = (SimpleCompressedDocumentCollection) AbstractDocumentCollection .load(basename); final long[] termFrequency = new long[(int) collection.terms]; final long[] nonTermFrequency = collection.exact ? new long[(int) collection.nonTerms] : null; final InputBitStream documentsIbs = collection.documentsInputBitStream; final DocumentFactory factory = collection.factory; final boolean exact = collection.exact; final MutableString s = new MutableString(); documentsIbs.position(0); for (int i = (int) collection.documents; i-- != 0;) { readSelfDelimitedUtf8String(documentsIbs, s); // Skip URI readSelfDelimitedUtf8String(documentsIbs, s); // Skip title for (int f = factory.numberOfFields() - 1; f-- != 0;) { int len = documentsIbs.readDelta(); while (len-- != 0) { termFrequency[documentsIbs.readDelta()]++; if (exact) nonTermFrequency[documentsIbs.readDelta()]++; } } } int[] termPerm = new int[termFrequency.length]; for (int i = termPerm.length; i-- != 0;) termPerm[i] = i; IntArrays.quickSort(termPerm, 0, termPerm.length, new AbstractIntComparator() { public int compare(int arg0, int arg1) { return termFrequency[arg1] - termFrequency[arg0] < 0 ? -1 : termFrequency[arg1] == termFrequency[arg0] ? 0 : 1; } }); int[] invTermPerm = new int[termFrequency.length]; for (int i = invTermPerm.length; i-- != 0;) invTermPerm[termPerm[i]] = i; int[] nonTermPerm = null, invNonTermPerm = null; if (exact) { nonTermPerm = new int[termFrequency.length]; for (int i = nonTermPerm.length; i-- != 0;) nonTermPerm[i] = i; IntArrays.quickSort(nonTermPerm, 0, nonTermPerm.length, new AbstractIntComparator() { public int compare(int arg0, int arg1) { return termFrequency[arg1] - termFrequency[arg0] < 0 ? -1 : termFrequency[arg1] == termFrequency[arg0] ? 0 : 1; } }); invNonTermPerm = new int[nonTermFrequency.length]; for (int i = invNonTermPerm.length; i-- != 0;) invNonTermPerm[nonTermPerm[i]] = i; } File newDocumentsFile = File.createTempFile(SimpleCompressedDocumentCollection.class.getSimpleName(), "temp", new File(basename.toString()).getParentFile()); OutputBitStream newDocumentsObs = new OutputBitStream(newDocumentsFile); documentsIbs.position(0); for (int i = (int) collection.documents; i-- != 0;) { readSelfDelimitedUtf8String(documentsIbs, s); // Skip URI SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String(newDocumentsObs, s); readSelfDelimitedUtf8String(documentsIbs, s); // Skip title SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String(newDocumentsObs, s); for (int f = factory.numberOfFields() - 1; f-- != 0;) { int len = documentsIbs.readDelta(); newDocumentsObs.writeDelta(len); while (len-- != 0) { newDocumentsObs.writeDelta(invTermPerm[documentsIbs.readDelta()]); if (exact) newDocumentsObs.writeDelta(invNonTermPerm[documentsIbs.readDelta()]); } } } newDocumentsObs.close(); new File(basename + DOCUMENTS_EXTENSION).delete(); newDocumentsFile.renameTo(new File(basename + DOCUMENTS_EXTENSION)); newDocumentsObs = null; invTermPerm = invNonTermPerm = null; FastBufferedInputStream termsStream = new FastBufferedInputStream( new FileInputStream(basename + TERMS_EXTENSION)); MutableString term[] = new MutableString[(int) collection.terms]; for (int i = 0; i < term.length; i++) term[i] = new MutableString().readSelfDelimUTF8(termsStream); termsStream.close(); new FastBufferedOutputStream(new FileOutputStream(basename + TERMS_EXTENSION)); } public static void main(final String[] arg) throws IOException, JSAPException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, ConfigurationException, ClassNotFoundException { SimpleJSAP jsap = new SimpleJSAP(FileSetDocumentCollection.class.getName(), "Optimises a simple compressed document collection.", new Parameter[] { new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename of the collection."), }); JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) return; optimize(jsapResult.getString("basename")); } }