com.yahoo.glimmer.indexing.generator.Index.java Source code

Java tutorial

Introduction

Here is the source code for com.yahoo.glimmer.indexing.generator.Index.java

Source

package com.yahoo.glimmer.indexing.generator;

/*
 * Copyright (c) 2012 Yahoo! Inc. All rights reserved.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 *  Unless required by applicable law or agreed to in writing, software distributed under the License is 
 *  distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and limitations under the License.
 *  See accompanying LICENSE file.
 */

import it.unimi.di.big.mg4j.index.BitStreamIndexWriter;
import it.unimi.di.big.mg4j.index.CompressionFlags;
import it.unimi.di.big.mg4j.index.CompressionFlags.Coding;
import it.unimi.di.big.mg4j.index.CompressionFlags.Component;
import it.unimi.di.big.mg4j.index.DiskBasedIndex;
import it.unimi.di.big.mg4j.index.IndexWriter;
import it.unimi.di.big.mg4j.io.HadoopFileSystemIOFactory;
import it.unimi.di.big.mg4j.io.IOFactory;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.util.Properties;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Map;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.yahoo.glimmer.indexing.CombinedTermProcessor;
import com.yahoo.glimmer.indexing.ResourceRefTermProcessor;

public class Index {
    private PrintWriter terms;
    private OutputStream properties;
    private OutputBitStream docSizes;
    private IndexWriter indexWriter;

    private FileSystem fs;
    private Path outputDir;
    private String name;
    private long numDocs;
    //    private int indexWriterCacheSize = QuasiSuccinctIndexWriter.DEFAULT_CACHE_SIZE;

    private boolean positions;
    private String hashValuePrefix;

    public Index(FileSystem fs, Path outputDir, String indexName, long numDocs, boolean positions,
            String hashValuePrefix, int indexWriterCacheSize) {
        this.fs = fs;
        this.outputDir = outputDir;
        // It seems like MG4J doesn't like index names with the '-' char
        this.name = indexName.replaceAll("\\-", "_");
        this.numDocs = numDocs;
        this.positions = positions;
        this.hashValuePrefix = hashValuePrefix;
        //   if (indexWriterCacheSize != 0) {
        //       this.indexWriterCacheSize = indexWriterCacheSize;
        //   }
    }

    public void open() throws IOException {
        String basename = new Path(outputDir, name).toString();

        Path termsPath = new Path(outputDir, name + DiskBasedIndex.TERMS_EXTENSION);
        terms = new PrintWriter(new BufferedWriter(new OutputStreamWriter(fs.create(termsPath, false), "UTF-8")));

        Path propertiesPath = new Path(outputDir, name + DiskBasedIndex.PROPERTIES_EXTENSION);
        properties = fs.create(propertiesPath, false);

        Map<Component, Coding> defaultStandardIndexFlags = new Object2ObjectOpenHashMap<Component, Coding>(
                CompressionFlags.DEFAULT_STANDARD_INDEX);
        if (!positions) {
            defaultStandardIndexFlags.remove(CompressionFlags.Component.POSITIONS);
            defaultStandardIndexFlags.remove(CompressionFlags.Component.COUNTS); // Quasi Succinct Indexes can't not have counts.
        }

        IOFactory ioFactory = new HadoopFileSystemIOFactory(fs);
        //   indexWriter = new QuasiSuccinctIndexWriter(ioFactory, basename, numDocs, Fast.mostSignificantBit(QuasiSuccinctIndex.DEFAULT_QUANTUM), indexWriterCacheSize, defaultStandardIndexFlags, ByteOrder.nativeOrder());
        indexWriter = new BitStreamIndexWriter(ioFactory, basename, numDocs, true, defaultStandardIndexFlags);
    }

    public PrintWriter getTermsWriter() {
        return terms;
    }

    public boolean hasPositions() {
        return positions;
    }

    public IndexWriter getIndexWriter() {
        return indexWriter;
    }

    public OutputStream getPropertiesStream() {
        return properties;
    }

    private long docSizesLastDocument = -1;

    public void writeDocSize(long document, int size) throws IOException {
        if (document <= docSizesLastDocument) {
            throw new IllegalArgumentException("Given document ID " + document
                    + " isn't bigger than the document ID given in the previous call " + docSizesLastDocument);
        }

        if (docSizes == null) {
            // Only create the file when needed.  writeDocSize() shouldn't be called for vertical indexes.
            Path docSizesPath = new Path(outputDir, name + DiskBasedIndex.SIZES_EXTENSION);
            docSizes = new OutputBitStream(fs.create(docSizesPath, false));
        }

        for (docSizesLastDocument++; docSizesLastDocument < document; docSizesLastDocument++) {
            docSizes.writeGamma(0);
        }
        docSizes.writeGamma(size);
    }

    public void close(long writtenOccurrences) throws IOException {
        try {
            Properties props = indexWriter.properties();
            System.out.println("Closing index " + name + " which has "
                    + props.getProperty(it.unimi.di.big.mg4j.index.Index.PropertyKeys.TERMS) + " terms ");
            if (positions) {
                props.setProperty(it.unimi.di.big.mg4j.index.Index.PropertyKeys.OCCURRENCES, writtenOccurrences);
            }
            props.setProperty(it.unimi.di.big.mg4j.index.Index.PropertyKeys.MAXCOUNT, -1);
            props.setProperty(it.unimi.di.big.mg4j.index.Index.PropertyKeys.FIELD, name);
            props.setProperty(it.unimi.di.big.mg4j.index.Index.PropertyKeys.TERMPROCESSOR,
                    CombinedTermProcessor.getInstance());
            props.addProperty(ResourceRefTermProcessor.PropertyKeys.REF_PREFIX, hashValuePrefix);

            props.save(properties);
        } catch (ConfigurationException e) {
            throw new IOException(e.getMessage());
        }

        properties.close();

        if (docSizes != null) {
            for (; docSizesLastDocument < numDocs; docSizesLastDocument++) {
                docSizes.writeGamma(0);
            }
            docSizes.close();
        }

        terms.close();
        indexWriter.close();
    }

    public String getName() {
        return name;
    }
}