org.sindice.siren.index.Indexing.java Source code

Introduction

Here is the source code for org.sindice.siren.index.Indexing.java
Source

/**
 * Copyright 2011, Campinas Stephane
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
/**
 * @project trec-entity-tool
 * @author Campinas Stephane [ 3 Jun 2011 ]
 * @link stephane.campinas@deri.org
 */
package org.sindice.siren.index;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.zip.GZIPInputStream;

import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import org.sindice.siren.analysis.TupleAnalyzer;
import org.sindice.siren.analysis.TupleAnalyzer.URINormalisation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Index a list of entities, creating incoming, outgoing triples fields, subject
 * and type fields. The type field is a grouping of the rdf:type objects for this
 * entity.<br>
 * Outgoing triples are stored as n-tuples where a predicate has all its related
 * values.
 * Incoming triples are also stored as n-tuples, the difference being that a
 * predicate possess its related subject URIs.
 */
public abstract class Indexing implements Iterator<Entity> {

    protected final Logger logger = LoggerFactory.getLogger(Indexing.class);

    /* Perform a commit by batch of COMMIT documents */
    public static int COMMIT = 10000;
    public static boolean STORE = true;
    public static int SKIP_TO = 0;

    // FIELDS
    final static public String INCOMING_TRIPLE = "incoming-triple";
    final static public String OUTGOING_TRIPLE = "outgoing-triple";
    final static public String SUBJECT = "subject";
    final static public String TYPE = "type";

    /* The dataset files */
    protected final File[] input;
    protected int inputPos = 0;
    /* The current reader into the compressed archive */
    protected TarArchiveInputStream reader = null;
    /* A file entry in the archive */
    protected TarArchiveEntry tarEntry;

    /* SIREn index */
    protected final Directory indexDir;
    protected final IndexWriter writer;

    /**
     * Create a SIREn index at indexDir, taking the files at inputDir as input.
     * @param inputDir
     * @param dir
     * @throws IOException
     */
    public Indexing(final File inputDir, final Directory dir) throws IOException {
        this.input = inputDir.listFiles(new FilenameFilter() {

            //@Override
            public boolean accept(File dir, String name) {
                if (name.matches(getPattern())) {
                    final int dump = Integer.valueOf(name.substring(3, name.indexOf('.')));
                    return dump >= SKIP_TO; // discards any dump files lower than #SKIP_TO
                }
                return false;
            }

        });
        /*
         *  Sort by filename: important because in the SIndice-ED dataset, two
         *  consecutive dumps can store a same entity
         */
        Arrays.sort(this.input);
        if (this.input.length == 0) {
            throw new RuntimeException("No archive files in the folder: " + inputDir.getAbsolutePath());
        }
        this.indexDir = dir;
        this.writer = initializeIndexWriter(this.indexDir);
        reader = getTarInputStream(this.input[0]);
        logger.info("Creating index from input located at {} ({} files)", inputDir.getAbsolutePath(), input.length);
        logger.info("Reading dump: {}", this.input[0]);
    }

    /**
     * The regular expression of the input files
     * @return
     */
    protected abstract String getPattern();

    /**
     * Create a buffered tar inputstream from the file in
     * @param in
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     */
    private TarArchiveInputStream getTarInputStream(final File in) throws FileNotFoundException, IOException {
        return new TarArchiveInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream(in))));
    }

    //@Override
    public boolean hasNext() {
        return this.hasNext(null);
    }

    /**
     * Move to the next tar entry.
     * @param rootDir an entry path
     * @return true if a next tar entry can be read, or if this entry name is a sub-folder of rootDir
     */
    protected boolean hasNext(final String rootDir) {
        try {
            /*
             * if reader.available() is not equal to 0, then it means that this entry
             * has been loaded, but not read.
             */
            while (reader.available() == 0 && (tarEntry = reader.getNextTarEntry()) == null) { // Next tar entry
                if (++inputPos >= input.length) {
                    reader.close();
                    return false;
                }
                // Next archive file
                reader.close();
                logger.info("Reading dump: {}", this.input[inputPos]);
                reader = getTarInputStream(input[inputPos]);
            }
        } catch (IOException e) {
            logger.error("Error while reading the input: {}\n{}", input[inputPos], e);
        }
        /*
         *  When returning from this method, the inputstream is positionned at a regular file,
         *  i.e., metadata, outgoing-triples.nt or incoming-triples.nt.
         */
        if (tarEntry.isDirectory()) {
            return hasNext(rootDir);
        }
        return rootDir == null || tarEntry.getName().startsWith(rootDir) ? true : false;
    }

    /**
     * Create a index writer that uses a #TupleAnalyzer on the triples fields with
     * a tokenization of the URI's localname, and the default #WhitespaceAnalyzer
     * on the others.
     * @param dir
     * @return
     * @throws IOException
     */
    @SuppressWarnings("deprecation")
    private IndexWriter initializeIndexWriter(final Directory dir) throws IOException {
        final Analyzer defaultAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_31);
        final Map<String, Analyzer> fieldAnalyzers = new HashMap<String, Analyzer>();
        final TupleAnalyzer tuple = new TupleAnalyzer(new StandardAnalyzer(Version.LUCENE_31));
        tuple.setURINormalisation(URINormalisation.LOCALNAME);
        fieldAnalyzers.put(OUTGOING_TRIPLE, tuple);
        fieldAnalyzers.put(INCOMING_TRIPLE, tuple);

        final IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_31,
                new PerFieldAnalyzerWrapper(defaultAnalyzer, fieldAnalyzers));

        // Disable compound file
        ((LogMergePolicy) config.getMergePolicy()).setUseCompoundFile(false);
        // Increase merge factor to 20 - more adapted to batch creation
        ((LogMergePolicy) config.getMergePolicy()).setMergeFactor(20);

        config.setRAMBufferSizeMB(256);
        config.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);
        config.setMaxBufferedDeleteTerms(IndexWriterConfig.DISABLE_AUTO_FLUSH);

        final IndexWriter writer = new IndexWriter(dir, config);
        writer.setMaxFieldLength(Integer.MAX_VALUE);
        return writer;
    }

    /**
     * Creates an entity index
     * @throws CorruptIndexException
     * @throws IOException
     */
    public void indexIt() throws CorruptIndexException, IOException {
        Entity entity = null;
        long counter = 0;

        while (hasNext()) { // for each entity
            entity = next();

            Document doc = new Document();
            doc.add(new Field(SUBJECT, entity.subject, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
            doc.add(new Field(TYPE, Utils.toString(entity.type), Store.YES, Index.ANALYZED_NO_NORMS));
            doc.add(new Field(OUTGOING_TRIPLE, entity.getTriples(true), STORE ? Store.YES : Store.NO,
                    Index.ANALYZED_NO_NORMS));
            doc.add(new Field(INCOMING_TRIPLE, entity.getTriples(false), STORE ? Store.YES : Store.NO,
                    Index.ANALYZED_NO_NORMS));
            writer.addDocument(doc);
            counter = commit(true, counter, entity.subject);
        }
        commit(false, counter, entity.subject); // Commit what is left
        writer.optimize();
    }

    /**
     * Commits the documents by batch
     * @param indexing
     * @param counter
     * @param subject
     * @return
     * @throws CorruptIndexException
     * @throws IOException
     */
    private long commit(boolean indexing, long counter, String subject) throws CorruptIndexException, IOException {
        if (!indexing || (++counter % COMMIT) == 0) { // Index by batch
            writer.commit();
            logger.info("Commited {} entities. Last entity: {}", (indexing ? COMMIT : counter), subject);
        }
        return counter;
    }

    /**
     * Close resources
     * @throws CorruptIndexException
     * @throws IOException
     */
    public void close() throws CorruptIndexException, IOException {
        try {
            writer.close();
        } finally {
            indexDir.close();
        }
    }

    //@Override
    public void remove() {
    }

}