edu.southampton.wais.crowd.terrier.applications.Indexing.java Source code

Java tutorial

Introduction

Here is the source code for edu.southampton.wais.crowd.terrier.applications.Indexing.java

Source

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.ac.gla.uk
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is TRECIndexing.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Gianni Amati <gba{a.}fub.it> (original author)
 *   Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
 *   Ben He <ben{a.}dcs.gla.ac.uk>
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> 
 */
package edu.southampton.wais.crowd.terrier.applications;

import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;

import org.terrier.indexing.BasicSinglePassIndexer;
import org.terrier.indexing.BlockSinglePassIndexer;
import org.terrier.indexing.Collection;
import org.terrier.indexing.CollectionFactory;
import org.terrier.indexing.Indexer;
import org.terrier.structures.Index;

/**
 * This class creates the indices for a test collection.
 * <p>
 * <b>Properties:</b>
 * <ul>
 * <li><tt>trec.indexer.class</tt> - name of the class to use as the indexer. This only applies to the Index method.</li>
 * <li><tt>trec.collection.class</tt> - name of the class to use as the Collection.</li>
 * </ul>
 * @author Gianni Amati, Vassilis Plachouras, Ben He, Craig Macdonald
 */
public class Indexing {
    /** The logger used */
    private static Logger logger = Logger.getLogger(Indexing.class);
    /** The collection to index. */
    Collection collectionTREC;

    String path;
    String prefix;

    /** The indexer object.*/
    Indexer indexer;
    private String terrierIndexPath;
    private String terrierIndexPrefix;

    /**
     * A constructor that initialised the data structures
     * to use for indexing. 
     * @param _path Absolute path to where the index should be created
     * @param _prefix Prefix of the index files, usually "data"
     */
    public void setTRECIndexing(String _path, String _prefix) {
        path = _path;
        prefix = _prefix;
        //load the appropriate collection
        final String collectionName = MyApplicationSetup.getProperty("trec.collection.class", "TRECCollection");
        collectionTREC = CollectionFactory.loadCollection(collectionName);

        if (collectionTREC == null) {
            logger.fatal("Collection class named " + collectionName + " not found, aborting");
        }

        //load the appropriate indexer
        String indexerName = MyApplicationSetup.getProperty("trec.indexer.class",
                MyApplicationSetup.BLOCK_INDEXING ? "BlockIndexer" : "BasicIndexer");
        if (indexerName.indexOf('.') == -1)
            indexerName = "org.terrier.indexing." + indexerName;
        else if (indexerName.startsWith("uk.ac.gla.terrier"))
            indexerName = indexerName.replaceAll("uk.ac.gla.terrier", "org.terrier");
        try {
            indexer = (Indexer) Class.forName(indexerName).getConstructor(String.class, String.class)
                    .newInstance(path, prefix);
        } catch (ClassNotFoundException e) {
            logger.fatal("Indexer class named " + indexerName + " not found", e);
        } catch (InstantiationException ie) {
            logger.fatal("Error while instantiating Indexer class named " + indexerName + " : " + ie.getCause(),
                    ie);
        } catch (Exception e) {
            logger.fatal("Indexer class named " + indexerName + "problem", e);
        }
    }

    /**
     * A default constructor that initialised the data structures
     * to use for indexing.
     */
    public Indexing() {

        terrierIndexPath = MyApplicationSetup.TERRIER_INDEX_PATH;
        terrierIndexPrefix = MyApplicationSetup.TERRIER_INDEX_PREFIX;

    }

    /**
     * Calls the method index(Collection[]) of the
     * class Indexer in order to build the data
     * structures for a set of collections. This 
     * particular method of the Indexer uses a 
     * set of builders for a subset of the collection
     * and builds separate data structures, which are 
     * later merged.
     */
    public void index() {
        if (Index.existsIndex(path, prefix)) {
            logger.fatal("Cannot index while an index exists at " + path + "," + prefix);
            return;
        }
        indexer.index(new Collection[] { collectionTREC });
        try {
            collectionTREC.close();
        } catch (Exception e) {
            logger.warn("problem closing collection", e);
        }
    }

    /**
     * Building the inverted file.
     */
    public void createInvertedFile() {
        if (Index.existsIndex(path, prefix)) {
            Index i = Index.createIndex();
            if (i == null) {
            } else if (i.hasIndexStructure("inverted")) {
                logger.fatal(
                        "Cannot create an inverted structure while an index with a inverted structure exists at "
                                + path + "," + prefix);
                return;
            } else if (!i.hasIndexStructure("direct")) {
                logger.fatal("Cannot create an inverted structure without a direct structure in the index at "
                        + path + "," + prefix);
                return;
            }
        } else {
            logger.fatal("Cannot create an inverted structure without an index at " + path + "," + prefix);
            return;
        }
        if (logger.isInfoEnabled())
            logger.info("Started building the inverted index...");
        long beginTimestamp = System.currentTimeMillis();
        indexer.createInvertedIndex();
        long endTimestamp = System.currentTimeMillis();
        if (logger.isInfoEnabled()) {
            logger.info("Finished building the inverted index...");
            double seconds = (endTimestamp - beginTimestamp) / 1000.0d;
            logger.info("Time elapsed for inverted file: " + seconds);
        }
    }

    /**
     * Builds the direct file and lexicon. This method goes through the 
     * input files specified in the <tt>collections.spec</tt> file 
     * and processes them in groups of n documents, where n is specified 
     * by the property <tt>bundle.size</tt>. Then, it merges the 
     * temporary lexicon files. If it necessary, it calls for the 
     * optimisation of the identifiers assigned to terms.
     */
    public void createDirectFile() {
        if (Index.existsIndex(path, prefix)) {
            Index i = Index.createIndex();
            if (i == null) {
            } else if (i.hasIndexStructure("direct")) {
                logger.fatal("Cannot create a direct structure while an index with a direct structure exists at "
                        + path + "," + prefix);
                return;
            }
        }
        long startTime = System.currentTimeMillis();
        indexer.createDirectIndex(new Collection[] { collectionTREC });
        long endTime = System.currentTimeMillis();
        if (logger.isInfoEnabled())
            logger.info("Direct index built in " + ((endTime - startTime) / 1000.0D) + " seconds.");
        try {
            collectionTREC.close();
        } catch (Exception e) {
            logger.warn("problem closing collection", e);
        }
    }

    /**
     * Builds the inverted file from scratch, single pass method
     */
    public void createSinglePass() {
        if (Index.existsIndex(path, prefix)) {
            Index i = Index.createIndex(path, prefix);
            if (i.hasIndexStructure("inverted")) {
                logger.fatal(
                        "Cannot create an inverted structure while an index with a inverted structure exists at "
                                + path + "," + prefix);
                return;
            }
        }
        System.err.println("Starting building the inverted file "
                + (MyApplicationSetup.BLOCK_INDEXING ? "(with blocks)" : "") + "...");
        final long beginTimestamp = System.currentTimeMillis();
        BasicSinglePassIndexer _indexer;
        if (MyApplicationSetup.BLOCK_INDEXING)
            _indexer = new BlockSinglePassIndexer(MyApplicationSetup.TERRIER_INDEX_PATH,
                    MyApplicationSetup.TERRIER_INDEX_PREFIX);
        else
            _indexer = new BasicSinglePassIndexer(MyApplicationSetup.TERRIER_INDEX_PATH,
                    MyApplicationSetup.TERRIER_INDEX_PREFIX);
        _indexer.index(new Collection[] { collectionTREC });
        long endTimestamp = System.currentTimeMillis();
        System.err.println("Finished building the inverted index...");
        double seconds = (endTimestamp - beginTimestamp) / 1000.0d;
        System.err.println("Time elapsed for inverted file: " + seconds);
        try {
            collectionTREC.close();
        } catch (Exception e) {
            logger.warn("problem closing collection", e);
        }
    }

    /** 
     * Used for testing purposes.
     * @param args the command line arguments.
     */
    public static void main(String[] args) {
        long startTime = System.currentTimeMillis();

        Indexing t = new Indexing();

        t.setTRECIndexing(t.terrierIndexPath, t.terrierIndexPrefix);

        logger.info("Cleaning the dirs with previous index..");

        try {

            FileUtils.cleanDirectory(new File(t.terrierIndexPath));

        } catch (IOException e) {
            // TODO Auto-generated catch block

            e.printStackTrace();

            logger.info("Exception : " + e.getMessage());

        }

        logger.info("Cleaning done");

        t.index();

        long endTime = System.currentTimeMillis();

        if (logger.isInfoEnabled())

            logger.info("Elapsed time=" + ((endTime - startTime) / 1000.0D));

    }

}