Java tutorial
/* * Terrier - Terabyte Retriever * Webpage: http://terrier.org * Contact: terrier{a.}dcs.gla.ac.uk * University of Glasgow - School of Computing Science * http://www.ac.gla.uk * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See * the License for the specific language governing rights and limitations * under the License. * * The Original Code is TRECIndexing.java. * * The Original Code is Copyright (C) 2004-2011 the University of Glasgow. * All Rights Reserved. * * Contributor(s): * Gianni Amati <gba{a.}fub.it> (original author) * Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk> * Ben He <ben{a.}dcs.gla.ac.uk> * Craig Macdonald <craigm{a.}dcs.gla.ac.uk> */ package edu.southampton.wais.crowd.terrier.applications; import java.io.File; import java.io.IOException; import org.apache.commons.io.FileUtils; import org.apache.log4j.Logger; import org.terrier.indexing.BasicSinglePassIndexer; import org.terrier.indexing.BlockSinglePassIndexer; import org.terrier.indexing.Collection; import org.terrier.indexing.CollectionFactory; import org.terrier.indexing.Indexer; import org.terrier.structures.Index; /** * This class creates the indices for a test collection. * <p> * <b>Properties:</b> * <ul> * <li><tt>trec.indexer.class</tt> - name of the class to use as the indexer. This only applies to the Index method.</li> * <li><tt>trec.collection.class</tt> - name of the class to use as the Collection.</li> * </ul> * @author Gianni Amati, Vassilis Plachouras, Ben He, Craig Macdonald */ public class Indexing { /** The logger used */ private static Logger logger = Logger.getLogger(Indexing.class); /** The collection to index. */ Collection collectionTREC; String path; String prefix; /** The indexer object.*/ Indexer indexer; private String terrierIndexPath; private String terrierIndexPrefix; /** * A constructor that initialised the data structures * to use for indexing. * @param _path Absolute path to where the index should be created * @param _prefix Prefix of the index files, usually "data" */ public void setTRECIndexing(String _path, String _prefix) { path = _path; prefix = _prefix; //load the appropriate collection final String collectionName = MyApplicationSetup.getProperty("trec.collection.class", "TRECCollection"); collectionTREC = CollectionFactory.loadCollection(collectionName); if (collectionTREC == null) { logger.fatal("Collection class named " + collectionName + " not found, aborting"); } //load the appropriate indexer String indexerName = MyApplicationSetup.getProperty("trec.indexer.class", MyApplicationSetup.BLOCK_INDEXING ? "BlockIndexer" : "BasicIndexer"); if (indexerName.indexOf('.') == -1) indexerName = "org.terrier.indexing." + indexerName; else if (indexerName.startsWith("uk.ac.gla.terrier")) indexerName = indexerName.replaceAll("uk.ac.gla.terrier", "org.terrier"); try { indexer = (Indexer) Class.forName(indexerName).getConstructor(String.class, String.class) .newInstance(path, prefix); } catch (ClassNotFoundException e) { logger.fatal("Indexer class named " + indexerName + " not found", e); } catch (InstantiationException ie) { logger.fatal("Error while instantiating Indexer class named " + indexerName + " : " + ie.getCause(), ie); } catch (Exception e) { logger.fatal("Indexer class named " + indexerName + "problem", e); } } /** * A default constructor that initialised the data structures * to use for indexing. */ public Indexing() { terrierIndexPath = MyApplicationSetup.TERRIER_INDEX_PATH; terrierIndexPrefix = MyApplicationSetup.TERRIER_INDEX_PREFIX; } /** * Calls the method index(Collection[]) of the * class Indexer in order to build the data * structures for a set of collections. This * particular method of the Indexer uses a * set of builders for a subset of the collection * and builds separate data structures, which are * later merged. */ public void index() { if (Index.existsIndex(path, prefix)) { logger.fatal("Cannot index while an index exists at " + path + "," + prefix); return; } indexer.index(new Collection[] { collectionTREC }); try { collectionTREC.close(); } catch (Exception e) { logger.warn("problem closing collection", e); } } /** * Building the inverted file. */ public void createInvertedFile() { if (Index.existsIndex(path, prefix)) { Index i = Index.createIndex(); if (i == null) { } else if (i.hasIndexStructure("inverted")) { logger.fatal( "Cannot create an inverted structure while an index with a inverted structure exists at " + path + "," + prefix); return; } else if (!i.hasIndexStructure("direct")) { logger.fatal("Cannot create an inverted structure without a direct structure in the index at " + path + "," + prefix); return; } } else { logger.fatal("Cannot create an inverted structure without an index at " + path + "," + prefix); return; } if (logger.isInfoEnabled()) logger.info("Started building the inverted index..."); long beginTimestamp = System.currentTimeMillis(); indexer.createInvertedIndex(); long endTimestamp = System.currentTimeMillis(); if (logger.isInfoEnabled()) { logger.info("Finished building the inverted index..."); double seconds = (endTimestamp - beginTimestamp) / 1000.0d; logger.info("Time elapsed for inverted file: " + seconds); } } /** * Builds the direct file and lexicon. This method goes through the * input files specified in the <tt>collections.spec</tt> file * and processes them in groups of n documents, where n is specified * by the property <tt>bundle.size</tt>. Then, it merges the * temporary lexicon files. If it necessary, it calls for the * optimisation of the identifiers assigned to terms. */ public void createDirectFile() { if (Index.existsIndex(path, prefix)) { Index i = Index.createIndex(); if (i == null) { } else if (i.hasIndexStructure("direct")) { logger.fatal("Cannot create a direct structure while an index with a direct structure exists at " + path + "," + prefix); return; } } long startTime = System.currentTimeMillis(); indexer.createDirectIndex(new Collection[] { collectionTREC }); long endTime = System.currentTimeMillis(); if (logger.isInfoEnabled()) logger.info("Direct index built in " + ((endTime - startTime) / 1000.0D) + " seconds."); try { collectionTREC.close(); } catch (Exception e) { logger.warn("problem closing collection", e); } } /** * Builds the inverted file from scratch, single pass method */ public void createSinglePass() { if (Index.existsIndex(path, prefix)) { Index i = Index.createIndex(path, prefix); if (i.hasIndexStructure("inverted")) { logger.fatal( "Cannot create an inverted structure while an index with a inverted structure exists at " + path + "," + prefix); return; } } System.err.println("Starting building the inverted file " + (MyApplicationSetup.BLOCK_INDEXING ? "(with blocks)" : "") + "..."); final long beginTimestamp = System.currentTimeMillis(); BasicSinglePassIndexer _indexer; if (MyApplicationSetup.BLOCK_INDEXING) _indexer = new BlockSinglePassIndexer(MyApplicationSetup.TERRIER_INDEX_PATH, MyApplicationSetup.TERRIER_INDEX_PREFIX); else _indexer = new BasicSinglePassIndexer(MyApplicationSetup.TERRIER_INDEX_PATH, MyApplicationSetup.TERRIER_INDEX_PREFIX); _indexer.index(new Collection[] { collectionTREC }); long endTimestamp = System.currentTimeMillis(); System.err.println("Finished building the inverted index..."); double seconds = (endTimestamp - beginTimestamp) / 1000.0d; System.err.println("Time elapsed for inverted file: " + seconds); try { collectionTREC.close(); } catch (Exception e) { logger.warn("problem closing collection", e); } } /** * Used for testing purposes. * @param args the command line arguments. */ public static void main(String[] args) { long startTime = System.currentTimeMillis(); Indexing t = new Indexing(); t.setTRECIndexing(t.terrierIndexPath, t.terrierIndexPrefix); logger.info("Cleaning the dirs with previous index.."); try { FileUtils.cleanDirectory(new File(t.terrierIndexPath)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); logger.info("Exception : " + e.getMessage()); } logger.info("Cleaning done"); t.index(); long endTime = System.currentTimeMillis(); if (logger.isInfoEnabled()) logger.info("Elapsed time=" + ((endTime - startTime) / 1000.0D)); } }