Java tutorial
package it.unimi.di.big.mg4j.index.cluster; /* * MG4J: Managing Gigabytes for Java (big) * * Copyright (C) 2006-2015 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ import it.unimi.di.big.mg4j.index.DiskBasedIndex; import it.unimi.di.big.mg4j.index.Index; import it.unimi.di.big.mg4j.index.TermProcessor; import it.unimi.di.big.mg4j.index.payload.Payload; import it.unimi.di.big.mg4j.search.score.BM25Scorer; import it.unimi.di.big.mg4j.util.MG4JClassParser; import it.unimi.dsi.fastutil.ints.IntBigList; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.util.BloomFilter; import it.unimi.dsi.util.Properties; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.net.URISyntaxException; import java.util.EnumMap; import org.apache.commons.configuration.ConfigurationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.martiansoftware.jsap.ParseException; /** An abstract index cluster. An index cluster is an index * exposing transparently a list of <em>local indices</em> as a single * <em>global index</em>. A cluster usually is generated by * partitioning an index {@linkplain it.unimi.di.big.mg4j.tool.PartitionLexically lexically} * or {@linkplain it.unimi.di.big.mg4j.tool.PartitionDocumentally documentally}, but nothing * prevents the creation of hand-made clusters. * * <p>Note that, upon creation of an instance, the main index key * of all {@linkplain #localIndex local indices} is * {@linkplain it.unimi.di.big.mg4j.index.Index#keyIndex(Index) set} to that instance. * * <p>An index cluster is defined by a property file. The only properties common * to all index clusters are <samp>localindex</samp>, which can be specified multiple * times (order is relevant) and contains the URIs of the local indices of the cluster, * and <samp>strategy</samp>, which contains the filename of a serialised {@link it.unimi.di.big.mg4j.index.cluster.ClusteringStrategy}. * The indices will be loaded using {@link it.unimi.di.big.mg4j.index.Index#getInstance(CharSequence,boolean,boolean)}, * so there is no restriction on the URIs that can be used (e.g., you can cluster * a set of remote indices). * * <p>Alternatively, the property <samp>strategyclass</samp> can be used to specify a class name (the class will * be loaded using {@link MG4JClassParser}, so you can omit the package if the class is in MG4J). The class * must provide a constructor with a signature like that of * {@link ChainedLexicalClusteringStrategy#ChainedLexicalClusteringStrategy(Index[], BloomFilter[])}). * * <p>If you plan to use global document sizes (e.g., for {@linkplain BM25Scorer BM25 scoring}) you will need * to load them explicitly using the property {@link it.unimi.di.big.mg4j.index.Index.UriKeys#SIZES}, which must specify * a size file for the <em>whole collection</em>. If you are clustering a partitioned index, * this is usually the original size file. * * <p>Optionally, an index cluster may provide {@linkplain BloomFilter Bloom filters} * to reduce useless access to local indices that do not contain a term. The filters * have the standard extension {@link #BLOOM_EXTENSION}. * * <p>This class exposes a {@linkplain #getInstance(CharSequence, boolean, boolean, EnumMap) static factory method} * that uses the <samp>indexclass</samp> property to load the appropriate implementing subclass; * Bloom filters are loaded automatically. */ public abstract class IndexCluster extends Index { private static final long serialVersionUID = 1L; private static final Logger LOGGER = LoggerFactory.getLogger(IndexCluster.class); /** Symbolic names for properties of an {@link it.unimi.di.big.mg4j.index.cluster.IndexCluster}. */ public static enum PropertyKeys { /** A local index (usually used multiple times). */ LOCALINDEX, /** The clustering strategy, specified as a serialised object. */ STRATEGY, /** The clustering strategy, specified as a class name (the class must provide the standard constructors described in {@link IndexCluster}). */ STRATEGYCLASS, /** A Boolean: whether the cluster has Bloom term filters. */ BLOOM, /** A Boolean: whether the cluster is flat (i.e., it is documental and all term lists are the same). */ FLAT } /** The default extension of a strategy. */ public static final String STRATEGY_DEFAULT_EXTENSION = ".strategy"; /** The default extension for Bloom term filters. */ public static final String BLOOM_EXTENSION = ".bloom"; /** The local indices of this cluster. */ protected final Index[] localIndex; /** An array of Bloom filter to reduce index access, or <code>null</code>. */ protected final BloomFilter<Void>[] termFilter; protected IndexCluster(final Index[] localIndex, final BloomFilter<Void>[] termFilter, final int numberOfDocuments, final int numberOfTerms, final long numberOfPostings, final long numberOfOccurrences, final int maxCount, final Payload payload, final boolean hasCounts, final boolean hasPositions, final TermProcessor termProcessor, final String field, final IntBigList sizes, final Properties properties) { super(numberOfDocuments, numberOfTerms, numberOfPostings, numberOfOccurrences, maxCount, payload, hasCounts, hasPositions, termProcessor, field, null, null, sizes, properties); this.localIndex = localIndex; this.termFilter = termFilter; for (int i = 0; i < localIndex.length; i++) localIndex[i].keyIndex(this); } /** Returns a new index cluster. * * <p>This method uses the <samp>LOCALINDEX</samp> property to locate the local indices, * loads them (passing on <code>randomAccess</code>) and * builds a new index cluster using the appropriate implementing subclass. * * <p>Note that <code>documentSizes</code> is just passed to the local indices. This can be useful * in {@linkplain DocumentalCluster documental clusters}, as it allows local scoring, but it is useless in * {@linkplain LexicalCluster lexical clusters}, as scoring is necessarily centralised. In the * latter case, the property {@link it.unimi.di.big.mg4j.index.Index.UriKeys#SIZES} can be used to specify a global sizes file (which * usually comes from an original global index). * * @param basename the basename. * @param randomAccess whether the index should be accessible randomly. * @param documentSizes if true, document sizes will be loaded (note that sometimes document sizes * might be loaded anyway because the compression method for positions requires it). * @param queryProperties a map containing associations between {@link it.unimi.di.big.mg4j.index.Index.UriKeys} and values, or <code>null</code>. */ @SuppressWarnings("unchecked") static public Index getInstance(final CharSequence basename, final boolean randomAccess, final boolean documentSizes, final EnumMap<UriKeys, String> queryProperties) throws ConfigurationException, IOException, ClassNotFoundException, SecurityException, URISyntaxException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { final Properties properties = new Properties(basename + DiskBasedIndex.PROPERTIES_EXTENSION); ClusteringStrategy strategy = null; Class<? extends ClusteringStrategy> strategyClass = null; if (properties.containsKey(PropertyKeys.STRATEGY)) strategy = (ClusteringStrategy) BinIO.loadObject(properties.getString(PropertyKeys.STRATEGY)); else if (properties.containsKey(PropertyKeys.STRATEGYCLASS)) try { strategyClass = (Class<? extends ClusteringStrategy>) MG4JClassParser.getParser() .parse(properties.getString(PropertyKeys.STRATEGYCLASS)); } catch (ParseException e) { throw new RuntimeException(e); } else throw new IllegalArgumentException( "Cluster properties must contain either a strategy or a strategy class property"); final Class<? extends IndexCluster> indexClass = (Class<? extends IndexCluster>) Class .forName(properties.getString(Index.PropertyKeys.INDEXCLASS, "(missing index class)")); String[] localBasename = properties.getStringArray(PropertyKeys.LOCALINDEX); Index[] localIndex = new Index[localBasename.length]; for (int i = 0; i < localIndex.length; i++) localIndex[i] = Index.getInstance(localBasename[i], randomAccess, documentSizes); final int numberOfDocuments = properties.getInt(Index.PropertyKeys.DOCUMENTS); final IntBigList sizes = queryProperties != null && queryProperties.containsKey(Index.UriKeys.SIZES) ? DiskBasedIndex.readSizes(queryProperties.get(Index.UriKeys.SIZES), numberOfDocuments) : null; if (sizes != null && documentSizes) LOGGER.warn( "You are loading both local sizes and a global size file specified by the \"size\" properties, which is usually nonsensical"); boolean hasCounts = true; boolean hasPositions = true; Payload payload = null; for (int i = 0; i < localIndex.length; i++) { hasCounts = hasCounts && localIndex[i].hasCounts; hasPositions = hasPositions && localIndex[i].hasPositions; if (i == 0) payload = localIndex[i].payload; if ((payload == null) != (localIndex[i].payload == null) || payload != null && !payload.compatibleWith(localIndex[i].payload)) throw new IllegalStateException("The payload specification of index " + localIndex[0] + " is not compatible with that of index " + localIndex[i]); } // We stem the names of Bloom filters from the index basename. BloomFilter<Void>[] termFilter = null; if (properties.getBoolean(DocumentalCluster.PropertyKeys.BLOOM)) { LOGGER.debug("Loading Bloom filters..."); termFilter = new BloomFilter[localIndex.length]; for (int i = 0; i < localIndex.length; i++) termFilter[i] = (BloomFilter<Void>) BinIO.loadObject(basename + "-" + i + BLOOM_EXTENSION); LOGGER.debug("Completed."); } // Let us rebuild the strategy in case it's a chained strategy if (strategyClass != null) { strategy = strategyClass.getConstructor(Index[].class, BloomFilter[].class).newInstance(localIndex, termFilter); } else { if (strategy instanceof ChainedLexicalClusteringStrategy) strategy = new ChainedLexicalClusteringStrategy(localIndex, termFilter); else if (strategy.numberOfLocalIndices() != localBasename.length) throw new IllegalArgumentException("The number of local indices of the strategy (" + localIndex.length + ") and the number of local indices specified by the property file (" + localBasename.length + ") differ"); } if (LexicalCluster.class.isAssignableFrom(indexClass)) return new LexicalCluster(localIndex, (LexicalClusteringStrategy) strategy, termFilter, numberOfDocuments, properties.getInt(Index.PropertyKeys.TERMS), properties.getLong(Index.PropertyKeys.POSTINGS), properties.getLong(Index.PropertyKeys.OCCURRENCES), properties.getInt(Index.PropertyKeys.MAXCOUNT), payload, hasCounts, hasPositions, Index.getTermProcessor(properties), properties.getString(Index.PropertyKeys.FIELD), sizes, properties); else if (DocumentalCluster.class.isAssignableFrom(indexClass)) { if (DocumentalConcatenatedCluster.class.isAssignableFrom(indexClass)) return new DocumentalConcatenatedCluster(localIndex, (DocumentalClusteringStrategy) strategy, properties.getBoolean(IndexCluster.PropertyKeys.FLAT), termFilter, numberOfDocuments, properties.getInt(Index.PropertyKeys.TERMS), properties.getLong(Index.PropertyKeys.POSTINGS), properties.getLong(Index.PropertyKeys.OCCURRENCES), properties.getInt(Index.PropertyKeys.MAXCOUNT), payload, hasCounts, hasPositions, Index.getTermProcessor(properties), properties.getString(Index.PropertyKeys.FIELD), sizes, properties); return new DocumentalMergedCluster(localIndex, (DocumentalClusteringStrategy) strategy, properties.getBoolean(IndexCluster.PropertyKeys.FLAT), termFilter, numberOfDocuments, properties.getInt(Index.PropertyKeys.TERMS), properties.getLong(Index.PropertyKeys.POSTINGS), properties.getLong(Index.PropertyKeys.OCCURRENCES), properties.getInt(Index.PropertyKeys.MAXCOUNT), payload, hasCounts, hasPositions, Index.getTermProcessor(properties), properties.getString(Index.PropertyKeys.FIELD), sizes, properties); } else throw new IllegalArgumentException("Unknown IndexCluster implementation: " + indexClass.getName()); } @Override public void keyIndex(final Index newKeyIndex) { super.keyIndex(newKeyIndex); for (int i = 0; i < localIndex.length; i++) localIndex[i].keyIndex(this); } }