cz.muni.fi.xfabian7.bp.mgrid.dindex.DIndex.java Source code

Introduction

Here is the source code for cz.muni.fi.xfabian7.bp.mgrid.dindex.DIndex.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package cz.muni.fi.xfabian7.bp.mgrid.dindex;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.logging.Level;
import java.util.logging.Logger;
import messif.algorithms.Algorithm;
import messif.buckets.BucketErrorCode;
import messif.buckets.BucketStorageException;
import messif.buckets.LocalBucket;
import messif.objects.LocalAbstractObject;
import messif.objects.util.AbstractObjectIterator;
import messif.objects.util.AbstractObjectList;
import messif.operations.data.BulkInsertOperation;
import messif.operations.data.InsertOperation;
import messif.operations.query.ApproxRangeQueryOperation;
import messif.operations.query.RangeQueryOperation;
import messif.pivotselection.AbstractPivotChooser;
import messif.pivotselection.IncrementalPivotChooser;
import messif.statistics.OperationStatistics;
import messif.statistics.StatisticCounter;
import messif.statistics.StatisticRefCounter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import cz.muni.fi.xfabian7.bp.mgrid.HdfsStorageBucket;
import cz.muni.fi.xfabian7.bp.mgrid.HdfsStorageBucketDispatcher;
import cz.muni.fi.xfabian7.bp.mgrid.MGrid;
import static cz.muni.fi.xfabian7.bp.mgrid.MGrid.HADOOP_CORESITE_FILE_PATH;

/**
 *
 * @author Vlastislav Dohnal, dohnal@fi.muni.cz, Faculty of Informatics, Masaryk University, Brno, Czech Republic\
 * @author Norbert Fabian, 396035@mail.muni.cz, Faculty of Informatics, Masaryk University, Brno, Czech Republic\
 * @since 2008-05-21
 */
public class DIndex extends Algorithm implements Serializable {

    /**
     * Class serial id for serialization
     */
    private static final long serialVersionUID = 2L;

    /**
     * An instance of BucketDispatcher shared within this instance of aD-index
     */
    final HdfsStorageBucketDispatcher disp;

    /**
     * List of aD-index levels (levels of hashing)
     */
    List<Hashing> levels;

    /*
     * Bucket path counetr
     */
    static int pathCounter;

    /**
     * Separation value rho
     */
    final float rho;

    /**
     * A list of fixed pivots. All objects have distances to these pivots and
     * these pivots are used on the first level.
     */
    List<LocalAbstractObject> pivotsFixed;

    private static final IncrementalPivotChooser incrementalPivotChooser = new IncrementalPivotChooser();

    /**
     * Global counter for distance computations made during splits
     */
    protected static final StatisticCounter counterSplitDistanceComputations = StatisticCounter
            .getStatistics("DistanceComputations.Split");
    /**
     * Global counter for distance computations
     */

    protected static final StatisticCounter counterDistanceComputations = StatisticCounter
            .getStatistics("DistanceComputations");

    /**
     * Global counter for the number of distance computations per bucket
     */
    protected static final StatisticRefCounter counterBucketDistanceComputations = StatisticRefCounter
            .getStatistics("BucketDistanceComputations");

    /**
     *
     * @param rho
     * @param bucketCapacity
     * @param pivots
     */
    @Algorithm.AlgorithmConstructor(description = "aD-index with rho, bucket capacity and pivots stream", arguments = {
            "rho", "bucket capacity in bytes", "stream with pivots" })
    public DIndex(float rho, long bucketCapacity, AbstractObjectIterator<LocalAbstractObject> pivots) {
        super("D-index");

        disp = new HdfsStorageBucketDispatcher(Integer.MAX_VALUE, Integer.MAX_VALUE /*bucketCapacity*2*/,
                bucketCapacity, 0, false, HdfsStorageBucket.class);
        this.rho = rho;
        if (pivots != null) {
            pivotsFixed = new ArrayList<>();
            while (pivots.hasNext()) {
                pivotsFixed.add(pivots.next());
            }
        } else {
            pivotsFixed = null;
        }

        FileSystem fs = getFileSystem();

        try {
            if (fs.exists(new Path(MGrid.D_INDEX_PATH))) {
                loadDIndex();
            } else {
                // Create the first level
                pathCounter = 0;
                levels = new ArrayList<>();
                try {
                    // Set global fixed pivots to the first level
                    AbstractPivotChooser chooser = getPivotChooser();
                    if (pivotsFixed != null) {
                        for (LocalAbstractObject pvt : pivotsFixed) {
                            chooser.addPivot(pvt);
                        }
                    }

                    levels.add(new HashingTree(disp, rho, chooser));
                } catch (BucketStorageException | InstantiationException ex) {
                    Logger.getLogger(DIndex.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        } catch (IOException ex) {
            Logger.getLogger(DIndex.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    /**
     * Instantiates a new pivot chooser
     *
     * @return a new instance of pivot choser class
     */
    protected static AbstractPivotChooser getPivotChooser() {
        return incrementalPivotChooser;
    }

    //**************************************************************/
    //******* SEARCH OPERATION *************************************/
    //**************************************************************/
    /**
     * Choices for algorithms evaluating queries
     */
    public enum enQueryEvalAlgoType {

        /**
         *
         */
        EVALUATION_LIMIT_TO_RHO,
        /**
         *
         */
        EVALUATION_COMPLETE;
    }

    /**
     * Choice of the evaluation algorithm for queries
     */
    protected enQueryEvalAlgoType queryEvaluationAlgorithmType = enQueryEvalAlgoType.EVALUATION_COMPLETE;

    /**
     * Sets the type of algorithms used to evaluate queries. This changes
     * behavior from now until this method is called with a different argument.
     *
     * @param type Specific type of evaluation algorithms
     */
    public void setQueryEvaluationAlgorithmType(enQueryEvalAlgoType type) {
        queryEvaluationAlgorithmType = type;
    }

    /**
     * Performs the range search operation. The answer to the query is returned
     * in the passed operation object.
     *
     * The evaluation algorithm is choosed according to the query evaluation
     * algorithm type currently set.
     *
     * @param q specifies the range query parameters
     * @return <code>true</code>
     */
    public List<Integer> rangeSearch(RangeQueryOperation q) {
        // Local operation statistics
        OperationStatistics opStat = OperationStatistics.getLocalThreadStatistics();
        opStat.registerBoundAllStats("DistanceComputations.*|BucketRead");
        List<Integer> addrList = new ArrayList<>();

        switch (queryEvaluationAlgorithmType) {
        case EVALUATION_COMPLETE:
            if (q.getRadius() <= rho) {
                addrList.addAll(rangeSearchUptoRho(q));
            } else {
                addrList.addAll(rangeSearchUnlimited(q));
            }
            break;
        case EVALUATION_LIMIT_TO_RHO:
            addrList.addAll(rangeSearchUptoRho(q));
            break;
        default:
            throw new UnsupportedOperationException("Unknown query evaluation algorithm type!");
        }

        // Local operation statistics
        opStat.unbindAllStats();

        return addrList;
    }

    /**
     * Performs the range search operation but the evaluation is limited to rho.
     * The correctness of the answer is ensured for objects closer to the query
     * object than rho.
     *
     * So this implementation accesses one bucket per aD-index level at maximum.
     * The early termination if the query region is exclusively in a separable
     * bucket and skipping levels if the query region is exclusively in the
     * exclusion zone are both implemented.
     *
     * @param q specifies the range query parameters
     * @return <code>true</code>
     */
    protected List<Integer> rangeSearchUptoRho(RangeQueryOperation q) {
        // Local operation statistics should be bound...
        float r = (q.getRadius() > rho) ? rho : q.getRadius();
        List<Integer> addrList = new ArrayList<>();

        for (int l = 0; l < levels.size(); l++) {
            Hashing h = levels.get(l);
            int addr;

            // Check exclusive containtment in a separable bucket
            addr = h.computeAddress(q.getQueryObject(), rho + r, true, l);
            if (addr != ExcludedMiddlePartitioning.ZONE_EXCLUSION) {
                // Return addr
                addrList.add(addr);
                break;
            }

            // Check exclusive containtment in the exclusion
            addr = h.computeAddress(q.getQueryObject(), rho - r, true, l);
            if (addr == ExcludedMiddlePartitioning.ZONE_EXCLUSION) {
                continue; // Check the next level directly
            }

            // Get the address of the bucket identified.
            addrList.add(addr);
        }

        return addrList;
    }

    /**
     * Performs the range search operation. The answer is correct since no
     * restriction is imposed on the query radius.
     *
     * The early termination if the query region is exclusively in a separable
     * bucket and skipping levels if the query region is exclusively in the
     * exclusion zone are both implemented.
     *
     * @param q specifies the range query parameters
     * @return <code>true</code>
     */
    protected List<Integer> rangeSearchUnlimited(RangeQueryOperation q) {
        List<Integer> addrList = new ArrayList<>();
        for (int l = 0; l < levels.size(); l++) {
            Hashing h = levels.get(l);

            // Check exclusive containtment in a separable bucket
            {
                int addr = h.computeAddress(q.getQueryObject(), rho + q.getRadius(), true, l);
                if (addr != ExcludedMiddlePartitioning.ZONE_EXCLUSION) {
                    // Search this bucket and terminate.
                    addrList.add(addr);
                    break;
                }
            }

            // Checking exclusive containtment in the exclusion is not 
            // necessary becuase the radius is larger than rho. 
            // (Refer to rangeSearch() method.)
            // Access all intersected buckets
            for (int addr : h.computeAddressList(q.getQueryObject(), rho + q.getRadius(), l)) {
                // Evaluate the query on the bucket identified.
                addrList.add(addr);
            }
        }

        return addrList;
    }

    /**
     * Performs the approximate range search operation. The answer to the query
     * is returned in the passed operation object.
     *
     * This algorithm is complete and correct for any type of range query, i.e.
     * no limits are applied on the query radius.
     *
     * @param q specifies the range query parameters
     * @return <code>true</code>
     */
    public List<Integer> rangeSearch(ApproxRangeQueryOperation q) {
        // Local operation statistics
        OperationStatistics opStat = OperationStatistics.getLocalThreadStatistics();
        opStat.registerBoundAllStats("DistanceComputations.*|BucketRead");

        List<Integer> addrList = new ArrayList<>();

        // Try to acommodate the new object in a level starting from the first one.
        for (int l = 0; l < levels.size(); l++) {
            Hashing h = levels.get(l);
            int addr = h.computeAddress(q.getQueryObject(), false, l);

            // Exclusion zone has been hit, so search in the next level
            if (addr == ExcludedMiddlePartitioning.ZONE_EXCLUSION) {
                continue;
            }

            // add address to the list
            addrList.add(addr);
        }

        // Set the quarantee if approximation
        q.setRadiusGuaranteed(rho);

        // Local operation statistics
        opStat.unbindAllStats();

        return addrList;
    }

    //**************************************************************/
    //******* INSERT OPERATION *************************************/
    //**************************************************************/

    public int getPathCounter() {
        pathCounter++;
        return pathCounter;
    }

    /**
     * Insert operation of an local object Inserts a new object to the aD-Index
     * structure.
     *
     * @param oper Operation of insert which carries an object to be inserted.
     *
     * @return Returns true on success, otherwise false (when the hard capacity
     * of bucket is reached).
     */
    public boolean insert(InsertOperation oper) {
        // Local operation statistics
        OperationStatistics opStat = OperationStatistics.getLocalThreadStatistics();
        opStat.registerBoundAllStats("DistanceComputations.*|BucketRead");

        boolean ret = insertFromLevel(0, oper.getInsertedObject(), false);

        // Local operation statistics
        opStat.unbindAllStats();

        return ret;
    }

    /**
     * Populates an existing instance (also empty) with the passed list of
     * objects. All split operations are done after all objects are accommodated
     * in a bucket.
     *
     * @param ins list of objects to insert
     * @return <code>true</code>
     */
    public boolean bulkInsert(BulkInsertOperation ins) {
        /**
         * IDEA: insert all objects in the current structure without any
         * modification to it. if any bucket overflows, a new bucket with
         * updated capacity is created and content copied.
         *
         * After the insertion, walk through all overflowing objects (using the
         * original capacity) and do split them
         *
         * OPTIMIZATION: the passed number of objects can be used to create
         * buckets having the corresponding capacity (even disk buckets would be
         * ok)
         */

        // Local operation statistics
        OperationStatistics opStat = OperationStatistics.getLocalThreadStatistics();
        opStat.registerBoundAllStats("DistanceComputations.*|BucketRead");

        // Insert all passed object without splitting
        insertFromLevel(0, ins.getInsertedObjects().iterator(), true);

        // Fix overflowing buckets
        for (int l = 0; l < levels.size(); l++) {
            Hashing h = levels.get(l);

            // Split buckets at this level until none is overflowing
            boolean bucketSplit = true;
            while (bucketSplit) {
                bucketSplit = false;
                Iterator<Integer> addr = h.getAllAddresses();
                while (addr.hasNext()) {
                    int ad = addr.next();

                    // Check the bucket's occupation
                    if (!h.getBucket(ad).isSoftCapacityExceeded()) {
                        continue;
                    }

                    // Split the overflowing bucket
                    System.out.println("DIndex, SplitBucket :" + ad + h.getBucket(ad).toString());
                    splitBucket(l, ad, true);
                    bucketSplit = true;
                }
            }
        }

        // Local operation statistics
        opStat.unbindAllStats();

        return true;
    }

    //**************************************************************/
    //******* INTERNAL DATA MANAGEMENT *****************************/
    //**************************************************************/
    /**
     * Inserts the objects provided by the iterator to any level starting from
     * the given one.
     *
     * @param level index of the level to start inserting the object to
     * @param objects iterator over the objects to insert
     * @param bulkLoading flag specifying whether bulk-loading takes place
     * (bucket splits are forbidden) or not
     * @return <code>true</code> upon success, otherwise <code>false</code>.
     */
    protected boolean insertFromLevel(int level, Iterator<? extends LocalAbstractObject> objects,
            boolean bulkLoading) {
        if (objects == null) {
            return false;
        }

        while (objects.hasNext()) {
            insertFromLevel(level, objects.next(), bulkLoading);
        }
        return true;
    }

    /**
     * Inserts an objects to any level starting from the given one.
     *
     * @param level index of the level to start inserting the object to
     * @param obj the object to insert
     * @param bulkLoading flag specifying whether bulk-loading takes place
     * (bucket splits are forbidden) or not
     * @return <code>true</code> upon success, otherwise <code>false</code>.
     */
    protected boolean insertFromLevel(int level, LocalAbstractObject obj, boolean bulkLoading) {
        // Test the number of levels
        if (level >= levels.size()) {
            // Allocate a new level
            try {
                levels.add(new HashingTree(disp, rho, getPivotChooser()));
            } catch (BucketStorageException | InstantiationException ex) {
                Logger.getLogger(DIndex.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

        // If inserting from the first level, compute and store the distances to the fixed pivots
        if (level == 0 && pivotsFixed != null) {
            // Get the precomputed distances filter       
            PrecomputedDistancesByLevelFilter flt = HashingTree.getFilter(obj);
            for (int i = 0; i < pivotsFixed.size(); i++) {
                float dist = flt.getPrecompDist(level, i);
                if (dist == LocalAbstractObject.UNKNOWN_DISTANCE) {
                    dist = obj.getDistance(pivotsFixed.get(i));
                    // Store the distance among precomputed distances
                    flt.addPrecompDist(level, dist);
                }
            }
        }

        // Try to acommodate the new object in a level starting from the first one.
        for (int l = level; l < levels.size(); l++) {
            Hashing h = levels.get(l);
            int addr = h.computeAddress(obj, false, l);

            // Exclusion zone has been hit, so insert the object in the next level
            if (addr == ExcludedMiddlePartitioning.ZONE_EXCLUSION) {
                continue;
            }

            // Insert the new object
            BucketErrorCode err = h.getBucket(addr).addObjectErrCode(obj);
            if (err == BucketErrorCode.SOFTCAPACITY_EXCEEDED && !bulkLoading) {
                splitBucket(l, addr, bulkLoading);
            } else {
                // Update min/max radii

            }

            // The object has been inserted successfully
            break;
        }

        try {
            //Save changes
            saveDIndex();
        } catch (IOException ex) {
            Logger.getLogger(DIndex.class.getName()).log(Level.SEVERE, null, ex);
        }
        return true;
    }

    /**
     * Splits a bucket of the given level <code>h</code>. The bucket is
     * specified by the passed <code>addr</code>.
     *
     * @param level identifies the hashing level
     * @param addr address of the bucket to split
     * @param bulkLoading flag specifying whether bulk-loading takes place
     * (bucket splits are forbidden) or not
     */
    protected void splitBucket(int level, int addr, boolean bulkLoading) {
        Hashing h = levels.get(level);
        LocalBucket b = h.getBucket(addr);

        counterSplitDistanceComputations.bindTo(counterDistanceComputations);

        float allObjs = b.getObjectCount() / 100.0f;

        // Get the pivot to split the bucket. Select a new one if not available.
        LocalAbstractObject p = h.getSplitPivot(addr);
        if (p == null) {
            p = h.selectNewPivot();
        }

        // Create an instance of partitioning 
        ExcludedMiddlePartitioning partitioning = new ExcludedMiddlePartitioning(b, p, rho);

        // Compute the value of dm
        float dm = partitioning.computeDm();

        // Split the bucket (a new empty bucket is created)
        int newAddr = -1;
        try {
            newAddr = h.split(addr, p, dm);
        } catch (NoSuchElementException | BucketStorageException | InstantiationException ex) {
            Logger.getLogger(DIndex.class.getName()).log(Level.SEVERE, null, ex);
        }

        LocalBucket newB = h.getBucket(newAddr);

        // Redistribute the data in the bucket that has been split.
        AbstractObjectList<LocalAbstractObject> exclusion = partitioning.partitionObjects(b, newB, level);

        // The object within the EXCLUSION_ZONE are stored in the next level.
        if (exclusion.size() > 0) {
            insertFromLevel(level + 1, exclusion.iterator(), bulkLoading);
        }

        // DEBUG info
        if (Logger.getLogger(DIndex.class.getName()).isLoggable(Level.INFO)) {
            String msg = String.format("Split result of bucket %s: %.2f%% : %.2f%% : %.2f%%",
                    h.formatBucketAddress(addr), (float) b.getObjectCount() / allObjs,
                    100.0f - (float) (b.getObjectCount() + newB.getObjectCount()) / allObjs,
                    (float) newB.getObjectCount() / allObjs);
            Logger.getLogger(DIndex.class.getName()).log(Level.INFO, msg);
        }

        counterSplitDistanceComputations.unbind();
    }

    public int getIncrementedPathCounter() {
        return pathCounter;
    }

    @Override
    public String toString() {
        StringBuilder rtv = new StringBuilder();
        int objs = 0, bkts = 0;

        rtv.append("D-index structure:\n");
        rtv.append("rho: ").append(rho).append("\n");
        rtv.append("fixed pivots: ").append((pivotsFixed != null) ? pivotsFixed.size() : 0).append("\n");
        rtv.append("hash levels: ").append(levels.size()).append("\n");
        rtv.append("bucket capacity: ").append(disp.getBucketSoftCapacity()).append(" objs\n");
        for (Hashing h : levels) {
            objs += h.getObjectCount();
            bkts += h.getBucketCount();
        }
        rtv.append("buckets: ").append(bkts).append("\n");
        rtv.append("objects: ").append(objs).append("\n");
        rtv.append("\n");

        for (int l = 0; l < levels.size(); l++) {
            rtv.append("level info: ").append(l).append("\n");
            rtv.append(levels.get(l)).append("\n");
        }

        return rtv.toString();
    }
    //***********************************************************
    //************** SAVE AND LOAD DINDEX ***********************
    //***********************************************************

    public FileSystem getFileSystem() {
        FileSystem fs = null;
        try {
            final Configuration confFileSystem = new Configuration();
            confFileSystem.addResource(new Path(HADOOP_CORESITE_FILE_PATH));
            fs = FileSystem.get(confFileSystem);
        } catch (IOException ex) {
            Logger.getLogger(DIndex.class.getName()).log(Level.SEVERE, null, ex);
        }
        return fs;
    }

    public void saveDIndex() throws IOException {
        FileSystem fs = getFileSystem();
        try (ObjectOutputStream oos = new ObjectOutputStream(fs.create(new Path(MGrid.D_INDEX_PATH)))) {
            oos.writeObject(levels);
            oos.writeObject(pathCounter);
        } catch (IOException ex) {
            Logger.getLogger(DIndex.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public void loadDIndex() {
        ObjectInputStream ois = null;
        try {
            FileSystem fs = getFileSystem();
            ois = new ObjectInputStream(fs.open(new Path(MGrid.D_INDEX_PATH)));
            this.levels = (List<Hashing>) ois.readObject();
            this.pathCounter = (Integer) ois.readObject();
            ois.close();
        } catch (IOException ex) {
            Logger.getLogger(DIndex.class.getName()).log(Level.SEVERE, null, ex);
        } catch (ClassNotFoundException ex) {
            Logger.getLogger(DIndex.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                ois.close();
            } catch (IOException ex) {
                Logger.getLogger(DIndex.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }
}