net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderFromCodeBook.java Source code

Introduction

Here is the source code for net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderFromCodeBook.java
Source

/*
 * This file is part of the LIRE project: http://www.semanticmetadata.net/lire
 * LIRE is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * LIRE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with LIRE; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * We kindly ask you to refer the any or one of the following publications in
 * any publication mentioning or employing Lire:
 *
 * Lux Mathias, Savvas A. Chatzichristofis. Lire: Lucene Image Retrieval 
 * An Extensible Java CBIR Library. In proceedings of the 16th ACM International
 * Conference on Multimedia, pp. 1085-1088, Vancouver, Canada, 2008
 * URL: http://doi.acm.org/10.1145/1459359.1459577
 *
 * Lux Mathias. Content Based Image Retrieval with LIRE. In proceedings of the
 * 19th ACM International Conference on Multimedia, pp. 735-738, Scottsdale,
 * Arizona, USA, 2011
 * URL: http://dl.acm.org/citation.cfm?id=2072432
 *
 * Mathias Lux, Oge Marques. Visual Information Retrieval using Java and LIRE
 * Morgan & Claypool, 2013
 * URL: http://www.morganclaypool.com/doi/abs/10.2200/S00468ED1V01Y201301ICR025
 *
 * Copyright statement:
 * ====================
 * (c) 2002-2013 by Mathias Lux (mathias@juggle.at)
 *  http://www.semanticmetadata.net/lire, http://www.lire-project.net
 *
 * Updated: 27.06.14 13:06
 */
package net.semanticmetadata.lire.imageanalysis.bovw;

import net.semanticmetadata.lire.DocumentBuilder;
import net.semanticmetadata.lire.imageanalysis.Histogram;
import net.semanticmetadata.lire.imageanalysis.LireFeature;
import net.semanticmetadata.lire.utils.LuceneUtils;
import net.semanticmetadata.lire.utils.MetricsUtils;
import net.semanticmetadata.lire.utils.SerializationUtils;
import org.apache.commons.math3.ml.clustering.CentroidCluster;
import org.apache.commons.math3.ml.clustering.DoublePoint;
import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.util.Bits;

import javax.swing.*;
import java.io.FileInputStream;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
 * General class creating bag of visual words vocabularies parallel based on k-means. Works with SIFT, SURF and MSER.
 * Date: 24.09.2008
 * Time: 09:38:53
 *
 * @author Mathias Lux, mathias@juggle.at
 */
public abstract class LocalFeatureHistogramBuilderFromCodeBook {
    IndexReader reader;
    // number of documents used to build the vocabulary / clusters.
    private int numDocsForVocabulary = 100;
    private int numClusters = 512;
    private List<double[]> clusters = null;
    DecimalFormat df = (DecimalFormat) NumberFormat.getNumberInstance();
    private ProgressMonitor pm = null;

    protected String localFeatureFieldName = DocumentBuilder.FIELD_NAME_SURF;
    protected String visualWordsFieldName = DocumentBuilder.FIELD_NAME_SURF_VISUAL_WORDS;
    protected String localFeatureHistFieldName = DocumentBuilder.FIELD_NAME_SURF_LOCAL_FEATURE_HISTOGRAM;
    protected String clusterFile = "./clusters.dat";
    public static boolean DELETE_LOCAL_FEATURES = true;

    public LocalFeatureHistogramBuilderFromCodeBook(IndexReader reader) {
        this.reader = reader;
    }

    /**
     * Creates a new instance of the LocalFeatureHistogramBuilder using the given reader. The numDocsForVocabulary
     * indicates how many documents of the index are used to build the vocabulary (clusters).
     *
     * @param reader               the reader used to open the Lucene index,
     * @param numDocsForVocabulary gives the number of documents for building the vocabulary (clusters).
     */
    public LocalFeatureHistogramBuilderFromCodeBook(IndexReader reader, int numDocsForVocabulary) {
        this.reader = reader;
        this.numDocsForVocabulary = numDocsForVocabulary;
    }

    /**
     * Creates a new instance of the LocalFeatureHistogramBuilder using the given reader. The numDocsForVocabulary
     * indicates how many documents of the index are used to build the vocabulary (clusters). The numClusters gives
     * the number of clusters k-means should find. Note that this number should be lower than the number of features,
     * otherwise an exception will be thrown while indexing.
     *
     * @param reader               the index reader
     * @param numDocsForVocabulary the number of documents that should be sampled for building the visual vocabulary
     * @param numClusters          the size of the visual vocabulary
     */
    public LocalFeatureHistogramBuilderFromCodeBook(IndexReader reader, int numDocsForVocabulary, int numClusters) {
        this.numDocsForVocabulary = numDocsForVocabulary;
        this.numClusters = numClusters;
        this.reader = reader;
    }

    /**
     * Uses an existing index, where each and every document should have a set of local features. A number of
     * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words
     * (the cluster means). For all images a histogram on the visual words is created and added to the documents.
     * Pre-existing histograms are deleted, so this method can be used for re-indexing.
     *
     * @throws java.io.IOException
     */
    public void index() throws IOException {
        df.setMaximumFractionDigits(3);
        // find the documents for building the vocabulary:
        //        HashSet<Integer> docIDs = selectVocabularyDocs();
        //        System.out.println("Using " + docIDs.size() + " documents to build the vocabulary.");
        //        KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15);
        //        // fill the KMeans object:
        //        LinkedList<DoublePoint> features = new LinkedList<DoublePoint>();
        //        // Needed for check whether the document is deleted.
        Bits liveDocs = MultiFields.getLiveDocs(reader);
        //        for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext(); ) {
        //            int nextDoc = iterator.next();
        //            if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it.
        //            Document d = reader.document(nextDoc);
        ////            features.clear();
        //            IndexableField[] fields = d.getFields(localFeatureFieldName);
        //            String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
        //            for (int j = 0; j < fields.length; j++) {
        //                LireFeature f = getFeatureInstance();
        //                f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length);
        //                // copy the data over to new array ...
        //                double[] feat = new double[f.getDoubleHistogram().length];
        //                System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length);
        //                features.add(new DoublePoint(f.getDoubleHistogram()));
        //            }
        //        }
        //        if (features.size() < numClusters) {
        //            // this cannot work. You need more data points than clusters.
        //            throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images.");
        //        }
        //        // do the clustering:
        //        System.out.println("Number of local features: " + df.format(features.size()));
        //        System.out.println("Starting clustering ...");
        //        List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features);
        // TODO: Serializing clusters to a file on the disk ...
        clusters = SerializationUtils.readCodeBook(new FileInputStream("codebookCEDD128.txt"));
        numClusters = clusters.size();
        System.out.println("Clustering finished, " + clusters.size() + " clusters found");
        //        for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext(); ) {
        //            CentroidCluster<DoublePoint> centroidCluster = iterator.next();
        //            clusters.add(centroidCluster.getCenter().getPoint());
        //        }
        System.out.println("Creating histograms ...");
        int[] tmpHist = new int[numClusters];
        IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
                LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d);

        // careful: copy reader to RAM for faster access when reading ...
        //        reader = IndexReader.open(new RAMDirectory(reader.directory()), true);
        LireFeature f = getFeatureInstance();
        for (int i = 0; i < reader.maxDoc(); i++) {
            try {
                if (reader.hasDeletions() && !liveDocs.get(i))
                    continue;
                for (int j = 0; j < tmpHist.length; j++) {
                    tmpHist[j] = 0;
                }
                Document d = reader.document(i);
                IndexableField[] fields = d.getFields(localFeatureFieldName);
                // remove the fields if they are already there ...
                d.removeField(visualWordsFieldName);
                d.removeField(localFeatureHistFieldName);

                // find the appropriate cluster for each feature:
                for (int j = 0; j < fields.length; j++) {
                    f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                            fields[j].binaryValue().length);
                    tmpHist[clusterForFeature(f, clusters)]++;
                }
                //                System.out.println(Arrays.toString(tmpHist));
                d.add(new StoredField(localFeatureHistFieldName,
                        SerializationUtils.toByteArray(normalize(tmpHist))));
                quantize(tmpHist);
                d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));

                // remove local features to save some space if requested:
                if (DELETE_LOCAL_FEATURES) {
                    d.removeFields(localFeatureFieldName);
                }
                // now write the new one. we use the identifier to update ;)
                iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                        d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        iw.commit();
        // this one does the "old" commit(), it removes the deleted local features.
        iw.forceMerge(1);
        iw.close();
        System.out.println("Finished.");
    }

    /*
    public void indexMissing() throws IOException {
    // Reading clusters from disk:
    clusters = Cluster.readClusters(clusterFile);
    //  create & store histograms:
    System.out.println("Creating histograms ...");
    int[] tmpHist = new int[numClusters];
    LireFeature f = getFeatureInstance();
        
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
        
    // based on bug report from Einav Itamar <einavitamar@gmail.com>
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(),
            false, LuceneUtils.AnalyzerType.WhitespaceAnalyzer);
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.
        for (int j = 0; j < tmpHist.length; j++) {
            tmpHist[j] = 0;
        }
        Document d = reader.document(i);
        // Only if there are no values yet:
        if (d.getValues(visualWordsFieldName) == null || d.getValues(visualWordsFieldName).length == 0) {
            IndexableField[] fields = d.getFields(localFeatureFieldName);
            // find the appropriate cluster for each feature:
            for (int j = 0; j < fields.length; j++) {
                f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length);
                tmpHist[clusterForFeature((Histogram) f, clusterList)]++;
            }
            normalize(tmpHist);
            d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));
            d.add(new StringField(localFeatureHistFieldName, SerializationUtils.arrayToString(tmpHist), Field.Store.YES));
            // now write the new one. we use the identifier to update ;)
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
        }
    //            }
    }
    iw.commit();
    // added to permanently remove the deleted docs.
    iw.forceMerge(1);
    iw.close();
    System.out.println("Finished.");
    }
    */

    /*
     * Takes one single document and creates the visual words and adds them to the document. The same document is returned.
     *
     * @param d the document to use for adding the visual words
     * @return
     * @throws java.io.IOException
     */
    /*
    public Document getVisualWords(Document d) throws IOException {
    clusters = Cluster.readClusters(clusterFile);
    int[] tmpHist = new int[clusters.length];
    LireFeature f = getFeatureInstance();
    IndexableField[] fields = d.getFields(localFeatureFieldName);
    // find the appropriate cluster for each feature:
    for (int j = 0; j < fields.length; j++) {
        f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length);
        tmpHist[clusterForFeature(f, clusterList)]++;
    }
    quantize(tmpHist);
    byte[] data = new byte[tmpHist.length];
    for (int i = 0; i < data.length; i++) {
        data[i] = (byte) tmpHist[i];
    }
    d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(tmpHist)));
    d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));
    d.removeFields(localFeatureFieldName);
    return d;
    }
    */

    private double[] normalize(int[] histogram) {
        double[] result = new double[histogram.length];
        double max = 0;
        for (int i = 0; i < histogram.length; i++) {
            max = Math.max(max, histogram[i]);
        }
        for (int i = 0; i < histogram.length; i++) {
            result[i] = ((double) histogram[i]) / max;
        }
        return result;
    }

    private void quantize(int[] histogram) {
        double max = 0;
        for (int i = 0; i < histogram.length; i++) {
            max = Math.max(max, histogram[i]);
        }
        for (int i = 0; i < histogram.length; i++) {
            histogram[i] = (int) Math.floor((histogram[i] * 128d) / max);
        }
    }

    /**
     * Find the appropriate cluster for a given feature.
     *
     *
     *
     * @param f
     * @param clusterList
     * @return the index of the cluster.
     */
    private int clusterForFeature(Histogram f, List<double[]> clusterList) {
        double distance = MetricsUtils.distL2(clusterList.get(0), f.getDoubleHistogram());
        double tmp;
        int result = 0;
        int i = 0;
        for (double[] c : clusterList) {
            tmp = MetricsUtils.distL2(c, f.getDoubleHistogram());
            if (tmp < distance) {
                distance = tmp;
                result = i;
            }
            i++;
        }
        return result;
    }

    private String arrayToVisualWordString(int[] hist) {
        StringBuilder sb = new StringBuilder(1024);
        for (int i = 0; i < hist.length; i++) {
            int visualWordIndex = hist[i];
            for (int j = 0; j < visualWordIndex; j++) {
                sb.append('v');
                sb.append(i);
                sb.append(' ');
            }
        }
        return sb.toString();
    }

    private HashSet<Integer> selectVocabularyDocs() throws IOException {
        // need to make sure that this is not running forever ...
        int loopCount = 0;
        float maxDocs = reader.maxDoc();
        int capacity = (int) Math.min(numDocsForVocabulary, maxDocs);
        if (capacity < 0)
            capacity = (int) (maxDocs / 2);
        HashSet<Integer> result = new HashSet<Integer>(capacity);
        int tmpDocNumber, tmpIndex;
        LinkedList<Integer> docCandidates = new LinkedList<Integer>();
        // three cases:
        //
        // either it's more or the same number as documents
        if (numDocsForVocabulary >= maxDocs) {
            for (int i = 0; i < maxDocs; i++) {
                result.add(i);
            }
            return result;
        } else if (numDocsForVocabulary >= maxDocs - 100) { // or it's slightly less:
            for (int i = 0; i < maxDocs; i++) {
                result.add(i);
            }
            while (result.size() > numDocsForVocabulary) {
                result.remove((int) Math.floor(Math.random() * result.size()));
            }
            return result;
        } else {
            for (int i = 0; i < maxDocs; i++) {
                docCandidates.add(i);
            }
            for (int r = 0; r < capacity; r++) {
                boolean worksFine = false;
                do {
                    tmpIndex = (int) Math.floor(Math.random() * (double) docCandidates.size());
                    tmpDocNumber = docCandidates.get(tmpIndex);
                    docCandidates.remove(tmpIndex);
                    // check if the selected doc number is valid: not null, not deleted and not already chosen.
                    worksFine = (reader.document(tmpDocNumber) != null) && !result.contains(tmpDocNumber);
                } while (!worksFine);
                result.add(tmpDocNumber);
                // need to make sure that this is not running forever ...
                if (loopCount++ > capacity * 100)
                    throw new UnsupportedOperationException(
                            "Could not get the documents, maybe there are not enough documents in the index?");
            }
            return result;
        }
    }

    protected abstract LireFeature getFeatureInstance();
}