Java tutorial
package gdsc.core.clustering.optics; /*----------------------------------------------------------------------------- * GDSC ImageJ Software * * Copyright (C) 2017 Alex Herbert * Genome Damage and Stability Centre * University of Sussex, UK * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. *---------------------------------------------------------------------------*/ import java.util.Arrays; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.math3.random.RandomGenerator; import org.apache.commons.math3.random.RandomVectorGenerator; import org.apache.commons.math3.random.UnitSphereRandomVectorGenerator; import gdsc.core.ij.Utils; import gdsc.core.logging.TrackProgress; import gdsc.core.utils.NotImplementedException; import gdsc.core.utils.PseudoRandomGenerator; import gdsc.core.utils.Sort; import gdsc.core.utils.TurboList; import gdsc.core.utils.TurboRandomGenerator; import gnu.trove.set.hash.TIntHashSet; /** * Store molecules and allows generation of random projections * <p> * This class is an adaption of de.lmu.ifi.dbs.elki.index.preprocessed.fastoptics.RandomProjectedNeighborsAndDensities. * Copyright (C) 2015. * Johannes Schneider, ABB Research, Switzerland, johannes.schneider@alumni.ethz.ch. * Released under the GPL v3 licence. * <p> * Modifications have been made for multi-threading and different neighbour sampling modes. The partitioning of the sets * is * essentially unchanged. * * @author Alex Herbert */ class ProjectedMoleculeSpace extends MoleculeSpace { /** Used for access to the raw coordinates. */ protected final OPTICSManager opticsManager; /** The tracker. */ private TrackProgress tracker; /** * Default constant used to compute number of projections as well as number of * splits of point set, ie. constant *log N*d */ // constant in O(log N*d) used to compute number of projections as well as // number of splits of point set private static final int logOProjectionConst = 20; /** * Sets used for neighborhood computation should be about minSplitSize Sets are still used if they deviate by less * (1+/- sizeTolerance). */ private static final float sizeTolerance = 2f / 3; /** * Store the results of a split of the dataset. */ class Split { /** The number. */ final int number; /** The sets. */ final TurboList<int[]> sets; /** * Instantiates a new split. * * @param number * the number * @param sets * the sets */ Split(int number, TurboList<int[]> sets) { this.number = number; this.sets = sets; } /** * Instantiates a new split. * * @param number * the number * @param sets * the sets */ Split(int number, int[]... sets) { this.number = number; this.sets = new TurboList<int[]>(Arrays.asList(sets)); } } /** Sets that resulted from recursive split of entire point set. */ TurboList<Split> splitSets; /** * Random factory. */ RandomGenerator rand; /** The pseudo random. */ private TurboRandomGenerator pseudoRandom = null; /** The neighbours of each point. */ int[][] allNeighbours; /** The number of splits to compute (if below 1 it will be auto-computed using the size of the data). */ public int nSplits = 0; /** The number of projections to compute (if below 1 it will be auto-computed using the size of the data). */ public int nProjections = 0; /** * Set to true to save all sets that are approximately min split size. The default is to only save sets smaller than * min split size. */ public boolean saveApproximateSets = false; /** The sample mode. */ private SampleMode sampleMode; /** * Set to true to use random vectors for the projections. The default is to uniformly create vectors on the * semi-circle interval. */ public boolean useRandomVectors = false; /** The number of threads to use. */ public int nThreads = 1; /** The number of distance computations. */ public AtomicInteger distanceComputations = new AtomicInteger(); /** * Instantiates a new projected molecule space. * * @param opticsManager * the optics manager * @param generatingDistanceE * the generating distance E * @param rand * the rand */ ProjectedMoleculeSpace(OPTICSManager opticsManager, float generatingDistanceE, RandomGenerator rand) { super(opticsManager.getSize(), generatingDistanceE); this.opticsManager = opticsManager; this.rand = rand; } /* * (non-Javadoc) * * @see gdsc.core.clustering.optics.MoleculeSpace#toString() */ @Override public String toString() { return String.format("%s", this.getClass().getSimpleName()); } /* * (non-Javadoc) * * @see gdsc.core.clustering.optics.MoleculeSpace#generate() */ Molecule[] generate() { final float[] xcoord = opticsManager.getXData(); final float[] ycoord = opticsManager.getYData(); setOfObjects = new Molecule[xcoord.length]; for (int i = 0; i < xcoord.length; i++) { final float x = xcoord[i]; final float y = ycoord[i]; setOfObjects[i] = new Molecule(i, x, y); } return setOfObjects; } /* * (non-Javadoc) * * @see gdsc.core.clustering.optics.OPTICSManager.MoleculeSpace#findNeighbours(int, * gdsc.core.clustering.optics.OPTICSManager.Molecule, float) */ void findNeighbours(int minPts, Molecule object, float e) { // Return the neighbours found in {@link #computeAverageDistInSetAndNeighbours()}. // Assume allNeighbours has been computed. neighbours.clear(); int[] list = allNeighbours[object.id]; for (int i = list.length; i-- > 0;) neighbours.add(setOfObjects[list[i]]); } /* * (non-Javadoc) * * @see gdsc.core.clustering.optics.OPTICSManager.MoleculeSpace#findNeighboursAndDistances(int, * gdsc.core.clustering.optics.OPTICSManager.Molecule, float) */ void findNeighboursAndDistances(int minPts, Molecule object, float e) { // Return the neighbours found in {@link #computeAverageDistInSetAndNeighbours()}. // Assume allNeighbours has been computed. neighbours.clear(); int[] list = allNeighbours[object.id]; for (int i = list.length; i-- > 0;) { Molecule otherObject = setOfObjects[list[i]]; otherObject.setD(object.distance2(otherObject)); neighbours.add(otherObject); } } /** * Sets the tracker. * * @param tracker * the new tracker */ public void setTracker(TrackProgress tracker) { this.tracker = tracker; } /** The total progress. */ int progress, stepProgress, totalProgress; /** * Sets the up progress. * * @param total * the new up progress */ private void setUpProgress(int total) { totalProgress = total; stepProgress = Utils.getProgressInterval(totalProgress); progress = 0; } /** * Show progress. */ private synchronized void showProgress() { if (progress % stepProgress == 0) { if (tracker != null) tracker.progress(progress, totalProgress); } progress++; } /** * The Class Job. */ private abstract class Job { /** The index. */ final int index; /** * Instantiates a new job. * * @param index * the index */ Job(int index) { this.index = index; } } /** * The Class ProjectionJob. */ private class ProjectionJob extends Job { /** The v. */ final double[] v; /** * Instantiates a new projection job. * * @param index * the index * @param v * the v */ ProjectionJob(int index, double[] v) { super(index); this.v = v; } } /** * The Class SplitJob. */ private class SplitJob extends Job { /** The projected points. */ final float[][] projectedPoints; /** The rand. */ final TurboRandomGenerator rand; /** * Instantiates a new split job. * * @param index * the index * @param projectedPoints * the projected points * @param rand * the rand */ SplitJob(int index, float[][] projectedPoints, TurboRandomGenerator rand) { super(index); this.projectedPoints = projectedPoints; this.rand = rand; } } /** * The Class ProjectionWorker. */ private class ProjectionWorker implements Runnable { /** The finished. */ volatile boolean finished = false; /** The jobs. */ final BlockingQueue<ProjectionJob> jobs; /** The projected points. */ final float[][] projectedPoints; /** * Instantiates a new projection worker. * * @param jobs * the jobs * @param projectedPoints * the projected points */ public ProjectionWorker(BlockingQueue<ProjectionJob> jobs, float[][] projectedPoints) { this.jobs = jobs; this.projectedPoints = projectedPoints; } /* * (non-Javadoc) * * @see java.lang.Runnable#run() */ public void run() { try { while (true) { ProjectionJob job = jobs.take(); if (job.index < 0) break; if (!finished) // Only run jobs when not finished. This allows the queue to be emptied. run(job); } } catch (InterruptedException e) { System.out.println(e.toString()); throw new RuntimeException(e); } finally { finished = true; } } /** * Run. * * @param job * the job */ private void run(ProjectionJob job) { //if (Utils.isInterrupted()) //{ // finished = true; // return; //} showProgress(); final double[] v = job.v; // Project points to the vector and compute the distance along the vector from the origin float[] currPro = new float[size]; for (int it = size; it-- > 0;) { Molecule m = setOfObjects[it]; // Dot product: currPro[it] = (float) (v[0] * m.x + v[1] * m.y); } projectedPoints[job.index] = currPro; } } /** * The Class SplitWorker. */ private class SplitWorker implements Runnable { /** The finished. */ volatile boolean finished = false; /** The jobs. */ final BlockingQueue<SplitJob> jobs; /** The min split size. */ final int minSplitSize; /** The split sets. */ final TurboList<Split> splitSets = new TurboList<Split>(); /** * Instantiates a new split worker. * * @param jobs * the jobs * @param minSplitSize * the min split size */ public SplitWorker(BlockingQueue<SplitJob> jobs, int minSplitSize) { this.jobs = jobs; this.minSplitSize = minSplitSize; } /* * (non-Javadoc) * * @see java.lang.Runnable#run() */ public void run() { try { while (true) { SplitJob job = jobs.take(); if (job.index < 0) break; if (!finished) // Only run jobs when not finished. This allows the queue to be emptied. run(job); } } catch (InterruptedException e) { System.out.println(e.toString()); throw new RuntimeException(e); } finally { finished = true; } } /** * Run. * * @param job * the job */ private void run(SplitJob job) { //if (Utils.isInterrupted()) //{ // finished = true; // return; //} showProgress(); final TurboList<int[]> sets = new TurboList<int[]>(); splitupNoSort(sets, job.projectedPoints, Utils.newArray(size, 0, 1), 0, size, 0, job.rand, minSplitSize); splitSets.add(new Split(job.index, sets)); } } /** * The Class SetWorker. */ private class SetWorker implements Runnable { /** The sum distances. */ final double[] sumDistances; /** The n distances. */ final int[] nDistances; /** The neighbours. */ final TIntHashSet[] neighbours; /** The sets. */ final TurboList<int[]> sets; /** The from. */ final int from; /** The to. */ final int to; /** * Instantiates a new sets the worker. * * @param sumDistances * the sum distances * @param nDistances * the n distances * @param neighbours * the neighbours * @param sets * the sets * @param from * the from * @param to * the to */ public SetWorker(double[] sumDistances, int[] nDistances, TIntHashSet[] neighbours, TurboList<int[]> sets, int from, int to) { this.sumDistances = sumDistances; this.nDistances = nDistances; this.neighbours = neighbours; this.sets = sets; this.from = from; this.to = to; } /* * (non-Javadoc) * * @see java.lang.Runnable#run() */ public void run() { sampleNeighbours(sumDistances, nDistances, neighbours, sets, from, to); } } /** * Create random projections, project points and put points into sets of size * about minSplitSize/2. * * @param minSplitSize * minimum size for which a point set is further * partitioned (roughly corresponds to minPts in OPTICS) */ public void computeSets(int minSplitSize) { splitSets = new TurboList<Split>(); // Edge cases if (minSplitSize < 2 || size <= 1) return; if (size == 2) { // No point performing projections and splits splitSets.add(new Split(0, new int[] { 0, 1 })); return; } final int dim = 2; // FastOPTICS paper states you can use c0*log(N) sets and c1*log(N) projections. // The ELKI framework increase this for the number of dimensions. However I have stuck // with the original (as it is less so will be faster). // Note: In most computer science contexts log is in base 2. int nPointSetSplits, nProject1d; nPointSetSplits = getNumberOfSplitSets(nSplits, size); nProject1d = getNumberOfProjections(nProjections, size); // perform O(log N+log dim) splits of the entire point sets projections //nPointSetSplits = (int) (logOProjectionConst * log2(size * dim + 1)); // perform O(log N+log dim) projections of the point set onto a random line //nProject1d = (int) (logOProjectionConst * log2(size * dim + 1)); if (nPointSetSplits < 1 || nProject1d < 1) return; // Nothing to do // perform projections of points float[][] projectedPoints = new float[nProject1d][]; long time = System.currentTimeMillis(); setUpProgress(nProject1d); if (tracker != null) { tracker.log("Computing projections ..."); } // Multi-thread this for speed int nThreads = Math.min(this.nThreads, nPointSetSplits); final TurboList<Thread> threads = new TurboList<Thread>(nThreads); final BlockingQueue<ProjectionJob> projectionJobs = new ArrayBlockingQueue<ProjectionJob>(nThreads * 2); final TurboList<ProjectionWorker> projectionWorkers = new TurboList<ProjectionWorker>(nThreads); for (int i = 0; i < nThreads; i++) { final ProjectionWorker worker = new ProjectionWorker(projectionJobs, projectedPoints); final Thread t = new Thread(worker); projectionWorkers.addf(worker); threads.addf(t); t.start(); } // Create random vectors or uniform distribution RandomVectorGenerator vectorGen = (useRandomVectors) ? new UnitSphereRandomVectorGenerator(2, rand) : null; final double increment = Math.PI / nProject1d; for (int i = 0; i < nProject1d; i++) { // Create a random unit vector double[] currRp; if (useRandomVectors) { currRp = vectorGen.nextVector(); } else { // For a 2D vector we can just uniformly distribute them around a semi-circle currRp = new double[dim]; double a = i * increment; currRp[0] = Math.sin(a); currRp[1] = Math.cos(a); } put(projectionJobs, new ProjectionJob(i, currRp)); } // Finish all the worker threads by passing in a null job for (int i = 0; i < nThreads; i++) { put(projectionJobs, new ProjectionJob(-1, null)); } // Wait for all to finish for (int i = 0; i < nThreads; i++) { try { threads.get(i).join(); } catch (InterruptedException e) { e.printStackTrace(); } } threads.clear(); if (tracker != null) { tracker.progress(1); long time2 = System.currentTimeMillis(); tracker.log("Computed projections ... " + Utils.timeToString(time2 - time)); time = time2; tracker.log("Splitting data ..."); } // split entire point set, reuse projections by shuffling them int[] proind = Utils.newArray(nProject1d, 0, 1); setUpProgress(nPointSetSplits); // The splits do not have to be that random so we can use a pseudo random sequence. // The sets will be randomly sized between 1 and minSplitSize. Ensure we have enough // numbers for all the splits. double expectedSetSize = (1 + minSplitSize) * 0.5; int expectedSets = (int) Math.round(size / expectedSetSize); pseudoRandom = new TurboRandomGenerator(Math.max(200, minSplitSize + 2 * expectedSets), rand); // Multi-thread this for speed final BlockingQueue<SplitJob> splitJobs = new ArrayBlockingQueue<SplitJob>(nThreads * 2); final TurboList<SplitWorker> splitWorkers = new TurboList<SplitWorker>(nThreads); for (int i = 0; i < nThreads; i++) { final SplitWorker worker = new SplitWorker(splitJobs, minSplitSize); final Thread t = new Thread(worker); splitWorkers.addf(worker); threads.addf(t); t.start(); } for (int i = 0; i < nPointSetSplits; i++) { // shuffle projections float[][] shuffledProjectedPoints = new float[nProject1d][]; pseudoRandom.shuffle(proind); for (int j = 0; j < nProject1d; j++) { shuffledProjectedPoints[j] = projectedPoints[proind[j]]; } // New random generator TurboRandomGenerator rand = (TurboRandomGenerator) pseudoRandom.clone(); rand.setSeed(i); put(splitJobs, new SplitJob(i, shuffledProjectedPoints, rand)); } // Finish all the worker threads by passing in a null job for (int i = 0; i < nThreads; i++) { put(splitJobs, new SplitJob(-1, null, null)); } // Wait for all to finish int total = 0; for (int i = 0; i < nThreads; i++) { try { threads.get(i).join(); total += splitWorkers.get(i).splitSets.size(); } catch (InterruptedException e) { e.printStackTrace(); } } threads.clear(); // Merge the split-sets splitSets = splitWorkers.get(0).splitSets; splitSets.ensureCapacity(total); for (int i = 1; i < nThreads; i++) splitSets.addAll(splitWorkers.get(i).splitSets); if (tracker != null) { time = System.currentTimeMillis() - time; tracker.log("Split data ... " + Utils.timeToString(time)); tracker.progress(1); } } /** * Put. * * @param <T> * the generic type * @param jobs * the jobs * @param job * the job */ private <T> void put(BlockingQueue<T> jobs, T job) { try { jobs.put(job); } catch (InterruptedException e) { throw new RuntimeException("Unexpected interruption", e); } } /** * Gets the number of split sets. * * @param nSplits * The number of splits to compute (if below 1 it will be auto-computed using the size of the data) * @param size * the size * @return the number of split sets */ public static int getNumberOfSplitSets(int nSplits, int size) { if (size < 2) return 0; return (nSplits > 0) ? nSplits : (int) (logOProjectionConst * log2(size)); } /** * Gets the number of projections. * * @param nProjections * The number of projections to compute (if below 1 it will be auto-computed using the size of the data) * @param size * the size * @return the number of projections */ public static int getNumberOfProjections(int nProjections, int size) { return getNumberOfSplitSets(nProjections, size); } /** * 1. / log(2) */ public static final double ONE_BY_LOG2 = 1. / Math.log(2.); /** * Compute the base 2 logarithm. * * @param x * X * @return Logarithm base 2. */ public static double log2(double x) { return Math.log(x) * ONE_BY_LOG2; } /** * Recursively splits entire point set until the set is below a threshold. * * @param splitSets * the split sets * @param projectedPoints * the projected points * @param ind * points that are in the current set * @param begin * Interval begin in the ind array * @param end * Interval end in the ind array * @param dim * depth of projection (how many times point set has been split * already) * @param rand * Random generator * @param minSplitSize * minimum size for which a point set is further * partitioned (roughly corresponds to minPts in OPTICS) */ private void splitupNoSort(TurboList<int[]> splitSets, float[][] projectedPoints, int[] ind, int begin, int end, int dim, PseudoRandomGenerator rand, int minSplitSize) { final int nele = end - begin; if (nele < 2) { // Nothing to split. Also ensures we only add to the sets if neighbours can be sampled. return; } dim = dim % projectedPoints.length;// choose a projection of points float[] tpro = projectedPoints[dim]; if (saveApproximateSets) { // save set such that used for density or neighborhood computation // sets should be roughly minSplitSize // -=-=- // Note: This is the method used in ELKI which uses the distance to the median of the set // (thus no distances are computed that are between points very far apart, e.g. each end // of the set). if (nele > minSplitSize * (1 - sizeTolerance) && nele < minSplitSize * (1 + sizeTolerance)) { saveSet(splitSets, ind, begin, end, rand, tpro); } } // compute splitting element // do not store set or even sort set, since it is too large if (nele > minSplitSize) { // splits can be performed either by distance (between min,maxCoord) or by // picking a point randomly(picking index of point) // outcome is similar //int minInd = splitByDistance(ind, begin, end, tpro, rand); int minInd = splitRandomly(ind, begin, end, tpro, rand); // split set recursively // position used for splitting the projected points into two // sets used for recursive splitting int splitpos = minInd + 1; splitupNoSort(splitSets, projectedPoints, ind, begin, splitpos, dim + 1, rand, minSplitSize); splitupNoSort(splitSets, projectedPoints, ind, splitpos, end, dim + 1, rand, minSplitSize); } else if (!saveApproximateSets) { // It it wasn't saved as an approximate set then make sure it is saved as it is less than minSplitSize saveSet(splitSets, ind, begin, end, rand, tpro); } } private void saveSet(TurboList<int[]> splitSets, int[] ind, int begin, int end, PseudoRandomGenerator rand, float[] tpro) { int[] indices = Arrays.copyOfRange(ind, begin, end); if (sampleMode == SampleMode.RANDOM) { // Ensure the indices are random rand.shuffle(indices); } else if (sampleMode == SampleMode.MEDIAN) { // sort set, since need median element later // (when computing distance to the middle of the set) Sort.sort(indices, tpro); } splitSets.add(indices); } /** * Split the data set randomly. * * @param ind * Object index * @param begin * Interval begin * @param end * Interval end * @param tpro * Projection * @param rand * Random generator * @return Splitting point */ public static int splitRandomly(int[] ind, int begin, int end, float[] tpro, RandomGenerator rand) { final int nele = end - begin; // pick random splitting element based on position float rs = tpro[ind[begin + rand.nextInt(nele)]]; int minInd = begin, maxInd = end - 1; // permute elements such that all points smaller than the splitting // element are on the right and the others on the left in the array while (minInd < maxInd) { float currEle = tpro[ind[minInd]]; if (currEle > rs) { while (minInd < maxInd && tpro[ind[maxInd]] > rs) { maxInd--; } if (minInd == maxInd) { break; } swap(ind, minInd, maxInd); maxInd--; } minInd++; } // if all elements are the same split in the middle if (minInd == end - 1) { minInd = (begin + end) >>> 1; } return minInd; } /** * Swap. * * @param data * the data * @param i * the i * @param j * the j */ private static void swap(int[] data, int i, int j) { int tmp = data[i]; data[i] = data[j]; data[j] = tmp; } /** * Split the data set by distances. * * @param ind * Object index * @param begin * Interval begin * @param end * Interval end * @param tpro * Projection * @param rand * Random generator * @return Splitting point */ public static int splitByDistance(int[] ind, int begin, int end, float[] tpro, RandomGenerator rand) { // pick random splitting point based on distance float rmin = tpro[ind[begin]], rmax = rmin; for (int it = begin + 1; it < end; it++) { float currEle = tpro[ind[it]]; if (currEle < rmin) rmin = currEle; else if (currEle > rmax) rmax = currEle; } if (rmin != rmax) { // if not all elements are the same float rs = (float) (rmin + rand.nextDouble() * (rmax - rmin)); int minInd = begin, maxInd = end - 1; // permute elements such that all points smaller than the splitting // element are on the right and the others on the left in the array while (minInd < maxInd) { float currEle = tpro[ind[minInd]]; if (currEle > rs) { while (minInd < maxInd && tpro[ind[maxInd]] > rs) { maxInd--; } if (minInd == maxInd) { break; } swap(ind, minInd, maxInd); maxInd--; } minInd++; } return minInd; } else { // if all elements are the same split in the middle return (begin + end) >>> 1; } } /** * Gets the core distance. We actually return the squared distance. * * @param sum * the sum of distances * @param count * the count of distances * @return the squared average core distance */ private float getCoreDistance(double sum, int count) { // it might be that a point does not occur for a certain size of a // projection (likely if too few projections, in this case there is no avg // distance) if (count == 0) return OPTICSManager.UNDEFINED; double d = sum / count; // We actually want the squared distance return (float) (d * d); } /** * Compute for each point the average distance to a point in a projected set and list of neighbors for each point * from sets resulting from projection. * * @return list of neighbours for each point */ public int[][] computeAverageDistInSetAndNeighbours() { distanceComputations.set(0); // Q. Are the neighbours worked out using the core distance? // The FastOPTICS paper discusses a merging distance as the min of the core distance for A and B. // Only those below the merge distance are candidates for a merge. // However in a later discussion of FastOPTICS they state that reachability is only computed for // the sampled neighbours (and these may be above the merge distance). // A. Here we assume that any point-pair in the split set can be neighbours but we do not // compute all pairs but only a sub-sample of them. // Note: The ELKI implementation computes the neighbours using all items in a set to // the middle of the set, and each item in the set to the middle of the set. The FastOPTICS // paper states that any neighbour is valid but further neighbours can be excluded using an // f-factor (with f 0:1). If f=1 then all neighbours are included. Below this then only some // of the neighbours are included using the projected distance values. Neighbours to be // included are picked at random. final int n = splitSets.size(); long time = System.currentTimeMillis(); if (tracker != null) { tracker.log("Computing density and neighbourhoods ..."); } double[] sumDistances = new double[size]; int[] nDistances = new int[size]; TIntHashSet[] neighbours = new TIntHashSet[size]; for (int it = size; it-- > 0;) { neighbours[it] = new TIntHashSet(); } // Multi-thread the hash set operations for speed. // We can do this if each split uses each index only once. int nThreads = Math.min(this.nThreads, n); boolean multiThread = (n > 1 && !saveApproximateSets); // Use an executor service so that we know the entire split has been processed before // doing the next split. ExecutorService executor = null; TurboList<Future<?>> futures = null; if (multiThread) { executor = Executors.newFixedThreadPool(nThreads); futures = new TurboList<Future<?>>(nThreads); } final int interval = Utils.getProgressInterval(n); for (int i = 0; i < n; i++) { if (tracker != null) { if (i % interval == 0) tracker.progress(i, n); } Split split = splitSets.get(i); if (multiThread) { // If the indices are unique within each split set then we can multi-thread the // sampling of neighbours (since each index in the cumulative arrays will only // be accessed concurrently by a single thread). int nPerThread = (int) Math.ceil((double) split.sets.size() / nThreads); for (int from = 0; from < split.sets.size();) { int to = Math.min(from + nPerThread, split.sets.size()); futures.add(executor .submit(new SetWorker(sumDistances, nDistances, neighbours, split.sets, from, to))); from = to; } // Wait for all to finish for (int t = futures.size(); t-- > 0;) { try { // The future .get() method will block until completed futures.get(t).get(); } catch (Exception e) { // This should not happen. // Ignore it and allow processing to continue (the number of neighbour samples will just be smaller). e.printStackTrace(); } } futures.clear(); } else { sampleNeighbours(sumDistances, nDistances, neighbours, split.sets, 0, split.sets.size()); } } if (multiThread) executor.shutdown(); // Finalise averages // Convert to simple arrays allNeighbours = new int[size][]; for (int it = size; it-- > 0;) { setOfObjects[it].coreDistance = getCoreDistance(sumDistances[it], nDistances[it]); allNeighbours[it] = neighbours[it].toArray(); neighbours[it] = null; // Allow garbage collection } if (tracker != null) { time = System.currentTimeMillis() - time; tracker.log("Computed density and neighbourhoods (%d distances) ... %s", distanceComputations.get(), Utils.timeToString(time)); tracker.progress(1); } return allNeighbours; } /** * Sample neighbours for each set in the split sets between the from index (inclusive) and to index (exclusive). * * @param sumDistances * the neighbour sum of distances * @param nDistances * the neighbour count of distances * @param neighbours * the neighbour hash sets * @param sets * the split sets * @param from * the from index * @param to * the to index */ private void sampleNeighbours(double[] sumDistances, int[] nDistances, TIntHashSet[] neighbours, TurboList<int[]> sets, int from, int to) { switch (sampleMode) { case RANDOM: for (int i = from; i < to; i++) sampleNeighboursRandom(sumDistances, nDistances, neighbours, sets.get(i)); break; case MEDIAN: for (int i = from; i < to; i++) sampleNeighboursUsingMedian(sumDistances, nDistances, neighbours, sets.get(i)); break; case ALL: for (int i = from; i < to; i++) sampleNeighboursAll(sumDistances, nDistances, neighbours, sets.get(i)); break; default: throw new NotImplementedException("Unsupported sample mode: " + sampleMode); } } /** * Sample neighbours using median. The distance of each point is computed to the median which is added as a * neighbour. The median point has all the other points added as a neighbours. * * @param sumDistances * the neighbour sum of distances * @param nDistances * the neighbour count of distances * @param neighbours * the neighbour hash sets * @param indices * the indices of objects in the set */ private void sampleNeighboursUsingMedian(double[] sumDistances, int[] nDistances, TIntHashSet[] neighbours, int[] indices) { final int len = indices.length; final int indoff = len >> 1; int v = indices[indoff]; int delta = len - 1; distanceComputations.addAndGet(delta); nDistances[v] += delta; Molecule midpoint = setOfObjects[v]; for (int j = len; j-- > 0;) { int it = indices[j]; if (it == v) { continue; } double dist = midpoint.distance(setOfObjects[it]); sumDistances[v] += dist; sumDistances[it] += dist; nDistances[it]++; neighbours[it].add(v); neighbours[v].add(it); } } /** * Sample neighbours randomly. For each point A choose a neighbour from the set B. This is mirrored this to get * another neighbour without extra distance computations. The distance between A and B is used to increment the * input distance arrays and each is added to the set of the other. * <p> * This method works for sets of size 2 and above. * * @param sumDistances * the neighbour sum of distances * @param nDistances * the neighbour count of distances * @param neighbours * the neighbour hash sets * @param indices * the indices of objects in the set */ private void sampleNeighboursRandom(double[] sumDistances, int[] nDistances, TIntHashSet[] neighbours, int[] indices) { if (indices.length == 2) { distanceComputations.incrementAndGet(); // Only one set of neighbours int a = indices[0]; int b = indices[1]; double dist = setOfObjects[a].distance(setOfObjects[b]); sumDistances[a] += dist; sumDistances[b] += dist; nDistances[a]++; nDistances[b]++; neighbours[a].add(b); neighbours[b].add(a); } else { distanceComputations.addAndGet(indices.length); // For a fast implementation we just pick consecutive // points as neighbours since the order is random. // Note: This only works if the set has size 3 or more. for (int j = indices.length, k = 0; j-- > 0;) { int a = indices[j]; int b = indices[k]; k = j; double dist = setOfObjects[a].distance(setOfObjects[b]); sumDistances[a] += dist; sumDistances[b] += dist; nDistances[a] += 2; // Each object will have 2 due to mirroring. neighbours[a].add(b); neighbours[b].add(a); } } } /** * Sample neighbours all-vs-all. * * @param sumDistances * the neighbour sum of distances * @param nDistances * the neighbour count of distances * @param neighbours * the neighbour hash sets * @param indices * the indices of objects in the set */ private void sampleNeighboursAll(double[] sumDistances, int[] nDistances, TIntHashSet[] neighbours, int[] indices) { int n = indices.length; int n1 = n - 1; // for all-vs-all = n(n-1)/2 distanceComputations.addAndGet((n * n1) >>> 1); for (int i = 0; i < n1; i++) { int a = indices[i]; nDistances[a] += n1; double d = 0; Molecule ma = setOfObjects[a]; TIntHashSet na = neighbours[a]; for (int j = i + 1; j < n; j++) { int b = indices[j]; double dist = ma.distance(setOfObjects[b]); d += dist; sumDistances[b] += dist; na.add(b); neighbours[b].add(a); } sumDistances[a] += d; } // For the last index that was skipped in the outer loop. // The set will always be a positive size so do not worry about index bounds. nDistances[indices[n1]] += n1; } /** * Gets the sample mode. * * @return the sample mode */ public SampleMode getSampleMode() { return sampleMode; } /** * Sets the sample mode. * * @param sampleMode * the new sample mode */ public void setSampleMode(SampleMode sampleMode) { if (sampleMode == null) sampleMode = SampleMode.RANDOM; this.sampleMode = sampleMode; } }