Example usage for com.google.common.collect Iterables skip

Introduction

In this page you can find the example usage for com.google.common.collect Iterables skip.

Prototype

public static <T> Iterable<T> skip(final Iterable<T> iterable, final int numberToSkip)

Source Link

Document

Returns a view of iterable that skips its first numberToSkip elements.

Usage

From source file:org.calrissian.mango.collect.CloseableIterables.java

/**
 * Returns a view of {@code iterable} that skips its first
 * {@code numberToSkip} elements. If {@code iterable} contains fewer than
 * {@code numberToSkip} elements, the returned closeableiterable skips all of its
 * elements.//  w w  w  . ja  v a  2 s . c om
 *
 * <p>Modifications to the underlying {@link CloseableIterable} before a call to
 * {@code iterator()} are reflected in the returned iterator. That is, the
 * iterator skips the first {@code numberToSkip} elements that exist when the
 * {@code Iterator} is created, not when {@code skip()} is called.
 *
 * <p>The returned closeableiterable's iterator supports {@code remove()} if the
 * iterator of the underlying iterable supports it. Note that it is
 * <i>not</i> possible to delete the last skipped element by immediately
 * calling {@code remove()} on that iterator, as the {@code Iterator}
 * contract states that a call to {@code remove()} before a call to
 * {@code next()} will throw an {@link IllegalStateException}.
 */
public static <T> CloseableIterable<T> skip(final CloseableIterable<T> iterable, final int numberToSkip) {
    return wrap(Iterables.skip(iterable, numberToSkip), iterable);
}

From source file:com.wrmsr.wava.yen.parser.ModuleFactory.java

public YModule create() {
    functionCounter = 0;//w  w  w. j  a v  a2 s . c  o m
    for (Element e : Iterables.skip(root, 1)) {
        ListElement le = (ListElement) e;
        preParseFunctionType(le);
        preParseImports(le);
    }
    functionCounter = 0;
    for (Element e : Iterables.skip(root, 1)) {
        ListElement le = (ListElement) e;
        parseModuleElement(le);
    }
    return builder.build();
}

From source file:com.google.enterprise.connector.util.diffing.SnapshotStore.java

@VisibleForTesting
public void deleteOldSnapshots() {
    // Leave at least two snapshot files, even if oldestSnapshotToKeep
    // is too high.
    for (long k : Iterables.skip(getExistingSnapshots(), 2)) {
        if (k < oldestSnapshotToKeep) {
            File x = getSnapshotFile(snapshotDir, k);
            if (x.delete()) {
                LOG.fine("deleting snapshot file " + x.getAbsolutePath());
            } else {
                LOG.warning("failed to delete snapshot file " + x.getAbsolutePath());
            }/*from   ww w . ja  v  a 2 s. c o  m*/
        }
    }
}

From source file:msi.gaml.factories.ModelAssembler.java

public ModelDescription assemble(final String projectPath, final String modelPath,
        final Iterable<ISyntacticElement> allModels, final ValidationContext collector, final boolean document,
        final Map<String, ModelDescription> mm) {
    final ImmutableList<ISyntacticElement> models = ImmutableList.copyOf(allModels);
    final TOrderedHashMap<String, ISyntacticElement> speciesNodes = new TOrderedHashMap();
    final TOrderedHashMap<String, TOrderedHashMap<String, ISyntacticElement>>[] experimentNodes = new TOrderedHashMap[1];
    final ISyntacticElement globalNodes = SyntacticFactory.create(GLOBAL, (EObject) null, true);
    final ISyntacticElement source = models.get(0);
    Facets globalFacets = null;//from   w w w.  ja  v  a 2s.  c  om
    if (source.hasFacet(IKeyword.PRAGMA)) {
        final Facets facets = source.copyFacets(null);
        final List<String> pragmas = (List<String>) facets.get(IKeyword.PRAGMA).getExpression().getConstValue();
        collector.resetInfoAndWarning();
        if (pragmas != null) {
            if (pragmas.contains(IKeyword.NO_INFO)) {
                collector.setNoInfo();
            }
            if (pragmas.contains(IKeyword.NO_WARNING)) {
                collector.setNoWarning();
            }
            if (pragmas.contains(IKeyword.NO_EXPERIMENT)) {
                collector.setNoExperiment();
            }
        }

    }
    final Map<String, SpeciesDescription> tempSpeciesCache = new THashMap<>();

    for (final ISyntacticElement cm : models.reverse()) {
        final SyntacticModelElement currentModel = (SyntacticModelElement) cm;
        if (currentModel != null) {
            if (currentModel.hasFacets()) {
                if (globalFacets == null) {
                    globalFacets = new Facets(currentModel.copyFacets(null));
                } else {
                    globalFacets.putAll(currentModel.copyFacets(null));
                }
            }
            currentModel.visitChildren(element -> globalNodes.addChild(element));
            SyntacticVisitor visitor = element -> addSpeciesNode(element, speciesNodes, collector);
            currentModel.visitSpecies(visitor);

            // We input the species so that grids are always the last ones
            // (see DiffusionStatement)
            currentModel.visitGrids(visitor);
            visitor = element -> {
                if (experimentNodes[0] == null) {
                    experimentNodes[0] = new TOrderedHashMap();
                }
                addExperimentNode(element, currentModel.getName(), experimentNodes[0], collector);

            };
            currentModel.visitExperiments(visitor);

        }
    }

    final String modelName = buildModelName(source.getName());

    // We build a list of working paths from which the composite model will
    // be able to look for resources. These working paths come from the
    // imported models

    Set<String> absoluteAlternatePathAsStrings = models.isEmpty() ? null
            : ImmutableSet.copyOf(
                    Iterables.transform(models.reverse(), each -> ((SyntacticModelElement) each).getPath()));

    if (mm != null) {
        for (final ModelDescription m1 : mm.values()) {
            for (final String im : m1.getAlternatePaths()) {
                absoluteAlternatePathAsStrings = Sets.union(absoluteAlternatePathAsStrings,
                        Collections.singleton(im));
            }
        }
    }

    final ModelDescription model = new ModelDescription(modelName, null, projectPath, modelPath,
            source.getElement(), null, ModelDescription.ROOT, null, globalFacets, collector,
            absoluteAlternatePathAsStrings);

    final Collection<String> allModelNames = models.size() == 1 ? null
            : ImmutableSet.copyOf(
                    Iterables.transform(Iterables.skip(models, 1), each -> buildModelName(each.getName())));
    model.setImportedModelNames(allModelNames);
    model.isDocumenting(document);

    // hqnghi add micro-models
    if (mm != null) {
        // model.setMicroModels(mm);
        model.addChildren(mm.values());
    }
    // end-hqnghi
    // recursively add user-defined species to world and down on to the
    // hierarchy
    speciesNodes.forEachValue(speciesNode -> {
        addMicroSpecies(model, speciesNode, tempSpeciesCache);
        return true;
    });
    if (experimentNodes[0] != null) {
        experimentNodes[0].forEachEntry((s, b) -> {
            b.forEachValue(experimentNode -> {
                addExperiment(s, model, experimentNode, tempSpeciesCache);
                return true;
            });
            return true;
        });
    }

    // Parent the species and the experiments of the model (all are now
    // known).
    speciesNodes.forEachValue(speciesNode -> {
        parentSpecies(model, speciesNode, model, tempSpeciesCache);
        return true;
    });

    if (experimentNodes[0] != null) {
        experimentNodes[0].forEachEntry((s, b) -> {
            b.forEachValue(experimentNode -> {
                parentExperiment(model, experimentNode);
                return true;
            });
            return true;
        });
    }

    // Initialize the hierarchy of types
    model.buildTypes();
    // hqnghi build micro-models as types
    if (mm != null) {
        for (final Entry<String, ModelDescription> entry : mm.entrySet()) {
            model.getTypesManager().alias(entry.getValue().getName(), entry.getKey());
        }
        // end-hqnghi
    }

    // Make species and experiments recursively create their attributes,
    // actions....
    complementSpecies(model, globalNodes);

    speciesNodes.forEachValue(speciesNode -> {
        complementSpecies(model.getMicroSpecies(speciesNode.getName()), speciesNode);
        return true;
    });

    if (experimentNodes[0] != null) {
        experimentNodes[0].forEachEntry((s, b) -> {
            b.forEachValue(experimentNode -> {
                complementSpecies(model.getExperiment(experimentNode.getName()), experimentNode);
                return true;
            });
            return true;
        });
    }

    // Complement recursively the different species (incl. the world). The
    // recursion is hierarchical

    model.inheritFromParent();

    for (final SpeciesDescription sd : getSpeciesInHierarchicalOrder(model)) {
        sd.inheritFromParent();
        if (sd.isExperiment()) {
            if (!sd.finalizeDescription()) {
                return null;
            }
        }
    }

    // Issue #1708 (put before the finalization)
    if (model.hasFacet(SCHEDULES) || model.hasFacet(FREQUENCY)) {
        createSchedulerSpecies(model);
    }

    if (!model.finalizeDescription()) {
        return null;
    }

    if (document) {
        collector.document(model);
    }
    return model;

}

From source file:org.apache.mahout.knn.cluster.BallKMeans.java

/**
 * Selects some of the original points according to the k-means++ algorithm.  The basic idea is that
 * points are selected with probability proportional to their distance from any selected point.  In
 * this version, points have weights which multiply their likelihood of being selected.  This is the
 * same as if there were as many copies of the same point as indicated by the weight.
 * <p/>/*w ww. j a  v  a 2s  .c  o  m*/
 * This is pretty expensive, but it vastly improves the quality and convergences of the k-means algorithm.
 * The basic idea can be made much faster by only processing a random subset of the original points.
 * In the context of streaming k-means, the total number of possible seeds will be about k log n so this
 * selection will cost O(k^2 (log n)^2) which isn't much worse than the random sampling idea.  At
 * n = 10^9, the cost of this initialization will be about 10x worse than a reasonable random sampling
 * implementation.
 * <p/>
 * The side effect of this method is to fill the centroids structure.
 * itself.
 *
 * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
 */
private void initializeSeeds(List<? extends WeightedVector> datapoints) {
    Preconditions.checkArgument(datapoints.size() > 1,
            "Must have at least two datapoints points to cluster " + "sensibly");
    // Compute the centroid of all of the datapoints.  This is then used to compute the squared radius of the datapoints.
    Centroid center = new Centroid(datapoints.iterator().next());
    for (WeightedVector row : Iterables.skip(datapoints, 1)) {
        center.update(row);
    }
    // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
    // this accelerates seed selection.
    double radius = 0;
    DistanceMeasure l2 = new SquaredEuclideanDistanceMeasure();
    for (WeightedVector row : datapoints) {
        radius += l2.distance(row, center);
    }

    // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
    // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2.  This is done
    // by first selecting c_1 with probability:
    //
    // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
    //
    // This can be simplified to:
    //
    // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
    //
    // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
    //
    // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
    // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.

    // Multinomial distribution of vector indices for the selection seeds. These correspond to
    // the indices of the vectors in the original datapoints list.
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < datapoints.size(); ++i) {
        double selectionProbability = radius + datapoints.size() * l2.distance(datapoints.get(i), center);
        seedSelector.add(i, selectionProbability);
    }

    Centroid c_1 = new Centroid(datapoints.get(seedSelector.sample()).clone());
    c_1.setIndex(0);
    // Construct a set of weighted things which can be used for random selection.  Initial weights are
    // set to the squared distance from c_1
    for (int i = 0; i < datapoints.size(); ++i) {
        WeightedVector row = datapoints.get(i);
        final double w = l2.distance(c_1, row) * row.getWeight();
        seedSelector.set(i, w);
    }

    // From here, seeds are selected with probability proportional to:
    //
    // r_i = min_{c_j} || x_i - c_j ||^2
    //
    // when we only have c_1, we have already set these distances and as we select each new
    // seed, we update the minimum distances.
    centroids.add(c_1);
    int clusterIndex = 1;
    while (centroids.size() < numClusters) {
        // Select according to weights.
        int seedIndex = seedSelector.sample();
        Centroid nextSeed = new Centroid(datapoints.get(seedIndex));
        // (WeightedVector)datapoints.get(seedIndex).clone());
        nextSeed.setIndex(clusterIndex++);
        centroids.add(nextSeed);
        // Don't select this one again.
        seedSelector.delete(seedIndex);
        // Re-weight everything according to the minimum distance to a seed.
        for (int currSeedIndex : seedSelector) {
            WeightedVector curr = datapoints.get(currSeedIndex);
            double newWeight = nextSeed.getWeight() * l2.distance(nextSeed, curr);
            if (newWeight < seedSelector.getWeight(currSeedIndex)) {
                seedSelector.set(currSeedIndex, newWeight);
            }
        }
    }
}

From source file:co.cask.cdap.logging.read.AvroFileReader.java

public Collection<LogEvent> readLogPrev(Location file, Filter logFilter, long fromTimeMs, final int maxEvents) {
    try {//from  ww w .j a v  a 2  s. c  o m
        DataFileReader<GenericRecord> dataFileReader = createReader(file);

        try {
            if (!dataFileReader.hasNext()) {
                return ImmutableList.of();
            }

            GenericRecord datum;
            List<List<LogEvent>> logSegments = Lists.newArrayList();
            int count = 0;

            // Calculate skipLen based on fileLength
            long skipLen = file.length() / 10;
            if (skipLen > DEFAULT_SKIP_LEN) {
                skipLen = DEFAULT_SKIP_LEN;
            } else if (skipLen <= 0) {
                skipLen = DEFAULT_SKIP_LEN;
            }

            List<LogEvent> logSegment = Lists.newArrayList();

            long lastSeekPos;
            long seekPos = file.length();
            while (seekPos > 0) {
                lastSeekPos = seekPos;
                seekPos = seekPos < skipLen ? 0 : seekPos - skipLen;
                dataFileReader.sync(seekPos);

                logSegment = logSegment.isEmpty() ? logSegment : Lists.<LogEvent>newArrayList();
                // read all the elements in the current segment (seekPos up to lastSeekPos)
                while (dataFileReader.hasNext() && !dataFileReader.pastSync(lastSeekPos)) {
                    datum = dataFileReader.next();

                    ILoggingEvent loggingEvent = LoggingEvent.decode(datum);

                    // Stop when reached fromTimeMs
                    if (loggingEvent.getTimeStamp() > fromTimeMs) {
                        break;
                    }

                    if (logFilter.match(loggingEvent)) {
                        ++count;
                        logSegment.add(new LogEvent(loggingEvent,
                                new LogOffset(LogOffset.INVALID_KAFKA_OFFSET, loggingEvent.getTimeStamp())));
                    }
                }

                if (!logSegment.isEmpty()) {
                    logSegments.add(logSegment);
                }

                if (count > maxEvents) {
                    break;
                }
            }

            int skip = count >= maxEvents ? count - maxEvents : 0;
            return Lists.newArrayList(Iterables.skip(Iterables.concat(Lists.reverse(logSegments)), skip));
        } finally {
            try {
                dataFileReader.close();
            } catch (IOException e) {
                LOG.error(String.format("Got exception while closing log file %s", file.toURI()), e);
            }
        }
    } catch (Exception e) {
        LOG.error(String.format("Got exception while reading log file %s", file.toURI()), e);
        throw Throwables.propagate(e);
    }
}

From source file:co.cask.cdap.logging.read.AvroFileLogReader.java

public Collection<LogEvent> readLogPrev(Location file, Filter logFilter, long fromTimeMs, final int maxEvents) {
    try {/*from  w ww  .  jav  a2s .c om*/
        DataFileReader<GenericRecord> dataFileReader = createReader(file);

        try {
            if (!dataFileReader.hasNext()) {
                return ImmutableList.of();
            }

            GenericRecord datum;
            List<List<LogEvent>> logSegments = Lists.newArrayList();
            int count = 0;

            // Calculate skipLen based on fileLength
            long skipLen = file.length() / 10;
            if (skipLen > DEFAULT_SKIP_LEN) {
                skipLen = DEFAULT_SKIP_LEN;
            } else if (skipLen <= 0) {
                skipLen = DEFAULT_SKIP_LEN;
            }

            List<LogEvent> logSegment = Lists.newArrayList();
            long boundaryTimeMs = Long.MAX_VALUE;

            long seekPos = file.length();
            while (seekPos > 0) {
                seekPos = seekPos < skipLen ? 0 : seekPos - skipLen;
                dataFileReader.sync(seekPos);

                logSegment = logSegment.isEmpty() ? logSegment : Lists.<LogEvent>newArrayList();
                long segmentStartTimeMs = Long.MAX_VALUE;
                while (dataFileReader.hasNext()) {
                    datum = dataFileReader.next();

                    ILoggingEvent loggingEvent = LoggingEvent.decode(datum);

                    if (segmentStartTimeMs == Long.MAX_VALUE) {
                        segmentStartTimeMs = loggingEvent.getTimeStamp();
                    }

                    // Stop when reached fromTimeMs, or at the end of current segment.
                    if (loggingEvent.getTimeStamp() > fromTimeMs
                            || loggingEvent.getTimeStamp() >= boundaryTimeMs) {
                        break;
                    }

                    if (logFilter.match(loggingEvent)) {
                        ++count;
                        logSegment.add(new LogEvent(loggingEvent, loggingEvent.getTimeStamp()));
                    }
                }

                boundaryTimeMs = segmentStartTimeMs;

                if (!logSegment.isEmpty()) {
                    logSegments.add(logSegment);
                }

                if (count > maxEvents) {
                    break;
                }
            }

            int skip = count >= maxEvents ? count - maxEvents : 0;
            return Lists.newArrayList(Iterables.skip(Iterables.concat(Lists.reverse(logSegments)), skip));
        } finally {
            try {
                dataFileReader.close();
            } catch (IOException e) {
                LOG.error(String.format("Got exception while closing log file %s", file.toURI()), e);
            }
        }
    } catch (Exception e) {
        LOG.error(String.format("Got exception while reading log file %s", file.toURI()), e);
        throw Throwables.propagate(e);
    }
}

From source file:com.blackducksoftware.bdio.model.ExternalIdentifier.java

@Nullable
public String getSuiteReleaseTag() {
    return Iterables.getFirst(Iterables.skip(getBdSuiteId(), 1), null);

}

From source file:co.cask.cdap.logging.write.LogLocation.java

/**
 * Return closeable iterator of {@link LogEvent}
 * @param logFilter filter for filtering log events
 * @param fromTimeMs start timestamp in millis
 * @param maxEvents max events to return
 * @return closeable iterator of previous log events
 *//*from   w ww .j a  va  2s .  c  o  m*/
@SuppressWarnings("WeakerAccess")
public Collection<LogEvent> readLogPrev(Filter logFilter, long fromTimeMs, final int maxEvents)
        throws IOException {
    DataFileReader<GenericRecord> dataFileReader = createReader();

    try {
        if (!dataFileReader.hasNext()) {
            return ImmutableList.of();
        }

        List<List<LogEvent>> logSegments = Lists.newArrayList();
        List<LogEvent> logSegment;
        int count = 0;

        // Calculate skipLen based on fileLength
        long length = location.length();
        LOG.trace("Got file length {}", length);
        long skipLen = length / 10;
        if (skipLen > DEFAULT_SKIP_LEN || skipLen <= 0) {
            skipLen = DEFAULT_SKIP_LEN;
        }

        // For open file, endPosition sync marker is unknown so start from file length and read up to the actual EOF
        dataFileReader.sync(length);
        long finalSync = dataFileReader.previousSync();
        logSegment = readToEndSyncPosition(dataFileReader, logFilter, fromTimeMs, -1);

        if (!logSegment.isEmpty()) {
            logSegments.add(logSegment);
            count = count + logSegment.size();
        }

        LOG.trace("Read logevents {} from position {}", count, finalSync);

        long startPosition = finalSync;
        long endPosition = startPosition;
        long currentSync;

        while (startPosition > 0 && count < maxEvents) {
            // Skip to sync position less than current sync position
            startPosition = skipToPosition(dataFileReader, startPosition, endPosition, skipLen);
            currentSync = dataFileReader.previousSync();
            logSegment = readToEndSyncPosition(dataFileReader, logFilter, fromTimeMs, endPosition);

            if (!logSegment.isEmpty()) {
                logSegments.add(logSegment);
                count = count + logSegment.size();
            }
            LOG.trace("Read logevents {} from position {} to endPosition {}", count, currentSync, endPosition);

            endPosition = currentSync;
        }

        int skip = count >= maxEvents ? count - maxEvents : 0;
        return Lists.newArrayList(Iterables.skip(Iterables.concat(Lists.reverse(logSegments)), skip));
    } finally {
        try {
            dataFileReader.close();
        } catch (IOException e) {
            LOG.error("Got exception while closing log file {}", location, e);
        }
    }
}

From source file:org.apache.mahout.knn.cluster.StreamingKMeans.java

private UpdatableSearcher clusterInternal(Iterable<Centroid> datapoints, boolean collapseClusters) {
    int oldNumProcessedDataPoints = numProcessedDatapoints;
    // We clear the centroids we have in case of cluster collapse, the old clusters are the
    // datapoints but we need to re-cluster them.
    if (collapseClusters) {
        centroids.clear();/*  ww w  . j  a  v a  2  s  .c om*/
        numProcessedDatapoints = 0;
    }

    int numCentroidsToSkip = 0;
    if (centroids.size() == 0) {
        // Assign the first datapoint to the first cluster.
        // Adding a vector to a searcher would normally just reference the copy,
        // but we could potentially mutate it and so we need to make a clone.
        centroids.add(Iterables.get(datapoints, 0).clone());
        numCentroidsToSkip = 1;
        ++numProcessedDatapoints;
    }

    Random rand = RandomUtils.getRandom();
    // To cluster, we scan the data and either add each point to the nearest group or create a new group.
    // when we get too many groups, we need to increase the threshold and rescan our current groups
    for (WeightedVector row : Iterables.skip(datapoints, numCentroidsToSkip)) {
        // Get the closest vector and its weight as a WeightedThing<Vector>.
        // The weight of the WeightedThing is the distance to the query and the value is a
        // reference to one of the vectors we added to the searcher previously.
        WeightedThing<Vector> closestPair = centroids.search(row, 1).get(0);

        // We get a uniformly distributed random number between 0 and 1 and compare it with the
        // distance to the closest cluster divided by the distanceCutoff.
        // This is so that if the closest cluster is further than distanceCutoff,
        // closestPair.getWeight() / distanceCutoff > 1 which will trigger the creation of a new
        // cluster anyway.
        // However, if the ratio is less than 1, we want to create a new cluster with probability
        // proportional to the distance to the closest cluster.
        if (rand.nextDouble() < closestPair.getWeight() / distanceCutoff) {
            // Add new centroid, note that the vector is copied because we may mutate it later.
            centroids.add(row.clone());
        } else {
            // Merge the new point with the existing centroid. This will update the centroid's actual
            // position.
            // We know that all the points we inserted in the centroids searcher are (or extend)
            // WeightedVector, so the cast will always succeed.
            Centroid centroid = (Centroid) closestPair.getValue();
            // We will update the centroid by removing it from the searcher and reinserting it to
            // ensure consistency.
            if (!centroids.remove(centroid, 1e-7)) {
                throw new RuntimeException("Unable to remove centroid");
            }
            centroid.update(row);
            centroids.add(centroid);
        }

        progressLogger.debug(
                "numProcessedDataPoints: {}, estimatedNumClusters: {}, "
                        + "distanceCutoff: {}, numCentroids: {}",
                numProcessedDatapoints, estimatedNumClusters, distanceCutoff, centroids.size());

        if (!collapseClusters && centroids.size() > estimatedNumClusters) {
            estimatedNumClusters = (int) Math.max(estimatedNumClusters,
                    clusterLogFactor * Math.log(numProcessedDatapoints));

            // TODO does shuffling help?
            List<Centroid> shuffled = Lists.newArrayList();
            for (Vector v : centroids) {
                shuffled.add((Centroid) v);
            }
            Collections.shuffle(shuffled);
            // Re-cluster using the shuffled centroids as data points. The centroids member variable
            // is modified directly.
            clusterInternal(shuffled, true);

            // In the original algorithm, with distributions with sharp scale effects, the
            // distanceCutoff can grow to excessive size leading sub-clustering to collapse
            // the centroids set too much. This test prevents increase in distanceCutoff if
            // the current value is doing well at collapsing the clusters.
            if (centroids.size() > clusterOvershoot * estimatedNumClusters) {
                distanceCutoff *= beta;
            }
        }
        ++numProcessedDatapoints;
    }

    if (collapseClusters) {
        numProcessedDatapoints = oldNumProcessedDataPoints;
    }

    // Normally, iterating through the searcher produces Vectors,
    // but since we always used Centroids, we adapt the return type.
    return centroids;
}