List of usage examples for java.util Random nextDouble
public double nextDouble()
From source file:org.apache.hadoop.hive.ql.exec.Utilities.java
/** * Introducing a random factor to the wait time before another retry. * The wait time is dependent on # of failures and a random factor. * At the first time of getting an exception , the wait time * is a random number between 0..baseWindow msec. If the first retry * still fails, we will wait baseWindow msec grace period before the 2nd retry. * Also at the second retry, the waiting window is expanded to 2*baseWindow msec * alleviating the request rate from the server. Similarly the 3rd retry * will wait 2*baseWindow msec. grace period before retry and the waiting window is * expanded to 3*baseWindow msec and so on. * @param baseWindow the base waiting window. * @param failures number of failures so far. * @param r a random generator.//from www .j a va2 s .c o m * @return number of milliseconds for the next wait time. */ public static long getRandomWaitTime(long baseWindow, int failures, Random r) { return (long) (baseWindow * failures + // grace period for the last round of attempt baseWindow * (failures + 1) * r.nextDouble()); // expanding time window for each failure }
From source file:ml.shifu.shifu.core.dtrain.dt.DTWorker.java
@Override public DTWorkerParams doCompute(WorkerContext<DTMasterParams, DTWorkerParams> context) { if (context.isFirstIteration()) { return new DTWorkerParams(); }//from www . j a v a2 s . c o m DTMasterParams lastMasterResult = context.getLastMasterResult(); final List<TreeNode> trees = lastMasterResult.getTrees(); final Map<Integer, TreeNode> todoNodes = lastMasterResult.getTodoNodes(); if (todoNodes == null) { return new DTWorkerParams(); } LOG.info("Start to work: todoNodes size is {}", todoNodes.size()); Map<Integer, NodeStats> statistics = initTodoNodeStats(todoNodes); double trainError = 0d, validationError = 0d; double weightedTrainCount = 0d, weightedValidationCount = 0d; // renew random seed if (this.isGBDT && !this.gbdtSampleWithReplacement && lastMasterResult.isSwitchToNextTree()) { this.baggingRandomMap = new HashMap<Integer, Random>(); } long start = System.nanoTime(); for (Data data : this.trainingData) { if (this.isRF) { for (TreeNode treeNode : trees) { if (treeNode.getNode().getId() == Node.INVALID_INDEX) { continue; } Node predictNode = predictNodeIndex(treeNode.getNode(), data, true); if (predictNode.getPredict() != null) { // only update when not in first node, for treeNode, no predict statistics at that time float weight = data.subsampleWeights[treeNode.getTreeId()]; if (Float.compare(weight, 0f) == 0) { // oob data, no need to do weighting validationError += data.significance * loss .computeError((float) (predictNode.getPredict().getPredict()), data.label); weightedValidationCount += data.significance; } else { trainError += weight * data.significance * loss .computeError((float) (predictNode.getPredict().getPredict()), data.label); weightedTrainCount += weight * data.significance; } } } } if (this.isGBDT) { if (this.isContinuousEnabled && lastMasterResult.isContinuousRunningStart()) { recoverGBTData(context, data.output, data.predict, data, false); trainError += data.significance * loss.computeError(data.predict, data.label); weightedTrainCount += data.significance; } else { if (isNeedRecoverGBDTPredict) { if (this.recoverTrees == null) { this.recoverTrees = recoverCurrentTrees(); } // recover gbdt data for fail over recoverGBTData(context, data.output, data.predict, data, true); } int currTreeIndex = trees.size() - 1; if (lastMasterResult.isSwitchToNextTree()) { if (currTreeIndex >= 1) { Node node = trees.get(currTreeIndex - 1).getNode(); Node predictNode = predictNodeIndex(node, data, false); if (predictNode.getPredict() != null) { double predict = predictNode.getPredict().getPredict(); // first tree logic, master must set it to first tree even second tree with ROOT is // sending if (context.getLastMasterResult().isFirstTree()) { data.predict = (float) predict; } else { // random drop boolean drop = (this.dropOutRate > 0.0 && dropOutRandom.nextDouble() < this.dropOutRate); if (!drop) { data.predict += (float) (this.learningRate * predict); } } data.output = -1f * loss.computeGradient(data.predict, data.label); } // if not sampling with replacement in gbdt, renew bagging sample rate in next tree if (!this.gbdtSampleWithReplacement) { Random random = null; int classValue = (int) (data.label + 0.01f); if (this.isStratifiedSampling) { random = baggingRandomMap.get(classValue); if (random == null) { random = DTrainUtils.generateRandomBySampleSeed( modelConfig.getTrain().getBaggingSampleSeed(), CommonConstants.NOT_CONFIGURED_BAGGING_SEED); baggingRandomMap.put(classValue, random); } } else { random = baggingRandomMap.get(0); if (random == null) { random = DTrainUtils.generateRandomBySampleSeed( modelConfig.getTrain().getBaggingSampleSeed(), CommonConstants.NOT_CONFIGURED_BAGGING_SEED); baggingRandomMap.put(0, random); } } if (random.nextDouble() <= modelConfig.getTrain().getBaggingSampleRate()) { data.subsampleWeights[currTreeIndex % data.subsampleWeights.length] = 1f; } else { data.subsampleWeights[currTreeIndex % data.subsampleWeights.length] = 0f; } } } } if (context.getLastMasterResult().isFirstTree() && !lastMasterResult.isSwitchToNextTree()) { Node currTree = trees.get(currTreeIndex).getNode(); Node predictNode = predictNodeIndex(currTree, data, true); if (predictNode.getPredict() != null) { trainError += data.significance * loss .computeError((float) (predictNode.getPredict().getPredict()), data.label); weightedTrainCount += data.significance; } } else { trainError += data.significance * loss.computeError(data.predict, data.label); weightedTrainCount += data.significance; } } } } LOG.debug("Compute train error time is {}ms", TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start)); if (validationData != null) { start = System.nanoTime(); for (Data data : this.validationData) { if (this.isRF) { for (TreeNode treeNode : trees) { if (treeNode.getNode().getId() == Node.INVALID_INDEX) { continue; } Node predictNode = predictNodeIndex(treeNode.getNode(), data, true); if (predictNode.getPredict() != null) { // only update when not in first node, for treeNode, no predict statistics at that time validationError += data.significance * loss .computeError((float) (predictNode.getPredict().getPredict()), data.label); weightedValidationCount += data.significance; } } } if (this.isGBDT) { if (this.isContinuousEnabled && lastMasterResult.isContinuousRunningStart()) { recoverGBTData(context, data.output, data.predict, data, false); validationError += data.significance * loss.computeError(data.predict, data.label); weightedValidationCount += data.significance; } else { if (isNeedRecoverGBDTPredict) { if (this.recoverTrees == null) { this.recoverTrees = recoverCurrentTrees(); } // recover gbdt data for fail over recoverGBTData(context, data.output, data.predict, data, true); } int currTreeIndex = trees.size() - 1; if (lastMasterResult.isSwitchToNextTree()) { if (currTreeIndex >= 1) { Node node = trees.get(currTreeIndex - 1).getNode(); Node predictNode = predictNodeIndex(node, data, false); if (predictNode.getPredict() != null) { double predict = predictNode.getPredict().getPredict(); if (context.getLastMasterResult().isFirstTree()) { data.predict = (float) predict; } else { data.predict += (float) (this.learningRate * predict); } data.output = -1f * loss.computeGradient(data.predict, data.label); } } } if (context.getLastMasterResult().isFirstTree() && !lastMasterResult.isSwitchToNextTree()) { Node predictNode = predictNodeIndex(trees.get(currTreeIndex).getNode(), data, true); if (predictNode.getPredict() != null) { validationError += data.significance * loss .computeError((float) (predictNode.getPredict().getPredict()), data.label); weightedValidationCount += data.significance; } } else { validationError += data.significance * loss.computeError(data.predict, data.label); weightedValidationCount += data.significance; } } } } LOG.debug("Compute val error time is {}ms", TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start)); } if (this.isGBDT) { // reset trees to null to save memory this.recoverTrees = null; if (this.isNeedRecoverGBDTPredict) { // no need recover again this.isNeedRecoverGBDTPredict = false; } } start = System.nanoTime(); CompletionService<Map<Integer, NodeStats>> completionService = new ExecutorCompletionService<Map<Integer, NodeStats>>( this.threadPool); int realThreadCount = 0; LOG.debug("while todo size {}", todoNodes.size()); int realRecords = this.trainingData.size(); int realThreads = this.workerThreadCount > realRecords ? realRecords : this.workerThreadCount; int[] trainLows = new int[realThreads]; int[] trainHighs = new int[realThreads]; int stepCount = realRecords / realThreads; if (realRecords % realThreads != 0) { // move step count to append last gap to avoid last thread worse 2*stepCount-1 stepCount += (realRecords % realThreads) / stepCount; } for (int i = 0; i < realThreads; i++) { trainLows[i] = i * stepCount; if (i != realThreads - 1) { trainHighs[i] = trainLows[i] + stepCount - 1; } else { trainHighs[i] = realRecords - 1; } } for (int i = 0; i < realThreads; i++) { final Map<Integer, TreeNode> localTodoNodes = new HashMap<Integer, TreeNode>(todoNodes); final Map<Integer, NodeStats> localStatistics = initTodoNodeStats(todoNodes); final int startIndex = trainLows[i]; final int endIndex = trainHighs[i]; LOG.info("Thread {} todo size {} stats size {} start index {} end index {}", i, localTodoNodes.size(), localStatistics.size(), startIndex, endIndex); if (localTodoNodes.size() == 0) { continue; } realThreadCount += 1; completionService.submit(new Callable<Map<Integer, NodeStats>>() { @Override public Map<Integer, NodeStats> call() throws Exception { long start = System.nanoTime(); List<Integer> nodeIndexes = new ArrayList<Integer>(trees.size()); for (int j = startIndex; j <= endIndex; j++) { Data data = DTWorker.this.trainingData.get(j); nodeIndexes.clear(); if (DTWorker.this.isRF) { for (TreeNode treeNode : trees) { if (treeNode.getNode().getId() == Node.INVALID_INDEX) { nodeIndexes.add(Node.INVALID_INDEX); } else { Node predictNode = predictNodeIndex(treeNode.getNode(), data, false); nodeIndexes.add(predictNode.getId()); } } } if (DTWorker.this.isGBDT) { int currTreeIndex = trees.size() - 1; Node predictNode = predictNodeIndex(trees.get(currTreeIndex).getNode(), data, false); // update node index nodeIndexes.add(predictNode.getId()); } for (Map.Entry<Integer, TreeNode> entry : localTodoNodes.entrySet()) { // only do statistics on effective data Node todoNode = entry.getValue().getNode(); int treeId = entry.getValue().getTreeId(); int currPredictIndex = 0; if (DTWorker.this.isRF) { currPredictIndex = nodeIndexes.get(entry.getValue().getTreeId()); } if (DTWorker.this.isGBDT) { currPredictIndex = nodeIndexes.get(0); } if (todoNode.getId() == currPredictIndex) { List<Integer> features = entry.getValue().getFeatures(); if (features.isEmpty()) { features = getAllValidFeatures(); } for (Integer columnNum : features) { double[] featuerStatistic = localStatistics.get(entry.getKey()) .getFeatureStatistics().get(columnNum); float weight = data.subsampleWeights[treeId % data.subsampleWeights.length]; if (Float.compare(weight, 0f) != 0) { // only compute weight is not 0 short binIndex = data.inputs[DTWorker.this.inputIndexMap.get(columnNum)]; DTWorker.this.impurity.featureUpdate(featuerStatistic, binIndex, data.output, data.significance, weight); } } } } } LOG.debug("Thread computing stats time is {}ms in thread {}", TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start), Thread.currentThread().getName()); return localStatistics; } }); } int rCnt = 0; while (rCnt < realThreadCount) { try { Map<Integer, NodeStats> currNodeStatsmap = completionService.take().get(); if (rCnt == 0) { statistics = currNodeStatsmap; } else { for (Entry<Integer, NodeStats> entry : statistics.entrySet()) { NodeStats resultNodeStats = entry.getValue(); mergeNodeStats(resultNodeStats, currNodeStatsmap.get(entry.getKey())); } } } catch (ExecutionException e) { throw new RuntimeException(e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } rCnt += 1; } LOG.debug("Compute stats time is {}ms", TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start)); LOG.info( "worker count is {}, error is {}, and stats size is {}. weightedTrainCount {}, weightedValidationCount {}, trainError {}, validationError {}", count, trainError, statistics.size(), weightedTrainCount, weightedValidationCount, trainError, validationError); return new DTWorkerParams(weightedTrainCount, weightedValidationCount, trainError, validationError, statistics); }
From source file:ml.shifu.shifu.core.dtrain.wdl.WDLWorker.java
/** * Add to training set or validation set according to validation rate. * /*from w w w.ja v a 2 s . c o m*/ * @param hashcode * the hash code of the data * @param data * data instance * @param attachment * if it is validation * @return if in training, training is true, others are false. */ protected boolean addDataPairToDataSet(long hashcode, Data data, Object attachment) { // if validation data from configured validation data set boolean isValidation = (attachment != null && attachment instanceof Boolean) ? (Boolean) attachment : false; if (this.isKFoldCV) { int k = this.modelConfig.getTrain().getNumKFold(); if (hashcode % k == this.trainerId) { this.validationData.append(data); if (isPositive(data.label)) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } else { this.trainingData.append(data); if (isPositive(data.label)) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } if (this.isManualValidation) { if (isValidation) { this.validationData.append(data); if (isPositive(data.label)) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } else { this.trainingData.append(data); if (isPositive(data.label)) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } else { if (Double.compare(this.modelConfig.getValidSetRate(), 0d) != 0) { int classValue = (int) (data.label + 0.01f); Random random = null; if (this.isStratifiedSampling) { // each class use one random instance random = validationRandomMap.get(classValue); if (random == null) { random = new Random(); this.validationRandomMap.put(classValue, random); } } else { // all data use one random instance random = validationRandomMap.get(0); if (random == null) { random = new Random(); this.validationRandomMap.put(0, random); } } if (this.modelConfig.isFixInitialInput()) { // for fix initial input, if hashcode%100 is in [start-hashcode, end-hashcode), validation, // otherwise training. start hashcode in different job is different to make sure bagging jobs have // different data. if end-hashcode is over 100, then check if hashcode is in [start-hashcode, 100] // or [0, end-hashcode] int startHashCode = (100 / this.modelConfig.getBaggingNum()) * this.trainerId; int endHashCode = startHashCode + Double.valueOf(this.modelConfig.getValidSetRate() * 100).intValue(); if (isInRange(hashcode, startHashCode, endHashCode)) { this.validationData.append(data); if (isPositive(data.label)) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } else { this.trainingData.append(data); if (isPositive(data.label)) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } else { // not fixed initial input, if random value >= validRate, training, otherwise validation. if (random.nextDouble() >= this.modelConfig.getValidSetRate()) { this.trainingData.append(data); if (isPositive(data.label)) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } else { this.validationData.append(data); if (isPositive(data.label)) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } } } else { this.trainingData.append(data); if (isPositive(data.label)) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } }
From source file:ml.shifu.shifu.core.dtrain.lr.LogisticRegressionWorker.java
/** * Add to training set or validation set according to validation rate. * //from w ww . ja v a 2s . c o m * @param hashcode * the hash code of the data * @param data * data instance * @param isValidation * if it is validation * @return if in training, training is true, others are false. */ protected boolean addDataPairToDataSet(long hashcode, Data data, boolean isValidation) { if (this.isKFoldCV) { int k = this.modelConfig.getTrain().getNumKFold(); if (hashcode % k == this.trainerId) { this.validationData.append(data); if (isPositive(data.outputs[0])) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } else { this.trainingData.append(data); if (isPositive(data.outputs[0])) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } if (this.isSpecificValidation) { if (isValidation) { this.validationData.append(data); if (isPositive(data.outputs[0])) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } else { this.trainingData.append(data); if (isPositive(data.outputs[0])) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } else { if (Double.compare(this.modelConfig.getValidSetRate(), 0d) != 0) { int classValue = (int) (data.outputs[0] + 0.01f); Random random = null; if (this.isStratifiedSampling) { // each class use one random instance random = validationRandomMap.get(classValue); if (random == null) { random = new Random(); this.validationRandomMap.put(classValue, random); } } else { // all data use one random instance random = validationRandomMap.get(0); if (random == null) { random = new Random(); this.validationRandomMap.put(0, random); } } if (this.modelConfig.isFixInitialInput()) { // for fix initial input, if hashcode%100 is in [start-hashcode, end-hashcode), validation, // otherwise training. start hashcode in different job is different to make sure bagging jobs have // different data. if end-hashcode is over 100, then check if hashcode is in [start-hashcode, 100] // or [0, end-hashcode] int startHashCode = (100 / this.modelConfig.getBaggingNum()) * this.trainerId; int endHashCode = startHashCode + Double.valueOf(this.modelConfig.getValidSetRate() * 100).intValue(); if (isInRange(hashcode, startHashCode, endHashCode)) { this.validationData.append(data); if (isPositive(data.outputs[0])) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } else { this.trainingData.append(data); if (isPositive(data.outputs[0])) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } else { // not fixed initial input, if random value >= validRate, training, otherwise validation. if (random.nextDouble() >= this.modelConfig.getValidSetRate()) { this.trainingData.append(data); if (isPositive(data.outputs[0])) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } else { this.validationData.append(data); if (isPositive(data.outputs[0])) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } } } else { this.trainingData.append(data); if (isPositive(data.outputs[0])) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } }
From source file:ubic.gemma.analysis.expression.coexpression.links.LinkAnalysisServiceImpl.java
/** * Write links as text. If "known genes only", only known genes will be displayed, even if the probe in question * targets other "types" of genes./* www.j a v a 2 s .c o m*/ * * @param la * @param wr */ private void writeLinks(final LinkAnalysis la, FilterConfig filterConfig, Writer wr) throws IOException { Map<CompositeSequence, Collection<Collection<Gene>>> probeToGeneMap = la.getProbeToGeneMap(); ObjectArrayList links = la.getKeep(); double subsetSize = la.getConfig().getSubsetSize(); List<String> buf = new ArrayList<String>(); if (la.getConfig().isSubset() && links.size() > subsetSize) { la.getConfig().setSubsetUsed(true); } wr.write(la.getConfig().toString()); wr.write(filterConfig.toString()); NumberFormat nf = NumberFormat.getInstance(); nf.setMaximumFractionDigits(4); Integer probeDegreeThreshold = la.getConfig().getProbeDegreeThreshold(); Transformer officialSymbolExtractor = new Transformer() { @Override public Object transform(Object input) { Gene g = (Gene) input; return g.getOfficialSymbol(); } }; int i = 0; int keptLinksCount = 0; Random generator = new Random(); double rand = 0.0; double fraction = subsetSize / links.size(); int skippedDueToDegree = 0; for (int n = links.size(); i < n; i++) { Object val = links.getQuick(i); if (val == null) continue; Link m = (Link) val; Double w = m.getWeight(); assert w != null; int x = m.getx(); int y = m.gety(); if (probeDegreeThreshold > 0 && (la.getProbeDegree(x) > probeDegreeThreshold || la.getProbeDegree(y) > probeDegreeThreshold)) { skippedDueToDegree++; continue; } CompositeSequence p1 = la.getProbe(x); CompositeSequence p2 = la.getProbe(y); Collection<Collection<Gene>> g1 = probeToGeneMap.get(p1); Collection<Collection<Gene>> g2 = probeToGeneMap.get(p2); List<String> genes1 = new ArrayList<String>(); for (Collection<Gene> cluster : g1) { if (cluster.isEmpty()) continue; String t = StringUtils.join(new TransformIterator(cluster.iterator(), officialSymbolExtractor), ","); genes1.add(t); } List<String> genes2 = new ArrayList<String>(); for (Collection<Gene> cluster : g2) { if (cluster.isEmpty()) continue; String t = StringUtils.join(new TransformIterator(cluster.iterator(), officialSymbolExtractor), ","); genes2.add(t); } if (genes2.size() == 0 || genes1.size() == 0) { continue; } String gene1String = StringUtils.join(genes1.iterator(), "|"); String gene2String = StringUtils.join(genes2.iterator(), "|"); if (gene1String.equals(gene2String)) { continue; } if (++keptLinksCount % 50000 == 0) { log.info(keptLinksCount + " links retained"); } if (la.getConfig().isSubsetUsed()) { rand = generator.nextDouble(); if (rand > fraction) continue; } buf.add(p1.getId() + "\t" + p2.getId() + "\t" + gene1String + "\t" + gene2String + "\t" + nf.format(w) + "\n");// save links // wr.write( p1.getId() + "\t" + p2.getId() + "\t" + gene1String + "\t" + gene2String + "\t" + nf.format( w // ) + "\n" ); } wr.write("# totalLinks:" + keptLinksCount + "\n"); wr.write("# printedLinks:" + buf.size() + "\n"); wr.write("# skippedDueToHighNodeDegree:" + skippedDueToDegree + "\n"); for (String line : buf) {// write links to file wr.write(line); } if (la.getConfig().isSubsetUsed()) {// subset option activated log.info("Done, " + keptLinksCount + "/" + links.size() + " links kept, " + buf.size() + " links printed"); // wr.write("# Amount of links before subsetting/after subsetting: " + links.size() + "/" + numPrinted + // "\n" ); } else { log.info("Done, " + keptLinksCount + "/" + links.size() + " links printed (some may have been filtered)"); } wr.flush(); }
From source file:com.ebay.erl.mobius.core.mapred.MobiusInputSampler.java
@Override public Object[] getSample(InputFormat inf, JobConf job) throws IOException { // the following codes are copied from {@link InputSampler#RandomSampler}, // but require some modifications. InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples); int splitsToSample = Math.min(this.maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed);/*w w w .j ava2 s.c om*/ // get Sorters Sorter[] sorters = null; if (job.get(ConfigureConstants.SORTERS, null) != null) { // total sort job sorters = (Sorter[]) SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job); } else { // there is no sorter, should be reducer/join job Column[] keys = (Column[]) SerializableUtil .deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job); sorters = new Sorter[keys.length]; for (int i = 0; i < keys.length; i++) { sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC); } } long proportion = 10L; while ((int) (this.freq * proportion) == 0) { proportion = proportion * 10; } proportion = 5L * proportion; // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } SamplingOutputCollector collector = new SamplingOutputCollector(); for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); i++) { LOGGER.info("Sampling from split #" + (i + 1) + ", collected samples:" + samples.size()); RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); WritableComparable key = reader.createKey(); WritableComparable value = reader.createValue(); if (!(inf instanceof MobiusDelegatingInputFormat)) { // not mobius delegating input format, so the CURRENT_DATASET_ID // will not be set by inf#getRecordReader, we set them here. // // set the current dataset id, as the AbstractMobiusMapper#configure // method needs this property. job.set(ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS)); } Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID)); LOGGER.info("Samples coming from dataset: " + datasetID.toString()); AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job); mapper.configure(job); // reading elements from one split long readElement = 0; while (reader.next(key, value)) { collector.clear(); Tuple tuple = mapper.parse(key, value); readElement++; if (readElement > (((long) numSamples) * ((long) proportion))) { // a split might be very big (ex: a large gz file), // so we just need to read the break; } if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { mapper.joinmap(key, value, collector, Reporter.NULL); // joinmap function might generate more than one output key // per <code>key</code> input. for (Tuple t : collector.getOutKey()) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.add(nkey); } } else { // When exceeding the maximum number of samples, replace // a random element with this one, then adjust the // frequency to reflect the possibility of existing // elements being pushed out mapper.joinmap(key, value, collector, Reporter.NULL); for (Tuple t : collector.getOutKey()) { int ind = r.nextInt(numSamples); if (ind != numSamples) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.set(ind, nkey); } } freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples; } key = reader.createKey(); value = reader.createValue(); } } reader.close(); } LOGGER.info("Samples have been collected, return."); return samples.toArray(); }
From source file:ml.shifu.shifu.core.dtrain.nn.AbstractNNWorker.java
/** * Add to training set or validation set according to validation rate. * /*from w ww . j av a 2s . c om*/ * @param hashcode * the hash code of the data * @param pair * data instance * @param isValidation * if it is validation * @return if in training, training is true, others are false. */ protected boolean addDataPairToDataSet(long hashcode, FloatMLDataPair pair, boolean isValidation) { if (this.isKFoldCV) { int k = this.modelConfig.getTrain().getNumKFold(); if (hashcode % k == this.trainerId) { this.validationData.add(pair); if (isPositive(pair.getIdealArray()[0])) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } else { this.trainingData.add(pair); if (isPositive(pair.getIdealArray()[0])) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } if (this.isSpecificValidation) { if (isValidation) { this.validationData.add(pair); if (isPositive(pair.getIdealArray()[0])) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } else { this.trainingData.add(pair); if (isPositive(pair.getIdealArray()[0])) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } else { if (Double.compare(this.modelConfig.getValidSetRate(), 0d) != 0) { int classValue = (int) (pair.getIdealArray()[0] + 0.01f); Random random = null; if (this.isStratifiedSampling) { // each class use one random instance random = validationRandomMap.get(classValue); if (random == null) { random = new Random(); this.validationRandomMap.put(classValue, random); } } else { // all data use one random instance random = validationRandomMap.get(0); if (random == null) { random = new Random(); this.validationRandomMap.put(0, random); } } if (this.modelConfig.isFixInitialInput()) { // for fix initial input, if hashcode%100 is in [start-hashcode, end-hashcode), validation, // otherwise training. start hashcode in different job is different to make sure bagging jobs have // different data. if end-hashcode is over 100, then check if hashcode is in [start-hashcode, 100] // or [0, end-hashcode] int startHashCode = (100 / this.modelConfig.getBaggingNum()) * this.trainerId; int endHashCode = startHashCode + Double.valueOf(this.modelConfig.getValidSetRate() * 100).intValue(); if (isInRange(hashcode, startHashCode, endHashCode)) { this.validationData.add(pair); if (isPositive(pair.getIdealArray()[0])) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } else { this.trainingData.add(pair); if (isPositive(pair.getIdealArray()[0])) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } else { // not fixed initial input, if random value >= validRate, training, otherwise validation. if (random.nextDouble() >= this.modelConfig.getValidSetRate()) { this.trainingData.add(pair); if (isPositive(pair.getIdealArray()[0])) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } else { this.validationData.add(pair); if (isPositive(pair.getIdealArray()[0])) { this.positiveValidationCount += 1L; } else { this.negativeValidationCount += 1L; } return false; } } } else { this.trainingData.add(pair); if (isPositive(pair.getIdealArray()[0])) { this.positiveTrainCount += 1L; } else { this.negativeTrainCount += 1L; } return true; } } }
From source file:org.roaringbitmap.TestRoaringBitmap.java
@Test public void flipTestBig() { final int numCases = 1000; System.out.println("flipTestBig for " + numCases + " tests"); final RoaringBitmap rb = new RoaringBitmap(); final BitSet bs = new BitSet(); final Random r = new Random(3333); int checkTime = 2; for (int i = 0; i < numCases; ++i) { final int start = r.nextInt(65536 * 20); int end = r.nextInt(65536 * 20); if (r.nextDouble() < 0.1) end = start + r.nextInt(100); rb.flip(start, end);//from w ww . j ava 2s . co m if (start < end) bs.flip(start, end); // throws exception // otherwise // insert some more ANDs to keep things sparser if (r.nextDouble() < 0.2) { final RoaringBitmap mask = new RoaringBitmap(); final BitSet mask1 = new BitSet(); final int startM = r.nextInt(65536 * 20); final int endM = startM + 100000; mask.flip(startM, endM); mask1.flip(startM, endM); mask.flip(0, 65536 * 20 + 100000); mask1.flip(0, 65536 * 20 + 100000); rb.and(mask); bs.and(mask1); } // see if we can detect incorrectly shared containers if (r.nextDouble() < 0.1) { final RoaringBitmap irrelevant = RoaringBitmap.flip(rb, 10, 100000); irrelevant.flip(5, 200000); irrelevant.flip(190000, 260000); } if (i > checkTime) { Assert.assertTrue(equals(bs, rb)); checkTime *= 1.5; } } }
From source file:org.dllearner.algorithms.qtl.experiments.QTLEvaluation.java
private Pair<List<String>, List<String>> generateNoise(List<String> examples, String sparqlQuery, double noise, Random randomGen) { // generate noise example candidates List<String> noiseCandidateExamples = null; switch (noiseMethod) { case RANDOM://from ww w . j a v a 2 s. c o m noiseCandidateExamples = generateNoiseCandidatesRandom(examples, 20); break; case SIMILAR: noiseCandidateExamples = generateNoiseCandidatesSimilar(examples, sparqlQuery, 20); break; case SIMILARITY_PARAMETERIZED://TODO implement configurable noise method break; default: noiseCandidateExamples = generateNoiseCandidatesRandom(examples, 20); break; } Collections.shuffle(noiseCandidateExamples, randomGen); // add some noise by using instances close to the positive examples // we have two ways of adding noise t_n // 1: iterate over pos. examples and if random number is below t_n, replace the example // 2: replace the (#posExamples * t_n) randomly chosen pos. examples by randomly chosen negative examples boolean probabilityBased = false; if (probabilityBased) { // 1. way List<String> newExamples = new ArrayList<>(); for (Iterator<String> iterator = examples.iterator(); iterator.hasNext();) { String posExample = iterator.next(); double rnd = randomGen.nextDouble(); if (rnd <= noise) { // remove the positive example iterator.remove(); // add one of the negative examples String negExample = noiseCandidateExamples.remove(0); newExamples.add(negExample); logger.info("Replacing " + posExample + " by " + negExample); } } examples.addAll(newExamples); return null; } else { // 2. way // replace at least 1 but not more than half of the examples int upperBound = examples.size() / 2; int nrOfPosExamples2Replace = (int) Math.ceil(noise * examples.size()); nrOfPosExamples2Replace = Math.min(nrOfPosExamples2Replace, upperBound); logger.info("replacing " + nrOfPosExamples2Replace + "/" + examples.size() + " examples to introduce noise"); List<String> posExamples2Replace = new ArrayList<>(examples.subList(0, nrOfPosExamples2Replace)); examples.removeAll(posExamples2Replace); List<String> negExamples4Replacement = noiseCandidateExamples.subList(0, nrOfPosExamples2Replace); List<String> noiseExamples = new ArrayList<>(negExamples4Replacement); List<String> correctExamples = new ArrayList<>(examples); examples.addAll(negExamples4Replacement); logger.info("replaced " + posExamples2Replace + " by " + negExamples4Replacement); return new Pair<>(correctExamples, noiseExamples); } }
From source file:org.roaringbitmap.TestRoaringBitmap.java
@Test public void flipTestBigA() { final int numCases = 1000; final BitSet bs = new BitSet(); final Random r = new Random(3333); int checkTime = 2; RoaringBitmap rb1 = new RoaringBitmap(), rb2 = null; // alternate // between//from ww w. j a v a 2 s . com // them for (int i = 0; i < numCases; ++i) { final int start = r.nextInt(65536 * 20); int end = r.nextInt(65536 * 20); if (r.nextDouble() < 0.1) end = start + r.nextInt(100); if ((i & 1) == 0) { rb2 = RoaringBitmap.flip(rb1, start, end); // tweak the other, catch bad sharing int r1 = r.nextInt(65536 * 20); int r2 = r.nextInt(65536 * 20); rb1.flip(r1, r2); } else { rb1 = RoaringBitmap.flip(rb2, start, end); int r1 = r.nextInt(65536 * 20); int r2 = r.nextInt(65536 * 20); rb2.flip(r1, r2); } if (start < end) { bs.flip(start, end); // throws exception // otherwise } // insert some more ANDs to keep things sparser if (r.nextDouble() < 0.2 && (i & 1) == 0) { final RoaringBitmap mask = new RoaringBitmap(); final BitSet mask1 = new BitSet(); final int startM = r.nextInt(65536 * 20); final int endM = startM + 100000; mask.flip(startM, endM); mask1.flip(startM, endM); mask.flip(0, 65536 * 20 + 100000); mask1.flip(0, 65536 * 20 + 100000); rb2.and(mask); bs.and(mask1); } if (i > checkTime) { System.out.println("check after " + i + ", card = " + rb2.getCardinality()); final RoaringBitmap rb = (i & 1) == 0 ? rb2 : rb1; final boolean status = equals(bs, rb); Assert.assertTrue(status); checkTime *= 1.5; } } }