List of usage examples for java.util BitSet get
public boolean get(int bitIndex)
From source file:org.apache.carbondata.hadoop.api.CarbonInputFormat.java
/** * get data blocks of given segment//from w w w . j a v a 2 s . co m */ protected List<CarbonInputSplit> getDataBlocksOfSegment(JobContext job, CarbonTable carbonTable, Expression expression, BitSet matchedPartitions, List<Segment> segmentIds, PartitionInfo partitionInfo, List<Integer> oldPartitionIdList) throws IOException { QueryStatisticsRecorder recorder = CarbonTimeStatisticsFactory.createDriverRecorder(); QueryStatistic statistic = new QueryStatistic(); // get tokens for all the required FileSystem for table path TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { new Path(carbonTable.getTablePath()) }, job.getConfiguration()); List<ExtendedBlocklet> prunedBlocklets = getPrunedBlocklets(job, carbonTable, expression, segmentIds); List<CarbonInputSplit> resultFilteredBlocks = new ArrayList<>(); int partitionIndex = 0; List<Integer> partitionIdList = new ArrayList<>(); if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) { partitionIdList = partitionInfo.getPartitionIds(); } for (ExtendedBlocklet blocklet : prunedBlocklets) { // OldPartitionIdList is only used in alter table partition command because it change // partition info first and then read data. // For other normal query should use newest partitionIdList if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) { long partitionId = CarbonTablePath.DataFileUtil .getTaskIdFromTaskNo(CarbonTablePath.DataFileUtil.getTaskNo(blocklet.getPath())); if (oldPartitionIdList != null) { partitionIndex = oldPartitionIdList.indexOf((int) partitionId); } else { partitionIndex = partitionIdList.indexOf((int) partitionId); } } if (partitionIndex != -1) { // matchedPartitions variable will be null in two cases as follows // 1. the table is not a partition table // 2. the table is a partition table, and all partitions are matched by query // for partition table, the task id of carbaondata file name is the partition id. // if this partition is not required, here will skip it. if (matchedPartitions == null || matchedPartitions.get(partitionIndex)) { resultFilteredBlocks.add(blocklet.getInputSplit()); } } } statistic.addStatistics(QueryStatisticsConstants.LOAD_BLOCKS_DRIVER, System.currentTimeMillis()); recorder.recordStatisticsForDriver(statistic, job.getConfiguration().get("query.id")); return resultFilteredBlocks; }
From source file:org.apache.hadoop.hdfs.TestReconstructStripedFile.java
/** * Test the file blocks reconstruction.//from w ww. java 2s . co m * 1. Check the replica is reconstructed in the target datanode, * and verify the block replica length, generationStamp and content. * 2. Read the file and verify content. */ private void assertFileBlocksReconstruction(String fileName, int fileLen, ReconstructionType type, int toRecoverBlockNum) throws Exception { if (toRecoverBlockNum < 1 || toRecoverBlockNum > parityBlkNum) { Assert.fail("toRecoverBlockNum should be between 1 ~ " + parityBlkNum); } assertTrue("File length must be positive.", fileLen > 0); Path file = new Path(fileName); final byte[] data = new byte[fileLen]; Arrays.fill(data, (byte) 1); DFSTestUtil.writeFile(fs, file, data); StripedFileTestUtil.waitBlockGroupsReported(fs, fileName); LocatedBlocks locatedBlocks = StripedFileTestUtil.getLocatedBlocks(file, fs); assertEquals(locatedBlocks.getFileLength(), fileLen); LocatedStripedBlock lastBlock = (LocatedStripedBlock) locatedBlocks.getLastLocatedBlock(); DatanodeInfo[] storageInfos = lastBlock.getLocations(); byte[] indices = lastBlock.getBlockIndices(); BitSet bitset = new BitSet(dnNum); for (DatanodeInfo storageInfo : storageInfos) { bitset.set(dnMap.get(storageInfo)); } int[] dead = generateDeadDnIndices(type, toRecoverBlockNum, indices); LOG.info("Note: indices == " + Arrays.toString(indices) + ". Generate errors on datanodes: " + Arrays.toString(dead)); DatanodeInfo[] dataDNs = new DatanodeInfo[toRecoverBlockNum]; int[] deadDnIndices = new int[toRecoverBlockNum]; ExtendedBlock[] blocks = new ExtendedBlock[toRecoverBlockNum]; File[] replicas = new File[toRecoverBlockNum]; long[] replicaLengths = new long[toRecoverBlockNum]; File[] metadatas = new File[toRecoverBlockNum]; byte[][] replicaContents = new byte[toRecoverBlockNum][]; Map<ExtendedBlock, DataNode> errorMap = new HashMap<>(dead.length); for (int i = 0; i < toRecoverBlockNum; i++) { dataDNs[i] = storageInfos[dead[i]]; deadDnIndices[i] = dnMap.get(dataDNs[i]); // Check the block replica file on deadDn before it dead. blocks[i] = StripedBlockUtil.constructInternalBlock(lastBlock.getBlock(), cellSize, dataBlkNum, indices[dead[i]]); errorMap.put(blocks[i], cluster.getDataNodes().get(deadDnIndices[i])); replicas[i] = cluster.getBlockFile(deadDnIndices[i], blocks[i]); replicaLengths[i] = replicas[i].length(); metadatas[i] = cluster.getBlockMetadataFile(deadDnIndices[i], blocks[i]); // the block replica on the datanode should be the same as expected assertEquals(replicaLengths[i], StripedBlockUtil.getInternalBlockLength(lastBlock.getBlockSize(), cellSize, dataBlkNum, indices[dead[i]])); assertTrue(metadatas[i].getName().endsWith(blocks[i].getGenerationStamp() + ".meta")); LOG.info("replica " + i + " locates in file: " + replicas[i]); replicaContents[i] = DFSTestUtil.readFileAsBytes(replicas[i]); } int lastGroupDataLen = fileLen % (dataBlkNum * blockSize); int lastGroupNumBlk = lastGroupDataLen == 0 ? dataBlkNum : Math.min(dataBlkNum, ((lastGroupDataLen - 1) / cellSize + 1)); int groupSize = lastGroupNumBlk + parityBlkNum; // shutdown datanodes or generate corruption int stoppedDN = generateErrors(errorMap, type); // Check the locatedBlocks of the file again locatedBlocks = StripedFileTestUtil.getLocatedBlocks(file, fs); lastBlock = (LocatedStripedBlock) locatedBlocks.getLastLocatedBlock(); storageInfos = lastBlock.getLocations(); assertEquals(storageInfos.length, groupSize - stoppedDN); int[] targetDNs = new int[dnNum - groupSize]; int n = 0; for (int i = 0; i < dnNum; i++) { if (!bitset.get(i)) { // not contain replica of the block. targetDNs[n++] = i; } } StripedFileTestUtil.waitForReconstructionFinished(file, fs, groupSize); targetDNs = sortTargetsByReplicas(blocks, targetDNs); // Check the replica on the new target node. for (int i = 0; i < toRecoverBlockNum; i++) { File replicaAfterReconstruction = cluster.getBlockFile(targetDNs[i], blocks[i]); LOG.info("replica after reconstruction " + replicaAfterReconstruction); File metadataAfterReconstruction = cluster.getBlockMetadataFile(targetDNs[i], blocks[i]); assertEquals(replicaLengths[i], replicaAfterReconstruction.length()); LOG.info("replica before " + replicas[i]); assertTrue(metadataAfterReconstruction.getName().endsWith(blocks[i].getGenerationStamp() + ".meta")); byte[] replicaContentAfterReconstruction = DFSTestUtil.readFileAsBytes(replicaAfterReconstruction); Assert.assertArrayEquals(replicaContents[i], replicaContentAfterReconstruction); } }
From source file:android.support.v7.widget.StaggeredGridLayoutManager2.java
/** * Checks for gaps if we've reached to the top of the list. * <p>//from w w w .j a v a2 s .co m * Intermediate gaps created by full span items are tracked via mLaidOutInvalidFullSpan field. */ View hasGapsToFix() { int startChildIndex = 0; int endChildIndex = getChildCount() - 1; BitSet mSpansToCheck = new BitSet(mSpanCount); mSpansToCheck.set(0, mSpanCount, true); final int firstChildIndex, childLimit; final int preferredSpanDir = mOrientation == VERTICAL && isLayoutRTL() ? 1 : -1; if (mShouldReverseLayout) { firstChildIndex = endChildIndex - 1; childLimit = startChildIndex - 1; } else { firstChildIndex = startChildIndex; childLimit = endChildIndex; } final int nextChildDiff = firstChildIndex < childLimit ? 1 : -1; for (int i = firstChildIndex; i != childLimit; i += nextChildDiff) { View child = getChildAt(i); LayoutParams lp = (LayoutParams) child.getLayoutParams(); if (mSpansToCheck.get(lp.mSpan.mIndex)) { if (checkSpanForGap(lp.mSpan)) { return child; } mSpansToCheck.clear(lp.mSpan.mIndex); } if (lp.mFullSpan) { continue; // quick reject } if (i + nextChildDiff != childLimit) { View nextChild = getChildAt(i + nextChildDiff); boolean compareSpans = false; if (mShouldReverseLayout) { // ensure child's end is below nextChild's end int myEnd = mPrimaryOrientation.getDecoratedEnd(child); int nextEnd = mPrimaryOrientation.getDecoratedEnd(nextChild); if (myEnd < nextEnd) { return child;//i should have a better position } else if (myEnd == nextEnd) { compareSpans = true; } } else { int myStart = mPrimaryOrientation.getDecoratedStart(child); int nextStart = mPrimaryOrientation.getDecoratedStart(nextChild); if (myStart > nextStart) { return child;//i should have a better position } else if (myStart == nextStart) { compareSpans = true; } } if (compareSpans) { // equal, check span indices. LayoutParams nextLp = (LayoutParams) nextChild.getLayoutParams(); if (lp.mSpan.mIndex - nextLp.mSpan.mIndex < 0 != preferredSpanDir < 0) { return child; } } } } // everything looks good return null; }
From source file:org.apache.openjpa.kernel.StateManagerImpl.java
/** * Rollback state of the managed instance to the given savepoint. */// www . j a v a 2 s. com void rollbackToSavepoint(SavepointFieldManager savepoint) { _state = savepoint.getPCState(); BitSet loaded = savepoint.getLoaded(); for (int i = 0, len = loaded.length(); i < len; i++) { if (loaded.get(i) && savepoint.restoreField(i)) { provideField(savepoint.getCopy(), savepoint, i); replaceField(_pc, savepoint, i); } } _loaded = loaded; _dirty = savepoint.getDirty(); _flush = savepoint.getFlushed(); _version = savepoint.getVersion(); _loadVersion = savepoint.getLoadVersion(); }
From source file:android.support.v7.widget.StaggeredGridLayoutManager.java
/** * Checks for gaps if we've reached to the top of the list. * <p>//from w w w . j a va 2 s .c o m * Intermediate gaps created by full span items are tracked via mLaidOutInvalidFullSpan field. */ View hasGapsToFix() { int startChildIndex = 0; int endChildIndex = getChildCount() - 1; BitSet mSpansToCheck = new BitSet(mSpanCount); mSpansToCheck.set(0, mSpanCount, true); final int firstChildIndex, childLimit; final int preferredSpanDir = mOrientation == VERTICAL && isLayoutRTL() ? 1 : -1; if (mShouldReverseLayout) { firstChildIndex = endChildIndex; childLimit = startChildIndex - 1; } else { firstChildIndex = startChildIndex; childLimit = endChildIndex + 1; } final int nextChildDiff = firstChildIndex < childLimit ? 1 : -1; for (int i = firstChildIndex; i != childLimit; i += nextChildDiff) { View child = getChildAt(i); LayoutParams lp = (LayoutParams) child.getLayoutParams(); if (mSpansToCheck.get(lp.mSpan.mIndex)) { if (checkSpanForGap(lp.mSpan)) { return child; } mSpansToCheck.clear(lp.mSpan.mIndex); } if (lp.mFullSpan) { continue; // quick reject } if (i + nextChildDiff != childLimit) { View nextChild = getChildAt(i + nextChildDiff); boolean compareSpans = false; if (mShouldReverseLayout) { // ensure child's end is below nextChild's end int myEnd = mPrimaryOrientation.getDecoratedEnd(child); int nextEnd = mPrimaryOrientation.getDecoratedEnd(nextChild); if (myEnd < nextEnd) { return child;//i should have a better position } else if (myEnd == nextEnd) { compareSpans = true; } } else { int myStart = mPrimaryOrientation.getDecoratedStart(child); int nextStart = mPrimaryOrientation.getDecoratedStart(nextChild); if (myStart > nextStart) { return child;//i should have a better position } else if (myStart == nextStart) { compareSpans = true; } } if (compareSpans) { // equal, check span indices. LayoutParams nextLp = (LayoutParams) nextChild.getLayoutParams(); if (lp.mSpan.mIndex - nextLp.mSpan.mIndex < 0 != preferredSpanDir < 0) { return child; } } } } // everything looks good return null; }
From source file:com.jefftharris.passwdsafe.PasswdSafe.java
@Override public boolean onPrepareOptionsMenu(Menu menu) { final BitSet options = new BitSet(); options.set(MENU_BIT_HAS_CLOSE);/*w w w . ja v a 2 s .co m*/ itsFileDataFrag.useFileData(new PasswdFileDataUser() { @Override public void useFileData(@NonNull PasswdFileData fileData) { boolean fileEditable = fileData.canEdit(); switch (itsCurrViewMode) { case VIEW_LIST: { options.set(MENU_BIT_CAN_ADD, fileEditable); options.set(MENU_BIT_HAS_SEARCH, true); if (fileEditable) { options.set(MENU_BIT_HAS_FILE_OPS, true); options.set(MENU_BIT_HAS_FILE_CHANGE_PASSWORD, fileData.isNotYubikey()); options.set(MENU_BIT_HAS_FILE_PROTECT, true); options.set(MENU_BIT_PROTECT_ALL, itsLocation.getGroups().isEmpty()); } if (fileData.canDelete()) { options.set(MENU_BIT_HAS_FILE_OPS, true); options.set(MENU_BIT_HAS_FILE_DELETE, true); } break; } case VIEW_RECORD: { options.set(MENU_BIT_CAN_ADD, fileEditable); break; } case INIT: case FILE_OPEN: case FILE_NEW: case VIEW_ABOUT: case VIEW_EXPIRATION: case VIEW_POLICY_LIST: case VIEW_PREFERENCES: { break; } case EDIT_RECORD: case CHANGING_PASSWORD: { options.set(MENU_BIT_HAS_CLOSE, false); break; } } } }); MenuItem item = menu.findItem(R.id.menu_add); if (item != null) { item.setVisible(options.get(MENU_BIT_CAN_ADD)); } item = menu.findItem(R.id.menu_close); if (item != null) { item.setVisible(options.get(MENU_BIT_HAS_CLOSE)); } item = menu.findItem(R.id.menu_file_ops); if (item != null) { item.setVisible(options.get(MENU_BIT_HAS_FILE_OPS)); } item = menu.findItem(R.id.menu_file_change_password); if (item != null) { item.setEnabled(options.get(MENU_BIT_HAS_FILE_CHANGE_PASSWORD)); } if (options.get(MENU_BIT_HAS_FILE_OPS)) { boolean hasProtect = options.get(MENU_BIT_HAS_FILE_PROTECT); boolean viewProtectAll = options.get(MENU_BIT_PROTECT_ALL); item = menu.findItem(R.id.menu_file_protect_records); if (item != null) { item.setEnabled(hasProtect); item.setTitle(viewProtectAll ? R.string.protect_all : R.string.protect_group); } item = menu.findItem(R.id.menu_file_unprotect_records); if (item != null) { item.setEnabled(hasProtect); item.setTitle(viewProtectAll ? R.string.unprotect_all : R.string.unprotect_group); } item = menu.findItem(R.id.menu_file_delete); if (item != null) { item.setEnabled(options.get(MENU_BIT_HAS_FILE_DELETE)); } } item = menu.findItem(R.id.menu_search); if (item != null) { item.setVisible(options.get(MENU_BIT_HAS_SEARCH)); } return super.onPrepareOptionsMenu(menu); }
From source file:org.apache.hadoop.hdfs.TestRecoverStripedFile.java
/** * Test the file blocks recovery./*from w ww . ja v a2 s . c om*/ * 1. Check the replica is recovered in the target datanode, * and verify the block replica length, generationStamp and content. * 2. Read the file and verify content. */ private void assertFileBlocksRecovery(String fileName, int fileLen, int recovery, int toRecoverBlockNum) throws Exception { if (recovery != 0 && recovery != 1 && recovery != 2) { Assert.fail("Invalid recovery: 0 is to recovery parity blocks," + "1 is to recovery data blocks, 2 is any."); } if (toRecoverBlockNum < 1 || toRecoverBlockNum > parityBlkNum) { Assert.fail("toRecoverBlockNum should be between 1 ~ " + parityBlkNum); } Path file = new Path(fileName); final byte[] data = new byte[fileLen]; ThreadLocalRandom.current().nextBytes(data); DFSTestUtil.writeFile(fs, file, data); StripedFileTestUtil.waitBlockGroupsReported(fs, fileName); LocatedBlocks locatedBlocks = getLocatedBlocks(file); assertEquals(locatedBlocks.getFileLength(), fileLen); LocatedStripedBlock lastBlock = (LocatedStripedBlock) locatedBlocks.getLastLocatedBlock(); DatanodeInfo[] storageInfos = lastBlock.getLocations(); byte[] indices = lastBlock.getBlockIndices(); BitSet bitset = new BitSet(dnNum); for (DatanodeInfo storageInfo : storageInfos) { bitset.set(dnMap.get(storageInfo)); } int[] toDead = new int[toRecoverBlockNum]; int n = 0; for (int i = 0; i < indices.length; i++) { if (n < toRecoverBlockNum) { if (recovery == 0) { if (indices[i] >= dataBlkNum) { toDead[n++] = i; } } else if (recovery == 1) { if (indices[i] < dataBlkNum) { toDead[n++] = i; } } else { toDead[n++] = i; } } else { break; } } DatanodeInfo[] dataDNs = new DatanodeInfo[toRecoverBlockNum]; int[] deadDnIndices = new int[toRecoverBlockNum]; ExtendedBlock[] blocks = new ExtendedBlock[toRecoverBlockNum]; File[] replicas = new File[toRecoverBlockNum]; File[] metadatas = new File[toRecoverBlockNum]; byte[][] replicaContents = new byte[toRecoverBlockNum][]; for (int i = 0; i < toRecoverBlockNum; i++) { dataDNs[i] = storageInfos[toDead[i]]; deadDnIndices[i] = dnMap.get(dataDNs[i]); // Check the block replica file on deadDn before it dead. blocks[i] = StripedBlockUtil.constructInternalBlock(lastBlock.getBlock(), cellSize, dataBlkNum, indices[toDead[i]]); replicas[i] = cluster.getBlockFile(deadDnIndices[i], blocks[i]); metadatas[i] = cluster.getBlockMetadataFile(deadDnIndices[i], blocks[i]); // the block replica on the datanode should be the same as expected assertEquals(replicas[i].length(), StripedBlockUtil.getInternalBlockLength(lastBlock.getBlockSize(), cellSize, dataBlkNum, indices[toDead[i]])); assertTrue(metadatas[i].getName().endsWith(blocks[i].getGenerationStamp() + ".meta")); replicaContents[i] = DFSTestUtil.readFileAsBytes(replicas[i]); } int cellsNum = (fileLen - 1) / cellSize + 1; int groupSize = Math.min(cellsNum, dataBlkNum) + parityBlkNum; for (int i = 0; i < toRecoverBlockNum; i++) { /* * Kill the datanode which contains one replica * We need to make sure it dead in namenode: clear its update time and * trigger NN to check heartbeat. */ DataNode dn = cluster.getDataNodes().get(deadDnIndices[i]); dn.shutdown(); cluster.setDataNodeDead(dn.getDatanodeId()); } // Check the locatedBlocks of the file again locatedBlocks = getLocatedBlocks(file); lastBlock = (LocatedStripedBlock) locatedBlocks.getLastLocatedBlock(); storageInfos = lastBlock.getLocations(); assertEquals(storageInfos.length, groupSize - toRecoverBlockNum); int[] targetDNs = new int[dnNum - groupSize]; n = 0; for (int i = 0; i < dnNum; i++) { if (!bitset.get(i)) { // not contain replica of the block. targetDNs[n++] = i; } } waitForRecoveryFinished(file, groupSize); targetDNs = sortTargetsByReplicas(blocks, targetDNs); // Check the replica on the new target node. for (int i = 0; i < toRecoverBlockNum; i++) { File replicaAfterRecovery = cluster.getBlockFile(targetDNs[i], blocks[i]); File metadataAfterRecovery = cluster.getBlockMetadataFile(targetDNs[i], blocks[i]); assertEquals(replicaAfterRecovery.length(), replicas[i].length()); assertTrue(metadataAfterRecovery.getName().endsWith(blocks[i].getGenerationStamp() + ".meta")); byte[] replicaContentAfterRecovery = DFSTestUtil.readFileAsBytes(replicaAfterRecovery); Assert.assertArrayEquals(replicaContents[i], replicaContentAfterRecovery); } }
From source file:org.apache.openjpa.kernel.StateManagerImpl.java
/** * Internal version of {@link OpenJPAStateManager#getUnloaded} that avoids * creating an empty bit set by returning null when there are no unloaded * fields./*w w w . j ava 2s .co m*/ */ private BitSet getUnloadedInternal(FetchConfiguration fetch, int mode, BitSet exclude) { if (exclude == StoreContext.EXCLUDE_ALL) return null; BitSet fields = null; FieldMetaData[] fmds = _meta.getFields(); boolean load; for (int i = 0; i < fmds.length; i++) { if (_loaded.get(i) || (exclude != null && exclude.get(i))) continue; switch (mode) { case LOAD_SERIALIZE: load = !fmds[i].isTransient(); break; case LOAD_FGS: load = fetch == null || fetch.requiresFetch(fmds[i]) != FetchConfiguration.FETCH_NONE; break; default: // LOAD_ALL load = true; } if (load) { if (fields == null) fields = new BitSet(fmds.length); fields.set(i); } } return fields; }
From source file:com.tamingtext.util.SplitInput.java
/** Perform a split on the specified input file. Results will be written to files of the same name in the specified * training and test output directories. The {@link #validate()} method is called prior to executing the split. *//*from ww w. j ava 2s. c o m*/ public void splitFile(Path inputFile) throws IOException { if (fs.getFileStatus(inputFile) == null) { throw new IOException(inputFile + " does not exist"); } else if (fs.getFileStatus(inputFile).isDir()) { throw new IOException(inputFile + " is a directory"); } validate(); Path testOutputFile = new Path(testOutputDirectory, inputFile.getName()); Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName()); int lineCount = countLines(fs, inputFile, charset); log.info("{} has {} lines", inputFile.getName(), lineCount); int testSplitStart = 0; int testSplitSize = this.testSplitSize; // don't modify state BitSet randomSel = null; if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) { testSplitSize = this.testRandomSelectionSize; if (testRandomSelectionPct > 0) { testSplitSize = Math.round(lineCount * (testRandomSelectionPct / 100.0f)); } log.info("{} test split size is {} based on random selection percentage {}", new Object[] { inputFile.getName(), testSplitSize, testRandomSelectionPct }); long[] ridx = new long[testSplitSize]; RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom()); randomSel = new BitSet(lineCount); for (long idx : ridx) { randomSel.set((int) idx + 1); } } else { if (testSplitPct > 0) { // calculate split size based on percentage testSplitSize = Math.round(lineCount * (testSplitPct / 100.0f)); log.info("{} test split size is {} based on percentage {}", new Object[] { inputFile.getName(), testSplitSize, testSplitPct }); } else { log.info("{} test split size is {}", inputFile.getName(), testSplitSize); } if (splitLocation > 0) { // calculate start of split based on percentage testSplitStart = Math.round(lineCount * (splitLocation / 100.0f)); if (lineCount - testSplitStart < testSplitSize) { // adjust split start downwards based on split size. testSplitStart = lineCount - testSplitSize; } log.info("{} test split start is {} based on split location {}", new Object[] { inputFile.getName(), testSplitStart, splitLocation }); } if (testSplitStart < 0) { throw new IllegalArgumentException( "test split size for " + inputFile + " is too large, it would produce an " + "empty training set from the initial set of " + lineCount + " examples"); } else if ((lineCount - testSplitSize) < testSplitSize) { log.warn( "Test set size for {} may be too large, {} is larger than the number of " + "lines remaining in the training set: {}", new Object[] { inputFile, testSplitSize, lineCount - testSplitSize }); } } BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset); Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset); int pos = 0; int trainCount = 0; int testCount = 0; String line; while ((line = reader.readLine()) != null) { pos++; Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } writer.write(line); writer.write('\n'); } IOUtils.close(Collections.singleton(trainingWriter)); IOUtils.close(Collections.singleton(testWriter)); log.info("file: {}, input: {} train: {}, test: {} starting at {}", new Object[] { inputFile.getName(), lineCount, trainCount, testCount, testSplitStart }); // testing; if (callback != null) { callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart); } }
From source file:org.apache.mahout.classifier.bayes.SplitBayesInput.java
/** Perform a split on the specified input file. Results will be written to files of the same name in the specified * training and test output directories. The {@link #validate()} method is called prior to executing the split. *///from ww w. ja v a2 s . com public void splitFile(Path inputFile) throws IOException { if (fs.getFileStatus(inputFile) == null) { throw new IOException(inputFile + " does not exist"); } else if (fs.getFileStatus(inputFile).isDir()) { throw new IOException(inputFile + " is a directory"); } validate(); Path testOutputFile = new Path(testOutputDirectory, inputFile.getName()); Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName()); int lineCount = countLines(fs, inputFile, charset); log.info("{} has {} lines", inputFile.getName(), lineCount); int testSplitStart = 0; int testSplitSize = this.testSplitSize; // don't modify state BitSet randomSel = null; if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) { testSplitSize = this.testRandomSelectionSize; if (testRandomSelectionPct > 0) { testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f); } log.info("{} test split size is {} based on random selection percentage {}", new Object[] { inputFile.getName(), testSplitSize, testRandomSelectionPct }); long[] ridx = new long[testSplitSize]; RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom()); randomSel = new BitSet(lineCount); for (long idx : ridx) { randomSel.set((int) idx + 1); } } else { if (testSplitPct > 0) { // calculate split size based on percentage testSplitSize = Math.round(lineCount * testSplitPct / 100.0f); log.info("{} test split size is {} based on percentage {}", new Object[] { inputFile.getName(), testSplitSize, testSplitPct }); } else { log.info("{} test split size is {}", inputFile.getName(), testSplitSize); } if (splitLocation > 0) { // calculate start of split based on percentage testSplitStart = Math.round(lineCount * splitLocation / 100.0f); if (lineCount - testSplitStart < testSplitSize) { // adjust split start downwards based on split size. testSplitStart = lineCount - testSplitSize; } log.info("{} test split start is {} based on split location {}", new Object[] { inputFile.getName(), testSplitStart, splitLocation }); } if (testSplitStart < 0) { throw new IllegalArgumentException( "test split size for " + inputFile + " is too large, it would produce an " + "empty training set from the initial set of " + lineCount + " examples"); } else if (lineCount - testSplitSize < testSplitSize) { log.warn( "Test set size for {} may be too large, {} is larger than the number of " + "lines remaining in the training set: {}", new Object[] { inputFile, testSplitSize, lineCount - testSplitSize }); } } BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset); Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset); int trainCount = 0; int testCount = 0; try { String line; int pos = 0; while ((line = reader.readLine()) != null) { pos++; Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } writer.write(line); writer.write('\n'); } } finally { IOUtils.quietClose(reader); IOUtils.quietClose(trainingWriter); IOUtils.quietClose(testWriter); } log.info("file: {}, input: {} train: {}, test: {} starting at {}", new Object[] { inputFile.getName(), lineCount, trainCount, testCount, testSplitStart }); // testing; if (callback != null) { callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart); } }