List of usage examples for java.io DataOutput writeInt
void writeInt(int v) throws IOException;
int
value, which is comprised of four bytes, to the output stream. From source file:org.commoncrawl.service.listcrawler.HDFSFileIndex.java
public static void writeIndex(Vector<FingerprintAndOffsetTuple> offsetInfo, DataOutput indexFileOut) throws IOException { long firstFingerprint = offsetInfo.get(0)._fingerprint; BloomFilter bloomFilter = new BloomFilter(offsetInfo.size(), 0.001201); // sort the offset list by fingerprint Collections.sort(offsetInfo, new Comparator<FingerprintAndOffsetTuple>() { @Override/*from w w w. j a v a2s. c om*/ public int compare(FingerprintAndOffsetTuple o1, FingerprintAndOffsetTuple o2) { return (o1._fingerprint < o2._fingerprint) ? -1 : o1._fingerprint > o2._fingerprint ? 1 : 0; } }); // now we need to write the index out // allocate working set buffers ... ByteBuffer indexDataBuffer = ByteBuffer.allocate(offsetInfo.size() * 16); ByteBuffer indexHintsBuffer = ByteBuffer .allocate(((((offsetInfo.size() + INDEX_HINT_RECORD_INTERVAL) / INDEX_HINT_RECORD_INTERVAL) + 1) * INDEX_HINT_SIZE) + 4); // build index hints placeholder Vector<HDFSFileIndex.IndexItem> hints = new Vector<HDFSFileIndex.IndexItem>(); // 0 100 200 300 400 500 for (int i = 0; i < offsetInfo.size(); ++i) { if (i % INDEX_HINT_RECORD_INTERVAL == 0 || (i == (offsetInfo.size() - 1))) { HDFSFileIndex.IndexItem hint = new IndexItem(offsetInfo.get(i)._fingerprint, (int) offsetInfo.get(i)._offset); hints.add(hint); // add fingerprint to bloom filter bloomFilter.add(hint.fingerprint); } } // start off the index hints buffer with a hint of the index hint buffer size indexHintsBuffer.putInt(hints.size()); // track total bits used ... int bitsUsedForHints = 0; int bitsUsedForFingerprints = 0; int bitsUsedForOffsets = 0; // now start populating index data ... for (int hintIdx = 0; hintIdx < hints.size(); ++hintIdx) { HDFSFileIndex.IndexItem hint = hints.get(hintIdx); LOG.info("IndexWriter FP:" + hint.fingerprint); indexHintsBuffer.putLong(hint.fingerprint); indexHintsBuffer.putInt(hint.dataOffset); indexHintsBuffer.putInt(indexDataBuffer.position()); // update stats bitsUsedForHints += INDEX_HINT_SIZE * 8; if (hintIdx < hints.size() - 1) { // track cumilative delta and offset values (for average calc later) double cumilativeDelta = 0; long cumilativeOffset = 0; int subIndexItemCount = 0; int nonZeroDeltaCount = 0; Vector<HDFSFileIndex.IndexItem> subHints = new Vector<HDFSFileIndex.IndexItem>(); // initialize last fingerprint to indexed value ... long lastFingerprint = hint.fingerprint; // first collect values in between index hints for (int nonIndexItem = (hintIdx * INDEX_HINT_RECORD_INTERVAL) + 1; nonIndexItem < ((hintIdx + 1) * INDEX_HINT_RECORD_INTERVAL); ++nonIndexItem) { if (nonIndexItem >= offsetInfo.size()) break; // calculdate fingerprint delta ... long fingerprintDelta = offsetInfo.get(nonIndexItem)._fingerprint - lastFingerprint; LOG.info("IndexWriter FP:" + offsetInfo.get(nonIndexItem)._fingerprint + " Delta:" + fingerprintDelta); // offset delta if (fingerprintDelta != 0) { cumilativeDelta += (double) fingerprintDelta; LOG.info("Cumilative Delta is:" + cumilativeDelta); nonZeroDeltaCount++; } cumilativeOffset += offsetInfo.get(nonIndexItem)._offset; ++subIndexItemCount; // add to collection vector subHints.add(new IndexItem(fingerprintDelta, (int) offsetInfo.get(nonIndexItem)._offset)); // remember the last fingerpint ... lastFingerprint = offsetInfo.get(nonIndexItem)._fingerprint; // add item to bloom filter bloomFilter.add(lastFingerprint); } // calculate average delta value double averageDeltaValue = (double) cumilativeDelta / (double) nonZeroDeltaCount; // calculate m for fingerprint deltas int mForFingerprints = (int) Math.floor(lg(averageDeltaValue)); LOG.info("Average Delta Value is:" + averageDeltaValue + " m is:" + mForFingerprints); // cacluldate average offset value double averageOffsetValue = (double) cumilativeOffset / (double) subIndexItemCount; // calculate m for offsets int mForOffsets = (int) Math.floor(lg(averageOffsetValue)); // calculate rice codes RiceCoding riceCodeFP = new RiceCoding(mForFingerprints); RiceCoding riceCodeOffsets = new RiceCoding(mForOffsets); // populate bits for (HDFSFileIndex.IndexItem subItemHint : subHints) { if (subItemHint.fingerprint == 0) { LOG.warn("Zero Delta for Fingerprint Detected.There are two duplicate entires in log!"); } riceCodeFP.addItem(subItemHint.fingerprint + 1); riceCodeOffsets.addItem(subItemHint.dataOffset + 1); } // now track bits used ... bitsUsedForFingerprints += riceCodeFP.getNumBits(); bitsUsedForOffsets += riceCodeOffsets.getNumBits(); // write out metadata // save the current position int currentPosition = indexDataBuffer.position(); // fingerprint data indexDataBuffer.put((byte) mForFingerprints); CacheManager.writeVLongToByteBuffer(indexDataBuffer, riceCodeFP.getNumBits()); indexDataBuffer.put(riceCodeFP.getBits(), 0, (riceCodeFP.getNumBits() + 7) / 8); // offset data indexDataBuffer.put((byte) mForOffsets); CacheManager.writeVLongToByteBuffer(indexDataBuffer, riceCodeOffsets.getNumBits()); indexDataBuffer.put(riceCodeOffsets.getBits(), 0, (riceCodeOffsets.getNumBits() + 7) / 8); System.out.println("Item Count:" + subIndexItemCount + "FP Bits:" + subIndexItemCount * 64 + " Compressed:" + riceCodeFP.getNumBits() + " Offset Bits:" + subIndexItemCount * 32 + " Compressed:" + riceCodeOffsets.getNumBits()); LOG.info("Item Count:" + subIndexItemCount + "FP Bits:" + subIndexItemCount * 64 + " Compressed:" + riceCodeFP.getNumBits() + " Offset Bits:" + subIndexItemCount * 32 + " Compressed:" + riceCodeOffsets.getNumBits()); if ((subIndexItemCount * 64) < riceCodeFP.getNumBits()) { throw new RuntimeException("Compressed Size > UnCompressed Size!!!!"); } validateIndexData(indexDataBuffer.array(), currentPosition, hint.fingerprint, subHints, bloomFilter); } } if (!bloomFilter.isPresent(firstFingerprint)) { throw new RuntimeException("Test Failed!"); } // serialize bloomfilter ByteStream baos = new ByteStream(1 << 12); BloomFilter.serializer().serialize(bloomFilter, new DataOutputStream(baos)); // spit out final stats System.out.println(" Bloomfilter Size:" + baos.size() + " IndexHintBuffer Size:" + indexHintsBuffer.position() + " IndexDataBuffer Size:" + indexDataBuffer.position()); // now write out the final index file ... // bloom filter data ... indexFileOut.write(baos.getBuffer(), 0, baos.size()); // write hint data indexFileOut.write(indexHintsBuffer.array(), 0, indexHintsBuffer.position()); // write out rice code data size indexFileOut.writeInt(indexDataBuffer.position()); // finally rice coded sub-index data indexFileOut.write(indexDataBuffer.array(), 0, indexDataBuffer.position()); }