org.commoncrawl.service.listcrawler.HDFSFileIndex.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.listcrawler.HDFSFileIndex.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.listcrawler;

import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.rmi.server.UID;
import java.security.MessageDigest;
import java.util.Collections;
import java.util.Comparator;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.CacheItem;
import org.commoncrawl.util.RiceCoding;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.BloomFilter;
import org.commoncrawl.util.ByteStream;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.IntrusiveList.IntrusiveListElement;

/** 
 * An index of files contained in a HDFS SequenceFile
 * 
 * @author rana
 *
 */
public class HDFSFileIndex {

    public static final Log LOG = LogFactory.getLog(HDFSFileIndex.class);

    public static final int INDEX_HINT_RECORD_INTERVAL = 100;
    public static final int INDEX_HINT_SIZE = 8 + 4 + 4;

    private File _localIndexFilePath = null;
    private FileSystem _remoteFileSystem = null;
    private Path _remoteDataPath = null;
    private BloomFilter _bloomFilter = null;
    private ByteBuffer _indexHints = null;
    private int _indexHintCount = -1;
    private int _indexDataOffset = -1;
    private int _indexDataSize = -1;

    public HDFSFileIndex(FileSystem remoteFileSystem, Path remoteIndexFileLocation, Path remoteDataFileLocation,
            File localIndexDataDirectory) throws IOException {
        _remoteFileSystem = remoteFileSystem;
        _remoteDataPath = remoteDataFileLocation;
        // create a local index file for the index
        _localIndexFilePath = new File(localIndexDataDirectory, remoteIndexFileLocation.getName());
        _localIndexFilePath.delete();

        LOG.info("Copying Remote Index Location:" + remoteIndexFileLocation + " to Local File Location:"
                + _localIndexFilePath);
        // copy over the index data file 
        remoteFileSystem.copyToLocalFile(remoteIndexFileLocation, new Path(_localIndexFilePath.getAbsolutePath()));
        LOG.info("Done Copying Remote File. Loading Index");
        // load the index 
        loadIndexFromLocalFile();
    }

    public HDFSFileIndex(FileSystem remoteFileSystem, File localIndexFileLocation, Path remoteDataFileLocation)
            throws IOException {
        _remoteFileSystem = remoteFileSystem;
        _remoteDataPath = remoteDataFileLocation;
        _localIndexFilePath = localIndexFileLocation;
        loadIndexFromLocalFile();
    }

    public long getIndexTimestamp() {
        try {
            Matcher m = Pattern.compile(".*-([0-9]*)").matcher(_remoteDataPath.getName());
            if (m.matches()) {
                return Long.parseLong(m.group(1));
            }
        } catch (Exception e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }
        return 0L;
    }

    public Path getIndexDataPath() {
        return _remoteDataPath;
    }

    private void loadIndexFromLocalFile() throws IOException {
        LOG.info("Loading Index from Local File:" + _localIndexFilePath);
        // now open an input stream to the local file ...
        FileInputStream fileInputStream = new FileInputStream(_localIndexFilePath);
        DataInputStream dataStream = new DataInputStream(fileInputStream);

        try {
            // deserialize bloom filter 
            _bloomFilter = BloomFilter.serializer().deserialize(dataStream);
            _indexHintCount = dataStream.readInt();

            int indexHintDataSize = _indexHintCount * INDEX_HINT_SIZE;
            // and deserialize index hints 
            _indexHints = ByteBuffer.allocate(indexHintDataSize);

            dataStream.readFully(_indexHints.array());

            // load index data buffer size 
            _indexDataSize = dataStream.readInt();
            // and capture offset information 
            _indexDataOffset = (int) fileInputStream.getChannel().position();
        } finally {
            if (fileInputStream != null) {
                fileInputStream.close();
            }
        }
        LOG.info("Successfully loaded Index");
    }

    public CacheItem findItem(long targetFingerprint, boolean checkOnly) throws IOException {
        // check bloom filter first ... 
        if (_bloomFilter.isPresent(targetFingerprint)) {
            //      synchronized (this) { 
            // find best hint ... 
            HDFSFileIndex.IndexItem itemOut = _findBestIndexHintForFingerprint(targetFingerprint);

            // if non null result returned   
            if (itemOut != null) {

                // if no match, then this is the next lowest matching hint item ... 
                if (itemOut.fingerprint != targetFingerprint) {
                    // demand load item data 
                    HDFSFileIndex.IndexDataBlock dataBlock = demandLoadIndexDataBlock(itemOut.fingerprint,
                            itemOut.indexDataOffset, itemOut.indexDataSize);
                    // and search within it ...
                    itemOut = dataBlock.searchBlockFor(targetFingerprint);
                }

                if (itemOut != null && checkOnly) {
                    CacheItem item = new CacheItem();
                    item.setUrlFingerprint(targetFingerprint);
                    return item;
                }

                if (itemOut != null) {
                    LOG.info("Found Match in Index:" + _localIndexFilePath + " For FP:" + targetFingerprint
                            + " Loading File:" + _remoteDataPath + " at Offset:" + _indexDataOffset);
                    // open sequence file ... 
                    SequenceFile.Reader reader = new SequenceFile.Reader(_remoteFileSystem, _remoteDataPath,
                            CrawlEnvironment.getHadoopConfig());

                    try {
                        reader.seek(itemOut.dataOffset);

                        Text url = new Text();
                        CacheItem item = new CacheItem();

                        LOG.info("Reading Item and Data");
                        reader.next(url, item);
                        String strURL = url.toString();

                        LOG.info("Read returned url:" + strURL);
                        item.setUrl(strURL);

                        return item;
                    } finally {
                        if (reader != null)
                            reader.close();
                    }
                }
            }
            //      }
        }
        return null;
    }

    private HDFSFileIndex.IndexDataBlock demandLoadIndexDataBlock(long fingerprint, int itemDataOffset,
            int itemDataSize) throws IOException {
        // ok time to load this block ...
        RandomAccessFile file = new RandomAccessFile(_localIndexFilePath, "r");

        try {

            ByteBuffer bufferOut = ByteBuffer.allocate(itemDataSize);

            if (bufferOut != null) {
                file.seek(_indexDataOffset + itemDataOffset);
                file.readFully(bufferOut.array());
                HDFSFileIndex.IndexDataBlock dataBlock = new IndexDataBlock(fingerprint, 0, bufferOut);
                return dataBlock;
            } else {
                throw new IOException("Unable to allocate byte buffer!!!");
            }
        } finally {
            if (file != null) {
                file.close();
            }
        }
    }

    private HDFSFileIndex.IndexItem _findBestIndexHintForFingerprint(long targetFP) throws IOException {
        int low = 0;
        int high = _indexHintCount - 1;

        while (low <= high) {
            int mid = low + ((high - low) / 2);
            _indexHints.position(mid * (INDEX_HINT_SIZE));
            long hintFP = _indexHints.getLong();
            // compare to target 
            long comparisonResult = (hintFP > targetFP) ? 1 : (hintFP < targetFP) ? -1 : 0;

            if (comparisonResult > 0)
                high = mid - 1;
            else if (comparisonResult < 0)
                low = mid + 1;
            else {
                return new IndexItem(targetFP, _indexHints.getInt());
            }
        }

        if (high >= 0 && low < _indexHintCount) {

            _indexHints.position(high * INDEX_HINT_SIZE);
            // create nearest match ... 
            HDFSFileIndex.IndexItem itemOut = new IndexItem(_indexHints.getLong(), _indexHints.getInt(),
                    _indexHints.getInt(), -1);
            // figure out this items data block size ... 
            if (high < (_indexHintCount - 1)) {
                _indexHints.position(((high + 1) * INDEX_HINT_SIZE) + 12);
                itemOut.indexDataSize = _indexHints.getInt() - itemOut.indexDataOffset;
            } else {
                itemOut.indexDataSize = _indexDataSize - itemOut.indexDataOffset;
            }
            return itemOut;
        }
        return null;
    }

    static class IndexItem {

        public IndexItem(long fingerprint, int dataOffset) {
            this.fingerprint = fingerprint;
            this.dataOffset = dataOffset;
            this.indexDataOffset = -1;
        }

        public IndexItem(long fingerprint, int dataOffset, int indexDataOffset, int indexDataSize) {
            this.fingerprint = fingerprint;
            this.dataOffset = dataOffset;
            this.indexDataOffset = indexDataOffset;
            this.indexDataSize = indexDataSize;
        }

        public long fingerprint;
        public int dataOffset;
        public int indexDataOffset = -1;
        public int indexDataSize = -1;
    }

    public static class IndexDataBlock extends IntrusiveListElement<HDFSFileIndex.IndexDataBlock> {
        public IndexDataBlock(long baseFingerprint, int dataOffset, ByteBuffer data) {
            _dataOffset = dataOffset;
            _buffer = data;
            _lastUseTime = System.currentTimeMillis();
            _baseFingerprint = baseFingerprint;
        }

        HDFSFileIndex.IndexItem searchBlockFor(long targetFingerprint) {
            // reset cursor ... 
            _buffer.position(_dataOffset);

            int fingerprintMValue = _buffer.get();
            int fingerprintBits = (int) CacheManager.readVLongFromByteBuffer(_buffer);

            RiceCoding.RiceCodeReader fingerprintReader = new RiceCoding.RiceCodeReader((int) fingerprintMValue,
                    (int) fingerprintBits, _buffer.array(), _buffer.position());

            // advance past fingerprint data to offset data 
            _buffer.position(_buffer.position() + ((fingerprintBits + 7) / 8));

            // and create offset data reader 
            RiceCoding.RiceCodeReader offsetReader = new RiceCoding.RiceCodeReader((int) _buffer.get(),
                    (int) CacheManager.readVLongFromByteBuffer(_buffer), _buffer.array(), _buffer.position());

            long fingerprintValue = _baseFingerprint;

            while (fingerprintReader.hasNext()) {
                fingerprintValue += fingerprintReader.nextValue();
                // rice coded values are offset by one since rice coding cannot support zero values .... 
                fingerprintValue -= 1;
                int offsetValue = (int) offsetReader.nextValue();
                // now compare to target 
                if (fingerprintValue == targetFingerprint) {
                    return new IndexItem(fingerprintValue, offsetValue
                            - 1 /*rice coder doesn't like zeros and Offset COULD be zero, so we have to offset by one to be safe*/);
                }
            }
            return null;
        }

        public int _dataOffset = -1;
        public ByteBuffer _buffer = null;
        public long _lastUseTime = -1;
        public long _baseFingerprint;
    }

    private static double lg(double value) {
        return Math.log(value) / Math.log(2.0);
    }

    public static void writeIndex(Vector<FingerprintAndOffsetTuple> offsetInfo, DataOutput indexFileOut)
            throws IOException {

        long firstFingerprint = offsetInfo.get(0)._fingerprint;

        BloomFilter bloomFilter = new BloomFilter(offsetInfo.size(), 0.001201);

        // sort the offset list by fingerprint 
        Collections.sort(offsetInfo, new Comparator<FingerprintAndOffsetTuple>() {

            @Override
            public int compare(FingerprintAndOffsetTuple o1, FingerprintAndOffsetTuple o2) {
                return (o1._fingerprint < o2._fingerprint) ? -1 : o1._fingerprint > o2._fingerprint ? 1 : 0;
            }

        });
        // now we need to write the index out

        // allocate working set buffers ...
        ByteBuffer indexDataBuffer = ByteBuffer.allocate(offsetInfo.size() * 16);
        ByteBuffer indexHintsBuffer = ByteBuffer
                .allocate(((((offsetInfo.size() + INDEX_HINT_RECORD_INTERVAL) / INDEX_HINT_RECORD_INTERVAL) + 1)
                        * INDEX_HINT_SIZE) + 4);

        // build index hints placeholder 
        Vector<HDFSFileIndex.IndexItem> hints = new Vector<HDFSFileIndex.IndexItem>();
        // 0 100 200 300 400 500
        for (int i = 0; i < offsetInfo.size(); ++i) {

            if (i % INDEX_HINT_RECORD_INTERVAL == 0 || (i == (offsetInfo.size() - 1))) {
                HDFSFileIndex.IndexItem hint = new IndexItem(offsetInfo.get(i)._fingerprint,
                        (int) offsetInfo.get(i)._offset);
                hints.add(hint);
                // add fingerprint to bloom filter 
                bloomFilter.add(hint.fingerprint);
            }
        }
        // start off the index hints buffer with a hint of the index hint buffer size 
        indexHintsBuffer.putInt(hints.size());

        // track total bits used ... 
        int bitsUsedForHints = 0;
        int bitsUsedForFingerprints = 0;
        int bitsUsedForOffsets = 0;

        // now start populating index data ... 
        for (int hintIdx = 0; hintIdx < hints.size(); ++hintIdx) {

            HDFSFileIndex.IndexItem hint = hints.get(hintIdx);

            LOG.info("IndexWriter FP:" + hint.fingerprint);
            indexHintsBuffer.putLong(hint.fingerprint);
            indexHintsBuffer.putInt(hint.dataOffset);
            indexHintsBuffer.putInt(indexDataBuffer.position());

            // update stats 
            bitsUsedForHints += INDEX_HINT_SIZE * 8;

            if (hintIdx < hints.size() - 1) {
                // track cumilative delta and offset values (for average calc later)
                double cumilativeDelta = 0;
                long cumilativeOffset = 0;

                int subIndexItemCount = 0;
                int nonZeroDeltaCount = 0;

                Vector<HDFSFileIndex.IndexItem> subHints = new Vector<HDFSFileIndex.IndexItem>();

                // initialize last fingerprint to indexed value ... 
                long lastFingerprint = hint.fingerprint;

                // first collect values in between index hints
                for (int nonIndexItem = (hintIdx * INDEX_HINT_RECORD_INTERVAL) + 1; nonIndexItem < ((hintIdx + 1)
                        * INDEX_HINT_RECORD_INTERVAL); ++nonIndexItem) {
                    if (nonIndexItem >= offsetInfo.size())
                        break;

                    // calculdate fingerprint delta ... 
                    long fingerprintDelta = offsetInfo.get(nonIndexItem)._fingerprint - lastFingerprint;
                    LOG.info("IndexWriter FP:" + offsetInfo.get(nonIndexItem)._fingerprint + " Delta:"
                            + fingerprintDelta);
                    // offset delta

                    if (fingerprintDelta != 0) {

                        cumilativeDelta += (double) fingerprintDelta;
                        LOG.info("Cumilative Delta is:" + cumilativeDelta);
                        nonZeroDeltaCount++;
                    }

                    cumilativeOffset += offsetInfo.get(nonIndexItem)._offset;

                    ++subIndexItemCount;

                    // add to collection vector 
                    subHints.add(new IndexItem(fingerprintDelta, (int) offsetInfo.get(nonIndexItem)._offset));

                    // remember the last fingerpint ... 
                    lastFingerprint = offsetInfo.get(nonIndexItem)._fingerprint;

                    // add item to bloom filter
                    bloomFilter.add(lastFingerprint);
                }

                // calculate average delta value 
                double averageDeltaValue = (double) cumilativeDelta / (double) nonZeroDeltaCount;
                // calculate m for fingerprint deltas 
                int mForFingerprints = (int) Math.floor(lg(averageDeltaValue));
                LOG.info("Average Delta Value is:" + averageDeltaValue + " m is:" + mForFingerprints);
                // cacluldate average offset value 
                double averageOffsetValue = (double) cumilativeOffset / (double) subIndexItemCount;
                // calculate m for offsets 
                int mForOffsets = (int) Math.floor(lg(averageOffsetValue));

                // calculate rice codes
                RiceCoding riceCodeFP = new RiceCoding(mForFingerprints);
                RiceCoding riceCodeOffsets = new RiceCoding(mForOffsets);

                // populate bits 
                for (HDFSFileIndex.IndexItem subItemHint : subHints) {
                    if (subItemHint.fingerprint == 0) {
                        LOG.warn("Zero Delta for Fingerprint Detected.There are two duplicate entires in log!");
                    }
                    riceCodeFP.addItem(subItemHint.fingerprint + 1);
                    riceCodeOffsets.addItem(subItemHint.dataOffset + 1);
                }
                // now track bits used ... 
                bitsUsedForFingerprints += riceCodeFP.getNumBits();
                bitsUsedForOffsets += riceCodeOffsets.getNumBits();

                // write out metadata 

                // save the current position 
                int currentPosition = indexDataBuffer.position();

                // fingerprint data 
                indexDataBuffer.put((byte) mForFingerprints);
                CacheManager.writeVLongToByteBuffer(indexDataBuffer, riceCodeFP.getNumBits());
                indexDataBuffer.put(riceCodeFP.getBits(), 0, (riceCodeFP.getNumBits() + 7) / 8);

                // offset data 
                indexDataBuffer.put((byte) mForOffsets);
                CacheManager.writeVLongToByteBuffer(indexDataBuffer, riceCodeOffsets.getNumBits());
                indexDataBuffer.put(riceCodeOffsets.getBits(), 0, (riceCodeOffsets.getNumBits() + 7) / 8);

                System.out.println("Item Count:" + subIndexItemCount + "FP Bits:" + subIndexItemCount * 64
                        + " Compressed:" + riceCodeFP.getNumBits() + " Offset Bits:" + subIndexItemCount * 32
                        + " Compressed:" + riceCodeOffsets.getNumBits());

                LOG.info("Item Count:" + subIndexItemCount + "FP Bits:" + subIndexItemCount * 64 + " Compressed:"
                        + riceCodeFP.getNumBits() + " Offset Bits:" + subIndexItemCount * 32 + " Compressed:"
                        + riceCodeOffsets.getNumBits());

                if ((subIndexItemCount * 64) < riceCodeFP.getNumBits()) {
                    throw new RuntimeException("Compressed Size > UnCompressed Size!!!!");
                }

                validateIndexData(indexDataBuffer.array(), currentPosition, hint.fingerprint, subHints,
                        bloomFilter);
            }

        }

        if (!bloomFilter.isPresent(firstFingerprint)) {
            throw new RuntimeException("Test Failed!");
        }

        // serialize bloomfilter
        ByteStream baos = new ByteStream(1 << 12);
        BloomFilter.serializer().serialize(bloomFilter, new DataOutputStream(baos));

        // spit out final stats 
        System.out.println(" Bloomfilter Size:" + baos.size() + " IndexHintBuffer Size:"
                + indexHintsBuffer.position() + " IndexDataBuffer Size:" + indexDataBuffer.position());

        // now write out the final index file ... 

        // bloom filter data ... 
        indexFileOut.write(baos.getBuffer(), 0, baos.size());
        // write hint data  
        indexFileOut.write(indexHintsBuffer.array(), 0, indexHintsBuffer.position());
        // write out rice code data size 
        indexFileOut.writeInt(indexDataBuffer.position());
        // finally rice coded sub-index data
        indexFileOut.write(indexDataBuffer.array(), 0, indexDataBuffer.position());
    }

    public static void validateIndexData(byte[] data, int offset, long baseFingerprint,
            Vector<HDFSFileIndex.IndexItem> subItems, BloomFilter filter) {
        HDFSFileIndex.IndexDataBlock dataBlock = new IndexDataBlock(baseFingerprint, offset, ByteBuffer.wrap(data));

        long fingerprintValue = baseFingerprint;
        int itemIndex = 0;
        for (HDFSFileIndex.IndexItem item : subItems) {
            fingerprintValue += item.fingerprint;
            long timeStart = System.currentTimeMillis();
            if (dataBlock.searchBlockFor(fingerprintValue) == null) {
                throw new RuntimeException("Unable to Find fingerprint in data block ! - Test Failed!");
            }
            if (!filter.isPresent(fingerprintValue)) {
                throw new RuntimeException("Unable to Find fingerprint in bloom filter ! - Test Failed!");
            }

            CacheManager.LOG
                    .info("Search for Item@" + itemIndex++ + " Took:" + (System.currentTimeMillis() - timeStart));
        }
    }

    public static void main(String[] args) {

        try {
            ByteStream outputStream = new ByteStream(8192);
            Vector<FingerprintAndOffsetTuple> fpInfo = new Vector<FingerprintAndOffsetTuple>();

            // construct 10000 entries with randomin fingerprints 
            for (int i = 0; i < 10000; ++i) {
                MessageDigest digester;
                digester = MessageDigest.getInstance("MD5");
                long time = System.currentTimeMillis();
                digester.update((new UID() + "@" + time + ":" + i).getBytes());
                FingerprintAndOffsetTuple offsetInfo = new FingerprintAndOffsetTuple(
                        URLFingerprint.generate64BitURLFPrint(StringUtils.byteToHexString(digester.digest())),
                        i * 10000);
                fpInfo.add(offsetInfo);
            }
            // clone the vector 
            Vector<FingerprintAndOffsetTuple> fpInfoCloned = new Vector<FingerprintAndOffsetTuple>();
            fpInfoCloned.addAll(fpInfo);
            // now write out the index ... 
            writeIndex(fpInfoCloned, new DataOutputStream(outputStream));
            // spit out some basic stats 
            System.out.println("output buffer size is:" + outputStream.size());

        } catch (Exception e) {
            CacheManager.LOG.error(CCStringUtils.stringifyException(e));
        }
    }

}