org.commoncrawl.service.listcrawler.HDFSFlusherThread.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.listcrawler.HDFSFlusherThread.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.listcrawler;

import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.CacheItem;
import org.commoncrawl.util.CCStringUtils;

/** 
 * Thread that flushes crawled content to HDFS 
 * 
 * @author rana
 *
 */
final class HDFSFlusherThread implements Runnable {

    public static final Log LOG = LogFactory.getLog(HDFSFlusherThread.class);

    CacheManager _manager;

    public HDFSFlusherThread(CacheManager manager) {
        _manager = manager;
    }

    private long generateSequenceFileAndIndex(int itemFlushLimit, RandomAccessFile sourceLogFile, long startPos,
            long endPos, byte[] syncBytes, SequenceFile.Writer writer, DataOutput indexStreamOut,
            ArrayList<FingerprintAndOffsetTuple> tupleListOut) throws IOException {

        byte[] syncCheck = new byte[syncBytes.length];

        // and create a list to hold fingerprint / offset information
        Vector<FingerprintAndOffsetTuple> fpOffsetList = new Vector<FingerprintAndOffsetTuple>();

        long currentPos = startPos;

        LOG.info("Flushing Entries Starting up to offset:" + endPos);
        CacheItemHeader itemHeader = new CacheItemHeader();

        int itemsProcessed = 0;

        boolean ignoreFlushLimit = false;

        // start read 
        while (currentPos < endPos) {

            if ((endPos - currentPos) < LocalLogFileHeader.SYNC_BYTES_SIZE)
                break;

            // seek to current position ... 
            sourceLogFile.seek(currentPos);

            boolean headerLoadFailed = false;

            try {
                // read the item header ... assuming things are good so far ... 
                itemHeader.readHeader(sourceLogFile);
            } catch (IOException e) {
                CacheManager.LOG.error("### Item Header Load At Position:" + currentPos + " Failed With Exception:"
                        + CCStringUtils.stringifyException(e));
                headerLoadFailed = true;
            }

            if (headerLoadFailed) {
                CacheManager.LOG
                        .error("### Item File Corrupt at position:" + currentPos + " Seeking Next Sync Point");
                currentPos += LocalLogFileHeader.SYNC_BYTES_SIZE;
            }

            // if header sync bytes don't match .. then seek to next sync position ... 
            if (headerLoadFailed || !Arrays.equals(itemHeader._sync, syncBytes)) {

                CacheManager.LOG
                        .error("### Item File Corrupt at position:" + currentPos + " Seeking Next Sync Point");

                // reseek to current pos 
                sourceLogFile.seek(currentPos);
                // read in a sync.length buffer amount 
                sourceLogFile.readFully(syncCheck);

                int syncLen = syncBytes.length;

                // start scan for next sync position ...
                for (int i = 0; sourceLogFile.getFilePointer() < endPos; i++) {
                    int j = 0;
                    for (; j < syncLen; j++) {
                        if (syncBytes[j] != syncCheck[(i + j) % syncLen])
                            break;
                    }
                    if (j == syncLen) {
                        sourceLogFile.seek(sourceLogFile.getFilePointer() - LocalLogFileHeader.SYNC_BYTES_SIZE); // position before sync
                        break;
                    }
                    syncCheck[i % syncLen] = sourceLogFile.readByte();
                }
                // whatever, happened file pointer is at current pos 
                currentPos = sourceLogFile.getFilePointer();

                if (currentPos < endPos) {
                    CacheManager.LOG.info("### Item Loader Found another sync point at:" + currentPos);
                } else {
                    CacheManager.LOG.error("### No more sync points found!");
                }
            } else {
                CacheManager.LOG
                        .info("WritingItem with FP:" + itemHeader._fingerprint + " Pos Is:" + writer.getLength());
                // track offset information for index building purposes   
                fpOffsetList.add(new FingerprintAndOffsetTuple(itemHeader._fingerprint, writer.getLength()));
                // read item data ...
                CacheItem cacheItem = new CacheItem();
                cacheItem.readFields(sourceLogFile);
                // now read content length 
                int contentLength = sourceLogFile.readInt();
                // and if content present... allocate buffer 
                if (contentLength != 0) {
                    // allocate content buffer 
                    byte[] contentBuffer = new byte[contentLength];
                    // read it from disk 
                    sourceLogFile.readFully(contentBuffer);
                    // and set content into cache item 
                    cacheItem.setContent(new Buffer(contentBuffer));
                }
                CacheManager.LOG.info("Adding to Sequence File Item with URL:" + cacheItem.getUrl());
                // write to sequence file ... 
                writer.append(new Text(cacheItem.getUrl()), cacheItem);
                // now seek past data
                currentPos += CacheItemHeader.SIZE + itemHeader._dataLength
                        + CacheManager.ITEM_RECORD_TRAILING_BYTES;
                // increment item count 
                itemsProcessed++;

            }

            if (!ignoreFlushLimit && itemsProcessed >= itemFlushLimit) {
                // ok this gets tricky now ...
                // figure out how many bytes of data were required to get to flush limit 
                long approxCheckpointSize = currentPos - startPos;
                // compute a  threshold number 
                long bytesThreshold = (long) (approxCheckpointSize * .70);
                // compute bytes remaining in checkpoint file ... 
                long bytesRemaining = endPos - currentPos;

                // ok if bytes remaining are less than threshold number then go ahead and gobble
                // everything up in a single pass (to prevent smaller subsequent index 
                if (bytesRemaining <= bytesThreshold) {
                    // ignore the flush limit and keep on rolling to the end ...  
                    ignoreFlushLimit = true;
                    LOG.warn("*****Bytes Remaining:" + bytesRemaining + " less than % of last whole chkpt size:"
                            + approxCheckpointSize + ". Bypassing Flush Limit");
                } else {
                    LOG.info("Reached Flush Item Limit:" + itemsProcessed + " Breaking Out");
                    break;
                }

            }
        }

        LOG.info("Writing Index");
        // ok now build the index file ... 
        HDFSFileIndex.writeIndex(fpOffsetList, indexStreamOut);
        LOG.info("Done Writing Index. Total Items Written:" + fpOffsetList.size());
        // copy offset list into tuple list
        tupleListOut.addAll(fpOffsetList);

        return currentPos;
    }

    static class IndexDataFileTriple {
        public Path _dataFilePath = null;
        public Path _indexFilePath = null;
        public File _localIndexFilePath = null;
    }

    @Override
    public void run() {

        boolean shutdown = false;

        while (!shutdown) {

            try {

                final CacheFlushRequest request = _manager.getHDFSFlushRequestQueue().take();

                switch (request._requestType) {

                case ExitThreadRequest: {
                    // shutdown condition ... 
                    CacheManager.LOG.info("Cache Flusher Thread Received Shutdown. Exiting!");
                    shutdown = true;
                }
                    break;

                case FlushRequest: {

                    LOG.info("Received Flush Request");

                    ArrayList<IndexDataFileTriple> tempFiles = new ArrayList<IndexDataFileTriple>();
                    ArrayList<FingerprintAndOffsetTuple> tuplesOut = new ArrayList<FingerprintAndOffsetTuple>();

                    // flag to track request status at end .. 
                    boolean requestFailed = false;

                    long logStart = LocalLogFileHeader.SIZE;
                    long logEnd = logStart + request._bytesToFlush;

                    // create a hdfs temp file for data (and index)
                    long generateTime = System.currentTimeMillis();
                    Path tempDir = new Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".")
                            + "/flusher-temp-" + generateTime);

                    // mkdir ... 
                    try {
                        _manager.getRemoteFileSystem().mkdirs(tempDir);
                    } catch (IOException e1) {
                        LOG.error(CCStringUtils.stringifyException(e1));
                        requestFailed = true;
                    }

                    int iterationNumber = 0;

                    while (logStart != logEnd && !requestFailed) {

                        Path tempDataFile = new Path(tempDir,
                                CacheManager.PROXY_CACHE_FILE_DATA_PREFIX + "-" + iterationNumber);
                        Path tempIndexFile = new Path(tempDir,
                                CacheManager.PROXY_CACHE_FILE_INDEX_PREFIX + "-" + iterationNumber);

                        LOG.info("FlushRequest Pass#:" + iterationNumber + " DataPath:" + tempDataFile
                                + " IndexPath:" + tempIndexFile);

                        SequenceFile.Writer writer = null;
                        FSDataOutputStream indexOutputStream = null;
                        RandomAccessFile localLogFile = null;

                        try {

                            LOG.info("Pass#:" + iterationNumber + " Opening SequenceFile for Output");
                            // open a temporary hdfs streams ...
                            writer = SequenceFile.createWriter(_manager.getRemoteFileSystem(),
                                    CrawlEnvironment.getHadoopConfig(), tempDataFile, Text.class, CacheItem.class,
                                    CompressionType.NONE);

                            // opening index output stream ... 
                            LOG.info("Pass#:" + iterationNumber + " Opening Index Output Stream");
                            indexOutputStream = _manager.getRemoteFileSystem().create(tempIndexFile);

                            LOG.info("Pass#:" + iterationNumber + " Opening Local Log");
                            localLogFile = new RandomAccessFile(_manager.getActiveLogFilePath(), "rw");

                            // transfer log entries and generate index
                            logStart = generateSequenceFileAndIndex(_manager.getCacheFlushThreshold(), localLogFile,
                                    logStart, logEnd, _manager.getLocalLogSyncBytes(), writer, indexOutputStream,
                                    tuplesOut);
                        } catch (IOException e) {
                            CacheManager.LOG.error(CCStringUtils.stringifyException(e));
                            requestFailed = true;
                        } finally {
                            if (writer != null) {
                                try {
                                    writer.close();
                                } catch (IOException e) {
                                    LOG.error(CCStringUtils.stringifyException(e));
                                }
                            }
                            if (indexOutputStream != null) {
                                try {
                                    indexOutputStream.close();
                                } catch (IOException e) {
                                    LOG.error(CCStringUtils.stringifyException(e));
                                }
                            }
                            if (localLogFile != null) {
                                try {
                                    localLogFile.close();
                                } catch (IOException e) {
                                    LOG.error(CCStringUtils.stringifyException(e));
                                }
                            }
                        }

                        if (requestFailed) {
                            try {
                                LOG.info("Pass#:" + iterationNumber + " Failed. Deleting temp files");
                                _manager.getRemoteFileSystem().delete(tempDataFile, false);
                                _manager.getRemoteFileSystem().delete(tempIndexFile, false);
                            } catch (IOException e) {
                                LOG.error("Delete Failed During Failure! Potenital Orphan Files! : "
                                        + CCStringUtils.stringifyException(e));
                            }
                            break;
                        } else {
                            LOG.info("Pass#:" + iterationNumber + " Finished. Adding files to tuple list");
                            // add temp file tuple
                            IndexDataFileTriple indexDataPair = new IndexDataFileTriple();

                            indexDataPair._dataFilePath = tempDataFile;
                            indexDataPair._indexFilePath = tempIndexFile;

                            tempFiles.add(indexDataPair);
                        }
                        iterationNumber++;
                    }

                    LOG.info("All Passes Complete. Finalizing Commit");

                    // ok if request failed ... 
                    if (!requestFailed) {

                        int itemIndex = 0;
                        for (IndexDataFileTriple indexDataPair : tempFiles) {
                            // generate final paths ... 
                            Path finalOutputDir = _manager.getRemoteDataDirectory();

                            Path finalDataFilePath = new Path(finalOutputDir,
                                    CacheManager.PROXY_CACHE_FILE_DATA_PREFIX + "-" + (generateTime + itemIndex));
                            Path finalIndexFilePath = new Path(finalOutputDir,
                                    CacheManager.PROXY_CACHE_FILE_INDEX_PREFIX + "-" + (generateTime + itemIndex));

                            try {
                                LOG.info("Pass#:" + itemIndex + " Renaming Temp Files");
                                LOG.info("Pass#:" + itemIndex + " Final Data File Name is:" + finalDataFilePath);
                                LOG.info("Pass#:" + itemIndex + " Final Index File Name is:" + finalIndexFilePath);

                                // rename files ... 
                                _manager.getRemoteFileSystem().rename(indexDataPair._dataFilePath,
                                        finalDataFilePath);
                                indexDataPair._dataFilePath = finalDataFilePath;
                                _manager.getRemoteFileSystem().rename(indexDataPair._indexFilePath,
                                        finalIndexFilePath);
                                indexDataPair._indexFilePath = finalIndexFilePath;
                            } catch (IOException e) {
                                LOG.info("Pass#:" + itemIndex + " Rename Failed");
                                LOG.error(CCStringUtils.stringifyException(e));
                                requestFailed = true;
                                break;
                            }

                            try {
                                // copy to local ...
                                indexDataPair._localIndexFilePath = new File(_manager.getLocalDataDirectory(),
                                        finalIndexFilePath.getName());

                                LOG.info("Pass#:" + itemIndex + " Copying Remote Index File at:"
                                        + finalIndexFilePath + " to Local Directory:"
                                        + indexDataPair._localIndexFilePath.getAbsolutePath());
                                _manager.getRemoteFileSystem().copyToLocalFile(finalIndexFilePath,
                                        new Path(indexDataPair._localIndexFilePath.getAbsolutePath()));
                                LOG.info("Pass#:" + itemIndex + " Done Copying Remote Index File to Local");
                            } catch (IOException e) {
                                LOG.info("Pass#:" + itemIndex + " Local File Copy Failed with Exception:"
                                        + CCStringUtils.stringifyException(e));
                                requestFailed = true;
                                indexDataPair._localIndexFilePath = null;
                                break;
                            }
                            // inrement item index 
                            itemIndex++;
                        }
                        // ok callback to manager if request succeeded 
                        if (!requestFailed) {
                            try {
                                LOG.info("Flush Complete. Calling hdfsFlushComplete");
                                _manager.hdfsCacheFlushRequestComplete(request, tuplesOut, tempFiles);
                                LOG.info("Flush Complete. hdfsFlushComplete succeeded");
                            } catch (IOException e) {

                                LOG.error("hdfsFlushComplete returned Exception:"
                                        + CCStringUtils.stringifyException(e));
                                requestFailed = true;
                            }
                        }

                    }

                    if (requestFailed) {
                        LOG.info("Cache Manager Log Flush Failed. Deleteing files");
                        try {
                            // delete temp file directory recursively 
                            _manager.getRemoteFileSystem().delete(tempDir, true);
                        } catch (IOException e) {
                            LOG.error(CCStringUtils.stringifyException(e));
                        }
                        // iterate temp file list 
                        for (IndexDataFileTriple triple : tempFiles) {
                            try {
                                LOG.info("Deleteing:" + triple._dataFilePath);
                                _manager.getRemoteFileSystem().delete(triple._dataFilePath, false);
                                LOG.info("Deleteing:" + triple._indexFilePath);
                                _manager.getRemoteFileSystem().delete(triple._indexFilePath, false);
                                if (triple._localIndexFilePath != null) {
                                    LOG.info("Deleteing LOCAL:" + triple._localIndexFilePath);
                                    triple._localIndexFilePath.delete();
                                }
                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                            }
                        }
                        // callback to manager with the bad news ... 
                        _manager.hdfsCacheFlushRequestFailed(request);
                    }
                }
                    break;
                }
            } catch (InterruptedException e) {
                LOG.error("Unexpected Exception in HDFSFlusher Thread:" + CCStringUtils.stringifyException(e));
            }
        }
    }
}