Java tutorial
/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.pagerank.slave; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.RandomAccessFile; import java.net.InetSocketAddress; import java.nio.ByteBuffer; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedList; import java.util.Map; import java.util.TreeMap; import java.util.Vector; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; import java.util.zip.CRC32; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.SequenceFile.Metadata; import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.io.compress.DefaultCodec; import org.commoncrawl.async.CallbackWithResult; import org.commoncrawl.async.EventLoop; import org.commoncrawl.async.Timer; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.protocol.CompressedOutlinkList; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.rpc.base.internal.AsyncClientChannel; import org.commoncrawl.rpc.base.internal.AsyncContext; import org.commoncrawl.rpc.base.internal.AsyncRequest; import org.commoncrawl.rpc.base.internal.AsyncServerChannel; import org.commoncrawl.rpc.base.internal.NullMessage; import org.commoncrawl.rpc.base.internal.Server; import org.commoncrawl.rpc.base.internal.AsyncRequest.Callback; import org.commoncrawl.rpc.base.internal.AsyncRequest.Status; import org.commoncrawl.rpc.base.shared.RPCException; import org.commoncrawl.rpc.base.shared.RPCStruct; import org.commoncrawl.service.crawler.filters.SuperDomainFilter; import org.commoncrawl.service.crawler.filters.Filter.FilterResult; import org.commoncrawl.service.pagerank.BaseConfig; import org.commoncrawl.service.pagerank.BeginPageRankInfo; import org.commoncrawl.service.pagerank.BlockTransfer; import org.commoncrawl.service.pagerank.BlockTransferAck; import org.commoncrawl.service.pagerank.CheckpointInfo; import org.commoncrawl.service.pagerank.Constants; import org.commoncrawl.service.pagerank.FileInfo; import org.commoncrawl.service.pagerank.IterationInfo; import org.commoncrawl.service.pagerank.PRRangeItem; import org.commoncrawl.service.pagerank.PageRankSlave; import org.commoncrawl.service.pagerank.SlaveStatus; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.FileUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.JVMStats; import org.commoncrawl.util.URLUtils; import org.junit.Test; import com.google.common.collect.ImmutableList; import com.hadoop.compression.lzo.LzoCodec; public class PageRankUtils { // TODO:HACK public static final int VALUES_PER_RANGE = 10; public static final Log LOG = LogFactory.getLog(PageRankUtils.class); public static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } private static String outlinkValuesFilePrefix = "OutlinkPR"; public static Path getCheckpointFilePath(Path jobPath, int iterationPhase, int iterationNumnber, int nodeIndex) { String fileName = IterationInfo.Phase.toString(iterationPhase) + "-CheckpointComplete-" + NUMBER_FORMAT.format(iterationNumnber) + "-" + NUMBER_FORMAT.format(nodeIndex); return new Path(jobPath, fileName); } public static String makeUniqueFileName(String fileNamePrefix, int iterationNumber, int nodeIndex) { if (iterationNumber == 0) { return fileNamePrefix + NUMBER_FORMAT.format(nodeIndex); } else { return fileNamePrefix + NUMBER_FORMAT.format(iterationNumber) + "-" + NUMBER_FORMAT.format(nodeIndex); } } public static File makeIdsFilePath(File basePath, int nodeIndex) { return new File(basePath, PageRankUtils.makeUniqueFileName(Constants.PR_IDS_FILE_PREFIX, 0, nodeIndex)); } public static Path makeRangeFilePath(File basePath, int nodeIndex) { return new Path(basePath.getAbsolutePath(), PageRankUtils.makeUniqueFileName(Constants.PR_RANGE_FILE_PREFIX, 0, nodeIndex)); } public static String getOutlinksBaseName(int myNodeIdx, int iterationNumber) { return outlinkValuesFilePrefix + "-" + NUMBER_FORMAT.format(iterationNumber) + "-" + NUMBER_FORMAT.format(myNodeIdx); } private static int readVIntFromByteBuffer(ByteBuffer source) { return (int) readVLongFromByteBuffer(source); } private static long readVLongFromByteBuffer(ByteBuffer source) { byte firstByte = source.get(); int len = WritableUtils.decodeVIntSize(firstByte); if (len == 1) { return firstByte; } long i = 0; for (int idx = 0; idx < len - 1; idx++) { byte b = source.get(); i = i << 8; i = i | (b & 0xFF); } return (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1L) : i); } public static final class PRValueMap { private static final int RANGE_ITEM_SIZE = 20; private static final int RANGE_FP_OFFSET = 8; private static final int RANGE_POS_OFFSET = 16; private File rangeFilePath = null; private ByteBuffer valueFileBuffer = null; private ByteBuffer rangeFileBuffer = null; private int rangeItemCount = 0; public PRValueMap() { } public void open(FileSystem fs, Path valueFilePath, Path rangeFilePath) throws IOException { LOG.info("OPENING PRValueMap - Available Memory:" + Runtime.getRuntime().freeMemory() + " TotalMemory:" + Runtime.getRuntime().totalMemory()); FileStatus valueFileStatus = fs.getFileStatus(valueFilePath); FileStatus rangeFileStatus = fs.getFileStatus(rangeFilePath); if (valueFileStatus == null) { LOG.error("Value File at Path:" + valueFilePath + " not Found!"); throw new FileNotFoundException(); } if (rangeFileStatus == null) { LOG.error("Range File at Path:" + rangeFilePath + " not Found!"); throw new FileNotFoundException(); } FSDataInputStream valueFile = null; FSDataInputStream rangeFile = null; try { LOG.info("Create R/W Random Access File for values Path:" + valueFilePath); valueFile = fs.open(valueFilePath); LOG.info("Create R-ONLY Random Access File for range Path:" + rangeFilePath); rangeFile = fs.open(rangeFilePath); LOG.info("Allocating R/W Buffer of Size:" + valueFileStatus.getLen() + " for Value File" + " Available Memory:" + Runtime.getRuntime().freeMemory()); JVMStats.dumpMemoryStats(); byte[] valueMapData = new byte[(int) valueFileStatus.getLen()]; //this.valueFileBuffer = ByteBuffer.allocate((int) valueFileStatus.getLen() ); this.valueFileBuffer = ByteBuffer.wrap(valueMapData); LOG.info("Loading R/W Buffer From Value File"); long loadStart = System.currentTimeMillis(); for (int offset = 0, totalRead = 0; offset < valueFileBuffer.capacity();) { int bytesToRead = Math.min(16384, valueFileBuffer.capacity() - totalRead); valueFile.read(valueFileBuffer.array(), offset, bytesToRead); offset += bytesToRead; totalRead += bytesToRead; } LOG.info("Load of Value File Buffer Took:" + (System.currentTimeMillis() - loadStart) + " MS"); LOG.info("Mapping R-ONLY Buffer of Size:" + rangeFileStatus.getLen() + " for Range File"); this.rangeFileBuffer = ByteBuffer.allocate((int) rangeFileStatus.getLen()); LOG.info("Loading RangeFile Buffer From Range File"); loadStart = System.currentTimeMillis(); for (int offset = 0, totalRead = 0; offset < rangeFileBuffer.capacity();) { int bytesToRead = Math.min(16384, rangeFileBuffer.capacity() - totalRead); rangeFile.read(rangeFileBuffer.array(), offset, bytesToRead); offset += bytesToRead; totalRead += bytesToRead; } LOG.info("Load of Range File Buffer Took:" + (System.currentTimeMillis() - loadStart) + " MS"); // calculate range item count rangeItemCount = (int) rangeFileStatus.getLen() / RANGE_ITEM_SIZE; } finally { if (valueFile != null) valueFile.close(); if (rangeFile != null) rangeFile.close(); } } void flush(OutputStream stream) throws IOException { if (valueFileBuffer != null) { LOG.info("Flushing valueBuffer"); LOG.info("Accessing underlying ByteArray"); valueFileBuffer.position(0); byte array[] = valueFileBuffer.array(); long timeStart = System.currentTimeMillis(); stream.write(array); long timeEnd = System.currentTimeMillis(); LOG.info("ValueBuffer Flush took:" + (timeEnd - timeStart) + " Milliseconds - valueBufferSize:" + valueFileBuffer.limit()); } } void close() throws IOException { LOG.info("CLOSING PRValueMap"); valueFileBuffer = null; rangeFileBuffer = null; } enum GetSetOPType { GET, SET, ADD } public final float getPRValue(URLFPV2 urlItem) throws IOException { return getSetPRValue(urlItem, GetSetOPType.GET, 0.0f); } public final void setPRValue(URLFPV2 urlItem, float value) throws IOException { getSetPRValue(urlItem, GetSetOPType.SET, value); } public final void addPRValue(URLFPV2 urlItem, float value) throws IOException { getSetPRValue(urlItem, GetSetOPType.ADD, value); } public void zeroValues() throws IOException { valueFileBuffer.position(0); while (valueFileBuffer.position() < valueFileBuffer.limit()) { valueFileBuffer.getLong(); //TODO: SWITCH TO INT FOR TEST // valueFileBuffer.putShort((short)0); valueFileBuffer.putFloat(0.0f); } } // TODO: SWITCH TO INT FOR TEST // static Map<Long,Short> debugMap = new TreeMap<Long,Short>(); static Map<Long, Float> debugMap = new TreeMap<Long, Float>(); public void finalizePageRank() throws IOException { valueFileBuffer.position(0); int itemCount = 0; while (valueFileBuffer.position() < valueFileBuffer.limit()) { long fingerprint = valueFileBuffer.getLong(); valueFileBuffer.mark(); //TODO: SWITCH TO INT FOR TEST // int accumulatedRank = valueFileBuffer.getShort(); float accumulatedRank = valueFileBuffer.getFloat(); // TODO: hack use default pr formula for now ... float finalRank = (.150f + (.85f * (float) accumulatedRank)); valueFileBuffer.reset(); valueFileBuffer.putFloat(finalRank); } } final float getSetPRValue(URLFPV2 urlItem, GetSetOPType opType, float valueIn) throws IOException { //long timeStart = System.currentTimeMillis(); int rangeIdx = findRangePosition(urlItem); //long timeEnd = System.currentTimeMillis(); if (rangeIdx == -1) { throw new IOException("Unable to locate PR Value for domain:" + urlItem.getDomainHash() + " fingerprint:" + urlItem.getUrlHash()); } //DBG if (1 == 0) { URLFPV2 rangeFP = new URLFPV2(); populateFPForRange(rangeFileBuffer, rangeFP, rangeIdx); //LOG.info("Range for Domain:" + urlItem.getDomainHash() + " FP:" + urlItem.getUrlHash() + " is Domain:" + rangeFP.getDomainHash() + " FP:" + rangeFP.getUrlHash() ); } //get the search start positon via the range int rangeOffset = rangeFileBuffer.getInt(rangeIdx * RANGE_ITEM_SIZE + RANGE_POS_OFFSET); // now start walking items in range ... //LOG.info("RangeOffset for domain:" + urlItem.getDomainHash() + " fingerprint:" + urlItem.getUrlHash() + " is:" + rangeOffset); // seek to range offset ... valueFileBuffer.position(rangeOffset); //timeStart = System.currentTimeMillis(); // walk up to max number of items in range ... for (int itemIdx = 0; itemIdx < VALUES_PER_RANGE; ++itemIdx) { // read the urlf fp ... long urlFPValue = valueFileBuffer.getLong(); if (urlItem.getUrlHash() == urlFPValue) { //timeEnd = System.currentTimeMillis(); ///LOG.info("Scan took:" + (timeEnd-timeStart)); if (opType == GetSetOPType.SET) { valueFileBuffer.putFloat(valueIn); return 0; } else if (opType == GetSetOPType.GET) { return valueFileBuffer.getFloat(); } else { // ADD valueFileBuffer.mark(); float value = valueFileBuffer.getFloat(); valueFileBuffer.reset(); valueFileBuffer.putFloat((Math.min(value + valueIn, Float.MAX_VALUE))); return 0; } } // otherwise skip the value ... else { valueFileBuffer.getFloat(); } // if we reached trailing end of buffer ... we are done if (valueFileBuffer.remaining() == 0) { throw new IOException("Reached end of Value Buffer Looking for Value"); } } //this is bad news... dump context info for debug purposes before throwing exception LOG.error("Reached End of Range looking for PRValue for FP:" + urlItem.getUrlHash()); URLFPV2 rangeFPDBG = new URLFPV2(); populateFPForRange(rangeFileBuffer, rangeFPDBG, rangeIdx); LOG.error("Closest Range Was Index:" + rangeIdx + " DomainHash:" + rangeFPDBG.getDomainHash() + " URLHash:" + rangeFPDBG.getUrlHash()); if (rangeIdx + 1 < this.rangeItemCount) { populateFPForRange(rangeFileBuffer, rangeFPDBG, rangeIdx + 1); LOG.error("Range At Index:" + (rangeIdx + 1) + " DomainHash:" + rangeFPDBG.getDomainHash() + " URLHash:" + rangeFPDBG.getUrlHash()); } LOG.error("Dumping Next 600 bytes at offset:" + rangeOffset); /* // re-seek to range offset ... valueFileBuffer.position(rangeOffset); LOG.error("\n" + dumpAsHex(valueFileBuffer, Math.min(600,valueFileBuffer.remaining()))); */ LOG.error("Dumping Values:"); // re-seek to range offset ... valueFileBuffer.position(rangeOffset); // walk up to max number of items in range ... for (int itemIdx = 0; itemIdx < VALUES_PER_RANGE && valueFileBuffer.remaining() != 0; ++itemIdx) { // read the urlf fp ... long urlFPValue = valueFileBuffer.getLong(); // and the value float value = valueFileBuffer.getFloat(); LOG.error("Item:" + itemIdx + " FP:" + urlFPValue + " Value:" + value); } LOG.error("Dump Complete"); throw new IOException("Reached the End of Range looking for designated PRValue"); } private static final int HEX_CHARS_PER_LINE = 32; public String dumpAsHex(ByteBuffer data, int amount) { StringBuffer buf = new StringBuffer(amount << 1); int k = 0; int flen = amount; char hexBuffer[] = new char[HEX_CHARS_PER_LINE * 2 + (HEX_CHARS_PER_LINE - 1) + 2]; char asciiBuffer[] = new char[HEX_CHARS_PER_LINE + 1]; hexBuffer[hexBuffer.length - 1] = 0; asciiBuffer[asciiBuffer.length - 1] = 0; for (int i = 0; i < flen; i++) { int j = data.get() & 0xFF; hexBuffer[k * 3] = Character.forDigit((j >>> 4), 16); hexBuffer[k * 3 + 1] = Character.forDigit((j & 0x0F), 16); hexBuffer[k * 3 + 2] = ' '; if (j < 0x20) asciiBuffer[k] = '.'; else if (k < 0x78) asciiBuffer[k] = (char) j; else asciiBuffer[k] = '?'; k++; if (k % HEX_CHARS_PER_LINE == 0) { hexBuffer[hexBuffer.length - 2] = 0; buf.append(hexBuffer); buf.append(" "); buf.append(asciiBuffer); buf.append('\n'); k = 0; } } if (k != 0) { hexBuffer[k * 3 + 1] = 0; asciiBuffer[k] = 0; buf.append(hexBuffer); buf.append(" "); buf.append(asciiBuffer); buf.append('\n'); } return buf.toString(); } int getRangeOffsetFromRangeIndex(int rangeIndex) { return rangeFileBuffer.getInt(rangeIndex * RANGE_ITEM_SIZE + RANGE_POS_OFFSET); } static final void populateFPForRange(ByteBuffer sourceBuffer, URLFPV2 placeHolder, int rangeIndex) { placeHolder.setDomainHash(sourceBuffer.getLong(rangeIndex * RANGE_ITEM_SIZE)); placeHolder.setUrlHash(sourceBuffer.getLong(rangeIndex * RANGE_ITEM_SIZE + RANGE_FP_OFFSET)); } final int findRangePosition(URLFPV2 searchTerm) { long searchDomainHash = searchTerm.getDomainHash(); long searchURLHash = searchTerm.getUrlHash(); int low = 0; int high = rangeItemCount - 1; while (low <= high) { int mid = low + ((high - low) / 2); long currentDomainHash = rangeFileBuffer.getLong(mid * RANGE_ITEM_SIZE); int result = (currentDomainHash < searchDomainHash ? -1 : (currentDomainHash == searchDomainHash ? 0 : 1)); if (result == 0) { long currentURLHash = rangeFileBuffer.getLong(mid * RANGE_ITEM_SIZE + RANGE_FP_OFFSET); result = (currentURLHash < searchURLHash ? -1 : (currentURLHash == searchURLHash ? 0 : 1)); } //comparisonFP.setDomainHash(rangeFileBuffer.getLong(mid*RANGE_ITEM_SIZE)); //comparisonFP.setUrlHash(rangeFileBuffer.getLong(mid*RANGE_ITEM_SIZE + RANGE_FP_OFFSET)); // populateFPForRange(rangeFileBuffer, comparisonFP, mid); if (result > 0) high = mid - 1; else if (result < 0) low = mid + 1; else return mid; // found } if (high < rangeItemCount) return high; return -1; // not found } void dumpRangeItems() { RandomAccessFile rangeFileObj = null; PRRangeItem item = new PRRangeItem(); try { rangeFileObj = new RandomAccessFile(rangeFilePath, "r"); for (int i = 0; i < rangeItemCount; ++i) { item.clear(); item.readFields(rangeFileObj); LOG.info("Range Item:" + i + " Domain:" + item.getDomainStart() + " FPStart:" + item.getUrlFPStart() + " Offset:" + item.getStartPos()); } rangeFileBuffer.position(0); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { if (rangeFileObj != null) { try { rangeFileObj.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } } static int findPos(int[] array, int searchTerm) { int low = 0; int high = array.length - 1; while (low <= high) { int mid = low + ((high - low) / 2); // Note: not (low + high) / 2 !! if (array[mid] > searchTerm) high = mid - 1; else if (array[mid] < searchTerm) low = mid + 1; else return mid; // found } if (high < array.length) return high; return -1; // not found } private static interface PRValueOutputStream { void writePRValue(URLFPV2 targetFP, URLFPV2 sourceFP, float prValue) throws IOException; void close(boolean deleteUnderlyingFile) throws IOException; } private static class PRSequenceFileOutputStream implements PRValueOutputStream { FileSystem _fileSystem; Path _path; SequenceFile.Writer _writer = null; DataOutputBuffer _outputWriter = new DataOutputBuffer(); FlexBuffer _buffer = new FlexBuffer(); public PRSequenceFileOutputStream(Configuration conf, FileSystem fs, Path path) throws IOException { _fileSystem = fs; _path = path; _writer = SequenceFile.createWriter(_fileSystem, conf, path, FlexBuffer.class, NullWritable.class, fs.getConf().getInt("io.file.buffer.size", 4096 * 12), (short) 1, fs.getDefaultBlockSize(), CompressionType.BLOCK, new DefaultCodec(), null, new Metadata()); } @Override public void close(boolean deleteUnderlyingFile) throws IOException { _writer.close(); if (deleteUnderlyingFile) { _fileSystem.delete(_path, false); } } @Override public void writePRValue(URLFPV2 target, URLFPV2 source, float prValue) throws IOException { _outputWriter.reset(); _outputWriter.writeLong(target.getDomainHash()); _outputWriter.writeLong(target.getUrlHash()); _outputWriter.writeLong(source.getRootDomainHash()); _outputWriter.writeLong(source.getDomainHash()); _outputWriter.writeLong(source.getUrlHash()); _outputWriter.writeFloat(prValue); _buffer.set(_outputWriter.getData(), 0, _outputWriter.getLength()); _writer.append(_buffer, NullWritable.get()); } } private static class PROldValueOutputStream implements PRValueOutputStream { PROldValueOutputStream(FileSystem fs, Path path) throws IOException { _targetFS = fs; _path = path; _stream = fs.create(path); } public FileSystem _targetFS; public Path _path; // optional path if this is a remote file public FSDataOutputStream _stream; @Override public void close(boolean deleteUnderlyingFile) throws IOException { if (_stream != null) { _stream.flush(); _stream.close(); _stream = null; } if (deleteUnderlyingFile) { _targetFS.delete(_path, false); } } @Override public void writePRValue(URLFPV2 target, URLFPV2 source, float prValue) throws IOException { _stream.writeLong(target.getDomainHash()); _stream.writeLong(target.getUrlHash()); _stream.writeLong(source.getRootDomainHash()); _stream.writeLong(source.getDomainHash()); _stream.writeLong(source.getUrlHash()); _stream.writeFloat(prValue); } } public static void purgeNodeDistributionFilesForIteration(FileSystem remoteFS, String remoteOutputPath, int nodeIndex, int nodeCount, int iterationNumber) throws IOException { String fileNamePrefix = getOutlinksBaseName(nodeIndex, iterationNumber); for (int i = 0; i < nodeCount; ++i) { // create output filename String fileName = fileNamePrefix + "-" + NUMBER_FORMAT.format(i); Path remotePath = new Path(remoteOutputPath, fileName); LOG.info("Deleting:" + remotePath); remoteFS.delete(remotePath, true); } } /** * PRValueMultiplexer * multiplexes page rank value distribution * across a set of pre-defined nodes * * @author rana * */ public static class PRValueMultiplexer { EventLoop _eventLoop = null; Vector<InetSocketAddress> _slaveAddressList; LinkedList<PRValueBlockWriter> _activeWriters = new LinkedList<PRValueBlockWriter>(); PRValueBlockWriter _failedWriter = null; int _myNodeIndex; Configuration _conf; long _jobId; int _iterationNumber; boolean _failed = false; int _completionCount = 0; int _nodeCount = 0; /** * construct a PRValueMultiplexer * * @param conf * @param jobId * @param iterationNumber * @param slaveAddressList * @param myNodeIndex * @throws IOException */ public PRValueMultiplexer(Configuration conf, long jobId, int iterationNumber, Vector<InetSocketAddress> slaveAddressList, int myNodeIndex) throws IOException { LOG.info("PRValueMultiplexer initialized. SlaveAddress List Size:" + slaveAddressList.size() + " JobID:" + jobId + " myNodeId:" + myNodeIndex); _slaveAddressList = slaveAddressList; _myNodeIndex = myNodeIndex; _conf = conf; _jobId = jobId; _iterationNumber = iterationNumber; _nodeCount = slaveAddressList.size(); // start event loop ... _eventLoop = new EventLoop(); _eventLoop.start(); try { createWriters(); } catch (IOException e) { _failed = true; LOG.error("Got Exception opening BlockWriters"); closeAllWriters(); throw e; } } /** * close the multiplexer, and optionally wait and flush all streams * @param forced - if false, block for all streams to complete * @return true if failure condition */ public boolean close(boolean forced) { // if not a forced close ... and we are not in a failure condition ... if (!forced && !_failed && _completionCount != _slaveAddressList.size()) { LOG.info("Setting up Poll Loop to monitor for clean shutdown"); // create a semaphore to block on final Semaphore blockingSemaphore = new Semaphore(0); // set up a poll loop to monitor writers ... _eventLoop.setTimer(new Timer(10, true, new Timer.Callback() { @Override public void timerFired(Timer timer) { if (_failed || _completionCount == _slaveAddressList.size()) { if (_failed) { LOG.error("Poll loop detected Failure. Shutting Down"); } else { LOG.info("Poll loop detected completion. Shutting Down"); } // release semaphore ... blockingSemaphore.release(); // cancel timer ... _eventLoop.cancelTimer(timer); } } })); // ok now wait for completion ... blockingSemaphore.acquireUninterruptibly(); } // a forced close is explicit, meaning just teardown everything ... if (forced) { _failed = true; } // ok close all writes closeAllWriters(); // ok finally shutdown the event loop _eventLoop.stop(); return _failed; } /** * write a page rank value to the appropriate stream ... * this method could block ... * @param target * @param source * @param prValue * @throws IOException */ public void writePRValue(int targetNode, URLFPV2 targetFP, URLFPV2 sourceFP, float prValue) throws IOException { if (_failed) { throw new IOException("Multiplexer in Failed State!"); } // figure out which stream this entry belongs to ... //int nodeIndex = (target.hashCode() & Integer.MAX_VALUE) % _nodeCount; // write directly to the proper block writer PRValueBlockWriter writer = null; synchronized (_activeWriters) { if (_activeWriters.size() != 0) writer = _activeWriters.get(targetNode); } if (writer != null) { writer.writePRValue(targetFP, sourceFP, prValue); } else { LOG.error("No Writer Found for nodexIndex:" + targetNode); } } /** * * create the block writers * @throws IOException */ void createWriters() throws IOException { int targetSlaveIndex = 0; for (InetSocketAddress targetSlaveAddress : _slaveAddressList) { LOG.info("Creating Writer for:" + targetSlaveAddress); PRValueBlockWriter prValueWriter = new PRValueBlockWriter(this, _conf, _jobId, targetSlaveAddress, targetSlaveIndex++, _myNodeIndex, _iterationNumber); synchronized (_activeWriters) { _activeWriters.add(prValueWriter); } } } void writerFailed(final PRValueBlockWriter writer, final IOException reason) { LOG.info("Writer Failed Callback for writer:" + writer._targetSlaveAddress); _failed = true; // fail this in the context of the async thread _eventLoop.setTimer(new Timer(0, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { LOG.error("Writer for Slave:" + writer._targetSlaveAddress + " failed with exception:" + CCStringUtils.stringifyException(reason)); _failedWriter = writer; closeAllWriters(); } })); } void writerDone(final PRValueBlockWriter writer) { LOG.info("Writer:" + writer._targetSlaveAddress + " done"); synchronized (this) { _completionCount++; } writer.close(); } void closeAllWriters() { LOG.info("Multiplexer: Closing all Writers"); ImmutableList<PRValueBlockWriter> writers = null; synchronized (_activeWriters) { writers = new ImmutableList.Builder().addAll(_activeWriters).build(); } for (PRValueBlockWriter writer : writers) { writer.close(); } // clear list synchronized (_activeWriters) { _activeWriters.clear(); } } } /** * Individual Node PageRank Value Stream Writer * * @author rana * */ static class PRValueBlockWriter implements AsyncClientChannel.ConnectionCallback { PRValueMultiplexer _multiplexer; ByteBuffer _outputBuffer = null; byte[] _outputArray = null; LinkedBlockingQueue<ByteBuffer> _packetQueue = new LinkedBlockingQueue<ByteBuffer>(MAX_PACKETS_ENQUEUED); CRC32 _crc32 = new CRC32(); int _itemCount = 0; LzoCodec _codec = new LzoCodec(); InetSocketAddress _targetSlaveAddress; int _targetSlaveIndex; int _sourceSlaveIndex; int _iterationNumber; String _targetFileName; FileInfo _fileInfo = new FileInfo(); long _lastBlockId = 0; // set when no more data is expected .. boolean _done = false; // slave communication related code ... AsyncClientChannel _channel; PageRankSlaveServer.AsyncStub _asyncStub; Semaphore _blockingCallSemaphore = null; IOException _lastIOException = null; String _logLinePrefix; private void log(boolean isError, String message) { if (isError) LOG.error(_logLinePrefix + message); else LOG.info(_logLinePrefix + message); } public PRValueBlockWriter(PRValueMultiplexer multiplexer, Configuration conf, long jobId, InetSocketAddress targetSlaveAddress, int targetSlaveIndex, int sourceSlaveIndex, int iterationNumber) throws IOException { _multiplexer = multiplexer; _outputBuffer = allocateNewBuffer(); _codec.setConf(conf); _targetSlaveAddress = targetSlaveAddress; _targetSlaveIndex = targetSlaveIndex; _sourceSlaveIndex = sourceSlaveIndex; _iterationNumber = iterationNumber; _logLinePrefix = "[TGT:" + targetSlaveIndex + " Addr:" + _targetSlaveAddress + "]"; _blockingCallSemaphore = new Semaphore(0); log(false, "Connecting to slave at index:" + _targetSlaveIndex + " endPoint:" + _targetSlaveAddress); _channel = new AsyncClientChannel(_multiplexer._eventLoop, null, _targetSlaveAddress, this); _channel.open(); _asyncStub = new PageRankSlaveServer.AsyncStub(_channel); log(false, "Waiting on Connect... "); _blockingCallSemaphore.acquireUninterruptibly(); log(false, "Connect Semaphore Released... "); if (!_channel.isOpen()) { log(true, "Connection Failed!"); throw new IOException("Connection Failed!"); } _targetFileName = getOutlinksBaseName(_sourceSlaveIndex, _iterationNumber) + "-" + NUMBER_FORMAT.format(_targetSlaveIndex); _fileInfo.setFileName(_targetFileName); _fileInfo.setJobId(jobId); log(false, "Sending Open File Command For Target:" + _targetSlaveAddress); sendOpenFileCommand(); } /** * Enqueue a page-rank value into this stream * * @param target * @param source * @param prValue * @throws IOException */ public void writePRValue(URLFPV2 target, URLFPV2 source, float prValue) throws IOException { _outputBuffer.putLong(target.getDomainHash()); _outputBuffer.putLong(target.getUrlHash()); _outputBuffer.putLong(source.getRootDomainHash()); _outputBuffer.putLong(source.getDomainHash()); _outputBuffer.putLong(source.getUrlHash()); _outputBuffer.putFloat(prValue); if (++_itemCount == RECORDS_PER_BLOCK) { // flush flush(); } } /** * mark this stream as compelte */ public void done() { // mark the stream as complete ... _done = true; // the poll thread will } void queuePollEvent() { // start the poll timer ... _multiplexer._eventLoop.setTimer(new Timer(10, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { if (_lastIOException == null && _channel.isOpen()) { ByteBuffer nextPacket = _packetQueue.poll(); if (nextPacket != null) { // ok we got a packet ... send it log(false, "got packet via poll"); BlockTransfer tranfserRequest = new BlockTransfer(); tranfserRequest.setBlockData(new FlexBuffer(nextPacket.array(), 0, nextPacket.limit())); tranfserRequest.setBlockId(_lastBlockId++); tranfserRequest.setFileId(_fileInfo.getFileId()); try { log(false, "Calling transferBlock RPC"); _asyncStub.transferBlock(tranfserRequest, new Callback<BlockTransfer, BlockTransferAck>() { @Override public void requestComplete( AsyncRequest<BlockTransfer, BlockTransferAck> request) { log(false, "transferBlock RPC Returned with Status:" + request.getStatus()); if (request.getStatus() == Status.Success) { // queue next poll event ... queuePollEvent(); } else { log(true, "transferBlock Failed!"); failed(new IOException("Transfer Block Failed!")); } } }); } catch (IOException e) { log(true, CCStringUtils.stringifyException(e)); // mark this stream as done ... failed(e); } } else { // check to see if we are done if (_done) { try { log(false, "Sending commitFile RPC"); _asyncStub.commitFile(_fileInfo, new Callback<FileInfo, NullMessage>() { @Override public void requestComplete(AsyncRequest<FileInfo, NullMessage> request) { log(false, "commitFile RPC returned with Status:" + request.getStatus()); if (request.getStatus() == Status.Success) { _multiplexer.writerDone(PRValueBlockWriter.this); } } }); } catch (IOException e) { log(true, CCStringUtils.stringifyException(e)); } } else { queuePollEvent(); } } } else { failed(null); } } })); } void sendOpenFileCommand() throws IOException { log(false, "sending createJobFile RPC"); _asyncStub.createJobFile(_fileInfo, new Callback<FileInfo, FileInfo>() { @Override public void requestComplete(AsyncRequest<FileInfo, FileInfo> request) { log(false, "createJobFile RPC returned with Status:" + request.getStatus()); if (request.getStatus() == Status.Success) { log(false, "Create File Successfull!!"); _fileInfo.setFileId(request.getOutput().getFileId()); // start polling log(false, "Polling for Data Packets"); queuePollEvent(); } else { // indicate a failure condition ... failed(new IOException("File Open Failed for Slave:" + _targetSlaveAddress)); } } }); } /** * indicate a failure condition * @param e */ private void failed(IOException e) { log(true, "failed called with Exception:" + CCStringUtils.stringifyException(e)); if (e != null) { _lastIOException = e; } // inform the multiplexer of the error ... _multiplexer.writerFailed(this, _lastIOException); } public void close() { log(false, "close called channel is:" + _channel + " packetQueue size is:" + _packetQueue.size()); if (_channel != null) { try { _channel.close(); } catch (IOException e) { e.printStackTrace(); } _channel = null; } // dump packets on the floor _packetQueue.clear(); } private ByteBuffer allocateNewBuffer() { return ByteBuffer.allocate(BLOCK_HEADER_SIZE + RECORD_BYTE_SIZE * RECORDS_PER_BLOCK + PADDING); } private static final int MAX_PACKETS_ENQUEUED = 5; private static final int RECORD_BYTE_SIZE = 48; // EACH RECORD IS 48 bytes long ... private static final int RECORDS_PER_BLOCK = (2 ^ 12); // 4096 records per block... private static final int SYNC_ESCAPE = -1; // "length" of sync entries private static final int SYNC_ESCAPE_SIZE = 4; // "length" of sync entries private static final byte SYNC_BYTES[] = { 'S', 'Y', 'N', 'C', 'B', 'Y', 'T', 'E' }; // sync bytes size ... private static final int BLOCK_SYNC_BYTE_SIZE = SYNC_ESCAPE_SIZE + SYNC_BYTES.length; // escape + hash; // block CRC LENGTH private static final int BLOCK_CRC_FIELD_SIZE = 8; // block LENGTH private static final int BLOCK_COMPRESSED_LENGTH_FIELD_SIZE = 4; // block LENGTH private static final int BLOCK_UNCOMPRESSED_LENGTH_FIELD_SIZE = 4; // PADDING FOR COMPRESSOR private static final int PADDING = 2 ^ 8; // block header size ... private static final int BLOCK_HEADER_SIZE = BLOCK_SYNC_BYTE_SIZE + BLOCK_CRC_FIELD_SIZE + BLOCK_COMPRESSED_LENGTH_FIELD_SIZE + BLOCK_UNCOMPRESSED_LENGTH_FIELD_SIZE; void flush() throws IOException { log(false, "flush called"); if (!_channel.isOpen() || _lastIOException != null) { log(true, "Invalid State. Connection Already Closed!"); throw new IOException("Connection Already Closed!"); } // queue packet for send ... if (_outputBuffer.position() != 0) { // create compressed buffer object .. ByteBuffer compressedBuffer = allocateNewBuffer(); // skip header ... compressedBuffer.position(BLOCK_HEADER_SIZE); // create output stream based on bytebuffer OutputStream compressedDataOutputStream = newOutputStream(compressedBuffer); // ok ... now compress the block CompressionOutputStream codecStream = _codec.createOutputStream(compressedDataOutputStream); // compress data .. codecStream.write(_outputBuffer.array(), 0, _outputBuffer.position()); // flush it ... codecStream.close(); // compute crc ... _crc32.reset(); _crc32.update(_outputBuffer.array(), 0, _outputBuffer.position()); // remember compressed buffer size .. int compressedBufferSize = compressedBuffer.position() - BLOCK_HEADER_SIZE; // ok write out header ... compressedBuffer.position(0); // write sync bytes into header ... compressedBuffer.putInt(SYNC_ESCAPE); // and write sync bytes compressedBuffer.put(SYNC_BYTES, 0, SYNC_BYTES.length); // write crc ... compressedBuffer.putLong(_crc32.getValue()); // write compressed length and uncompressed length... compressedBuffer.putInt(compressedBufferSize); compressedBuffer.putInt(_outputBuffer.position()); // and put it in queue ... compressedBuffer.position(compressedBufferSize + BLOCK_HEADER_SIZE); // flip it .. compressedBuffer.flip(); log(false, "queueing packet. Item Count:" + _itemCount + " UncompressedSize:" + _outputBuffer.position() + " CompressedSize:" + compressedBuffer.limit()); // add it to queue try { _packetQueue.put(compressedBuffer); } catch (InterruptedException e) { } // get new output buffer ... _outputBuffer.position(0); // reset item count _itemCount = 0; } } private static OutputStream newOutputStream(final ByteBuffer buf) { return new OutputStream() { @Override public void write(int b) throws IOException { buf.put((byte) (b & 0xff)); } public void write(byte src[], int off, int len) throws IOException { buf.put(src, off, len); } }; } @Override public void OutgoingChannelConnected(AsyncClientChannel channel) { LOG.info("OutgoingChannelConnected... "); if (_blockingCallSemaphore != null) { _blockingCallSemaphore.release(); } } @Override public boolean OutgoingChannelDisconnected(AsyncClientChannel channel) { LOG.info("OutgoingChannelDisconnected... "); try { // explicitly close the channel! _channel.close(); } catch (IOException e) { } _lastIOException = new IOException("Disconnected from slave"); if (_blockingCallSemaphore != null) { _blockingCallSemaphore.release(); } else { failed(_lastIOException); } return false; } } /** * Helper Class that encapsulates Block Receiving Logic for Slave Servers * * @author rana * */ static class PRValueBlockFileReceiver { // the active job id private long _jobId; // the fully qualified job storage path ... private File _jobFileLocalPath; // immediate shutdown flag ... private boolean _immediateShutdown = false; /** * */ public PRValueBlockFileReceiver(long jobId, File jobFileLocalPath) { _jobId = jobId; _jobFileLocalPath = jobFileLocalPath; startBlockWriter(); } /** * shutdown the block receiver * either in an orderly manner or immediately * @param orderly in an orderly manner (complete queued requests) or immediately */ public void shutdown(boolean orderly) throws IOException { if (_blockWriter != null) { // create a shutdown request ... BlockRequest request = BlockRequest.shutdownRequest(); // put in queue _blockRequestQueue.add(request); // if immediate, indicate so _immediateShutdown = !orderly; // ok wait for thread to exit ... LOG.info("Waiting for BlockWriter Thread Shutdown"); try { _blockWriter.join(); } catch (InterruptedException e) { } LOG.info("BlockWriter Thread Exited"); // ok reset state ... _immediateShutdown = false; _blockWriter = null; } } File getActiveJobLocalPath() { return _jobFileLocalPath; } long getJobId() { return _jobId; } private static class BlockRequest<DataType extends RPCStruct, ResultType> { enum BlockRequestType { FILE_CREATE, BLOCK_WRITE, FILE_COMMIT, PURGE, SHUTDOWN } AsyncContext _context; CallbackWithResult<BlockRequest<DataType, ResultType>> _callback; DataType _data; BlockRequestType _type; ResultType _result; public static BlockRequest<FileInfo, Long> createFileRequest(AsyncContext context, FileInfo fileInfo, CallbackWithResult<BlockRequest<FileInfo, Long>> callback) throws IOException { return new BlockRequest<FileInfo, Long>(context, BlockRequestType.FILE_CREATE, fileInfo, callback, 0L); } public static BlockRequest<FileInfo, Boolean> commitFileRequest(AsyncContext context, FileInfo fileInfo, CallbackWithResult<BlockRequest<FileInfo, Boolean>> callback) throws IOException { return new BlockRequest<FileInfo, Boolean>(context, BlockRequestType.FILE_COMMIT, fileInfo, callback, false); } public static BlockRequest<BlockTransfer, Boolean> blockTransferRequest(AsyncContext context, BlockTransfer blockInfo, CallbackWithResult<BlockRequest<BlockTransfer, Boolean>> callback) throws IOException { return new BlockRequest<BlockTransfer, Boolean>(context, BlockRequestType.BLOCK_WRITE, blockInfo, callback, false); } public static BlockRequest<NullMessage, Boolean> purgeRequest(AsyncContext context, NullMessage nullMessage, CallbackWithResult<BlockRequest<NullMessage, Boolean>> callback) throws IOException { return new BlockRequest<NullMessage, Boolean>(context, BlockRequestType.PURGE, null, callback, false); } public static BlockRequest<NullMessage, Boolean> shutdownRequest() throws IOException { return new BlockRequest<NullMessage, Boolean>(null, BlockRequestType.SHUTDOWN, null, null, false); } public BlockRequest(AsyncContext context, BlockRequestType type, DataType data, CallbackWithResult<BlockRequest<DataType, ResultType>> callback, ResultType defaultResultValue) throws IOException { _context = context; _type = type; _data = data; _callback = callback; _result = defaultResultValue; } } Thread _blockWriter = null; LinkedBlockingQueue<BlockRequest> _blockRequestQueue = new LinkedBlockingQueue<BlockRequest>(); long _lastFileId = 0; static class ActiveFile { ActiveFile(File file, RandomAccessFile stream, long fileId) { _file = file; _stream = stream; _fileId = fileId; } File _file; RandomAccessFile _stream; long _fileId; } TreeMap<Long, ActiveFile> _activeFilesMap = new TreeMap<Long, ActiveFile>(); void startBlockWriter() { _blockWriter = new Thread(new Runnable() { @SuppressWarnings("unchecked") @Override public void run() { LOG.info("BlockWriter Thread Running... "); try { while (true) { try { BlockRequest request = _blockRequestQueue.take(); if (_immediateShutdown || request._type == BlockRequest.BlockRequestType.PURGE || request._type == BlockRequest.BlockRequestType.SHUTDOWN) { LOG.info("Got Shutdown Or Purge Request... Closing existing connections"); purgeOpenFiles(); if (_immediateShutdown || request._type == BlockRequest.BlockRequestType.SHUTDOWN) { LOG.info("Received Shutdown Request. Existing Thread"); break; } } else { if (request._type == BlockRequest.BlockRequestType.FILE_CREATE) { BlockRequest<FileInfo, Long> typedRequest = (BlockRequest<FileInfo, Long>) request; LOG.info("Got Block File Create Request for Path:" + typedRequest._data.getFileName()); // create the actual file ... File basePath = getActiveJobLocalPath(); File path = new File(basePath, typedRequest._data.getFileName()); // try to create a file from scratch ... try { RandomAccessFile stream = new RandomAccessFile(path, "rw"); ActiveFile activeFile = new ActiveFile(path, stream, ++_lastFileId); _activeFilesMap.put(activeFile._fileId, activeFile); typedRequest._result = activeFile._fileId; LOG.info("Created Block File at Path:" + path + " FileId:" + activeFile._fileId); // ok return to caller } catch (IOException e) { typedRequest._result = 0L; LOG.error("Error Creating Block File:" + path + ":" + CCStringUtils.stringifyException(e)); } finally { // initiate callback typedRequest._callback.execute(typedRequest); } } else if (request._type == BlockRequest.BlockRequestType.FILE_COMMIT) { BlockRequest<FileInfo, Boolean> typedRequest = (BlockRequest<FileInfo, Boolean>) request; LOG.info( "Got Commit Request for FileId::" + typedRequest._data.getFileId()); // expect failure typedRequest._result = false; // try to access the file try { ActiveFile activeFile = _activeFilesMap .get(typedRequest._data.getFileId()); if (activeFile != null) { LOG.info("Committing File: " + activeFile._file + " Id:" + activeFile._fileId); if (activeFile._stream != null) { try { activeFile._stream.close(); typedRequest._result = true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } _activeFilesMap.remove(activeFile._fileId); } else { LOG.error("No Active File Found for Id:" + typedRequest._data.getFileId()); } } finally { // initiate callback typedRequest._callback.execute(typedRequest); } } else if (request._type == BlockRequest.BlockRequestType.BLOCK_WRITE) { BlockRequest<BlockTransfer, Boolean> typedRequest = (BlockRequest<BlockTransfer, Boolean>) request; LOG.info("Got Block Transfer Request for FileId:" + typedRequest._data.getFileId() + " ByteCount:" + typedRequest._data.getBlockData().getCount()); // expect failure typedRequest._result = false; // try to access the file try { ActiveFile activeFile = _activeFilesMap .get(typedRequest._data.getFileId()); if (activeFile != null) { LOG.info("Writing: " + typedRequest._data.getBlockData().getCount() + " Bytes to File: " + activeFile._file + " Id:" + activeFile._fileId); if (activeFile._stream != null) { try { activeFile._stream.write( typedRequest._data.getBlockData() .getReadOnlyBytes(), 0, typedRequest._data.getBlockData().getCount()); typedRequest._result = true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } _activeFilesMap.remove(activeFile._fileId); } else { LOG.error("No Active File Found for Id:" + typedRequest._data.getFileId()); } } finally { // initiate callback typedRequest._callback.execute(typedRequest); } } } } catch (InterruptedException e) { } } } finally { LOG.info("Block Writer Thread Exiting"); } } }); _blockWriter.start(); } void purgeOpenFiles() { for (ActiveFile file : _activeFilesMap.values()) { if (file._stream != null) { try { file._stream.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } file._file.delete(); } } _activeFilesMap.clear(); } public void createJobFile(final AsyncContext<FileInfo, FileInfo> rpcContext) throws RPCException { try { if (getJobId() != rpcContext.getInput().getJobId()) { throw new IOException("Invalid Job Config or Invalid Job Id!"); } LOG.info("Got createJobFile RPC. Path:" + rpcContext.getInput().getFileName()); // default to failure status ... rpcContext.setStatus(Status.Error_RequestFailed); try { BlockRequest request = BlockRequest.createFileRequest(rpcContext, rpcContext.getInput(), new CallbackWithResult<BlockRequest<FileInfo, Long>>() { @Override public void execute(BlockRequest<FileInfo, Long> requestObject) { try { LOG.info("Received callback for createFile:" + requestObject._data.getFileName() + " Result:" + requestObject._result); // ok request was successfull ... if (requestObject._result != 0L) { // write was successfull ... rpcContext.getOutput().setFileId(rpcContext.getInput().getFileId()); rpcContext.getOutput().setFileId(requestObject._result); rpcContext.setStatus(Status.Success); } } finally { try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } }); _blockRequestQueue.put(request); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } catch (InterruptedException e) { } } catch (IOException e) { rpcContext.setErrorDesc(CCStringUtils.stringifyException(e)); LOG.error(rpcContext.getErrorDesc()); rpcContext.setStatus(Status.Error_RequestFailed); } finally { rpcContext.completeRequest(); } } public void transferBlock(final AsyncContext<BlockTransfer, BlockTransferAck> rpcContext) throws RPCException { LOG.info("Got trasferBlock RPC. FileId:" + rpcContext.getInput().getFileId() + " BufferSize:" + rpcContext.getInput().getBlockData().getCount()); try { BlockRequest request = BlockRequest.blockTransferRequest(rpcContext, rpcContext.getInput(), new CallbackWithResult<BlockRequest<BlockTransfer, Boolean>>() { @Override public void execute(BlockRequest<BlockTransfer, Boolean> requestObject) { try { // ok request was successfull ... if (requestObject._result == true) { // write was successfull ... rpcContext.getOutput().setFileId(rpcContext.getInput().getFileId()); rpcContext.getOutput().setBlockId(rpcContext.getInput().getBlockId()); rpcContext.setStatus(Status.Success); } else { rpcContext.setStatus(Status.Error_RequestFailed); } } finally { try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } }); _blockRequestQueue.put(request); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } catch (InterruptedException e) { } } public void commitFile(final AsyncContext<FileInfo, NullMessage> rpcContext) throws RPCException { LOG.info("Got commitFile RPC. FileId:" + rpcContext.getInput().getFileId()); try { BlockRequest request = BlockRequest.commitFileRequest(rpcContext, rpcContext.getInput(), new CallbackWithResult<BlockRequest<FileInfo, Boolean>>() { @Override public void execute(BlockRequest<FileInfo, Boolean> requestObject) { try { // ok request was successfull ... if (requestObject._result == true) { // write was successfull ... rpcContext.setStatus(Status.Success); } else { rpcContext.setStatus(Status.Error_RequestFailed); } } finally { try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } }); _blockRequestQueue.put(request); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.completeRequest(); } catch (InterruptedException e) { } } } public static class PRValueBlockWriterAndReceiverTester extends Server implements PageRankSlave, AsyncServerChannel.ConnectionCallback { AsyncServerChannel _channel; EventLoop _eventLoop; PRValueBlockFileReceiver _receiver; File _jobLocalPath; PRValueBlockWriterAndReceiverTester(EventLoop eventLoop, int instanceId, int portToUse) throws IOException { _eventLoop = eventLoop; _jobLocalPath = new File("/tmp/prvalue_receiver_test/" + instanceId); InetSocketAddress localAddress = new InetSocketAddress("localhost", 0); InetSocketAddress address = new InetSocketAddress("localhost", portToUse); _channel = new AsyncServerChannel(this, _eventLoop, address, this); registerService(_channel, PageRankSlave.spec); FileUtils.recursivelyDeleteFile(_jobLocalPath); _jobLocalPath.mkdirs(); start(); // start the block receiver.... _receiver = new PRValueBlockFileReceiver(1, _jobLocalPath); } void shutdown() { LOG.info("Doing orderly shutdown on receiver"); try { _receiver.shutdown(true); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } LOG.info("Closing Channel"); stop(); } public static void runTest() { EventLoop eventLoop = new EventLoop(); eventLoop.start(); try { // instantiate tester ... LOG.info("Starting Servers"); PRValueBlockWriterAndReceiverTester tester1 = new PRValueBlockWriterAndReceiverTester(eventLoop, 0, 9000); PRValueBlockWriterAndReceiverTester tester2 = new PRValueBlockWriterAndReceiverTester(eventLoop, 1, 9001); PRValueBlockWriterAndReceiverTester tester3 = new PRValueBlockWriterAndReceiverTester(eventLoop, 2, 9002); Vector<InetSocketAddress> addressList = new Vector<InetSocketAddress>(); addressList.add(new InetSocketAddress("127.0.0.1", 9000)); addressList.add(new InetSocketAddress("127.0.0.1", 9001)); addressList.add(new InetSocketAddress("127.0.0.1", 9002)); Configuration conf = new Configuration(); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); LOG.info("Creating Multiplexer"); // instantiate block writer ... PRValueMultiplexer multiplexer = new PRValueMultiplexer(conf, 1, 0, addressList, 0); URLFPV2 source = URLUtils.getURLFPV2FromURL("http://source.com/"); URLFPV2 dest = URLUtils.getURLFPV2FromURL("http://dest.com/"); LOG.info("Writing Values"); for (int i = 0; i < 10000; ++i) { multiplexer.writePRValue(i % 3, source, dest, 1.0f); } LOG.info("Waiting on Close"); multiplexer.close(false); // shutdown writers LOG.info("Shutting Down Receiver 1"); tester1.shutdown(); LOG.info("Shutting Down Receiver 2"); tester2.shutdown(); LOG.info("Shutting Down Receiver 3"); tester3.shutdown(); eventLoop.stop(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } @Override public void beginPageRank(AsyncContext<BeginPageRankInfo, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void checkpoint(AsyncContext<CheckpointInfo, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void commitFile(AsyncContext<FileInfo, NullMessage> rpcContext) throws RPCException { LOG.info("TestServer: Recevied commitFile Cmd"); _receiver.commitFile(rpcContext); } @Override public void createJobFile(AsyncContext<FileInfo, FileInfo> rpcContext) throws RPCException { LOG.info("TestServer: Recevied createJobFile Cmd"); _receiver.createJobFile(rpcContext); } @Override public void deleteFile(AsyncContext<FileInfo, NullMessage> rpcContext) throws RPCException { } @Override public void doIteration(AsyncContext<IterationInfo, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void endPageRank(AsyncContext<NullMessage, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void heartbeat(AsyncContext<NullMessage, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void initialize(AsyncContext<BaseConfig, SlaveStatus> rpcContext) throws RPCException { // TODO Auto-generated method stub } @Override public void transferBlock(AsyncContext<BlockTransfer, BlockTransferAck> rpcContext) throws RPCException { LOG.info("TestServer: Recevied transferBlock Cmd"); _receiver.transferBlock(rpcContext); } @Override public void IncomingClientConnected(AsyncClientChannel channel) { LOG.info("TestServer IncomingClient Connected"); } @Override public void IncomingClientDisconnected(AsyncClientChannel channel) { LOG.info("TestServer IncomingClient Disconnected"); } } private static FileSystem buildDistributionOutputStreamVector(boolean useSequenceFile, String fileNamePrefix, File localOutputPath, String remoteOutputPath, int myNodeIndex, int nodeCount, Vector<PRValueOutputStream> outputStreamVector) { Configuration conf = new Configuration(CrawlEnvironment.getHadoopConfig()); conf.setInt("dfs.socket.timeout", 240000); conf.setInt("io.file.buffer.size", 4096 * 20); DistributedFileSystem hdfs = new DistributedFileSystem(); try { hdfs.initialize(FileSystem.getDefaultUri(conf), conf); for (int i = 0; i < nodeCount; ++i) { // create output filename String fileName = fileNamePrefix + "-" + NUMBER_FORMAT.format(i); // create stream (local or remote stream, depending on i) // remote path Path remotePath = new Path(remoteOutputPath, fileName); // remove file CrawlEnvironment.getDefaultFileSystem().delete(remotePath, false); if (useSequenceFile) { // recreate it ... outputStreamVector.add(new PRSequenceFileOutputStream(conf, CrawlEnvironment.getDefaultFileSystem(), remotePath)); } else { // recreate it ... outputStreamVector .add(new PROldValueOutputStream(CrawlEnvironment.getDefaultFileSystem(), remotePath)); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); for (PRValueOutputStream streamInfo : outputStreamVector) { try { if (streamInfo != null) { streamInfo.close(true); } } catch (IOException e2) { LOG.error(CCStringUtils.stringifyException(e2)); } outputStreamVector.clear(); } } return hdfs; } public static Vector<Path> buildCalculationInputStreamVector(File localOutputPath, String remoteOutputPath, int myNodeIndex, int nodeCount, int iterationNumber) { Vector<Path> vector = new Vector<Path>(); for (int i = 0; i < nodeCount; ++i) { // create output filename String fileName = getOutlinksBaseName(i, iterationNumber) + "-" + NUMBER_FORMAT.format(myNodeIndex); // create stream (local or remote stream, depending on i) // remote path Path remotePath = new Path(remoteOutputPath, fileName); LOG.info("Adding Path:" + remotePath + " For Index:" + i); // vector.add(remotePath); } return vector; } public static class SourceAndRank implements Comparable<SourceAndRank> { SourceAndRank(URLFPV2 fingerprint, float prValue) { source.setDomainHash(fingerprint.getDomainHash()); source.setRootDomainHash(fingerprint.getRootDomainHash()); source.setUrlHash(fingerprint.getUrlHash()); rank = prValue; } URLFPV2 source = new URLFPV2(); float rank; @Override public int compareTo(SourceAndRank o) { return source.compareTo(o.source); } } public static class DomainHashAndPRValue implements Comparable<DomainHashAndPRValue> { public DomainHashAndPRValue(long domainHash, float prValue) { _domainHash = domainHash; _accumulator = prValue; _inputs = 1; } public void updatePRValue(float newPRValue) { _accumulator += newPRValue; _inputs++; } public float averageValue() { return _accumulator / (float) _inputs; } public long _domainHash; public float _accumulator; public int _inputs; @Override public int compareTo(DomainHashAndPRValue o) { return ((Long) _domainHash).compareTo(o._domainHash); } } public static class RootDomain { public RootDomain() { } public HashMap<Long, DomainHashAndPRValue> subDomains = new HashMap<Long, DomainHashAndPRValue>(); } public static class TargetAndSources { URLFPV2 target = new URLFPV2(); HashMap<Long, RootDomain> sources = new HashMap<Long, RootDomain>(); } public static class TargetSourceAndRank { public boolean readFromStream(DataInputStream inputStream) throws IOException { if (inputStream.available() != 0) { target.setDomainHash(inputStream.readLong()); target.setUrlHash(inputStream.readLong()); source.setRootDomainHash(inputStream.readLong()); source.setDomainHash(inputStream.readLong()); source.setUrlHash(inputStream.readLong()); prValue = inputStream.readFloat(); isValid = true; } else { isValid = false; } return isValid; } @Override public String toString() { return "Target DomainHash:" + target.getDomainHash() + " FP:" + target.getUrlHash() + " Source DomainHash:" + source.getDomainHash() + " FP:" + source.getUrlHash(); } boolean isValid = false; URLFPV2 target = new URLFPV2(); URLFPV2 source = new URLFPV2(); float prValue; } static interface PRInputSource { public TargetSourceAndRank next() throws IOException; public TargetSourceAndRank last(); public void close() throws IOException; public long getSize() throws IOException; } static class PRSequenceFileInputSource implements PRInputSource { SequenceFile.Reader _reader; public Path _path; public TargetSourceAndRank _currentValue = new TargetSourceAndRank(); DataInputBuffer _inputStream = new DataInputBuffer(); FlexBuffer _buffer = new FlexBuffer(); long _totalLength = 0; public PRSequenceFileInputSource(Configuration conf, FileSystem fs, Path path, SortedPRInputReader reader) throws IOException { _path = path; _reader = new SequenceFile.Reader(fs, path, conf); FileStatus fileStatus = fs.getFileStatus(_path); _totalLength = 0L; if (fileStatus != null) { _totalLength = fileStatus.getLen(); } } @Override public void close() throws IOException { if (_reader != null) { _reader.close(); _reader = null; } } @Override public TargetSourceAndRank last() { return _currentValue; } @Override public TargetSourceAndRank next() throws IOException { _currentValue = null; if (_reader.next(_buffer, NullWritable.get())) { _inputStream.reset(_buffer.get(), _buffer.getCount()); _currentValue = new TargetSourceAndRank(); _currentValue.readFromStream(_inputStream); } return _currentValue; } @Override public long getSize() throws IOException { return _totalLength; } } static class PROldInputSource implements PRInputSource { SortedPRInputReader _reader = null; long _bytesTotal; public PROldInputSource(Path path, SortedPRInputReader reader) throws IOException { _path = path; _istream = CrawlEnvironment.getDefaultFileSystem().open(_path); _bytesTotal = CrawlEnvironment.getDefaultFileSystem().getFileStatus(_path).getLen(); // wrap the stream so that we can monitor progress ... _istream = new FilterInputStream(_istream) { @Override public int read() throws IOException { _reader._totalBytesRead += 1; return this.in.read(); } @Override public int read(byte[] b, int off, int len) throws IOException { int bytesRead = this.in.read(b, off, len); _reader._totalBytesRead += bytesRead; return bytesRead; } @Override public long skip(long n) throws IOException { long bytesSkipped = this.in.skip(n); _reader._totalBytesRead += bytesSkipped; return bytesSkipped; } }; _stream = new DataInputStream(_istream); _reader = reader; } @Override public TargetSourceAndRank next() throws IOException { _currentValue = null; if (_stream != null && _stream.available() != 0) { _currentValue = new TargetSourceAndRank(); // reset bytes read counter _currentValue.readFromStream(_stream); } return _currentValue; } @Override public TargetSourceAndRank last() { return _currentValue; } @Override public void close() throws IOException { if (_istream != null) { _istream.close(); _istream = null; _stream = null; } } public Path _path; public InputStream _istream; public DataInputStream _stream; public TargetSourceAndRank _currentValue = new TargetSourceAndRank(); @Override public long getSize() throws IOException { return _bytesTotal; } } public static class SortedPRInputReader { PRInputSource _inputs[] = null; int _validStreams = 0; long _totalBytesToRead = 0; long _totalBytesRead = 0; public SortedPRInputReader(Configuration conf, FileSystem fs, Vector<Path> streams, boolean useSequenceFile) throws IOException { try { LOG.info("PRInputReader: Allocating Stream Array of Size:" + streams.size()); //ok allocate an array up to stream vector size ... _inputs = new PRInputSource[streams.size()]; // now, open streams for (Path streamInfo : streams) { if (!useSequenceFile) { _inputs[_validStreams] = new PROldInputSource(streamInfo, this); } else { _inputs[_validStreams] = new PRSequenceFileInputSource(conf, fs, streamInfo, this); } // advance to first item if (_inputs[_validStreams].next() == null) { LOG.error("PRInputReader: Stream At Index:" + _validStreams + " contains zero entries!"); _inputs[_validStreams].close(); } else { LOG.info("PRInputReader: Stream :" + _validStreams + " First Item:" + _inputs[_validStreams].last().toString()); _totalBytesToRead += _inputs[_validStreams].getSize(); _validStreams++; } } // lastly sort streams sortStreams(); LOG.info("Sorted First Item:" + _inputs[0].last().toString()); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); close(); throw e; } } void close() { for (int i = 0; i < _validStreams; ++i) { try { _inputs[i].close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } _inputs[i] = null; } _validStreams = 0; } static final int MAX_ROOT_DOMAIN_SOURCES_PER_TARGET = 100000; static final int MAX_SUBDOMAIN_SOURCES_PER_ROOTDOMAIN = 500; static DomainHashAndPRValue addSourceToTarget(TargetAndSources tgtAndSources, TargetSourceAndRank source) { RootDomain rootDomain = tgtAndSources.sources.get(source.source.getRootDomainHash()); if (rootDomain == null) { if (tgtAndSources.sources.size() < MAX_ROOT_DOMAIN_SOURCES_PER_TARGET) { rootDomain = new RootDomain(); tgtAndSources.sources.put(source.source.getRootDomainHash(), rootDomain); } } DomainHashAndPRValue hashAndPRValue = (rootDomain != null) ? rootDomain.subDomains.get(source.source.getDomainHash()) : null; if (hashAndPRValue == null) { hashAndPRValue = new DomainHashAndPRValue(source.source.getDomainHash(), source.prValue); if (rootDomain != null && rootDomain.subDomains.size() < MAX_SUBDOMAIN_SOURCES_PER_ROOTDOMAIN) { rootDomain.subDomains.put(source.source.getDomainHash(), hashAndPRValue); } } else { hashAndPRValue.updatePRValue(source.prValue); } return hashAndPRValue; } void sortStreams() { Arrays.sort(_inputs, 0, _validStreams, new Comparator<PRInputSource>() { @Override public int compare(PRInputSource o1, PRInputSource o2) { return o1.last().target.compareTo(o2.last().target); } }); } // collect next valid target and all related sources TargetAndSources readNextTarget() throws IOException { if (_validStreams != 0) { TargetAndSources target = new TargetAndSources(); target.target.setDomainHash(_inputs[0].last().target.getDomainHash()); target.target.setUrlHash(_inputs[0].last().target.getUrlHash()); //LOG.info("readNextTarget - target is:" + target.target.getDomainHash() + ":" + target.target.getUrlHash()); //LOG.info("readNextTarget - source is:" + _inputs[0].last().source.getDomainHash() + ":" + _inputs[0].last().source.getUrlHash()); DomainHashAndPRValue lastValue = addSourceToTarget(target, _inputs[0].last()); // advance input zero _inputs[0].next(); // ok enter a loop and collect all sources for current target ... for (int streamIdx = 0; streamIdx < _validStreams;) { if (_inputs[streamIdx].last() == null || _inputs[streamIdx].last().target.compareTo(target.target) != 0) { streamIdx++; } else { if (lastValue != null && lastValue._domainHash == _inputs[streamIdx].last().source.getDomainHash()) { lastValue.updatePRValue(_inputs[streamIdx].last().prValue); } else { lastValue = addSourceToTarget(target, _inputs[streamIdx].last()); } // advance current stream ... _inputs[streamIdx].next(); } } // ok now collect remaining valid streams int newValidStreamCount = 0; for (int currStreamIdx = 0; currStreamIdx < _validStreams; ++currStreamIdx) { if (_inputs[currStreamIdx].last() != null) { _inputs[newValidStreamCount++] = _inputs[currStreamIdx]; } else { // close the stream ... _inputs[currStreamIdx].close(); // null it out ... _inputs[currStreamIdx] = null; } } // resset valid stream count _validStreams = newValidStreamCount; // ok now sort streams ... if (_validStreams != 0) { sortStreams(); } return target; } else { return null; } } } public static class CalculateRankQueueItem { public CalculateRankQueueItem(TargetAndSources next) { _e = null; _next = next; } public CalculateRankQueueItem(IOException e) { _e = e; _next = null; } public CalculateRankQueueItem() { _e = null; _next = null; } public TargetAndSources _next; public IOException _e; } public static void calculateRank(final Configuration conf, final FileSystem fs, final PRValueMap valueMap, final File jobLocalDir, final String jobWorkPath, final int nodeIndex, final int slaveCount, final int iterationNumber, final SuperDomainFilter superDomainFilter, final ProgressAndCancelCheckCallback progressAndCancelCallback) throws IOException { final LinkedBlockingQueue<CalculateRankQueueItem> readAheadQueue = new LinkedBlockingQueue<CalculateRankQueueItem>( 20); // build stream vector ... Vector<Path> streamVector = buildCalculationInputStreamVector(jobLocalDir, jobWorkPath, nodeIndex, slaveCount, iterationNumber); // construct a reader ... final SortedPRInputReader reader = new SortedPRInputReader(conf, fs, streamVector, true); Thread readerThread = new Thread(new Runnable() { @Override public void run() { IOException exceptionOut = null; try { TargetAndSources target = null; while ((target = reader.readNextTarget()) != null) { try { readAheadQueue.put(new CalculateRankQueueItem(target)); } catch (InterruptedException e) { } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); exceptionOut = e; } finally { if (reader != null) { reader.close(); } } try { readAheadQueue.put(new CalculateRankQueueItem(exceptionOut)); } catch (InterruptedException e1) { } } }); readerThread.start(); int failedUpdates = 0; int totalUpdates = 0; long iterationStart = System.currentTimeMillis(); boolean cancelled = false; while (!cancelled) { CalculateRankQueueItem queueItem = null; try { queueItem = readAheadQueue.take(); } catch (InterruptedException e) { } if (queueItem._next != null) { totalUpdates++; //LOG.info("Target: DomainHash:" + target.target.getDomainHash() + " URLHash:" + target.target.getUrlHash() + " ShardIdx:" + ((target.target.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.PR_NUMSLAVES)); // now accumulate rank from stream into value map if (!accumulateRank(valueMap, queueItem._next, superDomainFilter)) { failedUpdates++; LOG.error("**TotalUpdates:" + totalUpdates + " Failed Updates:" + failedUpdates); } if ((totalUpdates + failedUpdates) % 10000 == 0) { float percentComplete = (float) reader._totalBytesRead / (float) reader._totalBytesToRead; if (progressAndCancelCallback != null) { cancelled = progressAndCancelCallback.updateProgress(percentComplete); if (cancelled) { LOG.info("Cancel check callback returned true"); } } long timeEnd = System.currentTimeMillis(); int milliseconds = (int) (timeEnd - iterationStart); //LOG.info("Accumulate PR for 10000 Items Took:" + milliseconds + " Milliseconds QueueSize:" + readAheadQueue.size()); iterationStart = System.currentTimeMillis(); } } else { if (queueItem._e != null) { LOG.error(CCStringUtils.stringifyException(queueItem._e)); throw queueItem._e; } else { // now finally pagerank value in value map ... valueMap.finalizePageRank(); } break; } } try { readerThread.join(); } catch (InterruptedException e) { } } private static boolean accumulateRank(PRValueMap valueMap, TargetAndSources target, SuperDomainFilter superDomainFilter) throws IOException { float rank = 0.0f; //LOG.info("Accumulating Rank for DomainHash:" + target.target.getDomainHash() + " URLFP:" + target.target.getUrlHash()); for (Map.Entry<Long, RootDomain> entry : target.sources.entrySet()) { // ok first figure out if this is a super domain boolean rootIsSuperDomain = (superDomainFilter != null && superDomainFilter.filterItemByHashIdV2(entry.getKey()) == FilterResult.Filter_Accept); RootDomain rootDomain = entry.getValue(); if (!rootIsSuperDomain) { float accumulator = 0.0f; /* if (rootDomain.subDomains.size() > 1) { LOG.info("Non-Super-Domain:" + entry.getKey() + " has " + rootDomain.subDomains.size() + " subdomains"); } */ int subDomainsIterated = 0; for (DomainHashAndPRValue source : rootDomain.subDomains.values()) { ++subDomainsIterated; /* if (rootDomain.subDomains.size() > 1) { LOG.info("Taking Max Between CurrentValue:" + maxSourceValue + " and current SubDomain:" + source._prValue); } */ accumulator += source.averageValue(); if (subDomainsIterated > 100) break; } if (subDomainsIterated != 0) { rank += accumulator / (float) subDomainsIterated; } } else { /* if (rootDomain.subDomains.size() > 1) { LOG.info("Super-Domain:" + entry.getKey() + " has " + rootDomain.subDomains.size() + " subdomains"); } */ // ok walk items in collection (which are sorted by domain id) for (DomainHashAndPRValue source : rootDomain.subDomains.values()) { /* if (rootDomain.subDomains.size() > 1) { LOG.info("Adding SubDomain:" + source._domainHash + " value:" + source._prValue + " to existing value:" + rank); } */ rank += source.averageValue(); } } } try { // update page rank for item in map valueMap.addPRValue(target.target, rank); return true; } catch (IOException e) { return false; } } private static class OutlinkItem { public OutlinkItem() { targetFingerprint = new URLFPV2(); sourceFingerprint = new URLFPV2(); } public OutlinkItem(IOException e) { error = e; } public URLFPV2 targetFingerprint = null; public URLFPV2 sourceFingerprint = null; public int urlCount = 0; public IOException error = null; } public interface ProgressAndCancelCheckCallback { boolean updateProgress(float percentComplete); } public static void distributeRank(final PRValueMap valueMap, final Path outlinksFile, final boolean outlinksIsRemote, File localOutputDir, String remoteOutputDir, int thisNodeIdx, int nodeCount, int iterationNumber, final ProgressAndCancelCheckCallback progressCallback) throws IOException { final Configuration conf = CrawlEnvironment.getHadoopConfig(); Vector<PRValueOutputStream> outputStreamVector = new Vector<PRValueOutputStream>(); // allocate a queue ... final LinkedBlockingQueue<OutlinkItem> queue = new LinkedBlockingQueue<OutlinkItem>(20000); try { // start the loader thread ... Thread loaderThread = new Thread(new Runnable() { final BytesWritable key = new BytesWritable(); final BytesWritable value = new BytesWritable(); final DataInputBuffer keyStream = new DataInputBuffer(); final DataInputBuffer valueStream = new DataInputBuffer(); @Override public void run() { LOG.info("Opening Outlinks File at:" + outlinksFile); SequenceFile.Reader reader = null; try { FileSystem fsForOutlinksFile = null; if (outlinksIsRemote) { fsForOutlinksFile = CrawlEnvironment.getDefaultFileSystem(); } else { fsForOutlinksFile = FileSystem.getLocal(conf); } FileStatus outlinksFileStatus = fsForOutlinksFile.getFileStatus(outlinksFile); long bytesToReadTotal = (outlinksFileStatus != null) ? outlinksFileStatus.getLen() : 0; reader = new SequenceFile.Reader(fsForOutlinksFile, outlinksFile, conf); OutlinkItem item = new OutlinkItem(); int itemCount = 0; boolean isCancelled = false; while (!isCancelled && reader.next(key, value)) { keyStream.reset(key.getBytes(), 0, key.getLength()); valueStream.reset(value.getBytes(), 0, value.getLength()); //populate item from data readURLFPFromStream(keyStream, item.targetFingerprint); item.urlCount = readURLFPAndCountFromStream(valueStream, item.sourceFingerprint); try { long blockTimeStart = System.currentTimeMillis(); queue.put(item); long blockTimeEnd = System.currentTimeMillis(); } catch (InterruptedException e) { } item = new OutlinkItem(); if (itemCount++ % 10000 == 0 && progressCallback != null) { float percentComplete = (float) reader.getPosition() / (float) bytesToReadTotal; if (progressCallback.updateProgress(percentComplete)) { LOG.info("Cancel check callback returned true.Cancelling outlink item load"); isCancelled = true; } } } item.sourceFingerprint = null; item.targetFingerprint = null; // add empty item try { if (!isCancelled) { queue.put(item); } else { queue.put(new OutlinkItem(new IOException("Operation Cancelled"))); } } catch (InterruptedException e) { } } catch (IOException e) { // add error item to queue. try { queue.put(new OutlinkItem(e)); } catch (InterruptedException e1) { } } finally { if (reader != null) try { reader.close(); } catch (IOException e) { } } } }); loaderThread.start(); // first things first ... initialize output stream vector FileSystem fileSystem = buildDistributionOutputStreamVector(true, getOutlinksBaseName(thisNodeIdx, iterationNumber), localOutputDir, remoteOutputDir, thisNodeIdx, nodeCount, outputStreamVector); try { // open outlinks file . LOG.info("Iterating Items in Outlinks File and Writing Test Value"); int itemCount = 0; int totalOutlinkCount = 0; int iterationOutlinkCount = 0; long iterationStart = System.currentTimeMillis(); long timeStart = iterationStart; boolean done = false; ArrayList<OutlinkItem> items = new ArrayList<OutlinkItem>(); // start iterating outlinks while (!done) { //OutlinkItem item = null; //try { long waitTimeStart = System.currentTimeMillis(); queue.drainTo(items); long waitTimeEnd = System.currentTimeMillis(); //} catch (InterruptedException e) { //} for (OutlinkItem item : items) { if (item.error != null) { LOG.info( "Loader Thread Returned Error:" + CCStringUtils.stringifyException(item.error)); throw item.error; } else if (item.sourceFingerprint == null) { LOG.info("Loader Thread Indicated EOF via emtpy item"); done = true; } else { ++itemCount; /* LOG.info("SourceFP-DomainHash:" + item.sourceFingerprint.getDomainHash() + " URLHash:" + item.sourceFingerprint.getUrlHash() + " PartitionIdx:" + ((item.sourceFingerprint.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.PR_NUMSLAVES) ); */ // now get pr value for fingerprint (random seek in memory here!!!) float prValue = valueMap.getPRValue(item.sourceFingerprint) / (float) Math.max(item.urlCount, 1); // write value out int nodeIndex = (item.targetFingerprint.hashCode() & Integer.MAX_VALUE) % nodeCount; outputStreamVector.get(nodeIndex).writePRValue(item.targetFingerprint, item.sourceFingerprint, prValue); if (itemCount % 10000 == 0) { long timeEnd = System.currentTimeMillis(); int milliseconds = (int) (timeEnd - iterationStart); LOG.info("Distribute PR for 10000 Items with:" + iterationOutlinkCount + " Outlinks Took:" + milliseconds + " Milliseconds" + " QueueCount:" + queue.size()); iterationStart = System.currentTimeMillis(); totalOutlinkCount += iterationOutlinkCount; iterationOutlinkCount = 0; } } } items.clear(); } totalOutlinkCount += iterationOutlinkCount; LOG.info("Distribute Finished for a total of:" + itemCount + " Items with:" + totalOutlinkCount + " Outlinks Took:" + (System.currentTimeMillis() - timeStart) + " Milliseconds"); LOG.info("Waiting for Loader Thread to Die"); try { loaderThread.join(); } catch (InterruptedException e) { } LOG.info("Loader Thread Died - Moving on..."); } finally { for (PRValueOutputStream info : outputStreamVector) { if (info != null) { info.close(false); } } if (fileSystem != null) { fileSystem.close(); } } } catch (IOException e) { LOG.error("Exception caught while distributing outlinks:" + CCStringUtils.stringifyException(e)); throw e; } } @Test public void testname() throws Exception { int array[] = { 2, 3, 5, 7, 10 }; System.out.println("searching for 1 returned:" + findPos(array, 1)); System.out.println("searching for 2 returned:" + findPos(array, 2)); System.out.println("searching for 11 returned:" + findPos(array, 11)); System.out.println("searching for 8 returned:" + findPos(array, 8)); } public static void main(String[] args) { LOG.info("Initializing Hadoop Config"); Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn02:9000/"); if (args[0].equals("PRValueRW")) { runPRValueReadWriteTest(args); } else if (args[0].equals("IDRead")) { runIDReadBenchmark(args); } else if (args[0].equals("DRank")) { runDistributeRankBenchmark(args); } else if (args[0].equals("ARank")) { runAccumulateRankBechmark(args); } else if (args[0].equals("BlockFileRcv")) { LOG.info("Running BlockFileReceiver test"); runBlockFileReceiverTest(); } } private static void runBlockFileReceiverTest() { PRValueBlockWriterAndReceiverTester.runTest(); } private static void runIDReadBenchmark(String[] args) { File idsFile = new File(args[1]); URLFPV2 fingerPrint = new URLFPV2(); LOG.info("Opening ID File at path:" + idsFile.getAbsolutePath()); RandomAccessFile stream = null; try { stream = new RandomAccessFile(idsFile, "r"); long length = stream.length(); int idCount = 0; long totalStartTime = System.currentTimeMillis(); long snapshotTime = System.currentTimeMillis(); boolean error = false; while (!error) { fingerPrint.readFields(stream); ++idCount; if (idCount % 10000 == 0) { LOG.info("Read 10000 ids in:" + (System.currentTimeMillis() - snapshotTime) + " MS"); snapshotTime = System.currentTimeMillis(); } } LOG.info("Completed Reading a Total of:" + idCount + " IDs in:" + (System.currentTimeMillis() - totalStartTime) + " MS"); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { if (stream != null) { try { stream.close(); } catch (IOException e) { } } } } private static void runPRValueReadWriteTest(String[] args) { Configuration conf = CrawlEnvironment.getHadoopConfig(); File valueFile = new File(args[1]); File rangeFile = new File(args[2]); File outlinksFile = new File(args[3]); PRValueMap valueMap = new PRValueMap(); try { valueMap.open(FileSystem.getLocal(conf), new Path(valueFile.getAbsolutePath()), new Path(rangeFile.getAbsolutePath())); // valueMap.dumpRangeItems(); LOG.info("Opening Outlinks File at:" + outlinksFile); SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(conf), new Path(outlinksFile.getPath()), conf); LOG.info("Iterating Items in Outlinks File and Writing Test Value"); URLFPV2 fingerprint = new URLFPV2(); CompressedOutlinkList outlinkList = new CompressedOutlinkList(); int itemCount = 0; long valueWriteStart = System.currentTimeMillis(); long timeStart = valueWriteStart; while (reader.next(fingerprint, outlinkList)) { ++itemCount; // LOG.info("Got Item with Domain Hash:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash()); // get pr value for item ... //TODO: SWITCH TO INT FOR TEST // valueMap.setPRValue(fingerprint,itemCount % Short.MAX_VALUE); valueMap.setPRValue(fingerprint, itemCount % Integer.MAX_VALUE); // LOG.info("Get PRValue returned:" + prValue); fingerprint.clear(); outlinkList.clear(); if (itemCount % 10000 == 0) { LOG.info("Wrote 10000 Items in:" + (System.currentTimeMillis() - timeStart) + " Milliseconds"); timeStart = System.currentTimeMillis(); } } LOG.info("Done Writing Values. Took:" + (System.currentTimeMillis() - valueWriteStart) + " Milliseconds"); valueFile.delete(); OutputStream stream = null; try { stream = new FileOutputStream(valueFile); // flush stuff to disk valueMap.flush(stream); } finally { if (stream != null) stream.close(); } LOG.info("Opening Outlinks File at:" + args[2]); reader = new SequenceFile.Reader(FileSystem.getLocal(conf), new Path(args[2]), conf); LOG.info("Iterating Items in Outlinks File and Reading Test Value"); itemCount = 0; long valueReadStart = System.currentTimeMillis(); timeStart = valueWriteStart; while (reader.next(fingerprint, outlinkList)) { ++itemCount; // LOG.info("Got Item with Domain Hash:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash()); // get pr value for item ... float prValue = valueMap.getPRValue(fingerprint); // LOG.info("Get PRValue returned:" + prValue); //TODO: SWITCH TO INT FOR TEST //if (prValue != (itemCount % Short.MAX_VALUE)) { if (prValue != (itemCount % Integer.MAX_VALUE)) { //TODO: SWITCH TO INT FOR TEST // throw new IOException("PRValue did not match for item:" + itemCount + " Expected:" + (itemCount % Short.MAX_VALUE) + " Got:" + prValue); throw new IOException("PRValue did not match for item:" + itemCount + " Expected:" + (itemCount % Integer.MAX_VALUE) + " Got:" + prValue); } fingerprint.clear(); outlinkList.clear(); if (itemCount % 10000 == 0) { LOG.info("Read 10000 Items in:" + (System.currentTimeMillis() - timeStart) + " Milliseconds"); timeStart = System.currentTimeMillis(); } } LOG.info( "Done Reading Values. Took:" + (System.currentTimeMillis() - valueReadStart) + " Milliseconds"); valueMap.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } private static void runAccumulateRankBechmark(String args[]) { LOG.info("Initializing Hadoop Config"); Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("mapred-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/"); try { Path valueFile = new Path(args[1]); Path rangeFile = new Path(args[2]); Path outlinksFile = new Path(args[3]); //Path outputDir = new Path(args[4]); String remoteOutputDir = args[4]; LOG.info("ValuesFile:" + valueFile); LOG.info("RangeFile:" + rangeFile); LOG.info("OutlinksFile:" + outlinksFile); LOG.info("RemoteOutputDir:" + remoteOutputDir); LOG.info("Initializing SuperDomain Filter"); SuperDomainFilter superDomainFilter = new SuperDomainFilter(); superDomainFilter.loadFromPath( new InetSocketAddress("10.0.20.21", CrawlEnvironment.DIRECTORY_SERVICE_RPC_PORT).getAddress(), CrawlEnvironment.ROOT_SUPER_DOMAIN_PATH, false); LOG.info("Loaded SuperDomain Filter"); int thisNodeIdx = 0; int totalNodeCount = CrawlEnvironment.PR_NUMSLAVES; FileSystem fs = FileSystem.get(conf); PRValueMap valueMap = new PRValueMap(); LOG.info("Initializing Value Map"); valueMap.open(FileSystem.get(conf), valueFile, rangeFile); LOG.info("Initialized Value Map"); LOG.info("Calculating Rank"); long timeStart = System.currentTimeMillis(); calculateRank(conf, fs, valueMap, null, remoteOutputDir, 0, totalNodeCount, 0, superDomainFilter, null); long timeEnd = System.currentTimeMillis(); LOG.info("Done Calculating Rank. Took:" + (timeEnd - timeStart)); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } public static final int readURLFPAndCountFromStream(DataInput input, URLFPV2 fpOut) throws IOException { fpOut.setDomainHash(input.readLong()); fpOut.setRootDomainHash(input.readLong()); fpOut.setUrlHash(input.readLong()); return WritableUtils.readVInt(input); } public static final void writeURLFPAndCountToStream(DataOutput stream, URLFPV2 key, int urlCount) throws IOException { stream.writeLong(key.getDomainHash()); stream.writeLong(key.getRootDomainHash()); stream.writeLong(key.getUrlHash()); WritableUtils.writeVInt(stream, urlCount); } public static final void readURLFPFromStream(DataInput input, URLFPV2 fpOut) throws IOException { fpOut.setDomainHash(input.readLong()); fpOut.setRootDomainHash(input.readLong()); fpOut.setUrlHash(input.readLong()); } public static final void writeURLFPToStream(DataOutput stream, URLFPV2 key) throws IOException { stream.writeLong(key.getDomainHash()); stream.writeLong(key.getRootDomainHash()); stream.writeLong(key.getUrlHash()); } private static void runDistributeRankBenchmark(String args[]) { LOG.info("Initializing Hadoop Config"); Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("mapred-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/"); try { Path valueFile = new Path(args[1]); Path rangeFile = new Path(args[2]); Path outlinksFile = new Path(args[3]); //Path outputDir = new Path(args[4]); String remoteOutputDir = args[4]; LOG.info("ValuesFile:" + valueFile); LOG.info("RangeFile:" + rangeFile); LOG.info("OutlinksFile:" + outlinksFile); LOG.info("RemoteOutputDir:" + remoteOutputDir); int thisNodeIdx = 0; int totalNodeCount = CrawlEnvironment.PR_NUMSLAVES; FileSystem fs = FileSystem.get(conf); PRValueMap valueMap = new PRValueMap(); valueMap.open(FileSystem.get(conf), valueFile, rangeFile); fs.mkdirs(new Path(remoteOutputDir)); fs.delete(new Path(remoteOutputDir, "*"), false); //File localOutputFile = new File(localOutputDir,getOutlinksBaseName(0,0) + "-" + NUMBER_FORMAT.format(0)); //localOutputFile.delete(); distributeRank(valueMap, outlinksFile, true, null, remoteOutputDir, thisNodeIdx, totalNodeCount, 0, null); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } }