org.commoncrawl.util.TaskDataUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.util.TaskDataUtils.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.util.concurrent.Semaphore;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.protocol.CrawlDBService;
import org.commoncrawl.protocol.MapReduceTaskIdAndData;
import org.commoncrawl.protocol.SimpleByteResult;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.rpc.base.internal.AsyncClientChannel;
import org.commoncrawl.rpc.base.internal.AsyncRequest;
import org.commoncrawl.rpc.base.internal.NullMessage;

/**
 * 
 * @author rana
 *
 */
public class TaskDataUtils {

    private static final Log LOG = LogFactory.getLog(TaskDataUtils.class);

    /** used to set task data connection info into job config BEFORE submitting the job to the JobTracker**/
    public static final void initializeTaskDataJobConfig(JobConf jobconf, long jobId,
            InetSocketAddress connectionInfo) {
        jobconf.set("taskdata.service.connectionInfo",
                connectionInfo.getAddress().getHostAddress() + ":" + connectionInfo.getPort());
        jobconf.set("taskdata.service.jobid", Long.toString(jobId));
    }

    public static final long getTaskDataJobIdFromJobConfig(JobConf jobconf) {
        return Long.parseLong(jobconf.get("taskdata.service.jobid", "0"));
    }

    /** used by the Map/Reduce Task to construct a task data client **/
    public static final TaskDataClient getTaskDataClientForTask(JobConf jobconf) throws IOException {
        return new TaskDataClient(jobconf);
    }

    public static class TaskDataClient implements AsyncClientChannel.ConnectionCallback {

        EventLoop _eventLoop = null;
        AsyncClientChannel _channel;
        CrawlDBService.AsyncStub _asyncStub;
        Semaphore _blockingCallSemaphore = null;
        IOException _lastIOException = null;

        private String _connectionData;
        private String _jobId;
        private String _taskId;

        /** internal constructor used to create a new taskdataclient and establish a connection to the 
         * task data server **/
        TaskDataClient(JobConf jobConf) throws IOException {
            _connectionData = jobConf.get("taskdata.service.connectionInfo");
            _jobId = jobConf.get("taskdata.service.jobid");

            if (_connectionData == null || _jobId == null) {
                throw new IOException("No TaskData Server Information in JobConfig!");
            }
            // extract task id ... 
            TaskAttemptID attemptId = TaskAttemptID.forName(jobConf.get("mapred.task.id"));
            _taskId = Integer.toString(attemptId.getTaskID().getId());

            // start event loop ... 
            _eventLoop = new EventLoop();
            _eventLoop.start();

            // parse connection sting 
            String connectionParts[] = _connectionData.split(":");
            if (connectionParts.length != 2) {
                throw new IOException("Invalid Connection String!");
            } else {
                // construct socket address 
                InetSocketAddress endPoint = new InetSocketAddress(InetAddress.getByName(connectionParts[0]),
                        Integer.parseInt(connectionParts[1]));

                LOG.info("Connecting to server at:" + endPoint);
                _channel = new AsyncClientChannel(_eventLoop, null, endPoint, this);
                _blockingCallSemaphore = new Semaphore(0);
                _channel.open();
                _asyncStub = new CrawlDBService.AsyncStub(_channel);
                LOG.info("Waiting on Connect... ");
                _blockingCallSemaphore.acquireUninterruptibly();
                LOG.info("Connect Semaphore Released... ");
                _blockingCallSemaphore = null;

                if (!_channel.isOpen()) {
                    throw new IOException("Connection Failed!");
                }
            }
        }

        /** used by the map/reduce task to set task data by key **/
        public void updateTaskData(String key, String value) throws IOException {

            _blockingCallSemaphore = new Semaphore(0);
            _lastIOException = null;

            MapReduceTaskIdAndData taskIdAndData = new MapReduceTaskIdAndData();

            taskIdAndData.setJobId(_jobId);
            taskIdAndData.setTaskId(_taskId);
            taskIdAndData.setDataKey(key);
            taskIdAndData.setDataValue(value);

            _asyncStub.updateMapReduceTaskValue(taskIdAndData,
                    new AsyncRequest.Callback<MapReduceTaskIdAndData, NullMessage>() {

                        @Override
                        public void requestComplete(AsyncRequest<MapReduceTaskIdAndData, NullMessage> request) {
                            if (request.getStatus() != AsyncRequest.Status.Success) {
                                LOG.error("updateMapReduceTaskValue Request Failed");
                                _lastIOException = new IOException("updateMapReduceTaskValue Request Failed");
                            }
                            //release blocking semaphore
                            _blockingCallSemaphore.release();
                        }
                    });
            //wait for async request to complete ... 
            _blockingCallSemaphore.acquireUninterruptibly();

            // if async call failed and generated an exception on the remote end ... throw the exception now
            if (_lastIOException != null) {
                throw _lastIOException;
            }
        }

        /** used by the map/reduce task to query for task data by key **/
        public String queryTaskData(String key) throws IOException {

            _blockingCallSemaphore = new Semaphore(0);
            _lastIOException = null;

            final MapReduceTaskIdAndData taskIdAndDataInOut = new MapReduceTaskIdAndData();

            taskIdAndDataInOut.setJobId(_jobId);
            taskIdAndDataInOut.setTaskId(_taskId);
            taskIdAndDataInOut.setDataKey(key);

            _asyncStub.queryMapReduceTaskValue(taskIdAndDataInOut,
                    new AsyncRequest.Callback<MapReduceTaskIdAndData, MapReduceTaskIdAndData>() {

                        @Override
                        public void requestComplete(
                                AsyncRequest<MapReduceTaskIdAndData, MapReduceTaskIdAndData> request) {
                            if (request.getStatus() != AsyncRequest.Status.Success) {
                                LOG.error("queryTaskData Request Failed");
                                _lastIOException = new IOException("queryTaskData Request Failed");
                            } else {
                                if (request.getOutput().isFieldDirty(MapReduceTaskIdAndData.Field_DATAVALUE)) {
                                    taskIdAndDataInOut.setDataValue(request.getOutput().getDataValue());
                                }
                            }
                            //release blocking semaphore
                            _blockingCallSemaphore.release();
                        }
                    });
            //wait for async request to complete ... 
            _blockingCallSemaphore.acquireUninterruptibly();

            // if async call failed and generated an exception on the remote end ... throw the exception now
            if (_lastIOException != null) {
                throw _lastIOException;
            } else {
                if (taskIdAndDataInOut.isFieldDirty(MapReduceTaskIdAndData.Field_DATAVALUE)) {
                    return taskIdAndDataInOut.getDataValue();
                }
                return null;
            }
        }

        /** used by the map/reduce task to query for duplicate exclusion status **/
        public int queryDuplicateStatus(final URLFPV2 key) throws IOException {

            _blockingCallSemaphore = new Semaphore(0);
            _lastIOException = null;
            final SimpleByteResult result = new SimpleByteResult();

            _asyncStub.queryDuplicateStatus(key, new AsyncRequest.Callback<URLFPV2, SimpleByteResult>() {

                @Override
                public void requestComplete(AsyncRequest<URLFPV2, SimpleByteResult> request) {
                    if (request.getStatus() != AsyncRequest.Status.Success) {
                        LOG.error("queryTaskData Request Failed");
                        _lastIOException = new IOException("queryTaskData Request Failed");
                    } else {
                        result.setByteResult(request.getOutput().getByteResult());
                    }
                    //release blocking semaphore
                    _blockingCallSemaphore.release();
                }
            });
            //wait for async request to complete ... 
            _blockingCallSemaphore.acquireUninterruptibly();

            // if async call failed and generated an exception on the remote end ... throw the exception now
            if (_lastIOException != null) {
                throw _lastIOException;
            } else {
                return result.getByteResult();
            }
        }

        /** used by the map/reduce task to query for fingprint status **/
        public int queryFingerprintStatus(final URLFPV2 key) throws IOException {

            _blockingCallSemaphore = new Semaphore(0);
            _lastIOException = null;
            final SimpleByteResult result = new SimpleByteResult();

            _asyncStub.queryFingerprintStatus(key, new AsyncRequest.Callback<URLFPV2, SimpleByteResult>() {

                @Override
                public void requestComplete(AsyncRequest<URLFPV2, SimpleByteResult> request) {
                    if (request.getStatus() != AsyncRequest.Status.Success) {
                        LOG.error("queryTaskData Request Failed");
                        _lastIOException = new IOException("queryTaskData Request Failed");
                    } else {
                        result.setByteResult(request.getOutput().getByteResult());
                    }
                    //release blocking semaphore
                    _blockingCallSemaphore.release();
                }
            });
            //wait for async request to complete ... 
            _blockingCallSemaphore.acquireUninterruptibly();

            // if async call failed and generated an exception on the remote end ... throw the exception now
            if (_lastIOException != null) {
                throw _lastIOException;
            } else {
                return result.getByteResult();
            }
        }

        /** used by the map/reduce task to purge all data related to the task **/
        public void purgeTaskData() throws IOException {

            _blockingCallSemaphore = new Semaphore(0);
            _lastIOException = null;

            MapReduceTaskIdAndData taskIdAndData = new MapReduceTaskIdAndData();

            taskIdAndData.setJobId(_jobId);
            taskIdAndData.setTaskId(_taskId);

            _asyncStub.purgeMapReduceTaskValue(taskIdAndData,
                    new AsyncRequest.Callback<MapReduceTaskIdAndData, NullMessage>() {

                        @Override
                        public void requestComplete(AsyncRequest<MapReduceTaskIdAndData, NullMessage> request) {
                            if (request.getStatus() != AsyncRequest.Status.Success) {
                                LOG.error("purgeMapReduceTaskValue Request Failed");
                                _lastIOException = new IOException("purgeMapReduceTaskValue Request Failed");
                            }
                            //release blocking semaphore
                            _blockingCallSemaphore.release();
                        }
                    });
            //wait for async request to complete ... 
            _blockingCallSemaphore.acquireUninterruptibly();

            // if async call failed and generated an exception on the remote end ... throw the exception now
            if (_lastIOException != null) {
                throw _lastIOException;
            }
        }

        /** shutdown **/
        public void shutdown() {
            if (_channel != null) {
                try {
                    _channel.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                _channel = null;
                _eventLoop.stop();
            }
        }

        @Override
        public void OutgoingChannelConnected(AsyncClientChannel channel) {
            LOG.info("OutgoingChannelConnected... ");
            if (_blockingCallSemaphore != null) {
                _blockingCallSemaphore.release();
            }
        }

        @Override
        public boolean OutgoingChannelDisconnected(AsyncClientChannel channel) {
            LOG.info("OutgoingChannelDisconnected... ");
            if (_blockingCallSemaphore != null) {
                _blockingCallSemaphore.release();
            }
            return true;
        }
    }
}