org.commoncrawl.service.crawler.CrawlTarget.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.crawler.CrawlTarget.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.commoncrawl.service.crawler;

import java.io.IOException;
import java.net.InetAddress;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.record.Buffer;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.common.Environment;
import org.commoncrawl.io.NIOBufferList;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.CrawlSegmentHost;
import org.commoncrawl.protocol.CrawlSegmentURL;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.server.CommonCrawlServer;
import org.commoncrawl.service.crawler.CrawlTargetHTTPData;
import org.commoncrawl.service.crawler.PersistentCrawlTarget;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.IntrusiveList;
import org.commoncrawl.util.HttpCookieUtils.CookieStore;

/**
 * class encapsulates a target url and related state
 * 
 * @author rana
 *
 */
public final class CrawlTarget extends IntrusiveList.IntrusiveListElement<CrawlTarget> {

    /** logging **/
    private static final Log LOG = LogFactory.getLog(CrawlTarget.class);

    private int _segmentId;
    private CrawlList _sourceList;
    // private CrawlURL _urlData;
    private byte _crawlInterface = -1;
    private long _urlFP;
    private String _url;
    private String _redirectURL = null;
    // private Buffer _crawlDatum = null;
    private long _hostFP = -1;
    private long _requestStartTime = -1;
    private int _hostIPAddress = 0;
    private long _hostIPTTL;
    private byte _retryCount = 0;
    private byte _redirectCount = 0;
    private byte _flags = 0;
    private long _lastModifiedTime = -1;
    private String _etag = null;
    private String _crawlDirectiveJSON = null;

    // optional crawl completion callback
    private CrawlItemStatusCallback _callback;

    public static class HTTPData {

        public HTTPData() {

        }

        public HTTPData(String headers, short resultCode, int serverIPAddress, long serverIPTTL) {
            _headers = headers;
            _resultCode = resultCode;
            _serverIP = serverIPAddress;
            _serverIPTTL = serverIPTTL;
        }

        public String _headers;
        public short _resultCode = 0;
        public int _serverIP;
        public long _serverIPTTL;
    }

    private HTTPData _originalRequestData = null;
    private String _activeRequestHeaders = null;
    private short _activeRequestResultCode = 0;

    public CrawlTarget(int segmentId, CrawlList sourceList) {
        _sourceList = sourceList;
        _segmentId = segmentId;
    }

    public CrawlTarget(int segmentId, CrawlList sourceList, CrawlSegmentHost segmentHost,
            CrawlSegmentURL segmentURL) {
        _sourceList = sourceList;
        _segmentId = segmentId;
        _urlFP = segmentURL.getUrlFP();
        _url = segmentURL.getUrl();

        _hostFP = segmentHost.getHostFP();
        _lastModifiedTime = (segmentURL.isFieldDirty(CrawlSegmentURL.Field_LASTMODIFIEDTIME))
                ? segmentURL.getLastModifiedTime()
                : -1;
        _etag = (segmentURL.isFieldDirty(CrawlSegmentURL.Field_ETAG)) ? segmentURL.getEtag() : null;
        if (segmentURL.isFieldDirty(CrawlSegmentURL.Field_CRAWLDIRECTIVEJSON)) {
            _crawlDirectiveJSON = segmentURL.getCrawlDirectiveJSON();
        }
    }

    public CrawlTarget(int segmentId, CrawlList sourceList, String url, long fingerprint,
            CrawlItemStatusCallback callback) {
        _sourceList = sourceList;
        _segmentId = segmentId;
        _url = url;
        _urlFP = fingerprint;
        _callback = callback;
    }

    public CrawlTarget(CrawlList sourceList, PersistentCrawlTarget target) {
        _sourceList = sourceList;

        _segmentId = target.getSegmentId();
        _urlFP = target.getUrlFP();
        _url = target.getUrl();
        _redirectURL = target.getRedirectURL();
        // _crawlDatum = null; // target.getCrawlDatum();
        _hostFP = target.getHostFP();
        _hostIPAddress = target.getHostIPAddress();
        _hostIPTTL = target.getHostIPTTL();
        _retryCount = target.getRetryCount();
        _redirectCount = target.getRedirectCount();
        _flags = target.getFlags();

        if (target.getActiveRequestData().isFieldDirty(CrawlTargetHTTPData.Field_HEADERS))
            _activeRequestHeaders = target.getActiveRequestData().getHeaders();
        if (target.getActiveRequestData().isFieldDirty(CrawlTargetHTTPData.Field_RESULTCODE))
            _activeRequestResultCode = (short) target.getActiveRequestData().getResultCode();

        if (target.isFieldDirty(PersistentCrawlTarget.Field_ORIGINALREQUESTDATA)) {
            _originalRequestData = new HTTPData();
            _originalRequestData._headers = target.getActiveRequestData().getHeaders();
            _originalRequestData._resultCode = (short) target.getActiveRequestData().getResultCode();
            _originalRequestData._serverIP = target.getActiveRequestData().getServerIP();
            _originalRequestData._serverIPTTL = target.getActiveRequestData().getServerIPTTL();
        }

        _lastModifiedTime = target.isFieldDirty(PersistentCrawlTarget.Field_LASTMODIFIEDTIME)
                ? target.getLastModifiedTime()
                : -1;
        _etag = target.isFieldDirty(PersistentCrawlTarget.Field_ETAG) ? target.getEtag() : null;
        _crawlDirectiveJSON = null;
        if (target.isFieldDirty(PersistentCrawlTarget.Field_CRAWLDIRECTIVEJSON)) {
            _crawlDirectiveJSON = target.getCrawlDirectiveJSON();
        }
    }

    private CrawlTarget(CrawlList sourceList) {
        _sourceList = sourceList;
    }

    public static CrawlTarget createTestCrawlTarget(CrawlList domain, String url) {
        CrawlTarget target = new CrawlTarget(domain);

        target._segmentId = 1;
        target._url = url;

        return target;
    }

    public PersistentCrawlTarget createPersistentTarget() {

        PersistentCrawlTarget targetOut = new PersistentCrawlTarget();

        targetOut.setSegmentId(_segmentId);
        targetOut.setUrlFP(_urlFP);
        targetOut.setUrl(_url);
        // targetOut.setCrawlDatum(_crawlDatum);
        targetOut.setHostFP(_hostFP);
        targetOut.setHostIPAddress(_hostIPAddress);
        targetOut.setHostIPTTL(_hostIPTTL);
        targetOut.setRedirectURL((_redirectURL == null) ? "" : _redirectURL);
        targetOut.setRetryCount(_retryCount);
        targetOut.setRedirectCount(_redirectCount);
        targetOut.setFlags(_flags);

        if (_activeRequestHeaders != null)
            targetOut.getActiveRequestData().setHeaders(_activeRequestHeaders);
        if (_activeRequestResultCode != 0)
            targetOut.getActiveRequestData().setResultCode(_activeRequestResultCode);

        if (_originalRequestData != null) {
            targetOut.getOriginalRequestData().setHeaders(_originalRequestData._headers);
            targetOut.getOriginalRequestData().setResultCode(_originalRequestData._resultCode);
            targetOut.getOriginalRequestData().setServerIP(_originalRequestData._serverIP);
            targetOut.getOriginalRequestData().setServerIPTTL(_originalRequestData._serverIPTTL);
        }

        if (_lastModifiedTime != -1) {
            targetOut.setLastModifiedTime(_lastModifiedTime);
        }
        if (_etag != null) {
            targetOut.setEtag(_etag);
        }
        if (_crawlDirectiveJSON != null) {
            targetOut.setCrawlDirectiveJSON(_crawlDirectiveJSON);
        }

        return targetOut;
    }

    /**
     * set the crawl completion callback
     * 
     */
    public void setCompletionCallback(CrawlItemStatusCallback callback) {
        _callback = callback;
    }

    /**
     * get the completion callback (if specified)
     * 
     * @return callback object
     */
    public CrawlItemStatusCallback getCompletionCallback() {
        return _callback;
    }

    /**
     * get the source list which is managing this crawl target
     * 
     * @return CrawlList object
     */
    public CrawlList getSourceList() {
        return _sourceList;
    }

    /** get crawl host **/
    public CrawlListHost getCrawlHost() {
        return _sourceList.getHost();
    }

    /** get cookie store associated with this target **/
    public CookieStore getCookieStore() {
        CrawlListHost host = getCrawlHost();
        if (host != null) {
            return host.getCookieStore();
        }
        return null;
    }

    /**
     * set the source list that owns this target object
     * 
     * @param listObject
     */
    public void setSourceList(CrawlList listObject) {
        _sourceList = listObject;
    }

    /**
     * get the last modified time for this url (if previously set)
     * 
     * @return last modified time if set or -1 if not
     */
    public long getLastModifiedTime() {
        return _lastModifiedTime;
    }

    /**
     * 
     * @return crawl interface associated with this target or -1
     */
    public int getCrawlInterface() {
        return _crawlInterface;
    }

    /**
     * set the crawl interface associated with this target
     * 
     * @param crawlInterface
     *          - the index of the crawl interface to use with this target
     */
    public void setCrawlInterface(int crawlInterface) {
        _crawlInterface = (byte) crawlInterface;
    }

    /**
     * get the etag value for this url (if previously set)
     * 
     * @return etag for given target or null if not set
     */
    public String getETag() {
        return _etag;
    }

    /**
     * get the url fingerprint for this crawl target
     * 
     * @return
     */
    public long getFingerprint() {
        return _urlFP;
    }

    /**
     * get the host fingerprint for this crawl target
     * 
     * @return host fingerprint id
     */
    public long getHostFP() {
        return _hostFP;
    }

    /*
     * set the host fingerprint for this crawl target
     */
    public void setHostFP(long hostFingerprint) {
        _hostFP = hostFingerprint;
    }

    public int getResultCode() {
        return _activeRequestResultCode;
    }

    /** retrieve the orignal request data **/
    public HTTPData getOriginalRequestData() {
        return _originalRequestData;
    }

    /*
     * public Buffer getCrawlDatum() { return _crawlDatum; }
     */

    public boolean isRedirected() {
        return (_flags & CrawlURL.Flags.IsRedirected) != 0;
    }

    public String getActiveURL() {
        // if this is a redirected target ...
        if ((_flags & CrawlURL.Flags.IsRedirected) != 0) {
            // return the redirect url ...
            return _redirectURL;
        }
        // otherwise return the primary url ...
        return _url;
    }

    public String getOriginalURL() {
        return _url;
    }

    public void setOriginalURL(String url) {
        _url = url;
    }

    public String getRedirectURL() {
        return _redirectURL;
    }

    public void setRedirectURL(String url) {
        _redirectURL = url;
    }

    public int getSegmentId() {
        return _segmentId;
    }

    public int getRetryCount() {
        return _retryCount;
    }

    public int getRedirectCount() {
        return _redirectCount;
    }

    public void incRedirectCount() {
        _redirectCount++;
    }

    public int getFlags() {
        return _flags;
    }

    public void setFlags(int flags) {
        _flags = (byte) flags;
    }

    public long getServerIPTTL() {
        return _hostIPTTL;
    }

    public void setServerIPTTL(long ttl) {
        _hostIPTTL = ttl;
    }

    public int getServerIP() {
        return _hostIPAddress;
    }

    public void setServerIP(int ipAddress) {
        _hostIPAddress = ipAddress;
    }

    /**
     * set request start time
     * 
     */
    public void setRequestStartTime(long time) {
        _requestStartTime = time;
    }

    /**
     * get request start time
     * 
     */
    public long getRequestStartTime() {
        return _requestStartTime;
    }

    public void incrementRetryCounter() {
        _retryCount++;
    }

    public void cacheOriginalRequestData(NIOHttpConnection connection) {
        InetAddress address = connection.getResolvedAddress();
        int ipAddress = 0;
        if (address == null || address.getAddress() == null) {
            if (address == null) {
                LOG.error("### BUG resolved Adddress is null in cacheOriginalRequest! for Target:"
                        + getOriginalURL());
            } else {
                LOG.error("### BUG resolved Adddress.getAddress returned null in cacheOriginalRequest! for Target:"
                        + getOriginalURL());
            }
        } else {
            ipAddress = IPAddressUtils.IPV4AddressToInteger(address.getAddress());
        }

        _originalRequestData = new CrawlTarget.HTTPData(connection.getResponseHeaders().toString(),
                (short) connection.getHttpResponseCode(), ipAddress, connection.getResolvedAddressTTL());
    }

    private static String failureDescFromReason(int failureReason) {
        return CrawlURL.FailureReason.toString(failureReason);
    }

    public void logFailure(final CrawlerEngine engine, int failureReason, String errorDescription) {
        StringBuffer sb = new StringBuffer();

        if (errorDescription == null)
            errorDescription = "";

        sb.append(String.format("%1$20.20s ", CCStringUtils.dateStringFromTimeValue(System.currentTimeMillis())));
        sb.append(String.format("%1$15.15s ", engine.getCrawlInterfaceGivenIndex(getCrawlInterface())));
        sb.append(String.format("%1$15.15s ", failureDescFromReason(failureReason)));
        sb.append(String.format("%1s ", errorDescription));
        if ((getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
            sb.append(getRedirectURL());
            sb.append(" ");
        }
        sb.append(getActiveURL());

        if (engine != null) {
            engine.getFailureLog().error(sb.toString());
        } else {
            System.out.println(sb.toString());
        }
    }

    public static void logFailureDetail(final CrawlerEngine engine, CrawlURL url, CrawlTarget optionalTarget,
            int failureReason, String errorDescription) {

        StringBuffer sb = new StringBuffer();

        if (errorDescription == null)
            errorDescription = "";

        sb.append(String.format("%1$20.20s ", CCStringUtils.dateStringFromTimeValue(System.currentTimeMillis())));
        sb.append(String.format("%1$15.15s ",
                (optionalTarget != null) ? engine.getCrawlInterfaceGivenIndex(optionalTarget.getCrawlInterface())
                        : null));
        sb.append(String.format("%1$15.15s ", failureDescFromReason(failureReason)));
        sb.append(String.format("%1s ", errorDescription));
        if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
            sb.append(url.getRedirectURL());
            sb.append(" ");
        }
        sb.append(url.getUrl());

        if (engine != null) {
            engine.getFailureLog().error(sb.toString());
        } else {
            System.out.println(sb.toString());
        }
    }

    public static void failURL(CrawlURL urlData, CrawlTarget optionalTarget, int failureReason,
            String errorDescription) {

        if (Environment.detailLogEnabled())
            LOG.info("Fetch Failed URL:" + urlData.getUrl() + " reason:" + failureReason);

        // and log this event to the custom failure log ...
        logFailureDetail(CrawlerServer.getEngine(), urlData, optionalTarget, failureReason, errorDescription);

        // if not a robots request
        if ((urlData.getFlags() & CrawlURL.Flags.IsRobotsURL) == 0) {
            // add in failure info ...
            urlData.setLastAttemptFailureReason((byte) failureReason);
            if (errorDescription != null) {
                urlData.setLastAttemptFailureDetail(errorDescription);
            }

            // and update segment progress logs ...
            if (CrawlerServer.getEngine() != null) {
                CrawlerServer.getEngine().crawlComplete(null, urlData, optionalTarget, false);
            }
        }
    }

    public void fetchFailed(int failureReason, String description) {
        _sourceList.fetchFailed(this, failureReason, description);
    }

    public void fetchStarting(NIOHttpConnection connection) {
        CrawlerServer.getEngine().fetchStarting(this, connection);
        // inform source list of the change ...
        _sourceList.fetchStarting(this, connection);
    }

    public void fetchStarted() {
        _sourceList.fetchStarted(this);
    }

    private final EventLoop getEventLoop() {
        return getServer().getEventLoop();
    }

    private final CommonCrawlServer getServer() {
        return getEngine().getServer();
    }

    private final CrawlerEngine getEngine() {
        return CrawlerServer.getEngine();
    }

    private String getRedirectLocation(int responseCode, NIOHttpHeaders httpHeaders,
            NIOBufferList nioContentBuffer) {

        String redirectLocation = null;

        if (responseCode >= 300 && responseCode < 400) {

            switch (responseCode) {

            // multiple choices ?
            case 300:
                // permanent
            case 301:
                // use proxy ...
            case 305:
                // temporary
            case 302:
                // redirect after post
            case 303:
                // temporary redirect
            case 307: {

                // attempt to extract location from headers ...
                int key = httpHeaders.getKey("Location");
                if (key == -1) {
                    // attempt lowercase version ...
                    key = httpHeaders.getKey("location");
                }
                if (key != -1) {
                    redirectLocation = httpHeaders.getValue(key);
                    if (Environment.detailLogEnabled())
                        LOG.info("Redirect detected for target:" + getOriginalURL() + " .New Location:"
                                + redirectLocation);
                }
            }
                break;
            }
        }
        return redirectLocation;
    }

    /**
     * check final http response code against list of acceptable response code for
     * a successfull fetch
     * 
     */
    private static boolean isAcceptableSuccessResponseCode(int responseCode) {
        if ((responseCode >= 200 && responseCode < 300) || responseCode == 304
                || (responseCode >= 400 && responseCode < 500)) {
            return true;
        }
        return false;
    }

    public void fetchSucceeded(NIOHttpConnection connection, NIOHttpHeaders httpHeaders,
            NIOBufferList nioContentBuffer) {

        boolean failure = false;
        int failureReason = CrawlURL.FailureReason.UNKNOWN;
        Exception failureException = null;
        String failureDescription = "";

        // revalidate ip address here ...
        if (getRedirectCount() == 0) {
            // check to see if ip address go reresolved ...
            if (connection.getResolvedAddress() != null) {

                InetAddress address = connection.getResolvedAddress();

                int ipAddress = 0;

                if (address.getAddress() != null) {
                    // if so, update url data information ...
                    ipAddress = IPAddressUtils.IPV4AddressToInteger(address.getAddress());
                } else {
                    LOG.error("### BUG int Address getAddress returned Null for target:" + getActiveURL());
                }

                // LOG.info("IP Address for URL:" + getActiveURL() + " is:" + ipAddress
                // + " ttl is:" + connection.getResolvedAddressTTL());
                setServerIP(ipAddress);
                setServerIPTTL(connection.getResolvedAddressTTL());
            }
        }

        Buffer contentBuffer = new Buffer();
        byte data[] = new byte[nioContentBuffer.available()];

        int responseCode = -1;

        try {
            responseCode = NIOHttpConnection.getHttpResponseCode(httpHeaders);

            if (!isAcceptableSuccessResponseCode(responseCode)) {
                failure = true;
                failureReason = CrawlURL.FailureReason.InvalidResponseCode;
                failureDescription = "URL:" + getOriginalURL() + " returned invalid responseCode:" + responseCode;
            }
        } catch (Exception e) {
            failure = true;
            failureReason = CrawlURL.FailureReason.RuntimeError;
            failureException = e;
            failureDescription = "getHTTPResponse Threw:" + StringUtils.stringifyException(e) + " for URL:"
                    + getOriginalURL();
        }

        if (!failure) {
            // populate a conventional buffer object with content data ...

            try {
                // read data from nio buffer into byte array
                nioContentBuffer.read(data);
                // and reset source buffer .... (releasing memory )...
                nioContentBuffer.reset();
                // set byte buffer into buffer object ...
                contentBuffer.set(data);

            } catch (IOException e) {

                failure = true;
                failureReason = CrawlURL.FailureReason.IOException;
                failureException = e;
                failureDescription = "Unable to read Content Buffer from successfull Fetch for URL:"
                        + getOriginalURL();
            }
        }

        if (!failure) {
            // populate crawl url data
            _activeRequestHeaders = httpHeaders.toString();
            _activeRequestResultCode = (short) NIOHttpConnection.getHttpResponseCode(httpHeaders);
            ;
        }

        if (failure) {
            if (failureException != null) {
                if (Environment.detailLogEnabled())
                    LOG.error(StringUtils.stringifyException(failureException));
            }
            fetchFailed(failureReason, failureDescription);
        } else {

            // call host ...
            _sourceList.fetchSucceeded(this, connection.getDownloadTime(), httpHeaders, contentBuffer);

            // Add to CrawlLog for both content gets and robots gets
            // create a crawl url object
            CrawlURL urlData = createCrawlURLObject(CrawlURL.CrawlResult.SUCCESS, contentBuffer);
            // set truncation flag if content truncation during download
            if (connection.isContentTruncated()) {
                urlData.setFlags(urlData.getFlags() | CrawlURL.Flags.TruncatedDuringDownload);
            }
            // and update segment progress logs ...
            getEngine().crawlComplete(connection, urlData, this, true);

            /*
             * if ((getFlags() & CrawlURL.Flags.IsRobotsURL) != 0) {
             * getEngine().logSuccessfulRobotsGET(connection, this); }
             */
        }
    }

    public CrawlURL createFailureCrawlURLObject(int failureReason, String errorDescription) {
        CrawlURL urlData = createCrawlURLObject(CrawlURL.CrawlResult.FAILURE, null);
        urlData.setLastAttemptFailureReason((byte) failureReason);
        return urlData;
    }

    public CrawlURL createCrawlURLObject(int result, Buffer contentBuffer) {

        // build a crawl url object ...
        CrawlURL crawlURL = new CrawlURL();

        long currentTime = System.currentTimeMillis();

        // original request fingerprint ...
        crawlURL.setFingerprint(getFingerprint());
        // original request url ...
        crawlURL.setUrl(getOriginalURL());

        // skip datum for now ...
        // crawlURL.setCrawlDatumData(getCrawlDatum());
        // original list id
        crawlURL.setListId(_sourceList.getListId());
        // original segment id
        crawlURL.setCrawlSegmentId(getSegmentId());
        // original host fingerprint ...
        crawlURL.setHostFP(getHostFP());

        // set the host ip in the crawl target ...

        // latest server ip information
        crawlURL.setServerIP(getServerIP());
        crawlURL.setServerIPTTL(getServerIPTTL());

        if (_originalRequestData != null) {
            // original request data if present ...
            crawlURL.setOriginalResultCode(_originalRequestData._resultCode);
            crawlURL.setOriginalHeaders(_originalRequestData._headers);
            crawlURL.setOriginalServerIP(_originalRequestData._serverIP);
            // url.setOriginalContentRaw(url.getOriginalContentRaw());
        }

        // set last crawl info ...
        // url.setLastAttemptCrawlerId();
        crawlURL.setLastAttemptTime(currentTime);
        // final disposition
        crawlURL.setLastAttemptResult((byte) result);
        // url.setLastCrawlTime(currentTime);

        // current result details ...
        if (_activeRequestHeaders != null)
            crawlURL.setHeaders(_activeRequestHeaders);
        if (_activeRequestResultCode != 0)
            crawlURL.setResultCode(_activeRequestResultCode);

        // current result content ...
        if (contentBuffer != null) {
            crawlURL.setFieldDirty(CrawlURL.Field_CONTENTRAW);
            crawlURL.setContentRaw(contentBuffer);
        }

        // finally, most importantly ... if redirected ...
        if ((getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
            // check to see if urls match
            if (!getOriginalURL().equals(getActiveURL())) {
                crawlURL.setFlags(crawlURL.getFlags() | CrawlURL.Flags.IsRedirected);
                crawlURL.setRedirectURL(getActiveURL());
            }
        }

        // if robots, mark it so in the crawlURL object
        if ((getFlags() & CrawlURL.Flags.IsRobotsURL) != 0) {
            crawlURL.setFlags(crawlURL.getFlags() | CrawlURL.Flags.IsRobotsURL);
        }
        if (_crawlDirectiveJSON != null) {
            crawlURL.setCrawlDirectiveJSON(_crawlDirectiveJSON);
        }
        return crawlURL;
    }

    public static CrawlURL allocateCrawlURLFromSegmentURL(int segmentId, CrawlSegmentHost host,
            CrawlSegmentURL segmentURL, boolean populateIPInfo) {

        // build a crawl url object ...
        CrawlURL crawlURL = new CrawlURL();

        crawlURL.setFingerprint(segmentURL.getUrlFP());
        crawlURL.setUrl(segmentURL.getUrl());

        // TODO: TRICKY BUFFER ASSIGNMENT BUT WORKS
        // crawlURL.setCrawlDatumData(new
        // Buffer(segmentURL.getCrawlDatumData().getReadOnlyBytes()));
        crawlURL.setCrawlSegmentId(segmentId);
        crawlURL.setListId(host.getListId());
        crawlURL.setHostFP(host.getHostFP());

        if (populateIPInfo) {
            // set the host ip in the crawl target ...
            crawlURL.setServerIP(host.getIpAddress());
            crawlURL.setServerIPTTL(host.getTtl());
        }

        return crawlURL;
    }

    public static CrawlURL allocateCrawlURLForFailure(String url, long fingerprint, int failureCode,
            String failureDetail) {
        // build a crawl url object ...
        CrawlURL crawlURL = new CrawlURL();

        crawlURL.setFingerprint(fingerprint);
        crawlURL.setUrl(url);
        crawlURL.setLastAttemptResult((byte) CrawlURL.CrawlResult.FAILURE);
        crawlURL.setLastAttemptFailureReason((byte) failureCode);
        crawlURL.setLastAttemptFailureDetail(failureDetail);

        return crawlURL;
    }

}