org.apache.nutch.crawl.CrawlDatum.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.crawl.CrawlDatum.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.crawl;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.jexl2.JexlContext;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.MapContext;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.VersionMismatchException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.util.StringUtil;

/* The crawl state of a url. */
public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {

    public static final String GENERATE_DIR_NAME = "crawl_generate";
    public static final String FETCH_DIR_NAME = "crawl_fetch";
    public static final String PARSE_DIR_NAME = "crawl_parse";

    private static final byte CUR_VERSION = 7;

    /** Compatibility values for on-the-fly conversion from versions < 5. */
    private static final byte OLD_STATUS_SIGNATURE = 0;
    private static final byte OLD_STATUS_DB_UNFETCHED = 1;
    private static final byte OLD_STATUS_DB_FETCHED = 2;
    private static final byte OLD_STATUS_DB_GONE = 3;
    private static final byte OLD_STATUS_LINKED = 4;
    private static final byte OLD_STATUS_FETCH_SUCCESS = 5;
    private static final byte OLD_STATUS_FETCH_RETRY = 6;
    private static final byte OLD_STATUS_FETCH_GONE = 7;

    private static HashMap<Byte, Byte> oldToNew = new HashMap<>();

    /** Page was not fetched yet. */
    public static final byte STATUS_DB_UNFETCHED = 0x01;
    /** Page was successfully fetched. */
    public static final byte STATUS_DB_FETCHED = 0x02;
    /** Page no longer exists. */
    public static final byte STATUS_DB_GONE = 0x03;
    /** Page temporarily redirects to other page. */
    public static final byte STATUS_DB_REDIR_TEMP = 0x04;
    /** Page permanently redirects to other page. */
    public static final byte STATUS_DB_REDIR_PERM = 0x05;
    /** Page was successfully fetched and found not modified. */
    public static final byte STATUS_DB_NOTMODIFIED = 0x06;
    /** Page was marked as being a duplicate of another page */
    public static final byte STATUS_DB_DUPLICATE = 0x07;
    /** Page was marked as orphan, e.g. has no inlinks anymore */
    public static final byte STATUS_DB_ORPHAN = 0x08;

    /** Maximum value of DB-related status. */
    public static final byte STATUS_DB_MAX = 0x1f;

    /** Fetching was successful. */
    public static final byte STATUS_FETCH_SUCCESS = 0x21;
    /** Fetching unsuccessful, needs to be retried (transient errors). */
    public static final byte STATUS_FETCH_RETRY = 0x22;
    /** Fetching temporarily redirected to other page. */
    public static final byte STATUS_FETCH_REDIR_TEMP = 0x23;
    /** Fetching permanently redirected to other page. */
    public static final byte STATUS_FETCH_REDIR_PERM = 0x24;
    /** Fetching unsuccessful - page is gone. */
    public static final byte STATUS_FETCH_GONE = 0x25;
    /** Fetching successful - page is not modified. */
    public static final byte STATUS_FETCH_NOTMODIFIED = 0x26;

    /** Maximum value of fetch-related status. */
    public static final byte STATUS_FETCH_MAX = 0x3f;

    /** Page signature. */
    public static final byte STATUS_SIGNATURE = 0x41;
    /** Page was newly injected. */
    public static final byte STATUS_INJECTED = 0x42;
    /** Page discovered through a link. */
    public static final byte STATUS_LINKED = 0x43;
    /** Page got metadata from a parser */
    public static final byte STATUS_PARSE_META = 0x44;

    public static final HashMap<Byte, String> statNames = new HashMap<>();
    static {
        statNames.put(STATUS_DB_UNFETCHED, "db_unfetched");
        statNames.put(STATUS_DB_FETCHED, "db_fetched");
        statNames.put(STATUS_DB_GONE, "db_gone");
        statNames.put(STATUS_DB_REDIR_TEMP, "db_redir_temp");
        statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
        statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
        statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
        statNames.put(STATUS_DB_ORPHAN, "db_orphan");
        statNames.put(STATUS_SIGNATURE, "signature");
        statNames.put(STATUS_INJECTED, "injected");
        statNames.put(STATUS_LINKED, "linked");
        statNames.put(STATUS_FETCH_SUCCESS, "fetch_success");
        statNames.put(STATUS_FETCH_RETRY, "fetch_retry");
        statNames.put(STATUS_FETCH_REDIR_TEMP, "fetch_redir_temp");
        statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm");
        statNames.put(STATUS_FETCH_GONE, "fetch_gone");
        statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
        statNames.put(STATUS_PARSE_META, "parse_metadata");

        oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
        oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
        oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE);
        oldToNew.put(OLD_STATUS_FETCH_GONE, STATUS_FETCH_GONE);
        oldToNew.put(OLD_STATUS_FETCH_SUCCESS, STATUS_FETCH_SUCCESS);
        oldToNew.put(OLD_STATUS_FETCH_RETRY, STATUS_FETCH_RETRY);
        oldToNew.put(OLD_STATUS_LINKED, STATUS_LINKED);
        oldToNew.put(OLD_STATUS_SIGNATURE, STATUS_SIGNATURE);
    }

    private byte status;
    private long fetchTime = System.currentTimeMillis();
    private byte retries;
    private int fetchInterval;
    private float score = 0.0f;
    private byte[] signature = null;
    private long modifiedTime;
    private org.apache.hadoop.io.MapWritable metaData;

    public static boolean hasDbStatus(CrawlDatum datum) {
        if (datum.status <= STATUS_DB_MAX)
            return true;
        return false;
    }

    public static boolean hasFetchStatus(CrawlDatum datum) {
        if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX)
            return true;
        return false;
    }

    public CrawlDatum() {
    }

    public CrawlDatum(int status, int fetchInterval) {
        this();
        this.status = (byte) status;
        this.fetchInterval = fetchInterval;
    }

    public CrawlDatum(int status, int fetchInterval, float score) {
        this(status, fetchInterval);
        this.score = score;
    }

    //
    // accessor methods
    //

    public byte getStatus() {
        return status;
    }

    public static String getStatusName(byte value) {
        String res = statNames.get(value);
        if (res == null)
            res = "unknown";
        return res;
    }

    public void setStatus(int status) {
        this.status = (byte) status;
    }

    /**
     * Returns either the time of the last fetch, or the next fetch time,
     * depending on whether Fetcher or CrawlDbReducer set the time.
     */
    public long getFetchTime() {
        return fetchTime;
    }

    /**
     * Sets either the time of the last fetch or the next fetch time, depending on
     * whether Fetcher or CrawlDbReducer set the time.
     */
    public void setFetchTime(long fetchTime) {
        this.fetchTime = fetchTime;
    }

    public long getModifiedTime() {
        return modifiedTime;
    }

    public void setModifiedTime(long modifiedTime) {
        this.modifiedTime = modifiedTime;
    }

    public byte getRetriesSinceFetch() {
        return retries;
    }

    public void setRetriesSinceFetch(int retries) {
        this.retries = (byte) retries;
    }

    public int getFetchInterval() {
        return fetchInterval;
    }

    public void setFetchInterval(int fetchInterval) {
        this.fetchInterval = fetchInterval;
    }

    public void setFetchInterval(float fetchInterval) {
        this.fetchInterval = Math.round(fetchInterval);
    }

    public float getScore() {
        return score;
    }

    public void setScore(float score) {
        this.score = score;
    }

    public byte[] getSignature() {
        return signature;
    }

    public void setSignature(byte[] signature) {
        if (signature != null && signature.length > 256)
            throw new RuntimeException("Max signature length (256) exceeded: " + signature.length);
        this.signature = signature;
    }

    public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
        this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable);
    }

    /**
     * Add all metadata from other CrawlDatum to this CrawlDatum.
     * 
     * @param other
     *          CrawlDatum
     */
    public void putAllMetaData(CrawlDatum other) {
        for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) {
            getMetaData().put(e.getKey(), e.getValue());
        }
    }

    /**
     * returns a MapWritable if it was set or read in @see readFields(DataInput),
     * returns empty map in case CrawlDatum was freshly created (lazily
     * instantiated).
     */
    public org.apache.hadoop.io.MapWritable getMetaData() {
        if (this.metaData == null)
            this.metaData = new org.apache.hadoop.io.MapWritable();
        return this.metaData;
    }

    //
    // writable methods
    //

    public static CrawlDatum read(DataInput in) throws IOException {
        CrawlDatum result = new CrawlDatum();
        result.readFields(in);
        return result;
    }

    public void readFields(DataInput in) throws IOException {
        byte version = in.readByte(); // read version
        if (version > CUR_VERSION) // check version
            throw new VersionMismatchException(CUR_VERSION, version);

        status = in.readByte();
        fetchTime = in.readLong();
        retries = in.readByte();
        if (version > 5) {
            fetchInterval = in.readInt();
        } else
            fetchInterval = Math.round(in.readFloat());
        score = in.readFloat();
        if (version > 2) {
            modifiedTime = in.readLong();
            int cnt = in.readByte();
            if (cnt > 0) {
                signature = new byte[cnt];
                in.readFully(signature);
            } else
                signature = null;
        }

        if (version > 3) {
            boolean hasMetadata = false;
            if (version < 7) {
                org.apache.hadoop.io.MapWritable oldMetaData = new org.apache.hadoop.io.MapWritable();
                if (in.readBoolean()) {
                    hasMetadata = true;
                    metaData = new org.apache.hadoop.io.MapWritable();
                    oldMetaData.readFields(in);
                }
                for (Writable key : oldMetaData.keySet()) {
                    metaData.put(key, oldMetaData.get(key));
                }
            } else {
                if (in.readBoolean()) {
                    hasMetadata = true;
                    metaData = new org.apache.hadoop.io.MapWritable();
                    metaData.readFields(in);
                }
            }
            if (hasMetadata == false)
                metaData = null;
        }
        // translate status codes
        if (version < 5) {
            if (oldToNew.containsKey(status))
                status = oldToNew.get(status);
            else
                status = STATUS_DB_UNFETCHED;

        }
    }

    /** The number of bytes into a CrawlDatum that the score is stored. */
    private static final int SCORE_OFFSET = 15;
    private static final int SIG_OFFSET = SCORE_OFFSET + 12;

    public void write(DataOutput out) throws IOException {
        out.writeByte(CUR_VERSION); // store current version
        out.writeByte(status);
        out.writeLong(fetchTime);
        out.writeByte(retries);
        out.writeInt(fetchInterval);
        out.writeFloat(score);
        out.writeLong(modifiedTime);
        if (signature == null) {
            out.writeByte(0);
        } else {
            out.writeByte(signature.length);
            out.write(signature);
        }
        if (metaData != null && metaData.size() > 0) {
            out.writeBoolean(true);
            metaData.write(out);
        } else {
            out.writeBoolean(false);
        }
    }

    /** Copy the contents of another instance into this instance. */
    public void set(CrawlDatum that) {
        this.status = that.status;
        this.fetchTime = that.fetchTime;
        this.retries = that.retries;
        this.fetchInterval = that.fetchInterval;
        this.score = that.score;
        this.modifiedTime = that.modifiedTime;
        this.signature = that.signature;
        if (that.metaData != null) {
            // make a deep copy
            this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData);
        } else {
            this.metaData = null;
        }
    }

    //
    // compare methods
    //

    /** Sort by decreasing score. */
    public int compareTo(CrawlDatum that) {
        if (that.score != this.score)
            return (that.score - this.score) > 0 ? 1 : -1;
        if (that.status != this.status)
            return this.status - that.status;
        if (that.fetchTime != this.fetchTime)
            return (that.fetchTime - this.fetchTime) > 0 ? 1 : -1;
        if (that.retries != this.retries)
            return that.retries - this.retries;
        if (that.fetchInterval != this.fetchInterval)
            return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1;
        if (that.modifiedTime != this.modifiedTime)
            return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1;
        return SignatureComparator._compare(this, that);
    }

    /** A Comparator optimized for CrawlDatum. */
    public static class Comparator extends WritableComparator {
        public Comparator() {
            super(CrawlDatum.class);
        }

        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            float score1 = readFloat(b1, s1 + SCORE_OFFSET);
            float score2 = readFloat(b2, s2 + SCORE_OFFSET);
            if (score2 != score1) {
                return (score2 - score1) > 0 ? 1 : -1;
            }
            int status1 = b1[s1 + 1];
            int status2 = b2[s2 + 1];
            if (status2 != status1)
                return status1 - status2;
            long fetchTime1 = readLong(b1, s1 + 2);
            long fetchTime2 = readLong(b2, s2 + 2);
            if (fetchTime2 != fetchTime1)
                return (fetchTime2 - fetchTime1) > 0 ? 1 : -1;
            int retries1 = b1[s1 + 10];
            int retries2 = b2[s2 + 10];
            if (retries2 != retries1)
                return retries2 - retries1;
            int fetchInterval1 = readInt(b1, s1 + 11);
            int fetchInterval2 = readInt(b2, s2 + 11);
            if (fetchInterval2 != fetchInterval1)
                return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;
            long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);
            long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4);
            if (modifiedTime2 != modifiedTime1)
                return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1;
            int sigl1 = b1[s1 + SIG_OFFSET];
            int sigl2 = b2[s2 + SIG_OFFSET];
            return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2, SIG_OFFSET, sigl2);
        }
    }

    static { // register this comparator
        WritableComparator.define(CrawlDatum.class, new Comparator());
    }

    //
    // basic methods
    //

    public String toString() {
        StringBuilder buf = new StringBuilder();
        buf.append("Version: " + CUR_VERSION + "\n");
        buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) + ")\n");
        buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
        buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
        buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
        buf.append("Retry interval: " + getFetchInterval() + " seconds ("
                + (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
        buf.append("Score: " + getScore() + "\n");
        buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
        buf.append("Metadata: \n ");
        if (metaData != null) {
            for (Entry<Writable, Writable> e : metaData.entrySet()) {
                buf.append("\t");
                buf.append(e.getKey());
                buf.append("=");
                buf.append(e.getValue());
                buf.append("\n");
            }
        }
        return buf.toString();
    }

    private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) {
        if (metaData == null || metaData.size() == 0) {
            return otherMetaData == null || otherMetaData.size() == 0;
        }
        if (otherMetaData == null) {
            // we already know that the current object is not null or empty
            return false;
        }
        HashSet<Entry<Writable, Writable>> set1 = new HashSet<>(metaData.entrySet());
        HashSet<Entry<Writable, Writable>> set2 = new HashSet<>(otherMetaData.entrySet());
        return set1.equals(set2);
    }

    public boolean equals(Object o) {
        if (!(o instanceof CrawlDatum))
            return false;
        CrawlDatum other = (CrawlDatum) o;
        boolean res = (this.status == other.status) && (this.fetchTime == other.fetchTime)
                && (this.modifiedTime == other.modifiedTime) && (this.retries == other.retries)
                && (this.fetchInterval == other.fetchInterval)
                && (SignatureComparator._compare(this.signature, other.signature) == 0)
                && (this.score == other.score);
        if (!res)
            return res;
        return metadataEquals(other.metaData);
    }

    public int hashCode() {
        int res = 0;
        if (signature != null) {
            for (int i = 0; i < signature.length / 4; i += 4) {
                res ^= (signature[i] << 24 + signature[i + 1] << 16 + signature[i + 2] << 8 + signature[i + 3]);
            }
        }
        if (metaData != null) {
            res ^= metaData.entrySet().hashCode();
        }
        return res ^ status ^ ((int) fetchTime) ^ ((int) modifiedTime) ^ retries ^ fetchInterval
                ^ Float.floatToIntBits(score);
    }

    public Object clone() {
        try {
            return super.clone();
        } catch (CloneNotSupportedException e) {
            throw new RuntimeException(e);
        }
    }

    public boolean evaluate(Expression expr, String url) {
        if (expr != null && url != null) {
            // Create a context and add data
            JexlContext jcontext = new MapContext();

            // https://issues.apache.org/jira/browse/NUTCH-2229
            jcontext.set("url", url);
            jcontext.set("status", getStatusName(getStatus()));
            jcontext.set("fetchTime", (long) (getFetchTime()));
            jcontext.set("modifiedTime", (long) (getModifiedTime()));
            jcontext.set("retries", getRetriesSinceFetch());
            jcontext.set("interval", new Integer(getFetchInterval()));
            jcontext.set("score", getScore());
            jcontext.set("signature", StringUtil.toHexString(getSignature()));

            // Set metadata variables
            for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
                Object value = entry.getValue();
                Text tkey = (Text) entry.getKey();

                if (value instanceof FloatWritable) {
                    FloatWritable fvalue = (FloatWritable) value;
                    jcontext.set(tkey.toString(), fvalue.get());
                }

                if (value instanceof IntWritable) {
                    IntWritable ivalue = (IntWritable) value;
                    jcontext.set(tkey.toString(), ivalue.get());
                }

                if (value instanceof Text) {
                    Text tvalue = (Text) value;
                    jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
                }

                if (value instanceof ProtocolStatus) {
                    ProtocolStatus pvalue = (ProtocolStatus) value;
                    jcontext.set(tkey.toString().replace("-", "_"), pvalue.toString());
                }

            }

            try {
                if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
                    return true;
                }
            } catch (Exception e) {
                //
            }
        }

        return false;
    }
}