com.cyberway.issue.crawler.fetcher.FetchDNS.java Source code

Java tutorial

Introduction

Here is the source code for com.cyberway.issue.crawler.fetcher.FetchDNS.java

Source

/* Copyright (C) 2003 Internet Archive.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * FetchDNS
 * Created on Jun 5, 2003
 *
 * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/fetcher/FetchDNS.java,v 1.29.4.1 2007/01/13 01:31:17 stack-sf Exp $
 */
package com.cyberway.issue.crawler.fetcher;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.security.MessageDigest;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;

import org.apache.commons.httpclient.URIException;
import com.cyberway.issue.crawler.datamodel.CoreAttributeConstants;
import com.cyberway.issue.crawler.datamodel.CrawlHost;
import com.cyberway.issue.crawler.datamodel.CrawlURI;
import com.cyberway.issue.crawler.datamodel.FetchStatusCodes;
import com.cyberway.issue.crawler.framework.Processor;
import com.cyberway.issue.crawler.settings.SimpleType;
import com.cyberway.issue.util.ArchiveUtils;
import com.cyberway.issue.util.HttpRecorder;
import com.cyberway.issue.util.InetAddressUtil;
import org.xbill.DNS.ARecord;
import org.xbill.DNS.DClass;
import org.xbill.DNS.Lookup;
import org.xbill.DNS.Record;
import org.xbill.DNS.ResolverConfig;
import org.xbill.DNS.TextParseException;
import org.xbill.DNS.Type;

/**
 * Processor to resolve 'dns:' URIs.
 * 
 * TODO: Refactor to use com.cyberway.issue.util.DNSJavaUtils.
 *
 * @author multiple
 */
public class FetchDNS extends Processor implements CoreAttributeConstants, FetchStatusCodes {
    private static final long serialVersionUID = 4686199203459704426L;

    private Logger logger = Logger.getLogger(this.getClass().getName());

    // Defaults.
    private short ClassType = DClass.IN;
    private short TypeType = Type.A;
    protected InetAddress serverInetAddr = null;

    private static final String ATTR_ACCEPT_NON_DNS_RESOLVES = "accept-non-dns-resolves";
    private static final Boolean DEFAULT_ACCEPT_NON_DNS_RESOLVES = Boolean.FALSE;
    private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES = 6 * 60 * 60; // 6 hrs

    private byte[] reusableBuffer = new byte[1024];

    /** 
     * Create a new instance of FetchDNS.
     *
     * @param name the name of this attribute.
     */
    public FetchDNS(String name) {
        super(name, "DNS Fetcher. Handles DNS lookups.");
        com.cyberway.issue.crawler.settings.Type e = addElementToDefinition(new SimpleType(
                ATTR_ACCEPT_NON_DNS_RESOLVES,
                "If a DNS lookup fails, whether or not to fallback to "
                        + "InetAddress resolution, which may use local 'hosts' files " + "or other mechanisms.",
                DEFAULT_ACCEPT_NON_DNS_RESOLVES));
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(FetchHTTP.ATTR_DIGEST_CONTENT,
                "Whether or not to perform an on-the-fly digest hash of" + " retrieved content-bodies.",
                FetchHTTP.DEFAULT_DIGEST_CONTENT));
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(FetchHTTP.ATTR_DIGEST_ALGORITHM,
                "Which algorithm (for example " + "MD5 or SHA-1) to use to perform an on-the-fly digest"
                        + " hash of retrieved content-bodies.",
                FetchHTTP.DEFAULT_DIGEST_ALGORITHM, FetchHTTP.DIGEST_ALGORITHMS));
        e.setExpertSetting(true);
    }

    protected void innerProcess(CrawlURI curi) {
        if (!curi.getUURI().getScheme().equals("dns")) {
            // Only handles dns
            return;
        }
        Record[] rrecordSet = null; // Retrieved dns records
        String dnsName = null;
        try {
            dnsName = curi.getUURI().getReferencedHost();
        } catch (URIException e) {
            logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
        }

        if (dnsName == null) {
            curi.setFetchStatus(S_UNFETCHABLE_URI);
            return;
        }

        // Make sure we're in "normal operating mode", e.g. a cache +
        // controller exist to assist us.
        CrawlHost targetHost = null;
        if (getController() != null && getController().getServerCache() != null) {
            targetHost = getController().getServerCache().getHostFor(dnsName);
        } else {
            // Standalone operation (mostly for test cases/potential other uses)
            targetHost = new CrawlHost(dnsName);
        }
        if (isQuadAddress(curi, dnsName, targetHost)) {
            // We're done processing.
            return;
        }

        // Do actual DNS lookup.
        curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());

        // Try to get the records for this host (assume domain name)
        // TODO: Bug #935119 concerns potential hang here
        try {
            rrecordSet = (new Lookup(dnsName, TypeType, ClassType)).run();
        } catch (TextParseException e) {
            rrecordSet = null;
        }
        curi.setContentType("text/dns");
        if (rrecordSet != null) {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Found recordset for " + dnsName);
            }
            storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
        } else {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("Failed find of recordset for " + dnsName);
            }
            if (((Boolean) getUncheckedAttribute(null, ATTR_ACCEPT_NON_DNS_RESOLVES)).booleanValue()) {
                // Do lookup that bypasses javadns.
                InetAddress address = null;
                try {
                    address = InetAddress.getByName(dnsName);
                } catch (UnknownHostException e1) {
                    address = null;
                }
                if (address != null) {
                    targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
                    curi.setFetchStatus(S_GETBYNAME_SUCCESS);
                    if (logger.isLoggable(Level.FINE)) {
                        logger.fine("Found address for " + dnsName + " using native dns.");
                    }
                } else {
                    if (logger.isLoggable(Level.FINE)) {
                        logger.fine("Failed find of address for " + dnsName + " using native dns.");
                    }
                    setUnresolvable(curi, targetHost);
                }
            } else {
                setUnresolvable(curi, targetHost);
            }
        }
        curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
    }

    protected void storeDNSRecord(final CrawlURI curi, final String dnsName, final CrawlHost targetHost,
            final Record[] rrecordSet) {
        // Get TTL and IP info from the first A record (there may be
        // multiple, e.g. www.washington.edu) then update the CrawlServer
        ARecord arecord = getFirstARecord(rrecordSet);
        if (arecord == null) {
            throw new NullPointerException("Got null arecord for " + dnsName);
        }
        targetHost.setIP(arecord.getAddress(), arecord.getTTL());
        try {
            recordDNS(curi, rrecordSet);
            curi.setFetchStatus(S_DNS_SUCCESS);
            curi.putString(A_DNS_SERVER_IP_LABEL, ResolverConfig.getCurrentConfig().server());
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Failed store of DNS Record for " + curi.toString(), e);
            setUnresolvable(curi, targetHost);
        }
    }

    protected boolean isQuadAddress(final CrawlURI curi, final String dnsName, final CrawlHost targetHost) {
        boolean result = false;
        Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
        // If it's an ip no need to do a lookup
        if (matcher == null || !matcher.matches()) {
            return result;
        }

        result = true;
        // Ideally this branch would never be reached: no CrawlURI
        // would be created for numerical IPs
        if (logger.isLoggable(Level.WARNING)) {
            logger.warning("Unnecessary DNS CrawlURI created: " + curi);
        }
        try {
            targetHost.setIP(
                    InetAddress.getByAddress(dnsName,
                            new byte[] { (byte) (new Integer(matcher.group(1)).intValue()),
                                    (byte) (new Integer(matcher.group(2)).intValue()),
                                    (byte) (new Integer(matcher.group(3)).intValue()),
                                    (byte) (new Integer(matcher.group(4)).intValue()) }),
                    CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
            curi.setFetchStatus(S_DNS_SUCCESS);
        } catch (UnknownHostException e) {
            logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
            setUnresolvable(curi, targetHost);
        }
        return result;
    }

    protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet) throws IOException {
        final byte[] dnsRecord = getDNSRecord(curi.getLong(A_FETCH_BEGAN_TIME), rrecordSet);
        HttpRecorder rec = HttpRecorder.getHttpRecorder();

        // Shall we get a digest on the content downloaded?
        boolean digestContent = ((Boolean) getUncheckedAttribute(curi, FetchHTTP.ATTR_DIGEST_CONTENT))
                .booleanValue();
        String algorithm = null;
        if (digestContent) {
            algorithm = ((String) getUncheckedAttribute(curi, FetchHTTP.ATTR_DIGEST_ALGORITHM));
            rec.getRecordedInput().setDigest(algorithm);
        } else {
            // clear
            rec.getRecordedInput().setDigest((MessageDigest) null);
        }

        curi.setHttpRecorder(rec);
        InputStream is = curi.getHttpRecorder().inputWrap(new ByteArrayInputStream(dnsRecord));
        if (digestContent) {
            rec.getRecordedInput().startDigest();
        }
        // Reading from the wrapped stream, behind the scenes, will write
        // files into scratch space
        try {
            while (is.read(this.reusableBuffer) != -1) {
                continue;
            }
        } finally {
            is.close();
            rec.closeRecorders();
        }
        curi.setContentSize(dnsRecord.length);
        if (digestContent) {
            curi.setContentDigest(algorithm, rec.getRecordedInput().getDigestValue());
        }
    }

    protected byte[] getDNSRecord(final long fetchStart, final Record[] rrecordSet) throws IOException {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        // Start the record with a 14-digit date per RFC 2540
        byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
        baos.write(fetchDate);
        // Don't forget the newline
        baos.write("\n".getBytes());
        int recordLength = fetchDate.length + 1;
        if (rrecordSet != null) {
            for (int i = 0; i < rrecordSet.length; i++) {
                byte[] record = rrecordSet[i].toString().getBytes();
                recordLength += record.length;
                baos.write(record);
                // Add the newline between records back in
                baos.write("\n".getBytes());
                recordLength += 1;
            }
        }
        return baos.toByteArray();
    }

    protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
        host.setIP(null, 0);
        curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE);
    }

    protected ARecord getFirstARecord(Record[] rrecordSet) {
        ARecord arecord = null;
        if (rrecordSet == null || rrecordSet.length == 0) {
            if (logger.isLoggable(Level.FINEST)) {
                logger.finest("rrecordSet is null or zero length: " + rrecordSet);
            }
            return arecord;
        }
        for (int i = 0; i < rrecordSet.length; i++) {
            if (rrecordSet[i].getType() != Type.A) {
                if (logger.isLoggable(Level.FINEST)) {
                    logger.finest(
                            "Record " + Integer.toString(i) + " is not A type but " + rrecordSet[i].getType());
                }
                continue;
            }
            arecord = (ARecord) rrecordSet[i];
            break;
        }
        return arecord;
    }
}