bixo.fetcher.LoggingFetcher.java Source code

Java tutorial

Introduction

Here is the source code for bixo.fetcher.LoggingFetcher.java

Source

/*
 * Copyright 2009-2013 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.fetcher;

import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;

import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.scaleunlimited.cascading.Payload;

import bixo.config.FetcherPolicy;
import bixo.config.UserAgent;
import bixo.datum.ContentBytes;
import bixo.datum.FetchedDatum;
import bixo.datum.HttpHeaders;
import bixo.datum.ScoredUrlDatum;
import bixo.exceptions.BaseFetchException;
import bixo.exceptions.HttpFetchException;
import bixo.exceptions.UrlFetchException;

@SuppressWarnings("serial")
public class LoggingFetcher extends BaseFetcher {
    private static final Logger LOGGER = LoggerFactory.getLogger(LoggingFetcher.class);

    public static final String FAKE_CONTENT_LOCATION = "Fake-LoggingFetcher";

    // Generic HTML page we send back for every request - only customization is the URL
    private static final String HTML_TEMPLATE = "<!DOCTYPE HTML PUBLIC \"-//BBSW//DTD Compact HTML 2.0//EN\">\n"
            + "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n"
            + "<title>LoggingFetcher</title>\n" + "</head><body>URL = %s</body></html>\n";

    public LoggingFetcher(int maxThreads) {
        super(maxThreads, new FetcherPolicy(),
                new UserAgent("agentName", "agentName@domain.com", "http://agentName.domain.com"));
    }

    @Override
    public FetchedDatum get(ScoredUrlDatum datum) throws BaseFetchException {
        String url = datum.getUrl();
        Payload payload = datum.getPayload();
        logPayload(url, payload);

        // Create a simple HTML page here, where we fill in the URL as
        // the field, and return that as the BytesWritable. we could add
        // more of the datum values to the template if we cared.
        try {
            return makeFetchedDatum(url, String.format(HTML_TEMPLATE, url), payload);
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("Should never happen", e);
        } catch (MalformedURLException e) {
            throw new UrlFetchException(url, e.getMessage());
        }
    }

    private FetchedDatum makeFetchedDatum(String url, String htmlContent, Payload payload)
            throws MalformedURLException, HttpFetchException, UnsupportedEncodingException {
        URL theUrl = new URL(url);
        if (theUrl.getFile().equals("/robots.txt")) {
            throw new HttpFetchException(url, "Never return robots.txt from LoggingFetcher",
                    HttpStatus.SC_NOT_FOUND, null);
        }

        byte[] content = htmlContent.getBytes("UTF-8");
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_LENGTH, "" + content.length);
        headers.add(HttpHeaderNames.CONTENT_TYPE, "text/html");

        // Set the location to a fixed value, so that when we're processing entries from
        // the URL DB that might have been set using fake content, we know to ignore the
        // refetch time if we're doing a real fetch.
        headers.add(HttpHeaderNames.CONTENT_LOCATION, FAKE_CONTENT_LOCATION);
        FetchedDatum result = new FetchedDatum(url, url, System.currentTimeMillis(), headers,
                new ContentBytes(content), "text/html", 100000);
        result.setPayload(payload);
        return result;
    }

    private void logPayload(String url, Payload payload) {
        StringBuilder msg = new StringBuilder(url);
        msg.append(" ( ");
        for (String key : payload.keySet()) {
            msg.append(key);
            msg.append(':');
            Object value = payload.get(key);
            msg.append(value == null ? "null" : value.toString());
            msg.append(' ');
        }
        msg.append(")");

        LOGGER.info(msg.toString());
    }

    @Override
    public void abort() {
        // Do nothing
    }

}