bixo.operations.UrlLengthener.java Source code

Introduction

Here is the source code for bixo.operations.UrlLengthener.java
Source

/*
 * Copyright 2009-2013 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.operations;

import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.RejectedExecutionException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import bixo.config.BixoPlatform;
import bixo.config.FetcherPolicy;
import bixo.config.FetcherPolicy.RedirectMode;
import bixo.config.UserAgent;
import bixo.fetcher.BaseFetcher;
import bixo.fetcher.SimpleHttpFetcher;
import bixo.hadoop.FetchCounters;
import bixo.utils.ThreadedExecutor;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.OperationCall;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryCollector;

import com.scaleunlimited.cascading.LoggingFlowProcess;
import com.scaleunlimited.cascading.LoggingFlowReporter;
import com.scaleunlimited.cascading.NullContext;

@SuppressWarnings({ "serial", "rawtypes" })
public class UrlLengthener extends BaseOperation<NullContext> implements Function<NullContext> {
    private static final Logger LOGGER = LoggerFactory.getLogger(UrlLengthener.class);

    public static final String URL_FN = "url";

    // Seems like things work better when we fetch the content versus aborting the connection
    private static final int MAX_CONTENT_SIZE = 2 * 1024;

    // Lots of sites fail when they get a HEAD request, so avoid many of those by only
    // processing a single redirect (typically from the link shortening service)
    private static final int MAX_REDIRECTS = 1;

    private static final int REDIRECT_CONNECTION_TIMEOUT = 20 * 1000;
    private static final int REDIRECT_SOCKET_TIMEOUT = 10 * 1000;
    private static final int REDIRECT_RETRY_COUNT = 1;

    private static final long COMMAND_TIMEOUT = (REDIRECT_CONNECTION_TIMEOUT + REDIRECT_SOCKET_TIMEOUT)
            * REDIRECT_RETRY_COUNT;
    private static final long TERMINATE_TIMEOUT = COMMAND_TIMEOUT * 2;

    private static final Pattern HOSTNAME_PATTERN = Pattern.compile("^http://([^/:?]{3,})");

    private static final Fields DEFAULT_FIELD = new Fields(URL_FN);

    private BaseFetcher _fetcher;
    private int _maxThreads;
    private Set<String> _urlShorteners;

    private transient LoggingFlowProcess _flowProcess;
    private transient TupleEntryCollector _collector;
    private transient ThreadedExecutor _executor;

    /**
     * Return a SimpleHttpFetcher that's appropriate for lengthening URLs.
     * 
     * @param maxThreads - number of requests to make in parallel. Should be 1 to 100?
     * @param userAgent - what to use when making requests.
     * @return BaseFetcher that can be passed to the UrlLengthener constructor.
     */
    public static BaseFetcher makeFetcher(int maxThreads, UserAgent userAgent) {
        FetcherPolicy policy = new FetcherPolicy();
        policy.setRedirectMode(RedirectMode.FOLLOW_NONE);
        policy.setMaxRedirects(MAX_REDIRECTS);
        policy.setMaxConnectionsPerHost(maxThreads);

        SimpleHttpFetcher result = new SimpleHttpFetcher(maxThreads, policy, userAgent);
        result.setDefaultMaxContentSize(MAX_CONTENT_SIZE);

        // We don't want any encoding (compression) of the data.
        result.setAcceptEncoding("");
        return result;
    }

    public UrlLengthener(BaseFetcher fetcher) throws IOException {
        this(fetcher, DEFAULT_FIELD);
    }

    public UrlLengthener(BaseFetcher fetcher, Fields resultField) throws IOException {
        super(resultField);

        if (resultField.size() != 1) {
            throw new IllegalArgumentException("resultField must contain a single field");
        }

        _fetcher = fetcher;
        _maxThreads = fetcher.getMaxThreads();

        _urlShorteners = loadUrlShorteners();
    }

    @SuppressWarnings("unchecked")
    @Override
    public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
        super.prepare(flowProcess, operationCall);

        _flowProcess = new LoggingFlowProcess(flowProcess);
        _flowProcess.addReporter(new LoggingFlowReporter());

        _executor = new ThreadedExecutor(_maxThreads, COMMAND_TIMEOUT);
    }

    @Override
    public void flush(FlowProcess flowProcess, OperationCall<NullContext> perationCall) {
        try {
            if (!_executor.terminate(TERMINATE_TIMEOUT)) {
                LOGGER.warn("Had to do a hard shutdown of robots fetching");
            }
        } catch (InterruptedException e) {
            // FUTURE What's the right thing to do here? E.g. do I need to worry about
            // losing URLs still to be processed?
            LOGGER.warn("Interrupted while waiting for termination");
            Thread.currentThread().interrupt();
        }

        super.flush(flowProcess, perationCall);
    }

    @Override
    public void cleanup(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
        _flowProcess.dumpCounters();
        super.cleanup(flowProcess, operationCall);
    }

    @Override
    public void operate(FlowProcess flowProcess, FunctionCall<NullContext> functionCall) {
        _collector = functionCall.getOutputCollector();

        String url = functionCall.getArguments().getTuple().getString(0);

        // Figure out if this is a URL from a shortener service.
        // If so, then we want to try to lengthen it.
        // If not, see if it looks like shortened URL, and try anyway.

        Matcher m = HOSTNAME_PATTERN.matcher(url);
        if (!m.find()) {
            emitTuple(url);
            return;
        }

        String hostname = m.group(1);
        if (!_urlShorteners.contains(hostname)) {
            // FUTURE - see if this looks like a shortened URL
            emitTuple(url);
            return;
        }

        try {
            ResolveRedirectsTask task = new ResolveRedirectsTask(url, _fetcher, _collector, _flowProcess);
            _executor.execute(task);
        } catch (RejectedExecutionException e) {
            // should never happen.
            LOGGER.error("Redirection handling pool rejected our request for " + url);
            _flowProcess.increment(FetchCounters.URLS_REJECTED, 1);

            emitTuple(url);
        } catch (Throwable t) {
            LOGGER.error("Caught an unexpected throwable - redirection code rejected our request for " + url, t);
            _flowProcess.increment(FetchCounters.URLS_REJECTED, 1);

            emitTuple(url);
        }
    }

    public static Set<String> loadUrlShorteners() throws IOException {
        Set<String> result = new HashSet<String>();
        List<String> lines = IOUtils.readLines(UrlLengthener.class.getResourceAsStream("/url-shorteners.txt"),
                "UTF-8");
        for (String line : lines) {
            line = line.trim();
            if ((line.length() == 0) || (line.startsWith("#"))) {
                continue;
            }

            int commentIndex = line.indexOf('#');
            if (commentIndex != -1) {
                line = line.substring(0, commentIndex).trim();
            }

            result.add(line);
        }

        return result;
    }

    private void emitTuple(String url) {
        synchronized (_collector) {
            _collector.add(BixoPlatform.clone(new Tuple(url), _flowProcess));
        }
    }

}