com.salas.bb.whatshot.LinkResolver.java Source code

Introduction

Here is the source code for com.salas.bb.whatshot.LinkResolver.java
Source

// BlogBridge -- RSS feed reader, manager, and web based service
// Copyright (C) 2002-2007 by R. Pito Salas
//
// This program is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free Software Foundation;
// either version 2 of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with this program;
// if not, write to the Free Software Foundation, Inc., 59 Temple Place,
// Suite 330, Boston, MA 02111-1307 USA
//
// Contact: R. Pito Salas
// mailto:pitosalas@users.sourceforge.net
// More information: about BlogBridge
// http://www.blogbridge.com
// http://sourceforge.net/projects/blogbridge
//
// $Id: LinkResolver.java,v 1.13 2007/11/02 12:32:20 spyromus Exp $
//

package com.salas.bb.whatshot;

import EDU.oswego.cs.dl.util.concurrent.Executor;
import com.salas.bb.utils.StringUtils;
import com.salas.bb.utils.concurrency.ExecutorFactory;
import com.salas.bb.utils.concurrency.NamingThreadFactory;
import org.apache.commons.collections.ReferenceMap;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Pattern;

/**
 * Link resolver contacts servers to learn titles of the links it's given.
 * 
 */
public class LinkResolver {
    /** Default scan limit -- number of bytes to read from the stream looking for the title tag before giving up. */
    private static final int DEFAULT_SCAN_LIMIT = 2000;

    /** The map of patterns to replacement strings for the post-processing of titles. */
    private static Map<Pattern, String> postProcessingInstructions = new LinkedHashMap<Pattern, String>();

    /**
     * The map of patterns to the scan limits. The default limit is 2K which means that 2Kb of a resource
     * will be fetched (max) to find the TITLE tag. You can adjust it with this map.
     */
    private static Map<Pattern, Integer> customScanLimits = new LinkedHashMap<Pattern, Integer>();

    /** The list of custom special link resolvers. */
    private static List<ICustomLinkResolver> customLinkResolvers = new LinkedList<ICustomLinkResolver>();

    /** Cache of resolutions. It's memory-sensitive. */
    private static final ReferenceMap CACHE = new ReferenceMap();

    /**
     * Resolution executor.
     */
    private Executor executor;

    /**
     * Listener for the resolution events.
     */
    private final ILinkResolverListener listener;

    static {
        addPostProcessingInstruction(
                Pattern.compile("^([^:]+):\\s+(.+)\\s+on\\s+technorati", Pattern.CASE_INSENSITIVE),
                "Technorati tag: $1 ($2)");

        addCustomScanLimits(Pattern.compile("^http://(www\\.)?amazon\\.com", Pattern.CASE_INSENSITIVE), 20000);
    }

    /**
     * Creates a link resolver for a given listener.
     *
     * @param listener listner.
     *
     * @throws IllegalArgumentException if listener is <code>NULL</code>.
     */
    public LinkResolver(ILinkResolverListener listener) {
        if (listener == null)
            throw new IllegalArgumentException("Listener can't be NULL");

        this.listener = listener;

        executor = ExecutorFactory
                .createPooledExecutor(new NamingThreadFactory("Link Resolver", Thread.MIN_PRIORITY), 5, 1000);
    }

    /**
     * Stops link resolution immediately.
     */
    public void stop() {
        // Shutdown immediately and don't care about the unprocessed results
        executor = null;
    }

    /**
     * Returns the title of the link in the group or, schedules the
     * resolution and returns the link text.
     *
     * @param group group to resolveURI link for.
     *
     * @return resolved text or link itself.
     */
    public synchronized String resolve(HotResultGroup group) {
        // Check local cache
        String title = getFromCache(group);

        // Schedule the task if not in the cache
        if (title == null) {
            title = group.getName();
            try {
                executor.execute(new ResolutionTask(group));
            } catch (InterruptedException e) {
                // Failed to schedule
                e.printStackTrace();
            }
        }

        return title;
    }

    /**
     * Checks if the title for this group is in the cache.
     *
     * @param group group.
     *
     * @return title.
     */
    private String getFromCache(HotResultGroup group) {
        return (String) CACHE.get(group.getLink().toString());
    }

    /**
     * Performs the post-processing of the title resolved.
     *
     * @param title title.
     *
     * @return processed title.
     */
    static String postprocessTitle(String title) {
        if (title == null || StringUtils.isEmpty(title))
            return title;

        for (Map.Entry<Pattern, String> entry : postProcessingInstructions.entrySet()) {
            title = entry.getKey().matcher(title).replaceAll(entry.getValue());
        }

        return title;
    }

    /**
     * Removes all instructions.
     */
    static void clearPostProcessingInstructions() {
        postProcessingInstructions.clear();
    }

    /**
     * Adds a post-processing instruction to the tail of the instructions list.
     *
     * @param matchPattern  pattern to match in the title.
     * @param replacement   replacement to make.
     */
    public static void addPostProcessingInstruction(Pattern matchPattern, String replacement) {
        postProcessingInstructions.put(matchPattern, replacement);
    }

    /**
     * Checks if a given URL requires some special treatment.
     *
     * @param url link to check.
     *
     * @return title or <code>NULL</code> if to follow usual procedures.
     */
    static String customLinkResolution(URL url) {
        if (url == null)
            return null;

        String title = null;

        for (ICustomLinkResolver resolver : customLinkResolvers) {
            title = resolver.resolve(url);
            if (title != null)
                break;
        }

        return title;
    }

    /**
     * Clears the list of custom link resolvers.
     */
    static void clearCustomLinkResolvers() {
        customLinkResolvers.clear();
    }

    /**
     * Adds a custom link resolver to the end of the list.
     *
     * @param resolver resolver.
     */
    public static void addCustomLinkResolver(ICustomLinkResolver resolver) {
        customLinkResolvers.add(resolver);
    }

    /**
     * Adds new pattern for the URL recognition and the limit for the TITLE tag scanning procedure.
     *
     * @param pattern   parrent.
     * @param limit     scan limit in bytes.
     */
    public static void addCustomScanLimits(Pattern pattern, int limit) {
        customScanLimits.put(pattern, limit);
    }

    /**
     * Returns a scan limit for a link.
     *
     * @param link link.
     *
     * @return limit.
     */
    private static int getScanLimit(URL link) {
        int limit = DEFAULT_SCAN_LIMIT;

        if (link != null) {
            String ls = link.toString();
            Set<Map.Entry<Pattern, Integer>> entries = customScanLimits.entrySet();
            for (Map.Entry<Pattern, Integer> entry : entries) {
                if (entry.getKey().matcher(ls).find())
                    return entry.getValue();
            }
        }

        return limit;
    }

    /**
     * Fetches the title from the stream until finds the '&lt;' or
     * the end.
     *
     * @param is    input stream.
     *
     * @return title.
     *
     * @throws IOException in case of I/O error.
     */
    String fetchTitle(InputStream is) throws IOException {
        int ch;

        // Found the title tag and the text
        ByteArrayOutputStream buf = new ByteArrayOutputStream();
        while ((ch = is.read()) != -1 && ch != '<') {
            buf.write(ch);
            if (isTerminated())
                return null;
        }

        return buf.toString().trim();
    }

    /**
     * Returns <code>TRUE</code> if executor is no longer workin.
     *
     * @return <code>TRUE</code> if terminated.
     */
    private boolean isTerminated() {
        return executor == null;
    }

    /**
     * Resolves a single link into the title.
     */
    private class ResolutionTask implements Runnable {
        private final HotResultGroup group;
        private String tag = "<title>";
        private int pos = 0;

        /**
         * Creates a resolver task.
         *
         * @param group group to resolveURI the title for.
         */
        public ResolutionTask(HotResultGroup group) {
            this.group = group;
        }

        /**
         * Main task method.
         */
        public void run() {
            if (isTerminated())
                return;

            try {
                if (resolve(group))
                    listener.onGroupResolved(group);
            } catch (IOException e) {
                // Fall through
            }
        }

        /**
         * Invoked to resolveURI the hotlink into the title for this group item.
         *
         * @param group group to resolveURI.
         *
         * @return <code>TRUE</code> if the title was resolved and changed.
         *
         * @throws IOException in case of any I/O errors.
         */
        private boolean resolve(HotResultGroup group) throws IOException {
            String title;

            // Don't resolveURI invisible groups
            // When they become visible, they will be resolved
            if (!group.isVisible())
                return false;

            // Check if the link needs some special treatment.
            URL url = group.getLink();
            title = customLinkResolution(url);

            if (title == null) {
                URLConnection con = url.openConnection();
                String contentType = con.getContentType();
                InputStream is = null;

                int max = getScanLimit(url);

                try {
                    // Content type
                    if (contentType != null && contentType.startsWith("text/html")) {
                        is = new BufferedInputStream(con.getInputStream());

                        title = resolveFromStream(is, max);
                    }

                    // Sets the title of the page
                    if (title != null)
                        title = StringUtils.unescape(title);
                } finally {
                    if (is != null)
                        is.close();
                }
            }

            // Process the title to replace some parts or do any other post-processing
            if (title != null)
                title = postprocessTitle(title);
            if (StringUtils.isEmpty(title))
                title = "[Unresolved] " + url.toString();

            // Remember the resolution in the cache
            CACHE.put(url.toString(), title);
            group.setResolvedTitle(title);

            return true;
        }

        /**
         * Resolves a title from stream.
         *
         * @param is    stream.
         * @param max   maximum characters to load.
         *
         * @return title or <code>NULL</code> if not found.
         *
         * @throws IOException if I/O error happens.
         */
        String resolveFromStream(InputStream is, int max) throws IOException {
            String title = null;
            int i = 0;
            int b;
            while (title == null && !isTerminated() && i++ < max && (b = is.read()) != -1)
                title = resolveChar(b, is);

            return title;
        }

        /**
         * Resolves a character and moves on. Returns a title if recognized.
         *
         * @param b     byte from the stream.
         * @param is    input stream.
         *
         * @return title.
         *
         * @throws IOException if I/O exception happens.
         */
        String resolveChar(int b, InputStream is) throws IOException {
            if (pos < tag.length()) {
                // Skip whitespace
                if (b == ' ' || b == '\n' || b == '\r' || b == '\t')
                    return null;
                // Lowercase (but not < or >)
                if (b != '<' && b != '>' && b < 'a')
                    b += ' ';

                // Check against the pattern
                char ch = tag.charAt(pos);
                if (ch != b)
                    pos = 0;
                if (ch == b)
                    pos++;
            }

            return (pos == tag.length()) ? fetchTitle(is) : null;
        }
    }
}