org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer.java Source code

Introduction

Here is the source code for org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.net.urlnormalizer.regex;

import java.net.URL;
import java.net.MalformedURLException;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;

import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;

import javax.xml.parsers.*;
import org.w3c.dom.*;
import org.apache.oro.text.regex.*;

/**
 * Allows users to do regex substitutions on all/any URLs that are encountered,
 * which is useful for stripping session IDs from URLs.
 * 
 * <p>This class uses the <tt>urlnormalizer.regex.file</tt> property.
 * It should be set to the file name of an xml file which should contain the
 * patterns and substitutions to be done on encountered URLs.
 * </p>
 * <p>This class also supports different rules depending on the scope. Please see
 * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details.</p>
 * 
 * @author Luke Baker
 * @author Andrzej Bialecki
 */
public class RegexURLNormalizer extends Configured implements URLNormalizer {

    private static final Log LOG = LogFactory.getLog(RegexURLNormalizer.class);

    /**
     * Class which holds a compiled pattern and its corresponding substition
     * string.
     */
    private static class Rule {
        public Perl5Pattern pattern;

        public String substitution;
    }

    private HashMap scopedRules;

    private static final List EMPTY_RULES = Collections.EMPTY_LIST;

    private PatternMatcher matcher = new Perl5Matcher();

    /**
     * The default constructor which is called from UrlNormalizerFactory
     * (normalizerClass.newInstance()) in method: getNormalizer()*
     */
    public RegexURLNormalizer() {
        super(null);
    }

    public RegexURLNormalizer(Configuration conf) {
        super(conf);
    }

    /**
     * Constructor which can be passed the file name, so it doesn't look in the
     * configuration files for it.
     */
    public RegexURLNormalizer(Configuration conf, String filename) throws IOException, MalformedPatternException {
        super(conf);
        List rules = readConfigurationFile(filename);
        if (rules != null)
            scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules);
    }

    public void setConf(Configuration conf) {
        super.setConf(conf);
        if (conf == null)
            return;
        // the default constructor was called
        if (this.scopedRules == null) {
            String filename = getConf().get("urlnormalizer.regex.file");
            scopedRules = new HashMap();
            URL url = getConf().getResource(filename);
            List rules = null;
            if (url == null) {
                LOG.warn("Can't load the default config file! " + filename);
                rules = EMPTY_RULES;
            } else {
                try {
                    rules = readConfiguration(url.openStream());
                } catch (Exception e) {
                    LOG.warn("Couldn't read default config from '" + url + "': " + e);
                    rules = EMPTY_RULES;
                }
            }
            scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules);
        }
    }

    // used in JUnit test.
    void setConfiguration(InputStream is, String scope) {
        List rules = readConfiguration(is);
        scopedRules.put(scope, rules);
        LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules.");
    }

    /**
     * This function does the replacements by iterating through all the regex
     * patterns. It accepts a string url as input and returns the altered string.
     */
    public synchronized String regexNormalize(String urlString, String scope) {
        List curRules = (List) scopedRules.get(scope);
        if (curRules == null) {
            // try to populate
            String configFile = getConf().get("urlnormalizer.regex.file." + scope);
            if (configFile != null) {
                URL resource = getConf().getResource(configFile);
                LOG.debug("resource for scope '" + scope + "': " + resource);
                if (resource == null) {
                    LOG.warn("Can't load resource for config file: " + configFile);
                } else {
                    try {
                        InputStream is = resource.openStream();
                        curRules = readConfiguration(resource.openStream());
                        scopedRules.put(scope, curRules);
                    } catch (Exception e) {
                        LOG.warn("Couldn't load resource '" + resource + "': " + e);
                    }
                }
            }
            if (curRules == EMPTY_RULES || curRules == null) {
                LOG.warn("can't find rules for scope '" + scope + "', using default");
                scopedRules.put(scope, EMPTY_RULES);
            }
        }
        if (curRules == EMPTY_RULES || curRules == null) {
            // use global rules
            curRules = (List) scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
        }
        Iterator i = curRules.iterator();
        while (i.hasNext()) {
            Rule r = (Rule) i.next();
            urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution(r.substitution), urlString,
                    Util.SUBSTITUTE_ALL); // actual
                                                                                                                                            // substitution
        }
        return urlString;
    }

    public synchronized String normalize(String urlString, String scope) throws MalformedURLException {
        return regexNormalize(urlString, scope);
    }

    /** Reads the configuration file and populates a List of Rules. */
    private List readConfigurationFile(String filename) {
        if (LOG.isInfoEnabled()) {
            LOG.info("loading " + filename);
        }
        try {
            FileInputStream fis = new FileInputStream(filename);
            return readConfiguration(fis);
        } catch (Exception e) {
            LOG.fatal("Error loading rules from '" + filename + "': " + e);
            return EMPTY_RULES;
        }
    }

    private List readConfiguration(InputStream is) {
        Perl5Compiler compiler = new Perl5Compiler();
        List rules = new ArrayList();
        try {

            // borrowed heavily from code in Configuration.java
            Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(is);
            Element root = doc.getDocumentElement();
            if ((!"regex-normalize".equals(root.getTagName())) && (LOG.isFatalEnabled())) {
                LOG.fatal("bad conf file: top-level element not <regex-normalize>");
            }
            NodeList regexes = root.getChildNodes();
            for (int i = 0; i < regexes.getLength(); i++) {
                Node regexNode = regexes.item(i);
                if (!(regexNode instanceof Element))
                    continue;
                Element regex = (Element) regexNode;
                if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) {
                    LOG.warn("bad conf file: element not <regex>");
                }
                NodeList fields = regex.getChildNodes();
                String patternValue = null;
                String subValue = null;
                for (int j = 0; j < fields.getLength(); j++) {
                    Node fieldNode = fields.item(j);
                    if (!(fieldNode instanceof Element))
                        continue;
                    Element field = (Element) fieldNode;
                    if ("pattern".equals(field.getTagName()) && field.hasChildNodes())
                        patternValue = ((Text) field.getFirstChild()).getData();
                    if ("substitution".equals(field.getTagName()) && field.hasChildNodes())
                        subValue = ((Text) field.getFirstChild()).getData();
                    if (!field.hasChildNodes())
                        subValue = "";
                }
                if (patternValue != null && subValue != null) {
                    Rule rule = new Rule();
                    rule.pattern = (Perl5Pattern) compiler.compile(patternValue);
                    rule.substitution = subValue;
                    rules.add(rule);
                }
            }
        } catch (Exception e) {
            if (LOG.isFatalEnabled()) {
                LOG.fatal("error parsing conf file: " + e);
            }
            return EMPTY_RULES;
        }
        if (rules.size() == 0)
            return EMPTY_RULES;
        return rules;
    }

    /** Spits out patterns and substitutions that are in the configuration file. */
    public static void main(String args[]) throws MalformedPatternException, IOException {
        RegexURLNormalizer normalizer = new RegexURLNormalizer();
        normalizer.setConf(NutchConfiguration.create());
        Iterator i = ((List) normalizer.scopedRules.get(URLNormalizers.SCOPE_DEFAULT)).iterator();
        System.out.println("* Rules for 'DEFAULT' scope:");
        while (i.hasNext()) {
            Rule r = (Rule) i.next();
            System.out.print("  " + r.pattern.getPattern() + " -> ");
            System.out.println(r.substitution);
        }
        // load the scope
        if (args.length > 1) {
            normalizer.normalize("http://test.com", args[1]);
        }
        if (normalizer.scopedRules.size() > 1) {
            Iterator it = normalizer.scopedRules.keySet().iterator();
            while (it.hasNext()) {
                String scope = (String) it.next();
                if (URLNormalizers.SCOPE_DEFAULT.equals(scope))
                    continue;
                System.out.println("* Rules for '" + scope + "' scope:");
                i = ((List) normalizer.scopedRules.get(scope)).iterator();
                while (i.hasNext()) {
                    Rule r = (Rule) i.next();
                    System.out.print("  " + r.pattern.getPattern() + " -> ");
                    System.out.println(r.substitution);
                }
            }
        }
        if (args.length > 0) {
            System.out.println("\n---------- Normalizer test -----------");
            String scope = URLNormalizers.SCOPE_DEFAULT;
            if (args.length > 1)
                scope = args[1];
            System.out.println("Scope: " + scope);
            System.out.println("Input url:  '" + args[0] + "'");
            System.out.println("Output url: '" + normalizer.normalize(args[0], scope) + "'");
        }
        System.exit(0);
    }

}