org.apache.nutch.analysis.lang.HTMLLanguageParser.java Source code

Introduction

Here is the source code for org.apache.nutch.analysis.lang.HTMLLanguageParser.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.analysis.lang;

// JDK imports

import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseFilter;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.NodeWalker;
import org.apache.tika.language.LanguageIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

import java.lang.CharSequence;
import java.nio.ByteBuffer;
import java.util.*;

/**
 * Adds metadata identifying language of document if found We could also run
 * statistical analysis here but we'd miss all other formats
 */
public class HTMLLanguageParser implements ParseFilter {

    public static final Logger LOG = LoggerFactory.getLogger(HTMLLanguageParser.class);

    private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

    private int detect = -1, identify = -1;

    /* A static Map of ISO-639 language codes */
    private static Map<String, String> LANGUAGES_MAP = new HashMap<String, String>();
    static {
        try {
            Properties p = new Properties();
            p.load(HTMLLanguageParser.class.getResourceAsStream("langmappings.properties"));
            Enumeration<?> keys = p.keys();
            while (keys.hasMoreElements()) {
                String key = (String) keys.nextElement();
                String[] values = p.getProperty(key).split(",", -1);
                LANGUAGES_MAP.put(key, key);
                for (int i = 0; i < values.length; i++) {
                    LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
                }
            }
        } catch (Exception e) {
            if (LOG.isErrorEnabled()) {
                LOG.error(e.toString());
            }
        }
    }

    private Configuration conf;

    private boolean onlyCertain;

    /**
     * Scan the HTML document looking at possible indications of content language<br>
     * <li>1. html lang attribute
     * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
     * dc.language
     * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
     * -html.shtml#language) <li>3. meta http-equiv (content-language)
     * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
     */
    public Parse filter(String url, WebPage page, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
        String lang = null;

        if (detect >= 0 && identify < 0) {
            lang = detectLanguage(page, doc);
        } else if (detect < 0 && identify >= 0) {
            lang = identifyLanguage(parse);
        } else if (detect < identify) {
            lang = detectLanguage(page, doc);
            if (lang == null) {
                lang = identifyLanguage(parse);
            }
        } else if (identify < detect) {
            lang = identifyLanguage(parse);
            if (lang == null) {
                lang = detectLanguage(page, doc);
            }
        } else {
            LOG.warn("No configuration for language extraction policy is provided");
            return parse;
        }

        if (lang != null) {
            page.getMetadata().put(new Utf8(Metadata.LANGUAGE), ByteBuffer.wrap(lang.getBytes()));
            return parse;
        }

        return parse;
    }

    /** Try to find the document's language from page headers and metadata */
    private String detectLanguage(WebPage page, DocumentFragment doc) {
        String lang = null;
        ByteBuffer blang = getLanguageFromMetadata(page.getMetadata());
        if (blang == null) {
            LanguageParser parser = new LanguageParser(doc);
            lang = parser.getLanguage();
        } else
            lang = Bytes.toString(blang);

        if (lang != null) {
            return lang;
        }

        CharSequence ulang = page.getHeaders().get(new Utf8(Response.CONTENT_LANGUAGE));
        if (ulang != null) {
            lang = ulang.toString();
        }

        return lang;
    }

    /** Use statistical language identification to extract page language */
    private String identifyLanguage(Parse parse) {
        StringBuilder text = new StringBuilder();
        if (parse != null) {
            String title = parse.getTitle();
            if (title != null) {
                text.append(title.toString());
            }

            String content = parse.getText();
            if (content != null) {
                text.append(" ").append(content.toString());
            }

            LanguageIdentifier identifier = new LanguageIdentifier(text.toString());

            if (onlyCertain) {
                if (identifier.isReasonablyCertain()) {
                    return identifier.getLanguage();
                }
            } else {
                return identifier.getLanguage();
            }
        }
        return null;
    }

    // Check in the metadata whether the language has already been stored there by
    // Tika
    private static ByteBuffer getLanguageFromMetadata(Map<CharSequence, ByteBuffer> metadata) {
        if (metadata == null)
            return null;

        // dublin core
        ByteBuffer lang = metadata.get(new Utf8("dc.language"));
        if (lang != null)
            return lang;
        // meta content-language
        lang = metadata.get(new Utf8("content-language"));
        if (lang != null)
            return lang;
        // lang attribute
        return metadata.get(new Utf8("lang"));
    }

    static class LanguageParser {

        private String dublinCore = null;
        private String htmlAttribute = null;
        private String httpEquiv = null;
        private String language = null;

        LanguageParser(Node node) {
            parse(node);
            if (htmlAttribute != null) {
                language = htmlAttribute;
            } else if (dublinCore != null) {
                language = dublinCore;
            } else {
                language = httpEquiv;
            }
        }

        String getLanguage() {
            return language;
        }

        void parse(Node node) {

            NodeWalker walker = new NodeWalker(node);
            while (walker.hasNext()) {

                Node currentNode = walker.nextNode();
                String nodeName = currentNode.getNodeName();
                short nodeType = currentNode.getNodeType();

                if (nodeType == Node.ELEMENT_NODE) {

                    // Check for the lang HTML attribute
                    if (htmlAttribute == null) {
                        htmlAttribute = parseLanguage(((Element) currentNode).getAttribute("lang"));
                    }

                    // Check for Meta
                    if ("meta".equalsIgnoreCase(nodeName)) {
                        NamedNodeMap attrs = currentNode.getAttributes();

                        // Check for the dc.language Meta
                        if (dublinCore == null) {
                            for (int i = 0; i < attrs.getLength(); i++) {
                                Node attrnode = attrs.item(i);
                                if ("name".equalsIgnoreCase(attrnode.getNodeName())) {
                                    if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) {
                                        Node valueattr = attrs.getNamedItem("content");
                                        if (valueattr != null) {
                                            dublinCore = parseLanguage(valueattr.getNodeValue());
                                        }
                                    }
                                }
                            }
                        }

                        // Check for the http-equiv content-language
                        if (httpEquiv == null) {
                            for (int i = 0; i < attrs.getLength(); i++) {
                                Node attrnode = attrs.item(i);
                                if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) {
                                    if ("content-language".equals(attrnode.getNodeValue().toLowerCase())) {
                                        Node valueattr = attrs.getNamedItem("content");
                                        if (valueattr != null) {
                                            httpEquiv = parseLanguage(valueattr.getNodeValue());
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                if ((dublinCore != null) && (htmlAttribute != null) && (httpEquiv != null)) {
                    return;
                }
            }
        }

        /**
         * Parse a language string and return an ISO 639 primary code, or
         * <code>null</code> if something wrong occurs, or if no language is found.
         */
        final static String parseLanguage(String lang) {

            if (lang == null) {
                return null;
            }

            String code = null;
            String language = null;

            // First, split multi-valued values
            String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1);

            int i = 0;
            while ((language == null) && (i < langs.length)) {
                // Then, get the primary code
                code = langs[i].split("-")[0];
                code = code.split("_")[0];
                // Find the ISO 639 code
                language = (String) LANGUAGES_MAP.get(code.toLowerCase());
                i++;
            }

            return language;
        }

    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        onlyCertain = conf.getBoolean("lang.identification.only.certain", false);
        String[] policy = conf.getStrings("lang.extraction.policy");
        for (int i = 0; i < policy.length; i++) {
            if (policy[i].equals("detect")) {
                detect = i;
            } else if (policy[i].equals("identify")) {
                identify = i;
            }
        }
    }

    public Configuration getConf() {
        return this.conf;
    }

    public Collection<Field> getFields() {
        return FIELDS;
    }
}