org.apache.nutch.analysis.lang.LanguageIdentifier.java Source code

Introduction

Here is the source code for org.apache.nutch.analysis.lang.LanguageIdentifier.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.analysis.lang;

// JDK imports
import java.io.File;
import java.io.InputStream;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.List;
import java.util.Vector;
import java.util.HashMap;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.Properties;
import java.util.Enumeration;

// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

// Hadoop imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;

// Nutch imports
import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParserNotFound;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolNotFound;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.util.NutchConfiguration;

/**
 * Identify the language of a content, based on statistical analysis.
 *
 * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
 *      Language Codes</a>
 * 
 * @author Sami Siren
 * @author J&eacute;r&ocirc;me Charron
 */
public class LanguageIdentifier {

    private final static int DEFAULT_ANALYSIS_LENGTH = 0; // 0 means full content

    private final static Log LOG = LogFactory.getLog(LanguageIdentifier.class);

    private ArrayList<NGramProfile> languages = new ArrayList<NGramProfile>();

    private ArrayList<String> supportedLanguages = new ArrayList<String>();

    /** Minimum size of NGrams */
    private int minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH;

    /** Maximum size of NGrams */
    private int maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH;

    /** The maximum amount of data to analyze */
    private int analyzeLength = DEFAULT_ANALYSIS_LENGTH;

    /** A global index of ngrams of all supported languages */
    private HashMap<CharSequence, NGramEntry[]> ngramsIdx = new HashMap<CharSequence, NGramEntry[]>();

    /** The NGramProfile used for identification */
    private NGramProfile suspect = null;

    /**
     * Constructs a new Language Identifier.
     */
    public LanguageIdentifier(Configuration conf) {

        // Gets ngram sizes to take into account from the Nutch Config
        minLength = conf.getInt("lang.ngram.min.length", NGramProfile.DEFAULT_MIN_NGRAM_LENGTH);
        maxLength = conf.getInt("lang.ngram.max.length", NGramProfile.DEFAULT_MAX_NGRAM_LENGTH);
        // Ensure the min and max values are in an acceptale range
        // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
        maxLength = Math.min(maxLength, NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH);
        maxLength = Math.max(maxLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
        minLength = Math.max(minLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH);
        minLength = Math.min(minLength, maxLength);

        // Gets the value of the maximum size of data to analyze
        analyzeLength = conf.getInt("lang.analyze.max.length", DEFAULT_ANALYSIS_LENGTH);

        Properties p = new Properties();
        try {
            p.load(this.getClass().getResourceAsStream("langmappings.properties"));

            Enumeration alllanguages = p.keys();

            if (LOG.isInfoEnabled()) {
                LOG.info(new StringBuffer().append("Language identifier configuration [").append(minLength)
                        .append("-").append(maxLength).append("/").append(analyzeLength).append("]").toString());
            }

            StringBuffer list = new StringBuffer("Language identifier plugin supports:");
            HashMap<NGramEntry, List<NGramEntry>> tmpIdx = new HashMap<NGramEntry, List<NGramEntry>>();
            while (alllanguages.hasMoreElements()) {
                String lang = (String) (alllanguages.nextElement());

                InputStream is = this.getClass().getClassLoader().getResourceAsStream(
                        "org/apache/nutch/analysis/lang/" + lang + "." + NGramProfile.FILE_EXTENSION);

                if (is != null) {
                    NGramProfile profile = new NGramProfile(lang, minLength, maxLength);
                    try {
                        profile.load(is);
                        languages.add(profile);
                        supportedLanguages.add(lang);
                        List<NGramEntry> ngrams = profile.getSorted();
                        for (int i = 0; i < ngrams.size(); i++) {
                            NGramEntry entry = ngrams.get(i);
                            List<NGramEntry> registered = tmpIdx.get(entry);
                            if (registered == null) {
                                registered = new ArrayList<NGramEntry>();
                                tmpIdx.put(entry, registered);
                            }
                            registered.add(entry);
                            entry.setProfile(profile);
                        }
                        list.append(" " + lang + "(" + ngrams.size() + ")");
                        is.close();
                    } catch (IOException e1) {
                        if (LOG.isFatalEnabled()) {
                            LOG.fatal(e1.toString());
                        }
                    }
                }
            }
            // transform all ngrams lists to arrays for performances
            Iterator<NGramEntry> keys = tmpIdx.keySet().iterator();
            while (keys.hasNext()) {
                NGramEntry entry = keys.next();
                List<NGramEntry> l = tmpIdx.get(entry);
                if (l != null) {
                    NGramEntry[] array = l.toArray(new NGramEntry[l.size()]);
                    ngramsIdx.put(entry.getSeq(), array);
                }
            }
            if (LOG.isInfoEnabled()) {
                LOG.info(list.toString());
            }
            // Create the suspect profile
            suspect = new NGramProfile("suspect", minLength, maxLength);
        } catch (Exception e) {
            if (LOG.isFatalEnabled()) {
                LOG.fatal(e.toString());
            }
        }
    }

    /**
     * Main method used for command line process.
     * <br/>Usage is:
     * <pre>
     * LanguageIdentifier [-identifyrows filename maxlines]
     *                    [-identifyfile charset filename]
     *                    [-identifyfileset charset files]
     *                    [-identifytext text]
     *                    [-identifyurl url]
     * </pre>
     * @param args arguments.
     */
    public static void main(String args[]) {

        String usage = "Usage: LanguageIdentifier " + "[-identifyrows filename maxlines] "
                + "[-identifyfile charset filename] " + "[-identifyfileset charset files] "
                + "[-identifytext text] " + "[-identifyurl url]";
        int command = 0;

        final int IDFILE = 1;
        final int IDTEXT = 2;
        final int IDURL = 3;
        final int IDFILESET = 4;
        final int IDROWS = 5;

        Vector<String> fileset = new Vector<String>();
        String filename = "";
        String charset = "";
        String text = "";
        int max = 0;

        if (args.length == 0) {
            System.err.println(usage);
            System.exit(-1);
        }

        for (int i = 0; i < args.length; i++) { // parse command line
            if (args[i].equals("-identifyfile")) {
                command = IDFILE;
                charset = args[++i];
                filename = args[++i];
            }

            if (args[i].equals("-identifyurl")) {
                command = IDURL;
                filename = args[++i];
            }

            if (args[i].equals("-identifyrows")) {
                command = IDROWS;
                filename = args[++i];
                max = Integer.parseInt(args[++i]);
            }

            if (args[i].equals("-identifytext")) {
                command = IDTEXT;
                int j = i;
                for (j++; j < args.length; j++)
                    text += args[j] + " ";
            }

            if (args[i].equals("-identifyfileset")) {
                command = IDFILESET;
                charset = args[++i];
                for (i++; i < args.length; i++) {
                    File[] files = null;
                    File f = new File(args[i]);
                    if (f.isDirectory()) {
                        files = f.listFiles();
                    } else {
                        files = new File[] { f };
                    }
                    for (int j = 0; j < files.length; j++) {
                        fileset.add(files[j].getAbsolutePath());
                    }
                }
            }

        }

        Configuration conf = NutchConfiguration.create();
        String lang = null;
        //LanguageIdentifier idfr = LanguageIdentifier.getInstance();
        LanguageIdentifier idfr = new LanguageIdentifier(conf);
        File f;
        FileInputStream fis;
        try {
            switch (command) {

            case IDTEXT:
                lang = idfr.identify(text);
                break;

            case IDFILE:
                f = new File(filename);
                fis = new FileInputStream(f);
                lang = idfr.identify(fis, charset);
                fis.close();
                break;

            case IDURL:
                text = getUrlContent(filename, conf);
                lang = idfr.identify(text);
                break;

            case IDROWS:
                f = new File(filename);
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
                String line;
                while (max > 0 && (line = br.readLine()) != null) {
                    line = line.trim();
                    if (line.length() > 2) {
                        max--;
                        lang = idfr.identify(line);
                        System.out.println("R=" + lang + ":" + line);
                    }
                }

                br.close();
                System.exit(0);
                break;

            case IDFILESET:
                /* used for benchs
                for (int j=128; j<=524288; j*=2) {
                  long start = System.currentTimeMillis();
                  idfr.analyzeLength = j; */
                System.out.println("FILESET");
                Iterator<String> i = fileset.iterator();
                while (i.hasNext()) {
                    try {
                        filename = i.next();
                        f = new File(filename);
                        fis = new FileInputStream(f);
                        lang = idfr.identify(fis, charset);
                        fis.close();
                    } catch (Exception e) {
                        System.out.println(e);
                    }
                    System.out.println(filename + " was identified as " + lang);
                }
                /* used for benchs
                  System.out.println(j + "/" + (System.currentTimeMillis()-start));
                } */
                System.exit(0);
                break;
            }
        } catch (Exception e) {
            System.out.println(e);
        }
        System.out.println("text was identified as " + lang);
    }

    /**
     * @param url
     * @return contents of url
     */
    private static String getUrlContent(String url, Configuration conf) {
        Protocol protocol;
        try {
            protocol = new ProtocolFactory(conf).getProtocol(url);
            Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
            Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
            System.out.println("text:" + parse.getText());
            return parse.getText();

        } catch (ProtocolNotFound e) {
            e.printStackTrace();
        } catch (ParserNotFound e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * Identify language of a content.
     * 
     * @param content is the content to analyze.
     * @return The 2 letter
     *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
     *         language code</a> (en, fi, sv, ...) of the language that best
     *         matches the specified content.
     */
    public String identify(String content) {
        return identify(new StringBuilder(content));
    }

    /**
     * Identify language of a content.
     * 
     * @param content is the content to analyze.
     * @return The 2 letter
     *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
     *         language code</a> (en, fi, sv, ...) of the language that best
     *         matches the specified content.
     */
    public String identify(StringBuilder content) {

        StringBuilder text = content;
        if ((analyzeLength > 0) && (content.length() > analyzeLength)) {
            text = new StringBuilder().append(content);
            text.setLength(analyzeLength);
        }

        suspect.analyze(text);
        Iterator<NGramEntry> iter = suspect.getSorted().iterator();
        float topscore = Float.MIN_VALUE;
        String lang = "";
        HashMap<NGramProfile, Float> scores = new HashMap<NGramProfile, Float>();
        NGramEntry searched = null;

        while (iter.hasNext()) {
            searched = iter.next();
            NGramEntry[] ngrams = ngramsIdx.get(searched.getSeq());
            if (ngrams != null) {
                for (int j = 0; j < ngrams.length; j++) {
                    NGramProfile profile = ngrams[j].getProfile();
                    Float pScore = scores.get(profile);
                    if (pScore == null) {
                        pScore = new Float(0);
                    }
                    float plScore = pScore.floatValue();
                    plScore += ngrams[j].getFrequency() + searched.getFrequency();
                    scores.put(profile, new Float(plScore));
                    if (plScore > topscore) {
                        topscore = plScore;
                        lang = profile.getName();
                        if (lang.equals("zh")) {
                            return lang;
                        }
                    }
                }
            }
        }
        return lang;
    }

    /**
     * Identify language from input stream.
     * This method uses the platform default encoding to read the input stream.
     * For using a specific encoding, use the
     * {@link #identify(InputStream, String)} method.
     *
     * @param is is the input stream to analyze.
     * @return The 2 letter
     *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
     *         language code</a> (en, fi, sv, ...) of the language that best
     *         matches the content of the specified input stream.
     * @throws IOException if something wrong occurs on the input stream.
     */
    public String identify(InputStream is) throws IOException {
        return identify(is, null);
    }

    /**
     * Identify language from input stream.
     * 
     * @param is is the input stream to analyze.
     * @param charset is the charset to use to read the input stream.
     * @return The 2 letter
     *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639
     *         language code</a> (en, fi, sv, ...) of the language that best
     *         matches the content of the specified input stream.
     * @throws IOException if something wrong occurs on the input stream.
     */
    public String identify(InputStream is, String charset) throws IOException {

        ByteArrayOutputStream out = new ByteArrayOutputStream();
        byte[] buffer = new byte[2048];
        int len = 0;

        while (((len = is.read(buffer)) != -1) && ((analyzeLength == 0) || (out.size() < analyzeLength))) {
            if (analyzeLength != 0) {
                len = Math.min(len, analyzeLength - out.size());
            }
            out.write(buffer, 0, len);
        }
        return identify((charset == null) ? out.toString() : out.toString(charset));
    }

}