ivory.core.tokenize.Tokenizer.java Source code

Java tutorial

Introduction

Here is the source code for ivory.core.tokenize.Tokenizer.java

Source

/*
 * Ivory: A Hadoop toolkit for web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.core.tokenize;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashSet;
import java.util.Set;
import java.util.Map;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import edu.umd.hooka.VocabularyWritable;
import java.lang.UnsupportedOperationException;

public abstract class Tokenizer {
    private static final Logger LOG = Logger.getLogger(Tokenizer.class);
    static {
        LOG.setLevel(Level.INFO);
    }

    public abstract void configure(Configuration conf);

    public abstract void configure(Configuration conf, FileSystem fs);

    public abstract String[] processContent(String text);

    /*
     A method to create a mapping from stemmed version of each token to non-stemmed version. Useful in IR tasks where we want to recover non-stemmed version.
     Implemented in some sub-classes.
     */
    public Map<String, String> getStem2NonStemMapping(String text) {
        throw new UnsupportedOperationException();
    }

    protected static String delims = "`~!@#^&*()-_=+]}[{\\|'\";:/?.>,<";
    protected static int MIN_LENGTH = 2, MAX_LENGTH = 50;
    protected VocabularyWritable vocab;
    protected boolean isStopwordRemoval = false, isStemming = false;
    protected Set<String> stopwords;
    protected Set<String> stemmedStopwords;

    public boolean isStemming() {
        return isStemming;
    }

    public boolean isStopwordRemoval() {
        return isStopwordRemoval;
    }

    /**
     * Discard tokens not in the provided vocabulary.
     * 
     * @param v
     *    vocabulary for tokenizer
     */
    public void setVocab(VocabularyWritable v) {
        vocab = v;
    }

    public VocabularyWritable getVocab() {
        return vocab;
    }

    protected Set<String> readInput(FileSystem fs, String file) {
        Set<String> lines = new HashSet<String>();
        try {
            if (file == null) {
                return lines;
            }
            LOG.info("File " + file + " exists? " + fs.exists(new Path(file)) + ", fs: " + fs);
            FSDataInputStream fis = fs.open(new Path(file));
            InputStreamReader isr = new InputStreamReader(fis, "UTF8");
            BufferedReader in = new BufferedReader(isr);
            String line;

            while ((line = in.readLine()) != null) {
                lines.add(line);
            }
            in.close();
            return lines;
        } catch (Exception e) {
            LOG.warn("Problem reading stopwords from " + file);
            throw new RuntimeException("Problem reading stopwords from " + file);
        }
    }

    /**
     * Method to return number of tokens in text. Subclasses may override for more efficient implementations.
     * 
     * @param text
     *    text to be processed.
     * @return
     *    number of tokens in text.
     */
    public int getNumberTokens(String text) {
        return processContent(text).length;
    }

    public float getOOVRate(String text, VocabularyWritable vocab) {
        int countOOV = 0, countAll = 0;
        for (String token : processContent(text)) {
            countAll++;
            if (vocab != null && vocab.get(token) <= 0) {
                countOOV++;
            }
        }
        return (countOOV / (float) countAll);
    }

    /**
     * Method to remove non-unicode characters from token, to prevent errors in the preprocessing pipeline. Such cases exist in German Wikipedia. 
     * 
     * @param token
     *    token to check for non-unicode character
     * @return
     *    token without the non-unicode characters
     */
    public static String removeNonUnicodeChars(String token) {
        StringBuilder fixedToken = new StringBuilder();
        for (int i = 0; i < token.length(); i++) {
            char c = token.charAt(i);
            if (Character.getNumericValue(c) >= -1) {
                fixedToken.append(c);
            }
        }
        return fixedToken.toString();
    }

    /**
     * Check for the character (looks like reversed `) and normalize it to standard apostrophe
     * @param text French text
     * @return fixed version of the text 
     */
    public static String normalizeFrench(String text) {
        StringBuilder out = new StringBuilder();
        for (int i = 0; i < text.length(); i++) {
            if (String.format("%04x", (int) text.charAt(i)).equals("2019")) { // 
                out.append("' ");
            } else {
                out.append(text.charAt(i));
            }
        }
        return out.toString();
    }

    /**
     * Normalize apostrophe variations for better tokenization.
     *  
     * @param text
     *    text, before any tokenization
     * @return
     *    normalized text, ready to be run through tokenizer   
     */
    protected static String preNormalize(String text) {
        return text.replaceAll("\u2018", "'").replaceAll("\u2060", "'").replaceAll("\u201C", "\"")
                .replaceAll("\u201D", "\"").replaceAll("\u201B", "'").replaceAll("\u201F", "\"")
                .replaceAll("\u201E", "\"").replaceAll("\u00B4", "'").replaceAll("\u301F", "\"")
                .replaceAll("\u2019", "'").replaceAll("\u0060", "'");
    }

    /**
     * Fix several common tokenization errors.
     *  
     * @param text
     *    text, after tokenization
     * @return
     *    text, after fixing possible errors
     */
    protected static String postNormalize(String text) {
        return text.replaceAll("\\((\\S)", "( $1").replaceAll("(\\S)\\)", "$1 )").replaceAll("''(\\S)", "'' $1")
                .replaceAll("", "-").replaceAll("", "-").replaceAll("(\\S)-(\\S)", "$1 - $2")
                .replaceAll("", "").replaceAll(" ' s ", " 's ").replaceAll(" l ' ", " l' ")
                .replaceAll("\"(\\S)", "\" $1").replaceAll("(\\S)\"", "$1 \"");
    }

    /**
     * Convert tokenStream object into a string.
     * 
     * @param tokenStream
     *    object returned by Lucene tokenizer
     * @return
     *    String corresponding to the tokens output by tokenStream
     */
    protected static String streamToString(TokenStream tokenStream) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokenStream.clearAttributes();
        StringBuilder tokenized = new StringBuilder();
        try {
            while (tokenStream.incrementToken()) {
                tokenized.append(termAtt.toString() + " ");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return tokenized.toString().trim();
    }

    /**
     * Overrided by applicable implementing classes.
     * @param 
     *    token
     * @return
     *    true if parameter is a stopword, false otherwise
     */
    public boolean isStopWord(String token) {
        return delims.contains(token) || (isStemming() && stemmedStopwords.contains(token))
                || (!isStemming() && stopwords.contains(token));
    }

    /**
     * Overrided by applicable implementing classes.
     * @param isStemmed
     *    true if token has been stemmed, false otherwise
     * @param token
     * @return
     *    true if token is a stopword, false otherwise
     */
    public boolean isStopWord(boolean isStemmed, String token) {
        return delims.contains(token) || (isStemmed && stemmedStopwords.contains(token))
                || (!isStemmed && stopwords.contains(token));
    }

    public boolean isDiscard(String token) {
        return (token.length() < MIN_LENGTH || token.length() > MAX_LENGTH || isStopWord(token));
    }

    public boolean isDiscard(boolean isStemmed, String token) {
        return (token.length() < MIN_LENGTH || token.length() > MAX_LENGTH || isStopWord(isStemmed, token));
    }

    /**
     * Remove stop words from text that has been tokenized. Useful when postprocessing output of MT system, which is tokenized but not stopword'ed.
     *  
     * @param tokenizedText
     *    input text, assumed to be tokenized.
     * @return
     *    same text without the stop words.
     */
    @Deprecated
    public String removeBorderStopWords(String tokenizedText) {
        String[] tokens = tokenizedText.split(" ");
        int start = 0, end = tokens.length - 1;

        for (int i = 0; i < tokens.length; i++) {
            if (!isStopWord(tokens[i])) {
                start = i;
                break;
            }
        }
        for (int i = tokens.length - 1; i >= 0; i--) {
            if (!isStopWord(tokens[i])) {
                end = i;
                break;
            }
        }

        String output = "";
        for (int i = start; i <= end; i++) {
            output += (tokens[i] + " ");
        }
        return output.trim();
    }

    public String stem(String token) {
        return token;
    }

    public String getUTF8(String token) {
        String utf8 = "";
        for (int i = 0; i < token.length(); i++) {
            utf8 += String.format("%04x", (int) token.charAt(i)) + " ";
        }
        return utf8.trim();
    }

    @SuppressWarnings("static-access")
    public static void main(String[] args) {
        Options options = new Options();
        options.addOption(OptionBuilder.withArgName("full path to model file or directory").hasArg()
                .withDescription("model file").create("model"));
        options.addOption(OptionBuilder.withArgName("full path to input file").hasArg()
                .withDescription("input file").isRequired().create("input"));
        options.addOption(OptionBuilder.withArgName("full path to output file").hasArg()
                .withDescription("output file").isRequired().create("output"));
        options.addOption(OptionBuilder.withArgName("en | zh | de | fr | ar | tr | es").hasArg()
                .withDescription("2-character language code").isRequired().create("lang"));
        options.addOption(OptionBuilder.withArgName("path to stopwords list").hasArg()
                .withDescription("one stopword per line").create("stopword"));
        options.addOption(OptionBuilder.withArgName("path to stemmed stopwords list").hasArg()
                .withDescription("one stemmed stopword per line").create("stemmed_stopword"));
        options.addOption(OptionBuilder.withArgName("true|false").hasArg().withDescription("turn on/off stemming")
                .create("stem"));
        options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars")
                .withArgName("jar packages").hasArg().create("libjars"));

        CommandLine cmdline;
        CommandLineParser parser = new GnuParser();
        try {
            String stopwordList = null, stemmedStopwordList = null, modelFile = null;
            boolean isStem = true;
            cmdline = parser.parse(options, args);
            if (cmdline.hasOption("stopword")) {
                stopwordList = cmdline.getOptionValue("stopword");
            }
            if (cmdline.hasOption("stemmed_stopword")) {
                stemmedStopwordList = cmdline.getOptionValue("stemmed_stopword");
            }
            if (cmdline.hasOption("stem")) {
                isStem = Boolean.parseBoolean(cmdline.getOptionValue("stem"));
            }
            if (cmdline.hasOption("model")) {
                modelFile = cmdline.getOptionValue("model");
            }

            ivory.core.tokenize.Tokenizer tokenizer = TokenizerFactory.createTokenizer(
                    cmdline.getOptionValue("lang"), modelFile, isStem, stopwordList, stemmedStopwordList, null);
            BufferedWriter out = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(cmdline.getOptionValue("output")), "UTF8"));
            BufferedReader in = new BufferedReader(
                    new InputStreamReader(new FileInputStream(cmdline.getOptionValue("input")), "UTF8"));

            String line = null;
            while ((line = in.readLine()) != null) {
                String[] tokens = tokenizer.processContent(line);
                String s = "";
                for (String token : tokens) {
                    s += token + " ";
                }
                out.write(s.trim() + "\n");
            }
            in.close();
            out.close();

        } catch (Exception exp) {
            System.out.println(exp);
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("Tokenizer", options);
            System.exit(-1);
        }
    }
}