Java tutorial
/* * Ivory: A Hadoop toolkit for web-scale information retrieval * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package ivory.core.tokenize; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.HashSet; import java.util.Set; import java.util.Map; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import edu.umd.hooka.VocabularyWritable; import java.lang.UnsupportedOperationException; public abstract class Tokenizer { private static final Logger LOG = Logger.getLogger(Tokenizer.class); static { LOG.setLevel(Level.INFO); } public abstract void configure(Configuration conf); public abstract void configure(Configuration conf, FileSystem fs); public abstract String[] processContent(String text); /* A method to create a mapping from stemmed version of each token to non-stemmed version. Useful in IR tasks where we want to recover non-stemmed version. Implemented in some sub-classes. */ public Map<String, String> getStem2NonStemMapping(String text) { throw new UnsupportedOperationException(); } protected static String delims = "`~!@#^&*()-_=+]}[{\\|'\";:/?.>,<"; protected static int MIN_LENGTH = 2, MAX_LENGTH = 50; protected VocabularyWritable vocab; protected boolean isStopwordRemoval = false, isStemming = false; protected Set<String> stopwords; protected Set<String> stemmedStopwords; public boolean isStemming() { return isStemming; } public boolean isStopwordRemoval() { return isStopwordRemoval; } /** * Discard tokens not in the provided vocabulary. * * @param v * vocabulary for tokenizer */ public void setVocab(VocabularyWritable v) { vocab = v; } public VocabularyWritable getVocab() { return vocab; } protected Set<String> readInput(FileSystem fs, String file) { Set<String> lines = new HashSet<String>(); try { if (file == null) { return lines; } LOG.info("File " + file + " exists? " + fs.exists(new Path(file)) + ", fs: " + fs); FSDataInputStream fis = fs.open(new Path(file)); InputStreamReader isr = new InputStreamReader(fis, "UTF8"); BufferedReader in = new BufferedReader(isr); String line; while ((line = in.readLine()) != null) { lines.add(line); } in.close(); return lines; } catch (Exception e) { LOG.warn("Problem reading stopwords from " + file); throw new RuntimeException("Problem reading stopwords from " + file); } } /** * Method to return number of tokens in text. Subclasses may override for more efficient implementations. * * @param text * text to be processed. * @return * number of tokens in text. */ public int getNumberTokens(String text) { return processContent(text).length; } public float getOOVRate(String text, VocabularyWritable vocab) { int countOOV = 0, countAll = 0; for (String token : processContent(text)) { countAll++; if (vocab != null && vocab.get(token) <= 0) { countOOV++; } } return (countOOV / (float) countAll); } /** * Method to remove non-unicode characters from token, to prevent errors in the preprocessing pipeline. Such cases exist in German Wikipedia. * * @param token * token to check for non-unicode character * @return * token without the non-unicode characters */ public static String removeNonUnicodeChars(String token) { StringBuilder fixedToken = new StringBuilder(); for (int i = 0; i < token.length(); i++) { char c = token.charAt(i); if (Character.getNumericValue(c) >= -1) { fixedToken.append(c); } } return fixedToken.toString(); } /** * Check for the character (looks like reversed `) and normalize it to standard apostrophe * @param text French text * @return fixed version of the text */ public static String normalizeFrench(String text) { StringBuilder out = new StringBuilder(); for (int i = 0; i < text.length(); i++) { if (String.format("%04x", (int) text.charAt(i)).equals("2019")) { // out.append("' "); } else { out.append(text.charAt(i)); } } return out.toString(); } /** * Normalize apostrophe variations for better tokenization. * * @param text * text, before any tokenization * @return * normalized text, ready to be run through tokenizer */ protected static String preNormalize(String text) { return text.replaceAll("\u2018", "'").replaceAll("\u2060", "'").replaceAll("\u201C", "\"") .replaceAll("\u201D", "\"").replaceAll("\u201B", "'").replaceAll("\u201F", "\"") .replaceAll("\u201E", "\"").replaceAll("\u00B4", "'").replaceAll("\u301F", "\"") .replaceAll("\u2019", "'").replaceAll("\u0060", "'"); } /** * Fix several common tokenization errors. * * @param text * text, after tokenization * @return * text, after fixing possible errors */ protected static String postNormalize(String text) { return text.replaceAll("\\((\\S)", "( $1").replaceAll("(\\S)\\)", "$1 )").replaceAll("''(\\S)", "'' $1") .replaceAll("", "-").replaceAll("", "-").replaceAll("(\\S)-(\\S)", "$1 - $2") .replaceAll("", "").replaceAll(" ' s ", " 's ").replaceAll(" l ' ", " l' ") .replaceAll("\"(\\S)", "\" $1").replaceAll("(\\S)\"", "$1 \""); } /** * Convert tokenStream object into a string. * * @param tokenStream * object returned by Lucene tokenizer * @return * String corresponding to the tokens output by tokenStream */ protected static String streamToString(TokenStream tokenStream) { CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.clearAttributes(); StringBuilder tokenized = new StringBuilder(); try { while (tokenStream.incrementToken()) { tokenized.append(termAtt.toString() + " "); } } catch (IOException e) { e.printStackTrace(); } return tokenized.toString().trim(); } /** * Overrided by applicable implementing classes. * @param * token * @return * true if parameter is a stopword, false otherwise */ public boolean isStopWord(String token) { return delims.contains(token) || (isStemming() && stemmedStopwords.contains(token)) || (!isStemming() && stopwords.contains(token)); } /** * Overrided by applicable implementing classes. * @param isStemmed * true if token has been stemmed, false otherwise * @param token * @return * true if token is a stopword, false otherwise */ public boolean isStopWord(boolean isStemmed, String token) { return delims.contains(token) || (isStemmed && stemmedStopwords.contains(token)) || (!isStemmed && stopwords.contains(token)); } public boolean isDiscard(String token) { return (token.length() < MIN_LENGTH || token.length() > MAX_LENGTH || isStopWord(token)); } public boolean isDiscard(boolean isStemmed, String token) { return (token.length() < MIN_LENGTH || token.length() > MAX_LENGTH || isStopWord(isStemmed, token)); } /** * Remove stop words from text that has been tokenized. Useful when postprocessing output of MT system, which is tokenized but not stopword'ed. * * @param tokenizedText * input text, assumed to be tokenized. * @return * same text without the stop words. */ @Deprecated public String removeBorderStopWords(String tokenizedText) { String[] tokens = tokenizedText.split(" "); int start = 0, end = tokens.length - 1; for (int i = 0; i < tokens.length; i++) { if (!isStopWord(tokens[i])) { start = i; break; } } for (int i = tokens.length - 1; i >= 0; i--) { if (!isStopWord(tokens[i])) { end = i; break; } } String output = ""; for (int i = start; i <= end; i++) { output += (tokens[i] + " "); } return output.trim(); } public String stem(String token) { return token; } public String getUTF8(String token) { String utf8 = ""; for (int i = 0; i < token.length(); i++) { utf8 += String.format("%04x", (int) token.charAt(i)) + " "; } return utf8.trim(); } @SuppressWarnings("static-access") public static void main(String[] args) { Options options = new Options(); options.addOption(OptionBuilder.withArgName("full path to model file or directory").hasArg() .withDescription("model file").create("model")); options.addOption(OptionBuilder.withArgName("full path to input file").hasArg() .withDescription("input file").isRequired().create("input")); options.addOption(OptionBuilder.withArgName("full path to output file").hasArg() .withDescription("output file").isRequired().create("output")); options.addOption(OptionBuilder.withArgName("en | zh | de | fr | ar | tr | es").hasArg() .withDescription("2-character language code").isRequired().create("lang")); options.addOption(OptionBuilder.withArgName("path to stopwords list").hasArg() .withDescription("one stopword per line").create("stopword")); options.addOption(OptionBuilder.withArgName("path to stemmed stopwords list").hasArg() .withDescription("one stemmed stopword per line").create("stemmed_stopword")); options.addOption(OptionBuilder.withArgName("true|false").hasArg().withDescription("turn on/off stemming") .create("stem")); options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars") .withArgName("jar packages").hasArg().create("libjars")); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { String stopwordList = null, stemmedStopwordList = null, modelFile = null; boolean isStem = true; cmdline = parser.parse(options, args); if (cmdline.hasOption("stopword")) { stopwordList = cmdline.getOptionValue("stopword"); } if (cmdline.hasOption("stemmed_stopword")) { stemmedStopwordList = cmdline.getOptionValue("stemmed_stopword"); } if (cmdline.hasOption("stem")) { isStem = Boolean.parseBoolean(cmdline.getOptionValue("stem")); } if (cmdline.hasOption("model")) { modelFile = cmdline.getOptionValue("model"); } ivory.core.tokenize.Tokenizer tokenizer = TokenizerFactory.createTokenizer( cmdline.getOptionValue("lang"), modelFile, isStem, stopwordList, stemmedStopwordList, null); BufferedWriter out = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(cmdline.getOptionValue("output")), "UTF8")); BufferedReader in = new BufferedReader( new InputStreamReader(new FileInputStream(cmdline.getOptionValue("input")), "UTF8")); String line = null; while ((line = in.readLine()) != null) { String[] tokens = tokenizer.processContent(line); String s = ""; for (String token : tokens) { s += token + " "; } out.write(s.trim() + "\n"); } in.close(); out.close(); } catch (Exception exp) { System.out.println(exp); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("Tokenizer", options); System.exit(-1); } } }