edu.illinois.cs.cogcomp.bigdata.lucene.WikiURLAnalyzer.java Source code

Introduction

Here is the source code for edu.illinois.cs.cogcomp.bigdata.lucene.WikiURLAnalyzer.java
Source

/**
 * This software is released under the University of Illinois/Research and Academic Use License. See
 * the LICENSE file in the root folder for details. Copyright (c) 2016
 *
 * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign
 * http://cogcomp.cs.illinois.edu/
 */
package edu.illinois.cs.cogcomp.bigdata.lucene;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.util.Version;

/**
 * Only use this class for short strings
 *
 * @author cheng88
 *
 */
public class WikiURLAnalyzer extends Analyzer {

    private static Version matchVersion = Version.LUCENE_43;
    public static final String normalizeChars = "- ";
    public static final String replacement = "_";

    // Only lower case and remove hyphens
    public static class URLField extends Field {
        /** Indexed, not tokenized, omits norms, indexes
         *  DOCS_ONLY, stored */
        public static final FieldType TYPE_STORED = new FieldType();

        static {
            TYPE_STORED.setIndexed(true);
            TYPE_STORED.setOmitNorms(true);
            TYPE_STORED.setIndexOptions(IndexOptions.DOCS_ONLY);
            TYPE_STORED.setStored(true);
            TYPE_STORED.setTokenized(true);
            TYPE_STORED.freeze();
        }

        /** Creates a new URLField.
         *  @param name field name
         *  @param value String value
         *  @throws IllegalArgumentException if the field name or value is null.
         */
        public URLField(String name, String value) {
            super(name, value, TYPE_STORED);
        }
    }

    @Override
    protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {

        final Tokenizer source = new KeywordTokenizer(reader);
        TokenStream result = new StandardFilter(matchVersion, source);
        result = new CharacterFilter(result);
        result = new ASCIIFoldingFilter(result);
        result = new LowerCaseFilter(matchVersion, result);

        //        result = new WordDelimiterFilter(result, WordDelimiterFilter.DIGIT, null);

        return new TokenStreamComponents(source, result);
    }

    public static List<String> pruningTokenization(String s) {
        List<String> tokens = new ArrayList<String>();
        String[] parts = StringUtils.split(s, replacement.charAt(0));

        for (String part : parts) {
            if (!part.endsWith("."))
                tokens.add(part);
        }
        return tokens;
    }

    /**
     * Filters and replaces certain characters from the token stream
     * @author cheng88
     *
     */
    private static class CharacterFilter extends TokenFilter {

        private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

        private static final int UNDERSCORE = replacement.codePointAt(0);

        /**
         * Filters certain characters
         *
         * @param matchVersion
         *            See <a href="#version">above</a>
         * @param in
         *            TokenStream to filter
         */
        public CharacterFilter(TokenStream in) {
            super(in);
        }

        @Override
        public final boolean incrementToken() throws IOException {
            if (input.incrementToken()) {
                filterChar(termAtt.buffer(), 0, termAtt.length());
                return true;
            } else
                return false;
        }

        public static void filterChar(final char[] buffer, final int offset, final int limit) {
            assert buffer.length >= limit;
            assert offset <= 0 && offset <= buffer.length;
            for (int i = offset; i < limit;) {
                int unicodeChar = Character.codePointAt(buffer, i);

                char[] currentChars = Character.toChars(unicodeChar);

                if (currentChars.length == 1 && normalizeChars.indexOf(currentChars[0]) >= 0)
                    i += Character.toChars(UNDERSCORE, buffer, i);
                else
                    i += Character.toChars(unicodeChar, buffer, i);
            }
        }
    }

}