etc.aloe.filters.AbstractRegexFilter.java Source code

Java tutorial

Introduction

Here is the source code for etc.aloe.filters.AbstractRegexFilter.java

Source

/*
 * This file is part of ALOE.
 *
 * ALOE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
    
 * ALOE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
    
 * You should have received a copy of the GNU General Public License
 * along with ALOE.  If not, see <http://www.gnu.org/licenses/>.
 *
 * Copyright (c) 2012 SCCL, University of Washington (http://depts.washington.edu/sccl)
 */
package etc.aloe.filters;

import java.io.Serializable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import weka.core.Capabilities.Capability;
import weka.core.*;
import weka.filters.SimpleStreamFilter;
import weka.filters.UnsupervisedFilter;

/**
 * Abstract class representing a Weka filter that detects occurrences of regular
 * expressions in a specific string field.
 *
 * @author Michael Brooks <mjbrooks@uw.edu>
 */
public abstract class AbstractRegexFilter extends SimpleStreamFilter implements UnsupervisedFilter {

    private boolean countRegexLengths = false;

    /**
     * Set to true to add features for the length of the regex match. Defaults
     * to false.
     *
     * @param countRegexLengths
     */
    public void setCountRegexLengths(boolean countRegexLengths) {
        this.countRegexLengths = countRegexLengths;
    }

    /**
     * A mapping entity between names (feature names) and regular expressions.
     */
    protected static class NamedRegex implements Serializable {

        private final String name;
        private final String regex;
        private final Pattern pattern;

        public NamedRegex(String name, String regex) {
            this(name, regex, 0);
        }

        public NamedRegex(String name, String regex, int flags) {
            this.name = name;
            this.regex = regex;
            this.pattern = Pattern.compile(regex, flags);
        }

        public String getName() {
            return name;
        }

        public String getRegex() {
            return regex;
        }

        public Pattern getPattern() {
            return pattern;
        }
    }

    private String stringAttributeName;
    private int stringAttributeIndex = -1;

    protected abstract NamedRegex[] getRegexFeatures();

    public String getStringAttributeName() {
        return stringAttributeName;
    }

    public void setStringAttributeName(String stringAttributeName) {
        this.stringAttributeName = stringAttributeName;
    }

    @Override
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.enableAllAttributes();
        result.enableAllClasses();
        result.enable(Capability.NO_CLASS); //// filter doesn't need class to be set//
        return result;
    }

    @Override
    protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
        if (stringAttributeName == null) {
            throw new IllegalStateException("String attribute name not set");
        }

        Instances outputFormat = new Instances(inputFormat, 0);

        Attribute stringAttr = inputFormat.attribute(stringAttributeName);
        stringAttributeIndex = stringAttr.index();

        //Add the new columns. There is one for each regex feature.
        NamedRegex[] regexFeatures = getRegexFeatures();
        for (int i = 0; i < regexFeatures.length; i++) {
            String name = regexFeatures[i].getName();
            Attribute attr = new Attribute(name);
            outputFormat.insertAttributeAt(attr, outputFormat.numAttributes());

            if (countRegexLengths) {
                name = name + "_L";
                attr = new Attribute(name);
                outputFormat.insertAttributeAt(attr, outputFormat.numAttributes());
            }

        }

        return outputFormat;
    }

    @Override
    protected Instance process(Instance instance) throws Exception {
        if (stringAttributeIndex < 0) {
            throw new IllegalStateException("String attribute not set");
        }

        String stringValue = instance.stringValue(stringAttributeIndex);
        NamedRegex[] regexFeatures = getRegexFeatures();

        int numOldValues = instance.numAttributes();
        int numNewFeatures = regexFeatures.length;
        if (countRegexLengths) {
            numNewFeatures = regexFeatures.length * 2;
        }
        double[] newValues = new double[numOldValues + numNewFeatures];

        // Copy all attributes from input to output
        for (int i = 0; i < getInputFormat().numAttributes(); i++) {
            if (getInputFormat().attribute(i).type() != Attribute.STRING) {
                // Add simple nominal and numeric attributes directly
                if (instance.value(i) != 0.0) {
                    newValues[i] = instance.value(i);
                }
            } else {
                if (instance.isMissing(i)) {
                    newValues[i] = Utils.missingValue();
                } else {

                    // If this is a string attribute, we have to first add
                    // this value to the range of possible values, then add
                    // its new internal index.
                    if (outputFormatPeek().attribute(i).numValues() == 0) {
                        // Note that the first string value in a
                        // SparseInstance doesn't get printed.
                        outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug");
                    }
                    int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i));
                    newValues[i] = newIndex;
                }
            }
        }

        for (int i = 0; i < regexFeatures.length; i++) {
            Pattern pattern = regexFeatures[i].getPattern();

            Matcher matches = pattern.matcher(stringValue);
            int count = 0;
            int maxLength = 0;
            while (matches.find()) {
                count++;
                int len = matches.group().length();
                if (len > maxLength) {
                    maxLength = len;
                }
            }

            int index = numOldValues + i;
            if (countRegexLengths) {
                index = numOldValues + 2 * i;
            }
            newValues[index] = count;

            if (countRegexLengths) {
                newValues[numOldValues + 2 * i + 1] = maxLength;
            }
        }

        Instance result = new SparseInstance(instance.weight(), newValues);
        return result;
    }

    @Override
    public String globalInfo() {
        return "Generates a set of attributes from a string attribute. Each new attribute is defined by a regular expression.";
    }

    /**
     * Combines an array of string fragments into a regex-compatible string
     * using the alternative symbol: "|" All fragments are escaped.
     *
     * @param fragments
     * @return
     */
    protected String toRegex(String[] fragments) {
        return toRegex(fragments, true);
    }

    /**
     * Combines an array of string fragments into a regex-compatible string
     * using the alternative symbol: "|" If escape is true, escapes all special
     * characters in the fragments.
     *
     * @param fragments
     * @param escape
     * @return
     */
    protected String toRegex(String[] fragments, boolean escape) {
        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < fragments.length; i++) {
            String fragment = fragments[i];
            if (escape) {
                fragment = Pattern.quote(fragment);
            }

            if (builder.length() > 0) {
                builder.append("|");
            }
            builder.append(fragment);
        }
        return builder.toString();
    }
}