com.civprod.writerstoolbox.NaturalLanguage.util.TextAndTokenHandling.RegexTokenStripper.java Source code

Java tutorial

Introduction

Here is the source code for com.civprod.writerstoolbox.NaturalLanguage.util.TextAndTokenHandling.RegexTokenStripper.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.civprod.writerstoolbox.NaturalLanguage.util.TextAndTokenHandling;

import com.civprod.writerstoolbox.NaturalLanguage.util.ThreadSafe;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Pattern;

/**
 *
 * @author Steven Owens
 */
public class RegexTokenStripper extends AbstractTokenStripper implements TokenStripper, ThreadSafe {
    private final Set<String> removeRegEx;
    private final Set<Pattern> removePattern;

    public RegexTokenStripper(Collection<String> inRegex) {
        removeRegEx = org.apache.commons.collections4.SetUtils.unmodifiableSet(new HashSet<>(inRegex));
        removePattern = org.apache.commons.collections4.SetUtils.unmodifiableSet(removeRegEx.parallelStream()
                .map((String curRegEx) -> Pattern.compile(curRegEx)).collect(java.util.stream.Collectors.toSet()));
    }

    public RegexTokenStripper(String... inRegexes) {
        this(java.util.Arrays.asList(inRegexes));
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o instanceof RegexTokenStripper) {
            return this.removeRegEx.equals(((RegexTokenStripper) o).removeRegEx);
        } else {
            return false;
        }
    }

    @Override
    public int hashCode() {
        int hash = 5;
        hash = 97 * hash + Objects.hashCode(this.removeRegEx);
        return hash;
    }

    @Override
    public String strip(String text) {
        String rString = text;
        for (String curRegEx : removeRegEx) {
            rString = rString.replaceAll(curRegEx, "");
        }
        return rString;
    }

    @Override
    public boolean wouldBeStripped(String token) {
        return removePattern.parallelStream().anyMatch((Pattern curPattern) -> curPattern.matcher(token).matches());
    }
}