Example usage for java.util.regex Pattern UNICODE_CASE

List of usage examples for java.util.regex Pattern UNICODE_CASE


In this page you can find the example usage for java.util.regex Pattern UNICODE_CASE.



To view the source code for java.util.regex Pattern UNICODE_CASE.

Click Source Link


Enables Unicode-aware case folding.


From source file:org.etudes.component.app.jforum.util.html.SafeHtml.java

 * removes existing target attribute in the anchor tag and adds target="_blank"
 * //from w ww.  j  ava  2  s .  c  o m
 * @param contents
 *        Post contest
 * @return Modified content with target="_blank" in anchor tags
public static String addAnchorTarget(String contents) {
    if (contents == null) {
        return null;

    StringBuffer sb = new StringBuffer();

    Pattern p = Pattern.compile("<(a)([^>]+)>", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
    Matcher m = p.matcher(contents);

    while (m.find()) {
        if (m.groupCount() == 2) {
            String group1 = m.group(1);
            String group2 = m.group(2);
            String modGroup2 = group2.replaceAll("(target\\s*=\\s*[\"\'][^\"\']*[\"\']\\s*)?", "");

            String modString = "<" + group1 + " target=\"_blank\" " + modGroup2 + ">";

            m.appendReplacement(sb, Matcher.quoteReplacement(modString));


    return sb.toString();

From source file:org.yes.cart.bulkexport.csv.impl.CsvExportColumnImpl.java

private Pattern getPattern() {
    if (pattern == null && StringUtils.isNotBlank(valueRegEx)) {
        pattern = Pattern.compile(valueRegEx, Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
    }/*w  w  w .  j ava  2 s  .  c om*/
    return pattern;

From source file:com.norconex.importer.handler.tagger.impl.TextBetweenTagger.java

protected void tagStringContent(String reference, StringBuilder content, ImporterMetadata metadata,
        boolean parsed, boolean partialContent) {
    int flags = Pattern.DOTALL | Pattern.UNICODE_CASE;
    if (!caseSensitive) {
        flags = flags | Pattern.CASE_INSENSITIVE;
    }//from ww  w. j av a2 s.c om
    for (TextBetween between : betweens) {
        List<Pair<Integer, Integer>> matches = new ArrayList<Pair<Integer, Integer>>();
        Pattern leftPattern = Pattern.compile(between.start, flags);
        Matcher leftMatch = leftPattern.matcher(content);
        while (leftMatch.find()) {
            Pattern rightPattern = Pattern.compile(between.end, flags);
            Matcher rightMatch = rightPattern.matcher(content);
            if (rightMatch.find(leftMatch.end())) {
                if (inclusive) {
                    matches.add(new ImmutablePair<Integer, Integer>(leftMatch.start(), rightMatch.end()));
                } else {
                    matches.add(new ImmutablePair<Integer, Integer>(leftMatch.end(), rightMatch.start()));
            } else {
        for (int i = matches.size() - 1; i >= 0; i--) {
            Pair<Integer, Integer> matchPair = matches.get(i);
            String value = content.substring(matchPair.getLeft(), matchPair.getRight());
            if (value != null) {
                metadata.addString(between.name, value);

From source file:Normalization.TextNormalization.java

public String removeUrlsFromString(String content) {

    String utf8tweet = "";
    try {/*  w w  w . j a va2 s .c o  m*/
        byte[] utf8Bytes = content.getBytes("UTF-8");

        utf8tweet = new String(utf8Bytes, "UTF-8");
    } catch (UnsupportedEncodingException e) {

    final String regex = "(https?|ftp|file|pic|www)[:|.][-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%=~_|]";
    final Pattern unicodeOutliers = Pattern.compile(regex,
            Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

    Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);
    utf8tweet = unicodeOutlierMatcher.replaceAll("");
    return utf8tweet;

From source file:com.norconex.importer.handler.transformer.impl.StripBetweenTransformer.java

protected void transformStringContent(String reference, StringBuilder content, ImporterMetadata metadata,
        boolean parsed, boolean partialContent) {
    int flags = Pattern.DOTALL | Pattern.UNICODE_CASE;
    if (!caseSensitive) {
        flags = flags | Pattern.CASE_INSENSITIVE;
    }/*  w w w. j a  v a  2 s .  c o m*/
    for (Pair<String, String> pair : stripPairs) {
        List<Pair<Integer, Integer>> matches = new ArrayList<Pair<Integer, Integer>>();
        Pattern leftPattern = Pattern.compile(pair.getLeft(), flags);
        Matcher leftMatch = leftPattern.matcher(content);
        while (leftMatch.find()) {
            Pattern rightPattern = Pattern.compile(pair.getRight(), flags);
            Matcher rightMatch = rightPattern.matcher(content);
            if (rightMatch.find(leftMatch.end())) {
                if (inclusive) {
                    matches.add(new ImmutablePair<Integer, Integer>(leftMatch.start(), rightMatch.end()));
                } else {
                    matches.add(new ImmutablePair<Integer, Integer>(leftMatch.end(), rightMatch.start()));
            } else {
        for (int i = matches.size() - 1; i >= 0; i--) {
            Pair<Integer, Integer> matchPair = matches.get(i);
            content.delete(matchPair.getLeft(), matchPair.getRight());

From source file:com.microsoft.tfs.core.clients.versioncontrol.internal.fileattributes.FileAttributesEntry.java

 * Compiles a filename expression into a {@link Pattern} using the regex
 * flags appropriate for a file attributes entry.
 * @param filenameExpression//w ww. j a va2  s.  co  m
 *        the filename expression to compile.
 * @return the compiled pattern.
 * @throws PatternSyntaxException
 *         if the regular expression could not be compiled.
private Pattern compilePattern(final String filenameExpression) {
    return Pattern.compile(filenameExpression, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

From source file:com.norconex.collector.core.filter.impl.RegexMetadataFilter.java

public final void setRegex(String regex) {
    this.regex = regex;
    if (regex != null) {
        int flags = Pattern.DOTALL;
        if (!caseSensitive) {
            flags = flags | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
        }//from  w ww  .j a v a  2 s.c  o m
        this.pattern = Pattern.compile(regex, flags);
    } else {
        this.pattern = Pattern.compile(".*");

From source file:org.tightblog.rendering.comment.BlacklistCommentValidator.java

 * Create a list of regex Pattern elements from a line-delimited list
 * @param blacklist String of regex rules, one per line delimited by \n
 **//* w ww.  j av  a2s. co  m*/
public static List<Pattern> populateSpamRules(String blacklist) {
    List<Pattern> regexRules = new ArrayList<>();

    if (blacklist != null) {
        StringTokenizer tokenizer = new StringTokenizer(blacklist, "\n");

        while (tokenizer.hasMoreTokens()) {
            String token = tokenizer.nextToken().trim();
            if (token.startsWith("#")) {
            regexRules.add(Pattern.compile(token, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE));

    return regexRules;

From source file:com.google.code.configprocessor.processing.ModifyAction.java

protected int parseFlags() {
    int flagsToUse = 0;
    String flagsToTest = getFlags() == null ? DEFAULT_PATTERN_FLAGS : getFlags();
    String[] flagArray = StringUtils.split(flagsToTest, PATTERN_FLAG_SEPARATOR);
    for (String flag : flagArray) {
        if ("UNIX_LINES".equals(flag)) {
            flagsToUse |= Pattern.UNIX_LINES;
        } else if ("CASE_INSENSITIVE".equals(flag)) {
            flagsToUse |= Pattern.CASE_INSENSITIVE;
        } else if ("COMMENTS".equals(flag)) {
            flagsToUse |= Pattern.COMMENTS;
        } else if ("MULTILINE".equals(flag)) {
            flagsToUse |= Pattern.MULTILINE;
        } else if ("LITERAL".equals(flag)) {
            flagsToUse |= Pattern.LITERAL;
        } else if ("DOTALL".equals(flag)) {
            flagsToUse |= Pattern.DOTALL;
        } else if ("UNICODE_CASE".equals(flag)) {
            flagsToUse |= Pattern.UNICODE_CASE;
        } else if ("CANON_EQ".equals(flag)) {
            flagsToUse |= Pattern.CANON_EQ;
        } else {//from   w w w.j a  va2 s. c om
            throw new IllegalArgumentException("Unknown flag: " + flag);

    return flagsToUse;

From source file:org.apache.manifoldcf.crawler.connectors.webcrawler.CredentialsDescription.java

/** Constructor.  Build the description from the ConfigParams. */
public CredentialsDescription(ConfigParams configData) throws ManifoldCFException {
    // Scan, looking for bin description nodes
    int i = 0;//  w w  w .  j  av  a2  s.co  m
    while (i < configData.getChildCount()) {
        ConfigNode node = configData.getChild(i++);
        if (node.getType().equals(WebcrawlerConfig.NODE_ACCESSCREDENTIAL)) {
            // Get the url regexp
            String urlDescription = node.getAttributeValue(WebcrawlerConfig.ATTR_URLREGEXP);
            try {
                Pattern p;
                try {
                    p = Pattern.compile(urlDescription, Pattern.UNICODE_CASE);
                } catch (java.util.regex.PatternSyntaxException e) {
                    throw new ManifoldCFException("Access credential regular expression '" + urlDescription
                            + "' is illegal: " + e.getMessage(), e);
                CredentialsItem ti = new CredentialsItem(p);

                String type = node.getAttributeValue(WebcrawlerConfig.ATTR_TYPE);

                // These get used in two of the three types; no harm in fetching them up front.
                String userName = node.getAttributeValue(WebcrawlerConfig.ATTR_USERNAME);
                String password = node.getAttributeValue(WebcrawlerConfig.ATTR_PASSWORD);
                if (password != null)
                    password = ManifoldCF.deobfuscate(password);

                if (type.equals(WebcrawlerConfig.ATTRVALUE_BASIC))
                    ti.setCredential(new BasicCredential(userName, password));
                else if (type.equals(WebcrawlerConfig.ATTRVALUE_NTLM)) {
                    String domain = node.getAttributeValue(WebcrawlerConfig.ATTR_DOMAIN);
                    ti.setCredential(new NTLMCredential(domain, userName, password));
                } else if (type.equals(WebcrawlerConfig.ATTRVALUE_SESSION)) {
                    // This is a complex credential type that cannot be easily set up with just a constructor.
                    // Use the url regexp as the sequence key; this works as well as anything, although I haven't thought through all the implications if it gets changed.
                    SessionCredential sc = new SessionCredential(urlDescription);
                    // Loop through child nodes; they describe the pages that belong to the login sequence.
                    int j = 0;
                    while (j < node.getChildCount()) {
                        ConfigNode child = node.getChild(j++);
                        if (child.getType().equals(WebcrawlerConfig.NODE_AUTHPAGE)) {
                            String authPageRegexp = child.getAttributeValue(WebcrawlerConfig.ATTR_URLREGEXP);
                            String pageType = child.getAttributeValue(WebcrawlerConfig.ATTR_TYPE);
                            String matchRegexp = child.getAttributeValue(WebcrawlerConfig.ATTR_MATCHREGEXP);
                            String overrideTargetURL = child
                            if (overrideTargetURL != null && overrideTargetURL.length() == 0)
                                overrideTargetURL = null;
                            Pattern authPattern;
                            try {
                                authPattern = Pattern.compile(authPageRegexp, Pattern.UNICODE_CASE);
                            } catch (java.util.regex.PatternSyntaxException e) {
                                throw new ManifoldCFException("Authentication page regular expression '"
                                        + authPageRegexp + "' is illegal: " + e.getMessage(), e);
                            Pattern matchPattern;
                            try {
                                matchPattern = Pattern.compile(matchRegexp, Pattern.UNICODE_CASE);
                            } catch (java.util.regex.PatternSyntaxException e) {
                                throw new ManifoldCFException("Match regular expression '" + matchRegexp
                                        + "' is illegal: " + e.getMessage(), e);
                            if (pageType.equals(WebcrawlerConfig.ATTRVALUE_FORM)) {
                                sc.addAuthPage(authPageRegexp, authPattern, overrideTargetURL, null, null,
                                        matchRegexp, matchPattern, null, null, null, null);
                            } else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_LINK)) {
                                sc.addAuthPage(authPageRegexp, authPattern, overrideTargetURL, matchRegexp,
                                        matchPattern, null, null, null, null, null, null);
                            } else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_REDIRECTION)) {
                                sc.addAuthPage(authPageRegexp, authPattern, overrideTargetURL, null, null, null,
                                        null, matchRegexp, matchPattern, null, null);
                            } else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_CONTENT)) {
                                sc.addAuthPage(authPageRegexp, authPattern, overrideTargetURL, null, null, null,
                                        null, null, null, matchRegexp, matchPattern);
                            } else
                                throw new ManifoldCFException("Invalid page type: " + pageType);

                            // Finally, walk through any specified parameters
                            int k = 0;
                            while (k < child.getChildCount()) {
                                ConfigNode paramNode = child.getChild(k++);
                                if (paramNode.getType().equals(WebcrawlerConfig.NODE_AUTHPARAMETER)) {
                                    String paramName = paramNode
                                    Pattern paramNamePattern;
                                    try {
                                        paramNamePattern = Pattern.compile(paramName, Pattern.UNICODE_CASE);
                                    } catch (java.util.regex.PatternSyntaxException e) {
                                        throw new ManifoldCFException("Parameter name regular expression '"
                                                + paramName + "' is illegal: " + e.getMessage(), e);
                                    String passwordValue = paramNode
                                    String paramValue = paramNode
                                    if (passwordValue != null)
                                        paramValue = ManifoldCF.deobfuscate(passwordValue);
                                    sc.addPageParameter(authPageRegexp, paramName, paramNamePattern,
                } else
                    throw new ManifoldCFException("Illegal credential type: " + type);
                patternHash.put(urlDescription, ti);
            } catch (PatternSyntaxException e) {
                throw new ManifoldCFException("Bad pattern syntax in '" + urlDescription + "'", e);