Example usage for java.util.regex Pattern UNICODE

Introduction

In this page you can find the example usage for java.util.regex Pattern UNICODE_CASE.

Prototype

int UNICODE_CASE

To view the source code for java.util.regex Pattern UNICODE_CASE.

Click Source Link

Document

Enables Unicode-aware case folding.

Usage

From source file:BGrep.java

public static void main(String[] args) {
    String encodingName = "UTF-8"; // Default to UTF-8 encoding
    int flags = Pattern.MULTILINE; // Default regexp flags

    try { // Fatal exceptions are handled after this try block
        // First, process any options
        int nextarg = 0;
        while (args[nextarg].charAt(0) == '-') {
            String option = args[nextarg++];
            if (option.equals("-e")) {
                encodingName = args[nextarg++];
            } else if (option.equals("-i")) { // case-insensitive matching
                flags |= Pattern.CASE_INSENSITIVE;
            } else if (option.equals("-s")) { // Strict Unicode processing
                flags |= Pattern.UNICODE_CASE; // case-insensitive Unicode
                flags |= Pattern.CANON_EQ; // canonicalize Unicode
            } else {
                System.err.println("Unknown option: " + option);
                usage();/*from   w w w . j a  va2  s. c  o m*/
            }
        }

        // Get the Charset for converting bytes to chars
        Charset charset = Charset.forName(encodingName);

        // Next argument must be a regexp. Compile it to a Pattern object
        Pattern pattern = Pattern.compile(args[nextarg++], flags);

        // Require that at least one file is specified
        if (nextarg == args.length)
            usage();

        // Loop through each of the specified filenames
        while (nextarg < args.length) {
            String filename = args[nextarg++];
            CharBuffer chars; // This will hold complete text of the file
            try { // Handle per-file errors locally
                // Open a FileChannel to the named file
                FileInputStream stream = new FileInputStream(filename);
                FileChannel f = stream.getChannel();

                // Memory-map the file into one big ByteBuffer. This is
                // easy but may be somewhat inefficient for short files.
                ByteBuffer bytes = f.map(FileChannel.MapMode.READ_ONLY, 0, f.size());

                // We can close the file once it is is mapped into memory.
                // Closing the stream closes the channel, too.
                stream.close();

                // Decode the entire ByteBuffer into one big CharBuffer
                chars = charset.decode(bytes);
            } catch (IOException e) { // File not found or other problem
                System.err.println(e); // Print error message
                continue; // and move on to the next file
            }

            // This is the basic regexp loop for finding all matches in a
            // CharSequence. Note that CharBuffer implements CharSequence.
            // A Matcher holds state for a given Pattern and text.
            Matcher matcher = pattern.matcher(chars);
            while (matcher.find()) { // While there are more matches
                // Print out details of the match
                System.out.println(filename + ":" + // file name
                        matcher.start() + ": " + // character pos
                        matcher.group()); // matching text
            }
        }
    }
    // These are the things that can go wrong in the code above
    catch (UnsupportedCharsetException e) { // Bad encoding name
        System.err.println("Unknown encoding: " + encodingName);
    } catch (PatternSyntaxException e) { // Bad pattern
        System.err.println("Syntax error in search pattern:\n" + e.getMessage());
    } catch (ArrayIndexOutOfBoundsException e) { // Wrong number of arguments
        usage();
    }
}

From source file:org.medici.bia.common.util.RegExUtils.java

public static void main(String[] args) {
    String lastName = "pe'r";
    String paramString = "(?=.*[\\p{L}])|(?=.*['])";
    //String paramString = "(\\d)|([\\p{Punct}])";
    Pattern localPattern = Pattern.compile(paramString, Pattern.UNICODE_CASE);
    Matcher localMatcher = localPattern.matcher(lastName);

    if (localMatcher.find()) {
        System.out.println("OK");
    } else {// w  ww  .  ja v  a2 s .c om
        System.out.println("Contiene un carattere non valido");
    }
}

From source file:Main.java

/**
 * Creates a regular expression pattern that matches a "wildcard" pattern.
 * // ww w  .  j  av  a  2s  . c  o m
 * @param wildcard The wildcard pattern.
 * @param matchCase Whether the pattern should be case sensitive.
 * @param escapeStartChar Whether to escape a starting <code>'^'</code>
 *        character.
 * @return The pattern.
 */
public static Pattern wildcardToPattern(String wildcard, boolean matchCase, boolean escapeStartChar) {

    int flags = 0;
    if (!matchCase) {
        flags = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
    }

    StringBuffer sb = new StringBuffer();
    for (int i = 0; i < wildcard.length(); i++) {
        char ch = wildcard.charAt(i);
        switch (ch) {
        case '*':
            sb.append(".*");
            break;
        case '?':
            sb.append('.');
            break;
        case '^':
            if (i > 0 || escapeStartChar) {
                sb.append('\\');
            }
            sb.append('^');
            break;
        case '\\':
        case '.':
        case '|':
        case '+':
        case '-':
        case '$':
        case '[':
        case ']':
        case '{':
        case '}':
        case '(':
        case ')':
            sb.append('\\').append(ch);
            break;
        default:
            sb.append(ch);
            break;
        }
    }

    Pattern p = null;
    try {
        p = Pattern.compile(sb.toString(), flags);
    } catch (PatternSyntaxException pse) {
        pse.printStackTrace();
        p = Pattern.compile(".+");
    }

    return p;

}

From source file:Main.java

public static final String replaceRegexAll(String fileContent, String from, String to, boolean isRegex,
        boolean caseSensitive) {
    if (!isRegex) {
        Log.d(from, to);/*from  ww  w . j a v a 2 s. com*/
        from = from.replaceAll(SPECIAL_CHAR_PATTERNSTR, "\\\\$1");
        to = to.replaceAll(SPECIAL_CHAR_PATTERNSTR, "\\\\$1");
        Log.d(from, to);
    }
    //System.out.println(fileContent);
    Pattern p = null;
    if (!caseSensitive) {
        p = Pattern.compile(from, Pattern.CASE_INSENSITIVE);
        //fileContent = fileContent.replaceAll("(?i)"+from, to);
    } else {
        p = Pattern.compile(from, Pattern.UNICODE_CASE);
        //fileContent = fileContent.replaceAll(from, to);
    }
    fileContent = p.matcher(fileContent).replaceAll(to);
    //System.out.println(fileContent);
    return fileContent;
}

From source file:Main.java

/**
 * Pattern.pattern and Pattern.toString ignore any flags supplied to
 * Pattern.compile, so the regular expression you get out doesn't
 * correspond to what the Pattern was actually matching. This fixes that.
 * //  w  w w .  j a v  a  2  s  . c om
 * Note that there are some flags that can't be represented.
 * 
 * FIXME: why don't we use Pattern.LITERAL instead of home-grown escaping
 * code? Is it because you can't do the reverse transformation? Should we
 * integrate that code with this?
 */
public static String toString(Pattern pattern) {
    String regex = pattern.pattern();
    final int flags = pattern.flags();
    if (flags != 0) {
        StringBuilder builder = new StringBuilder("(?");
        toStringHelper(builder, flags, Pattern.UNIX_LINES, 'd');
        toStringHelper(builder, flags, Pattern.CASE_INSENSITIVE, 'i');
        toStringHelper(builder, flags, Pattern.COMMENTS, 'x');
        toStringHelper(builder, flags, Pattern.MULTILINE, 'm');
        toStringHelper(builder, flags, Pattern.DOTALL, 's');
        toStringHelper(builder, flags, Pattern.UNICODE_CASE, 'u');
        builder.append(")");
        regex = builder.toString() + regex;
    }
    return regex;
}

From source file:Utils.java

public static List<String> getFound(String contents, String regex) {
    if (isEmpty(regex) || isEmpty(contents)) {
        return null;
    }//from w  w w .j  av a 2s.co m
    List<String> results = new ArrayList<String>();
    Pattern pattern = Pattern.compile(regex, Pattern.UNICODE_CASE);
    Matcher matcher = pattern.matcher(contents);

    while (matcher.find()) {
        if (matcher.groupCount() > 0) {
            results.add(matcher.group(1));
        } else {
            results.add(matcher.group());
        }
    }
    return results;
}

From source file:com.github.javarch.support.SlugGenerator.java

public String encode(String str) {
    Pattern p = Pattern.compile("\\p{InCombiningDiacriticalMarks}+", Pattern.UNICODE_CASE);
    Pattern p2 = Pattern.compile("\\p{Punct}+", Pattern.UNICODE_CASE);
    Pattern p3 = Pattern.compile("\\s+", Pattern.UNICODE_CASE);

    // Decompose any funny characters.
    String link = Normalizer.normalize(str, Normalizer.Form.NFD).replaceAll(p.pattern(), "") // remove all the diacritic marks
            .replaceAll(p2.pattern(), " ").trim() // transform the punctuation into spaces first, so that we can trim some ending or beginning punctuation
            .replaceAll(p3.pattern(), "-") // and replace all the whitespace with a dash.
            .toLowerCase();//from   ww w  .  ja  v  a  2s  . co m

    return link;
}

From source file:edu.temple.cis3238.wiki.utils.FileUtils.java

/**
* Verifies valid file extension//from  ww  w.  j a  v  a2  s .c  o  m
* @param filePath full path of file
* @param extraValidExtensions CSV list of additional valid file extensions.
* @return valid
* @see edu.temple.cis3238.wiki.ui.servlets.UploadServlet
*/
public static boolean checkFileExtension(String filePath, String extraValidExtensions) {
    String[] extensionArray = { "" };
    String extensions = "";
    if (extraValidExtensions != null && !extraValidExtensions.isEmpty()) {
        extensionArray = extraValidExtensions.replace(" ", "").toLowerCase().split(",");
        extensions = "|" + org.apache.commons.lang3.StringUtils.join(extensionArray, '|');
        extensions = org.apache.commons.lang3.StringUtils.removeEnd(extensions.trim(), "|");
    }
    File file = new File(filePath);
    String validExts = "(pdf|mp4|m4v|wmv|flv|swf|avi|mov|mpeg|mpg|mov|doc|docx|xls|xlsx|ppt|pptx|txt|jpg|jpeg|png"
            + extensions + ")";
    //        if (!file.isFile()) {
    //            return false;
    //        }
    //      else {
    String type = filePath.substring(filePath.lastIndexOf(".") + 1, filePath.length()).toLowerCase();

    try {
        Pattern regex = Pattern.compile(validExts, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
        Matcher regexMatcher = regex.matcher(type);
        if (regexMatcher.matches()) {
            return true;
        }

    } catch (PatternSyntaxException ex) {
        LOG.log(Level.SEVERE, null, ex);
        return false;
    }

    //        }
    return false;
}

From source file:com.microsoft.tfs.client.eclipse.tpignore.TPIgnoreFileParser.java

/**
 * Gets the flags appropriate for this platform's filesystem's
 * case-sensitivity.//  w  ww. j  a v  a  2s  .com
 */
private synchronized static int getCompileFlags() {
    if (compileFlags == -1000) {
        if (FileHelpers.doesFileSystemIgnoreCase()) {
            compileFlags = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
        } else {
            compileFlags = 0;
        }
    }

    return compileFlags;
}

From source file:Normalization.TextNormalization.java

public String removeEmojiFromString(String content) {

    String utf8tweet = "";
    try {//ww  w .  j  a  va2  s .  c  o m
        byte[] utf8Bytes = content.getBytes("UTF-8");

        utf8tweet = new String(utf8Bytes, "UTF-8");
    } catch (UnsupportedEncodingException e) {
    }
    Pattern unicodeOutliers = Pattern.compile(
            "[\ud83c\udc00-\ud83c\udfff]|[\ud83d\udc00-\ud83d\udfff]|[\u2600-\u27ff]",
            Pattern.UNICODE_CASE | Pattern.CANON_EQ | Pattern.CASE_INSENSITIVE);
    Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);

    utf8tweet = unicodeOutlierMatcher.replaceAll("");
    return utf8tweet;
}

Example usage for java.util.regex Pattern UNICODE_CASE

Introduction

Prototype

Document

Usage