List of usage examples for java.util.regex Pattern UNICODE_CASE
int UNICODE_CASE
To view the source code for java.util.regex Pattern UNICODE_CASE.
Click Source Link
From source file:BGrep.java
public static void main(String[] args) { String encodingName = "UTF-8"; // Default to UTF-8 encoding int flags = Pattern.MULTILINE; // Default regexp flags try { // Fatal exceptions are handled after this try block // First, process any options int nextarg = 0; while (args[nextarg].charAt(0) == '-') { String option = args[nextarg++]; if (option.equals("-e")) { encodingName = args[nextarg++]; } else if (option.equals("-i")) { // case-insensitive matching flags |= Pattern.CASE_INSENSITIVE; } else if (option.equals("-s")) { // Strict Unicode processing flags |= Pattern.UNICODE_CASE; // case-insensitive Unicode flags |= Pattern.CANON_EQ; // canonicalize Unicode } else { System.err.println("Unknown option: " + option); usage();/*from w w w . j a va2 s. c o m*/ } } // Get the Charset for converting bytes to chars Charset charset = Charset.forName(encodingName); // Next argument must be a regexp. Compile it to a Pattern object Pattern pattern = Pattern.compile(args[nextarg++], flags); // Require that at least one file is specified if (nextarg == args.length) usage(); // Loop through each of the specified filenames while (nextarg < args.length) { String filename = args[nextarg++]; CharBuffer chars; // This will hold complete text of the file try { // Handle per-file errors locally // Open a FileChannel to the named file FileInputStream stream = new FileInputStream(filename); FileChannel f = stream.getChannel(); // Memory-map the file into one big ByteBuffer. This is // easy but may be somewhat inefficient for short files. ByteBuffer bytes = f.map(FileChannel.MapMode.READ_ONLY, 0, f.size()); // We can close the file once it is is mapped into memory. // Closing the stream closes the channel, too. stream.close(); // Decode the entire ByteBuffer into one big CharBuffer chars = charset.decode(bytes); } catch (IOException e) { // File not found or other problem System.err.println(e); // Print error message continue; // and move on to the next file } // This is the basic regexp loop for finding all matches in a // CharSequence. Note that CharBuffer implements CharSequence. // A Matcher holds state for a given Pattern and text. Matcher matcher = pattern.matcher(chars); while (matcher.find()) { // While there are more matches // Print out details of the match System.out.println(filename + ":" + // file name matcher.start() + ": " + // character pos matcher.group()); // matching text } } } // These are the things that can go wrong in the code above catch (UnsupportedCharsetException e) { // Bad encoding name System.err.println("Unknown encoding: " + encodingName); } catch (PatternSyntaxException e) { // Bad pattern System.err.println("Syntax error in search pattern:\n" + e.getMessage()); } catch (ArrayIndexOutOfBoundsException e) { // Wrong number of arguments usage(); } }
From source file:org.medici.bia.common.util.RegExUtils.java
public static void main(String[] args) { String lastName = "pe'r"; String paramString = "(?=.*[\\p{L}])|(?=.*['])"; //String paramString = "(\\d)|([\\p{Punct}])"; Pattern localPattern = Pattern.compile(paramString, Pattern.UNICODE_CASE); Matcher localMatcher = localPattern.matcher(lastName); if (localMatcher.find()) { System.out.println("OK"); } else {// w ww . ja v a2 s .c om System.out.println("Contiene un carattere non valido"); } }
From source file:Main.java
/** * Creates a regular expression pattern that matches a "wildcard" pattern. * // ww w . j av a 2s . c o m * @param wildcard The wildcard pattern. * @param matchCase Whether the pattern should be case sensitive. * @param escapeStartChar Whether to escape a starting <code>'^'</code> * character. * @return The pattern. */ public static Pattern wildcardToPattern(String wildcard, boolean matchCase, boolean escapeStartChar) { int flags = 0; if (!matchCase) { flags = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE; } StringBuffer sb = new StringBuffer(); for (int i = 0; i < wildcard.length(); i++) { char ch = wildcard.charAt(i); switch (ch) { case '*': sb.append(".*"); break; case '?': sb.append('.'); break; case '^': if (i > 0 || escapeStartChar) { sb.append('\\'); } sb.append('^'); break; case '\\': case '.': case '|': case '+': case '-': case '$': case '[': case ']': case '{': case '}': case '(': case ')': sb.append('\\').append(ch); break; default: sb.append(ch); break; } } Pattern p = null; try { p = Pattern.compile(sb.toString(), flags); } catch (PatternSyntaxException pse) { pse.printStackTrace(); p = Pattern.compile(".+"); } return p; }
From source file:Main.java
public static final String replaceRegexAll(String fileContent, String from, String to, boolean isRegex, boolean caseSensitive) { if (!isRegex) { Log.d(from, to);/*from ww w . j a v a 2 s. com*/ from = from.replaceAll(SPECIAL_CHAR_PATTERNSTR, "\\\\$1"); to = to.replaceAll(SPECIAL_CHAR_PATTERNSTR, "\\\\$1"); Log.d(from, to); } //System.out.println(fileContent); Pattern p = null; if (!caseSensitive) { p = Pattern.compile(from, Pattern.CASE_INSENSITIVE); //fileContent = fileContent.replaceAll("(?i)"+from, to); } else { p = Pattern.compile(from, Pattern.UNICODE_CASE); //fileContent = fileContent.replaceAll(from, to); } fileContent = p.matcher(fileContent).replaceAll(to); //System.out.println(fileContent); return fileContent; }
From source file:Main.java
/** * Pattern.pattern and Pattern.toString ignore any flags supplied to * Pattern.compile, so the regular expression you get out doesn't * correspond to what the Pattern was actually matching. This fixes that. * // w w w . j a v a 2 s . c om * Note that there are some flags that can't be represented. * * FIXME: why don't we use Pattern.LITERAL instead of home-grown escaping * code? Is it because you can't do the reverse transformation? Should we * integrate that code with this? */ public static String toString(Pattern pattern) { String regex = pattern.pattern(); final int flags = pattern.flags(); if (flags != 0) { StringBuilder builder = new StringBuilder("(?"); toStringHelper(builder, flags, Pattern.UNIX_LINES, 'd'); toStringHelper(builder, flags, Pattern.CASE_INSENSITIVE, 'i'); toStringHelper(builder, flags, Pattern.COMMENTS, 'x'); toStringHelper(builder, flags, Pattern.MULTILINE, 'm'); toStringHelper(builder, flags, Pattern.DOTALL, 's'); toStringHelper(builder, flags, Pattern.UNICODE_CASE, 'u'); builder.append(")"); regex = builder.toString() + regex; } return regex; }
From source file:Utils.java
public static List<String> getFound(String contents, String regex) { if (isEmpty(regex) || isEmpty(contents)) { return null; }//from w w w .j av a 2s.co m List<String> results = new ArrayList<String>(); Pattern pattern = Pattern.compile(regex, Pattern.UNICODE_CASE); Matcher matcher = pattern.matcher(contents); while (matcher.find()) { if (matcher.groupCount() > 0) { results.add(matcher.group(1)); } else { results.add(matcher.group()); } } return results; }
From source file:com.github.javarch.support.SlugGenerator.java
public String encode(String str) { Pattern p = Pattern.compile("\\p{InCombiningDiacriticalMarks}+", Pattern.UNICODE_CASE); Pattern p2 = Pattern.compile("\\p{Punct}+", Pattern.UNICODE_CASE); Pattern p3 = Pattern.compile("\\s+", Pattern.UNICODE_CASE); // Decompose any funny characters. String link = Normalizer.normalize(str, Normalizer.Form.NFD).replaceAll(p.pattern(), "") // remove all the diacritic marks .replaceAll(p2.pattern(), " ").trim() // transform the punctuation into spaces first, so that we can trim some ending or beginning punctuation .replaceAll(p3.pattern(), "-") // and replace all the whitespace with a dash. .toLowerCase();//from ww w . ja v a 2s . co m return link; }
From source file:edu.temple.cis3238.wiki.utils.FileUtils.java
/** * Verifies valid file extension//from ww w. j a v a2 s .c o m * @param filePath full path of file * @param extraValidExtensions CSV list of additional valid file extensions. * @return valid * @see edu.temple.cis3238.wiki.ui.servlets.UploadServlet */ public static boolean checkFileExtension(String filePath, String extraValidExtensions) { String[] extensionArray = { "" }; String extensions = ""; if (extraValidExtensions != null && !extraValidExtensions.isEmpty()) { extensionArray = extraValidExtensions.replace(" ", "").toLowerCase().split(","); extensions = "|" + org.apache.commons.lang3.StringUtils.join(extensionArray, '|'); extensions = org.apache.commons.lang3.StringUtils.removeEnd(extensions.trim(), "|"); } File file = new File(filePath); String validExts = "(pdf|mp4|m4v|wmv|flv|swf|avi|mov|mpeg|mpg|mov|doc|docx|xls|xlsx|ppt|pptx|txt|jpg|jpeg|png" + extensions + ")"; // if (!file.isFile()) { // return false; // } // else { String type = filePath.substring(filePath.lastIndexOf(".") + 1, filePath.length()).toLowerCase(); try { Pattern regex = Pattern.compile(validExts, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher regexMatcher = regex.matcher(type); if (regexMatcher.matches()) { return true; } } catch (PatternSyntaxException ex) { LOG.log(Level.SEVERE, null, ex); return false; } // } return false; }
From source file:com.microsoft.tfs.client.eclipse.tpignore.TPIgnoreFileParser.java
/** * Gets the flags appropriate for this platform's filesystem's * case-sensitivity.// w ww. j a v a 2s .com */ private synchronized static int getCompileFlags() { if (compileFlags == -1000) { if (FileHelpers.doesFileSystemIgnoreCase()) { compileFlags = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE; } else { compileFlags = 0; } } return compileFlags; }
From source file:Normalization.TextNormalization.java
public String removeEmojiFromString(String content) { String utf8tweet = ""; try {//ww w . j a va2 s . c o m byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } Pattern unicodeOutliers = Pattern.compile( "[\ud83c\udc00-\ud83c\udfff]|[\ud83d\udc00-\ud83d\udfff]|[\u2600-\u27ff]", Pattern.UNICODE_CASE | Pattern.CANON_EQ | Pattern.CASE_INSENSITIVE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(""); return utf8tweet; }