List of usage examples for java.util.regex Pattern UNICODE_CASE
int UNICODE_CASE
To view the source code for java.util.regex Pattern UNICODE_CASE.
Click Source Link
From source file:com.yahoo.flowetl.core.util.RegexUtil.java
/** * Gets the pattern for the given string by providing the rules to do * extraction./* w ww .j av a 2 s . c om*/ * * This is similar to how php does regex to match you provide in the format * /REGEX/options where options currently are "i" for case insensitive and * "u" for unicode and "m" for multiline and "s" for dotall and the value * inside the // is the regex to use * * @param str * the string to parsed the pattern out of * * @param cache * whether to cache the compiled pattern * * @return the pattern * * @throws PatternSyntaxException * * the pattern syntax exception if it has wrong syntax */ public static Pattern getPattern(String str, boolean cache) throws PatternSyntaxException { if (str == null) { return null; } // see if we made it before... Pattern p = compiledPats.get(str); if (p != null) { return p; } Matcher mat = patExtractor.matcher(str); if (mat.matches() == false) { throw new PatternSyntaxException("Invalid syntax provided", str, -1); } String regex = mat.group(1); String opts = mat.group(2); int optsVal = 0; if (StringUtils.contains(opts, "i")) { optsVal |= Pattern.CASE_INSENSITIVE; } if (StringUtils.contains(opts, "u")) { optsVal |= Pattern.UNICODE_CASE; } if (StringUtils.contains(opts, "m")) { optsVal |= Pattern.MULTILINE; } if (StringUtils.contains(opts, "s")) { optsVal |= Pattern.DOTALL; } // compile and store it p = Pattern.compile(regex, optsVal); if (cache) { compiledPats.put(str, p); } return p; }
From source file:org.lanes.utility.string.TextNormaliser.java
public static List<String> cleanLightHTML(String html) { html = html.replaceAll(" ", " "); html = html.replaceAll("[\\{\\}\\[\\]]", ""); html = html.replaceAll("&", "&"); html = html.replaceAll("(?i)<div.*?>(.*?)<\\/div>", "$1\n"); html = html.replaceAll("(?i)<strong.*?>(.*?)<\\/strong>", "[$1] "); html = html.replaceAll("(?i)<br\\/?>", "\n");//MUST COME BEFORE <b> html = html.replaceAll("(?i)<b.*?>(.*?)<\\/b>", "[$1] "); html = html.replaceAll("(?i)<em>(.*?)<\\/em>", "[$1] "); html = html.replaceAll("(?i)<i>(.*?)<\\/i>", "[$1] "); html = html.replaceAll("(?i)<u>(.*?)<\\/u>", "[$1] "); html = html.replaceAll("[\\s\\n]+\\]", "]"); html = html.replaceAll("\\[[\\s\\n]+", "["); html = html.replaceAll("[\\s]*:\\]", "]"); html = html.replaceAll("(?i)<[\\/]?[uo]l.*?>", ""); html = html.replaceAll("(?i)<li.*?>(.+?)(?=<li>)", "{$1}\n"); html = html.replaceAll("(?i)<li.*?>(.+?)\\n", "{$1}\n"); html = html.replaceAll("(?i)<\\/li>", " "); html = html.replaceAll("(?i)<[\\/]?div.*?>", " "); html = html.replaceAll("(?i)<\\/?center>", " "); html = html.replaceAll("(?i)<\\/?p.*?>", " "); html = html.replaceAll("(?i)<\\/?li>", " "); html = html.replaceAll("(?i)<\\/?font.*?>", " "); html = html.replaceAll("(?i)<\\/?hr.*?>", " "); html = html.replaceAll("\\[\\]", ""); Pattern pattern = Pattern.compile("[\u00B7\u2022]\\s*(.+?)\n", (Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE)); Matcher matcher = pattern.matcher(html); html = matcher.replaceAll("{$1}\n"); html = html.replaceAll("\\s\\}", "}"); html = html.replaceAll("(?i)(?:[\\w\\.]+)@(?:[\\w]+\\.)+(?:[\\w]+)", "<EMAIL>"); html = html.replaceAll("(?i)(?:http:\\/\\/)?(?:[\\w]+\\.)+(?:[\\w]+)", "<URL>"); html = html.replaceAll("\\s*\\/\\s*", ", "); //html = html.replaceAll("\\s+", " "); html = Normalizer.normalize(html, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); List<String> lineobj = new ArrayList<String>(); String[] lines = html.split("\\n"); for (String line : lines) { line = line.trim();/* w ww . ja v a 2 s.c o m*/ if (!line.equals("")) { lineobj.add(line); } } return lineobj; }
From source file:org.kurento.room.demo.FixedNKmsManager.java
public synchronized void setAuthRegex(String regex) { this.authRegex = regex != null ? regex.trim() : null; if (authRegex != null && !authRegex.isEmpty()) { authPattern = Pattern.compile(authRegex, Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE); }// w ww .jav a 2 s .c o m }
From source file:com.romeikat.datamessie.core.processing.service.fulltext.query.QueryUtil.java
public FullTextQuery parseQuery(final String luceneQueryString, final Analyzer analyzer) { LOG.debug("Parsing query: {}", luceneQueryString); // Check if query is "n outof abc def ghi" final Pattern pattern = Pattern.compile("\\s*(\\d+)\\s+outof\\s+(.*)", Pattern.DOTALL | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); final Matcher matcher = pattern.matcher(luceneQueryString); // Match => OUTOF query if (matcher.matches()) { final int k = Integer.parseInt(matcher.group(1)); final String searchString = matcher.group(2); final List<String> queryTerms = parseUtil.parseTerms(searchString, analyzer, true); final OutOfQuery query = new OutOfQuery(k, queryTerms); LOG.debug("Detected {}", query); return query; }//from ww w .j av a 2 s . c om // No match => Lucene query else { final LuceneQuery query = new LuceneQuery(luceneQueryString); LOG.debug("Detected {}", query); return query; } }
From source file:GIST.IzbirkomExtractor.AbbrList.java
/** * Adds an abbreviation and a set of its expansions to abbreviation list. * /*from ww w . j av a 2s. c o m*/ * @param abbr_string * @param expansions */ protected void addAbbrev(String abbr_string, String[] expansions) { if (!abbrevs.containsKey(abbr_string)) { Abbreviation abbr = Abbreviation.createAbbreviation(abbr_string); Pattern pat = Pattern.compile("\\b" + abbr_string + "\\b", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); // FIXME: ignore case flag does not seem to work because abbreviations are retrieved by original case abbr.setPattern(pat); abbrevs.put(abbr_string, abbr); } abbrevs.get(abbr_string).addExpandions(expansions); expansionsPattern = abbreviationsPattern = null; /* reset the pattern to indicate modification of the abbreviation list */ }
From source file:Normalization.TextNormalization.java
public String removeMentionsFromString(String content) { String utf8tweet = ""; try {/*from w w w . j av a2 s . c o m*/ byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } final String regex = "[@]\\w+"; final Pattern unicodeOutliers = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(""); return utf8tweet.replace("#", ""); }
From source file:tvbrowser.core.search.regexsearch.RegexSearcher.java
/** * Creates a pattern for a regular expression. * * @param regex The regular expression/*from www . j a v a 2 s . c o m*/ * @param caseSensitive Should the search be case sensitive? * @return The pattern * @throws TvBrowserException If there is a syntax error in the regular expression. */ public static Pattern createSearchPattern(String regex, boolean caseSensitive) throws TvBrowserException { // Get the flags for the regex int flags = Pattern.DOTALL; if (!caseSensitive) { flags |= Pattern.CASE_INSENSITIVE; flags |= Pattern.UNICODE_CASE; } // Compile the regular expression Pattern pattern; try { pattern = Pattern.compile(regex, flags); } catch (PatternSyntaxException exc) { throw new TvBrowserException(RegexSearcher.class, "error.1", "Syntax error in the regular expression of the search pattern!", exc); } return pattern; }
From source file:com.norconex.importer.handler.transformer.impl.StripAfterTransformer.java
@Override protected void transformStringContent(String reference, StringBuilder content, ImporterMetadata metadata, boolean parsed, boolean partialContent) { if (stripAfterRegex == null) { LOG.error("No regular expression provided."); return;//from w ww .java 2s. co m } int flags = Pattern.DOTALL | Pattern.UNICODE_CASE; if (!caseSensitive) { flags = flags | Pattern.CASE_INSENSITIVE; } Pattern pattern = Pattern.compile(stripAfterRegex, flags); Matcher match = pattern.matcher(content); if (match.find()) { if (inclusive) { content.delete(match.start(), content.length()); } else { content.delete(match.end(), content.length()); } } }
From source file:com.norconex.importer.handler.transformer.impl.StripBeforeTransformer.java
@Override protected void transformStringContent(String reference, StringBuilder content, ImporterMetadata metadata, boolean parsed, boolean partialContent) { if (stripBeforeRegex == null) { LOG.error("No regular expression provided."); return;//from www .j a v a2s .c o m } int flags = Pattern.DOTALL | Pattern.UNICODE_CASE; if (!caseSensitive) { flags = flags | Pattern.CASE_INSENSITIVE; } Pattern pattern = Pattern.compile(stripBeforeRegex, flags); Matcher match = pattern.matcher(content); if (match.find()) { if (inclusive) { content.delete(0, match.end()); } else { content.delete(0, match.start()); } } }
From source file:nz.net.orcon.kanban.automation.actions.RegexAction.java
public String extract(String text, String expressionString, int match, int group, String options) throws IOException { if (text == null) { text = ""; }/*from w w w . j a v a 2 s. c o m*/ if (expressionString == null) { throw new IllegalArgumentException( "No Regular Expression has been provided to carry out this operation."); } int optionsInEffect = 0; if (options != null) { for (String option : options.toUpperCase().split("\\|")) { optionsInEffect |= (option.equals("CANON_EQ")) ? Pattern.CANON_EQ : (option.equals("CASE_INSENSITIVE")) ? Pattern.CASE_INSENSITIVE : (option.equals("COMMENTS")) ? Pattern.COMMENTS : (option.equals("DOTALL")) ? Pattern.DOTALL : (option.equals("LITERAL")) ? Pattern.LITERAL : (option.equals("MULTILINE")) ? Pattern.MULTILINE : (option.equals("UNICODE_CASE")) ? Pattern.UNICODE_CASE : (option.equals("UNIX_LINES")) ? Pattern.UNIX_LINES : 0; } } Pattern expression = Pattern.compile(expressionString, optionsInEffect); Matcher matches = expression.matcher(text); int matchIndex = 1; while (matches.find()) { for (int groupIndex = 0; matches.groupCount() + 1 > groupIndex; groupIndex++) { if (matchIndex == match && groupIndex == group) { return matches.group(groupIndex); } } matchIndex++; } return ""; }