List of usage examples for java.util.regex Pattern UNICODE_CASE
int UNICODE_CASE
To view the source code for java.util.regex Pattern UNICODE_CASE.
Click Source Link
From source file:net.jazdw.rql.parser.listfilter.ListFilter.java
@SuppressWarnings("unchecked") @Override/*from ww w . j a v a 2 s . com*/ public List<T> visit(ASTNode node, List<T> list) { switch (node.getName()) { case "and": for (Object obj : node) { if (obj instanceof ASTNode) { list = ((ASTNode) obj).accept(this, list); } else { throw new UnsupportedOperationException("Encountered a non-ASTNode argument in AND statement"); } } return list; case "or": Set<T> set = new LinkedHashSet<T>(); for (Object obj : node) { if (obj instanceof ASTNode) { set.addAll(((ASTNode) obj).accept(this, list)); } else { throw new UnsupportedOperationException("Encountered a non-ASTNode argument in OR statement"); } } return new ArrayList<>(set); case "eq": case "gt": case "ge": case "lt": case "le": case "ne": String propName = (String) node.getArgument(0); Object test = node.getArgumentsSize() > 1 ? node.getArgument(1) : null; List<T> result = new ArrayList<>(); for (T item : list) { Object property = getProperty(item, propName); Comparable<Object> comparableProperty; if (property instanceof Comparable) { comparableProperty = (Comparable<Object>) property; } else { throw new UnsupportedOperationException( String.format("Property '%s' is not comparable", propName)); } int comparisonValue; try { comparisonValue = comparableProperty.compareTo(test); } catch (ClassCastException e) { throw new UnsupportedOperationException( String.format("Couldn't compare '%s' to '%s'", property.toString(), test.toString())); } if (checkComparisonValue(node.getName(), comparisonValue)) { result.add(item); } } return result; case "like": case "match": propName = (String) node.getArgument(0); String matchString = (String) node.getArgument(1); Pattern matchPattern = Pattern.compile(matchString.replace("*", ".*"), Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); result = new ArrayList<>(); for (T item : list) { Object property = getProperty(item, propName); String stringProperty; if (property instanceof String) { stringProperty = (String) property; } else { throw new UnsupportedOperationException( String.format("Property '%s' is not a string", propName)); } if (matchPattern.matcher(stringProperty).matches()) { result.add(item); } } return result; case "limit": int limit = (int) node.getArgument(0); int offset = node.getArgumentsSize() > 1 ? (int) node.getArgument(1) : 0; if (offset > list.size() - 1) { return Collections.emptyList(); } int toIndex = offset + limit; if (toIndex > list.size()) { toIndex = list.size(); } return list.subList(offset, toIndex); case "sort": ComparatorChain cc = new ComparatorChain(); for (Object obj : node) { String sortOption = (String) obj; boolean desc = sortOption.startsWith("-"); cc.addComparator(new BeanComparator<T>(sortOption.substring(1)), desc); } // copy the list as we are modifying it list = new ArrayList<>(list); Collections.sort(list, cc); return list; default: throw new UnsupportedOperationException( String.format("Encountered unknown operator '%s'", node.getName())); } }
From source file:Normalization.TextNormalization.java
public String removeSymbolsFromString(String content) { String utf8tweet = ""; try {// w w w . j a va 2s . c om byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } final String regex = "[\\./\\()\"':,.;<>~!$%^&*\\|+={}?\\-`1234567890_]"; final Pattern unicodeOutliers = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(" "); return utf8tweet; }
From source file:pl.otros.logview.gui.message.pattern.PropertyPatternMessageColorizer.java
public void init(InputStream in) throws ConfigurationException { propertiesConfiguration = new PropertiesConfiguration(); propertiesConfiguration.setDelimiterParsingDisabled(true); propertiesConfiguration.load(in, "UTF-8"); configuration = new DataConfiguration(propertiesConfiguration); configuration.setDelimiterParsingDisabled(true); String pa = configuration.getString(PROP_PATTERN); int flags = 0; flags = flags | (configuration.getBoolean(PROP_PATTERN_CANON_EQ, false) ? Pattern.CANON_EQ : 0); flags = flags//from w w w.jav a 2s . c o m | (configuration.getBoolean(PROP_PATTERN_CASE_INSENSITIVE, false) ? Pattern.CASE_INSENSITIVE : 0); flags = flags | (configuration.getBoolean(PROP_PATTERN_COMMENTS, false) ? Pattern.COMMENTS : 0); flags = flags | (configuration.getBoolean(PROP_PATTERN_DOTALL, false) ? Pattern.DOTALL : 0); flags = flags | (configuration.getBoolean(PROP_PATTERN_LITERAL, false) ? Pattern.LITERAL : 0); flags = flags | (configuration.getBoolean(PROP_PATTERN_MULTILINE, false) ? Pattern.MULTILINE : 0); flags = flags | (configuration.getBoolean(PROP_PATTERN_UNICODE_CASE, false) ? Pattern.UNICODE_CASE : 0); flags = flags | (configuration.getBoolean(PROP_PATTERN_UNIX_LINES, false) ? Pattern.UNIX_LINES : 0); pattern = Pattern.compile(pa, flags); groupCount = countGroups(pattern); name = configuration.getString(PROP_NAME, "NAME NOT SET!"); description = configuration.getString(PROP_DESCRIPTION, "DESCRIPTION NOT SET!"); testMessage = configuration.getString(PROP_TEST_MESSAGE, ""); version = configuration.getInt(PROP_VERSION, 1); }
From source file:de.dfki.km.perspecting.obie.model.Document.java
/*************************************************************************** * Gets the pure plain text out of a html text. All html tags are replaced * by spaces. To do so, the head is replaced, all remaining javascript tags * (including the content) and finally all remaining html tags. Thus, * absolute positioning is possible./*from w w w .j a v a2 s. c o m*/ * * @param text * content of the html document as text * @return text where all html was replaced by spaces */ private String extractPlainTextFromHtml(String text) { Collection<Pattern> patterns = new ArrayList<Pattern>(3); // Delete the head, then all remaining javascript items that might exist // in the body, then all remaining html tags. patterns.add( Pattern.compile("<head.*/head>", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL)); // .*? makes it non greedy -> take the shortes match // DOTALL does also include new lines patterns.add(Pattern.compile("<script.*?/script>", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL)); patterns.add(Pattern.compile("<.+?>", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE)); StringBuffer s = new StringBuffer(text); // Go for all patterns. for (Pattern p : patterns) { Matcher matcher = p.matcher(s); // As long as the matcher finds another occurance of the pattern we // replace it by the same number of spaces but keep new lines. while (matcher.find()) s.replace(matcher.start(), matcher.end(), matcher.group().replaceAll(".", " ")); } return s.toString(); }
From source file:org.eclipse.skalli.core.user.LocalUserComponent.java
@Override public List<User> findUser(String search) { List<User> result = new ArrayList<User>(); if (StringUtils.isNotBlank(search)) { String[] parts = StringUtils.split(NormalizeUtil.normalize(search), " ,"); //$NON-NLS-1$ Pattern[] patterns = new Pattern[parts.length]; for (int i = 0; i < parts.length; ++i) { patterns[i] = Pattern.compile(parts[i] + ".*", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);//$NON-NLS-1$ }/* w w w. j ava 2 s .co m*/ for (User user : getAll()) { if (parts.length == 1) { match(user, patterns[0], result); } else if (parts.length == 2) { // givenname surname ('Michael Ochmann') if (matches(patterns, user.getFirstname(), user.getLastname())) { result.add(user); } // surname givenname('Ochmann, Michael') else if (matches(patterns, user.getLastname(), user.getFirstname())) { result.add(user); } } else if (parts.length == 3) { // givenname initial surname, e.g. 'Michael R. Ochmann' // or title givenname surname or given name surname title if (matches(patterns, user.getFirstname(), null, user.getLastname())) { result.add(user); } else if (matches(patterns, user.getLastname(), null, user.getFirstname())) { result.add(user); } else if (matches(patterns, null, user.getFirstname(), user.getLastname())) { result.add(user); } else if (matches(patterns, user.getFirstname(), user.getLastname(), null)) { result.add(user); } } } if (result.isEmpty()) { for (User user : getAll()) { // try to match each part individually for (int i = 0; i < parts.length; ++i) { match(user, patterns[i], result); } } } } return result; }
From source file:Normalization.TextNormalization.java
public String removeSpacesFromString(String content) { String utf8tweet = ""; try {//from w w w . j a v a 2 s. c o m byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } final String regex = "\\s{2,}"; final Pattern unicodeOutliers = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(" "); return utf8tweet; }
From source file:org.carrot2.webapp.filter.QueryWordHighlighter.java
@Override public void process() throws ProcessingException { if (!enabled) { return;/*from w w w . j av a 2 s . co m*/ } if (query == null) { query = ""; } // Create regexp patterns for each query word final String[] queryTerms = querySanitizePatternCompiled.matcher(query).replaceAll("").split("\\s+"); Pattern queryPattern = null; List<String> patterns = Lists.newArrayList(); for (String queryTerm : queryTerms) { if (Strings.isNullOrEmpty(queryTerm)) { continue; } if (dontHighlightPatternCompiled != null && dontHighlightPatternCompiled.matcher(queryTerm).matches()) { continue; } patterns.add("(" + Pattern.quote(escapeLtGt(queryTerm)) + ")"); } if (patterns.size() > 0) { queryPattern = Pattern.compile(Joiner.on("|").join(patterns), Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } // As we're going to modify documents, we need to copy them to // avoid ConcurrentModificationExceptions. final List<Document> inputDocuments = documents; final List<Document> outputDocuments = Lists.newArrayListWithCapacity(inputDocuments.size()); for (Document document : inputDocuments) { final Document clonedDocument = document.clone(); for (String fieldName : fields) { highlightQueryTerms(clonedDocument, fieldName, queryPattern); } outputDocuments.add(clonedDocument); } documents = outputDocuments; }
From source file:GIST.IzbirkomExtractor.Russian.OrdinalFactory.java
/** * Default constructor//from ww w. j a va2 s . com */ public OrdinalFactory() { /* creating varions maps and hash tables */ stems0_10_lookup = new HashMap<String, Integer>(stems0_10.length); for (int i = 0; i < stems0_10.length; i++) stems0_10_lookup.put(stems0_10[i], i); stems11_19_lookup = new HashMap<String, Integer>(stems11_19.length); for (int i = 0; i < stems11_19.length; i++) stems11_19_lookup.put(stems11_19[i], i); /* create matching patterns for parsing */ /* pattern for 1-, 2-, ... */ digits_numeral_pat = Pattern.compile("\\b(\\d{1,2})(?:(-)?([-?]{1,2}))?\\b", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); /* pattern for , , ... */ StringBuilder sb0_10_pat = new StringBuilder(); sb0_10_pat.append("\\b("); sb0_10_pat.append(StringUtils.join(stems0_10, '|')); sb0_10_pat.append(")([-?]{1,3})\\b"); stem0_10_numeral_pat = Pattern.compile(sb0_10_pat.toString(), Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); /* pattern for , , ... */ StringBuilder sb11_19_pat = new StringBuilder(); sb11_19_pat.append("\\b("); sb11_19_pat.append(StringUtils.join(stems11_19, '|')); sb11_19_pat.append(")([-?]{1,2})\\b"); stem11_19_numeral_pat = Pattern.compile(sb11_19_pat.toString(), Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); // TODO: patterns for 20+ }
From source file:org.alfresco.repo.security.authority.GetAuthoritiesCannedQuery.java
private Pattern getPattern(String searchValue) { if (searchValue == null) { return null; }/*from ww w.j ava 2s . c om*/ // Escape characters of regex expressions searchValue = "^" + searchValue.replaceAll("\\\\", "\\\\\\\\").replaceAll("\\.", "\\\\.").replaceAll("\\?", ".") .replaceAll("\\*", ".*").replaceAll("\\[", "\\\\[").replaceAll("\\]", "\\\\]") .replaceAll("\\(", "\\\\(").replaceAll("\\)", "\\\\)").replaceAll("\\{", "\\\\{") .replaceAll("\\}", "\\\\}").replaceAll("\\^", "\\\\^").replaceAll("\\$", "\\\\\\$") .replaceAll("\\:", "\\\\:").replaceAll("\\\"", "\\\\\"").replaceAll("\\<", "\\\\<") .replaceAll("\\>", "\\\\>").replaceAll("\\/", "\\\\/").replaceAll("\\|", "\\\\|"); return Pattern.compile(searchValue, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); }
From source file:Normalization.TextNormalization.java
public String removeTwoLetterWordsFromString(String content) { String utf8tweet = ""; try {//from ww w .j a va 2 s . c o m byte[] utf8Bytes = content.getBytes("UTF-8"); utf8tweet = new String(utf8Bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { } final String regex = "((^|\\s)(\\w{1,2})(\\s|$))"; final Pattern unicodeOutliers = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet); utf8tweet = unicodeOutlierMatcher.replaceAll(" "); return utf8tweet; }