List of usage examples for java.util.regex Pattern CASE_INSENSITIVE
int CASE_INSENSITIVE
To view the source code for java.util.regex Pattern CASE_INSENSITIVE.
Click Source Link
From source file:de.dfki.km.perspecting.obie.model.Document.java
/*************************************************************************** * Gets the pure plain text out of a html text. All html tags are replaced * by spaces. To do so, the head is replaced, all remaining javascript tags * (including the content) and finally all remaining html tags. Thus, * absolute positioning is possible.//from w w w. j a va 2 s. c o m * * @param text * content of the html document as text * @return text where all html was replaced by spaces */ private String extractPlainTextFromHtml(String text) { Collection<Pattern> patterns = new ArrayList<Pattern>(3); // Delete the head, then all remaining javascript items that might exist // in the body, then all remaining html tags. patterns.add( Pattern.compile("<head.*/head>", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL)); // .*? makes it non greedy -> take the shortes match // DOTALL does also include new lines patterns.add(Pattern.compile("<script.*?/script>", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.DOTALL)); patterns.add(Pattern.compile("<.+?>", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE)); StringBuffer s = new StringBuffer(text); // Go for all patterns. for (Pattern p : patterns) { Matcher matcher = p.matcher(s); // As long as the matcher finds another occurance of the pattern we // replace it by the same number of spaces but keep new lines. while (matcher.find()) s.replace(matcher.start(), matcher.end(), matcher.group().replaceAll(".", " ")); } return s.toString(); }
From source file:com.github.ibole.infrastructure.persistence.db.mybatis.pagination.SqlHelper.java
public static String removeXsqlBuilderOrders(String string) { Preconditions.checkNotNull(string);/*from w w w .j ava 2 s .c o m*/ Pattern p = Pattern.compile(XSQL_ORDER_BY_REGEX, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(string); StringBuffer sb = new StringBuffer(string.length()); while (m.find()) { m.appendReplacement(sb, ""); } m.appendTail(sb); return removeOrders(sb.toString()); }
From source file:by.heap.remark.convert.TextCleaner.java
/** * Configures the basic replacements based on the configured options. * @param options Options that will affect what is replaced. *///from w ww. j a v a 2 s.c om @SuppressWarnings({ "OverlyLongMethod" }) private void setupReplacements(Options options) { this.replacements = new HashMap<String, String>(); // build replacement regex StringBuilder entities = new StringBuilder(replacements.size() * 5); // this is a special case for double-encoded HTML entities. entities.append("&(?>amp;([#a-z0-9]++;)|(?>"); addRepl(entities, "&", "&"); addRepl(entities, "<", "<"); addRepl(entities, ">", ">"); addRepl(entities, """, "\""); if (options.reverseHtmlSmartQuotes) { addRepl(entities, "“", "\""); addRepl(entities, "”", "\""); addRepl(entities, "‘", "\'"); addRepl(entities, "’", "\'"); addRepl(entities, "'", "\'"); addRepl(entities, "«", "<<"); addRepl(entities, "»", ">>"); } if (options.reverseHtmlSmartPunctuation) { addRepl(entities, "–", "--"); addRepl(entities, "—", "---"); addRepl(entities, "…", "..."); } entities.replace(entities.length() - 1, entities.length(), ");)"); entityReplacementsPattern = Pattern.compile(entities.toString(), Pattern.CASE_INSENSITIVE); if (options.reverseUnicodeSmartPunctuation || options.reverseUnicodeSmartQuotes) { StringBuilder unicode = new StringBuilder("[\\Q"); if (options.reverseUnicodeSmartQuotes) { addRepl(unicode, "\u201c", "\""); // left double quote: addRepl(unicode, "\u201d", "\""); // right double quote: ? addRepl(unicode, "\u2018", "\'"); // left single quote: addRepl(unicode, "\u2019", "\'"); // right single quote: addRepl(unicode, "\u00ab", "<<"); // left angle quote: addRepl(unicode, "\u00bb", ">>"); // right angle quote: } if (options.reverseUnicodeSmartPunctuation) { addRepl(unicode, "\u2013", "--"); // en-dash: addRepl(unicode, "\u2014", "---"); // em-dash: addRepl(unicode, "\u2026", "..."); // ellipsis: } unicode.append("\\E]"); unicodeReplacementsPattern = Pattern.compile(unicode.toString()); } }
From source file:com.ponysdk.impl.query.memory.FilteringTools.java
public static List<String> filter(final List<String> datas, final String patternMatching) { if (patternMatching == null || datas == null) { return datas; }//from w w w .ja v a2 s . c om final List<String> validData = new ArrayList<>(); try { for (final String data : datas) { if (data == null) continue; if (data.equalsIgnoreCase(patternMatching)) { validData.add(data); continue; } // Now we can filter our data against the pattern final String text = normalisePattern(patternMatching.trim()); final Pattern pattern = Pattern.compile(REGEX_BEGIN + text + REGEX_END, Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(data); if (matcher.find()) { validData.add(data); } else { matcher = pattern.matcher(""); if (matcher.find()) { validData.add(data); } } } } catch (final PatternSyntaxException e) { if (log.isDebugEnabled()) { log.debug("bad pattern : " + patternMatching); } } catch (final Exception e) { log.error("Filter Error => pattern : " + patternMatching, e); } return validData; }
From source file:com.md87.charliebravo.commands.IssueCommand.java
protected void executeOldIssue(InputHandler handler, Response response, String line) throws Exception { final List<String> result = Downloader.getPage("http://bugs.dmdirc.com/view.php?id=" + line); final StringBuilder builder = new StringBuilder(); for (String resline : result) { builder.append(resline);/*from w w w . j a va2s . c om*/ } if (builder.indexOf("APPLICATION ERROR #1100") > -1) { response.sendMessage("That issue was not found", true); } else if (builder.indexOf("<p>Access Denied.</p>") > -1) { response.sendMessage("that issue is private. Please see " + "http://bugs.dmdirc.com/view/" + line); } else { final Map<String, String> data = new HashMap<String, String>(); final Pattern pattern = Pattern.compile( "<td class=\"category\".*?>\\s*(.*?)\\s*" + "</td>\\s*(?:<!--.*?-->\\s*)?<td.*?>\\s*(.*?)\\s*</td>", Pattern.CASE_INSENSITIVE + Pattern.DOTALL); final Matcher matcher = pattern.matcher(builder); while (matcher.find()) { data.put(matcher.group(1).toLowerCase(), matcher.group(2)); } response.sendMessage("issue " + data.get("id") + " is \"" + data.get("summary").substring(9) + "\". Current " + "status is " + data.get("status") + " (" + data.get("resolution") + "). See http://bugs.dmdirc.com/view/" + data.get("id")); response.addFollowup(new IssueFollowup(data)); } }
From source file:de.espend.idea.shopware.util.ShopwareUtil.java
public static void collectControllerClass(Project project, ControllerClassVisitor controllerClassVisitor, String... modules) {/*from ww w . jav a 2s .c o m*/ PhpIndex phpIndex = PhpIndex.getInstance(project); Collection<PhpClass> phpClasses = phpIndex.getAllSubclasses("\\Enlight_Controller_Action"); Pattern pattern = Pattern.compile(".*_(" + StringUtils.join(modules, "|") + ")_(\\w+)", Pattern.CASE_INSENSITIVE); for (PhpClass phpClass : phpClasses) { String className = phpClass.getName(); Matcher matcher = pattern.matcher(className); if (matcher.find()) { String moduleName = matcher.group(1); String controller = matcher.group(2); controllerClassVisitor.visitClass(phpClass, moduleName, controller); } } }
From source file:io.apiman.plugins.simpleheaderpolicy.beans.SimpleHeaderPolicyDefBean.java
@SuppressWarnings("nls") private Pattern buildRegex(List<StripHeaderBean> itemList) { StringBuilder sb = new StringBuilder(); String divider = ""; for (StripHeaderBean stripHeader : itemList) { String pattern = StringUtils.strip(stripHeader.getPattern()); sb.append(divider);/*from www.j a v a 2s . co m*/ sb.append(pattern); divider = "|"; } return Pattern.compile(sb.toString(), Pattern.CASE_INSENSITIVE); }
From source file:com.intuit.tank.script.replace.AbstractReplacement.java
/** * Finds the pattern in the value and returns how the replacement string would like to the user * //from www.java 2 s . c o m * @param searchQuery * the search query * @param replaceString * The replacement string * @param value * The value to be searched in. * @return */ private boolean isMatch(String searchQuery, String replaceString, String value) { searchQuery = RegexUtil.wildcardToRegexp(searchQuery); Pattern p = Pattern.compile(searchQuery, Pattern.CASE_INSENSITIVE); return p.matcher(value).matches(); }
From source file:com.ebay.nest.io.sede.RegexSerDe.java
@Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { // We can get the table definition from tbl. // Read the configuration parameters inputRegex = tbl.getProperty("input.regex"); String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS); String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); boolean inputRegexIgnoreCase = "true".equalsIgnoreCase(tbl.getProperty("input.regex.case.insensitive")); // output format string is not supported anymore, warn user of deprecation if (null != tbl.getProperty("output.format.string")) { LOG.warn("output.format.string has been deprecated"); }/*from ww w . ja va2 s .c o m*/ // Parse the configuration parameters if (inputRegex != null) { inputPattern = Pattern.compile(inputRegex, Pattern.DOTALL + (inputRegexIgnoreCase ? Pattern.CASE_INSENSITIVE : 0)); } else { inputPattern = null; throw new SerDeException("This table does not have serde property \"input.regex\"!"); } List<String> columnNames = Arrays.asList(columnNameProperty.split(",")); columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); assert columnNames.size() == columnTypes.size(); numColumns = columnNames.size(); /* Constructing the row ObjectInspector: * The row consists of some set of primitive columns, each column will * be a java object of primitive type. */ List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(columnNames.size()); for (int c = 0; c < numColumns; c++) { TypeInfo typeInfo = columnTypes.get(c); String typeName = typeInfo.getTypeName(); if (typeName.equals(serdeConstants.STRING_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); } else if (typeName.equals(serdeConstants.TINYINT_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaByteObjectInspector); } else if (typeName.equals(serdeConstants.SMALLINT_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaShortObjectInspector); } else if (typeName.equals(serdeConstants.INT_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector); } else if (typeName.equals(serdeConstants.BIGINT_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector); } else if (typeName.equals(serdeConstants.FLOAT_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaFloatObjectInspector); } else if (typeName.equals(serdeConstants.DOUBLE_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector); } else if (typeName.equals(serdeConstants.BOOLEAN_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); } else if (typeName.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaTimestampObjectInspector); } else if (typeName.equals(serdeConstants.DATE_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaDateObjectInspector); } else if (typeName.equals(serdeConstants.DECIMAL_TYPE_NAME)) { columnOIs.add(PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector); } else if (typeInfo instanceof PrimitiveTypeInfo && ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory() == PrimitiveCategory.VARCHAR) { VarcharTypeParams varcharParams = (VarcharTypeParams) ParameterizedPrimitiveTypeUtils .getTypeParamsFromTypeInfo(typeInfo); columnOIs.add(PrimitiveObjectInspectorFactory .getPrimitiveJavaObjectInspector((PrimitiveTypeInfo) typeInfo)); } else { throw new SerDeException(getClass().getName() + " doesn't allow column [" + c + "] named " + columnNames.get(c) + " with type " + columnTypes.get(c)); } } // StandardStruct uses ArrayList to store the row. rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs); row = new ArrayList<Object>(numColumns); // Constructing the row object, etc, which will be reused for all rows. for (int c = 0; c < numColumns; c++) { row.add(null); } outputFields = new Object[numColumns]; outputRowText = new Text(); }
From source file:uk.ac.kcl.at.ElasticGazetteerAcceptanceTest.java
@Test public void deidentificationPerformanceTest() { dbmsTestUtils.createBasicInputTable(); dbmsTestUtils.createBasicOutputTable(); dbmsTestUtils.createDeIdInputTable(); List<Mutant> mutants = testUtils.insertTestDataForDeidentification(env.getProperty("tblIdentifiers"), env.getProperty("tblInputDocs"), mutatortype, true); int totalTruePositives = 0; int totalFalsePositives = 0; int totalFalseNegatives = 0; for (Mutant mutant : mutants) { Set<Pattern> mutatedPatterns = new HashSet<>(); mutant.setDeidentifiedString(elasticGazetteerService.deIdentifyString(mutant.getFinalText(), String.valueOf(mutant.getDocumentid()))); Set<String> set = new HashSet<>(mutant.getOutputTokens()); mutatedPatterns.addAll(//from www.ja v a 2 s.c o m set.stream().map(string -> Pattern.compile(Pattern.quote(string), Pattern.CASE_INSENSITIVE)) .collect(Collectors.toSet())); List<MatchResult> results = new ArrayList<>(); for (Pattern pattern : mutatedPatterns) { Matcher matcher = pattern.matcher(mutant.getFinalText()); while (matcher.find()) { results.add(matcher.toMatchResult()); } } int truePositives = getTruePositiveTokenCount(mutant); int falsePositives = getFalsePositiveTokenCount(mutant); int falseNegatives = getFalseNegativeTokenCount(mutant); System.out.println("Doc ID " + mutant.getDocumentid() + " has " + falseNegatives + " unmasked identifiers from a total of " + (falseNegatives + truePositives)); System.out.println("Doc ID " + mutant.getDocumentid() + " has " + falsePositives + " inaccurately masked tokens from a total of " + (falsePositives + truePositives)); System.out.println("TP: " + truePositives + " FP: " + falsePositives + " FN: " + falseNegatives); System.out.println("Doc ID precision " + calcPrecision(falsePositives, truePositives)); System.out.println("Doc ID recall " + calcRecall(falseNegatives, truePositives)); System.out.println(mutant.getDeidentifiedString()); System.out.println(mutant.getFinalText()); System.out.println(mutant.getInputTokens()); System.out.println(mutant.getOutputTokens()); System.out.println(); if (env.getProperty("elasticgazetteerTestOutput") != null) { try { try (BufferedWriter bw = new BufferedWriter( new FileWriter(new File(env.getProperty("elasticgazetteerTestOutput") + File.separator + mutant.getDocumentid())))) { bw.write("Doc ID " + mutant.getDocumentid() + " has " + falseNegatives + " unmasked identifiers from a total of " + (falseNegatives + truePositives)); bw.newLine(); bw.write("Doc ID " + mutant.getDocumentid() + " has " + falsePositives + " inaccurately masked tokens from a total of " + (falsePositives + truePositives)); bw.newLine(); bw.write("TP: " + truePositives + " FP: " + falsePositives + " FN: " + falseNegatives); bw.newLine(); bw.write("Doc ID precision " + calcPrecision(falsePositives, truePositives)); bw.newLine(); bw.write("Doc ID recall " + calcRecall(falseNegatives, truePositives)); bw.newLine(); bw.write(mutant.getDeidentifiedString()); bw.newLine(); bw.write(mutant.getFinalText()); bw.newLine(); bw.write(mutant.getInputTokens().toString()); bw.newLine(); bw.write(mutant.getOutputTokens().toString()); } } catch (IOException e) { e.printStackTrace(); } } totalTruePositives += truePositives; totalFalsePositives += falsePositives; totalFalseNegatives += falseNegatives; } DecimalFormat df = new DecimalFormat("#.#"); df.setRoundingMode(RoundingMode.CEILING); System.out.println(); System.out.println(); System.out.println("THIS RUN TP: " + totalTruePositives + " FP: " + totalFalsePositives + " FN: " + totalFalseNegatives); System.out.println("Doc ID precision " + calcPrecision(totalFalsePositives, totalTruePositives)); System.out.println("Doc ID recall " + calcRecall(totalFalseNegatives, totalTruePositives)); System.out.println(totalTruePositives + " & " + totalFalsePositives + " & " + totalFalseNegatives + " & " + df.format(calcPrecision(totalFalsePositives, totalTruePositives)) + " & " + df.format(calcRecall(totalFalseNegatives, totalTruePositives)) + " \\\\"); if (env.getProperty("elasticgazetteerTestOutput") != null) { try { try (BufferedWriter bw = new BufferedWriter(new FileWriter( new File(env.getProperty("elasticgazetteerTestOutput") + File.separator + "summary")))) { bw.write("THIS RUN TP: " + totalTruePositives + " FP: " + totalFalsePositives + " FN: " + totalFalseNegatives); bw.newLine(); bw.write("Doc ID precision " + calcPrecision(totalFalsePositives, totalTruePositives)); bw.newLine(); bw.write("Doc ID recall " + calcRecall(totalFalseNegatives, totalTruePositives)); } } catch (IOException e) { e.printStackTrace(); } } }