List of usage examples for java.util.regex Matcher end
public int end(String name)
From source file:tr.edu.gsu.nerwip.recognition.internal.modelless.subee.Subee.java
/** * Takes advantage of hyperlinks in the text, in order * to detect entities. Most of the time, in a Wikipedia * article, the hyperlink is defined only for the very * first occurrence of the entity. For this reason, * an additional processing is required to find the possible * other occurrences (cf. {@link #processOccurrences(Article, List)}). * //from w w w. j av a2 s.c o m * @param article * Processed article. * @return * The list of entities detected by this method. * * @throws ParserException * Problem while parsing the hyperlinks. * @throws ClientProtocolException * Problem while accessing Freebase. * @throws ParseException * Problem while accessing Freebase. * @throws IOException * Problem while accessing Freebase. * @throws org.json.simple.parser.ParseException * Problem while accessing Freebase. */ private List<AbstractEntity<?>> processHyperlinks(Article article) throws ParserException, ClientProtocolException, ParseException, IOException, org.json.simple.parser.ParseException { logger.increaseOffset(); List<AbstractEntity<?>> result = new ArrayList<AbstractEntity<?>>(); // parse linked text to automatically get hyperlink list logger.log("Get hyperlink list"); String linkedText = article.getLinkedText(); Parser parser = new Parser(TAG_PAR_START + linkedText + TAG_PAR_END); NodeList linkList = parser.parse(new TagNameFilter(TAG_LINK)); int offset = TAG_PAR_START.length(); // process each hyperlink logger.log("Process each hyperlink"); logger.increaseOffset(); for (int i = 0; i < linkList.size(); i++) { LinkTag linkTag = (LinkTag) linkList.elementAt(i); String valueStr = linkTag.getLinkText(); int length = valueStr.length(); String test = linkTag.toHtml(); logger.log("Hyperlink '" + test + "'"); // get type from Freebase EntityType type = null; // only process strings with uppercase initial if (StringTools.hasInitial(valueStr)) { String hyperlink = linkTag.getLink(); String[] linkParts = hyperlink.split("/"); String lastPart = linkParts[linkParts.length - 1]; String wikipediaTitle = URLDecoder.decode(lastPart, "UTF-8"); //TODO we may take advantage of this to automatically detect the type String wikipediaTitleEscaped = FbCommonTools.escapeMqlKey(wikipediaTitle); //TODO or this logger.log("Wikipedia title: " + wikipediaTitle); logger.log("Escaped Wikipedia title: " + wikipediaTitleEscaped); // use only the notable type if (notableType) { String possibleType = FbTypeTools.getNotableType(wikipediaTitleEscaped); if (possibleType == null) logger.log("No notable Freebase type found for \"" + valueStr + "\""); else { List<String> possibleTypes = new ArrayList<String>(); possibleTypes.add(possibleType); type = retrieveEntityType(possibleTypes); } } // use all available types if (type == null) { List<String> possibleTypes = FbTypeTools.getAllTypes(wikipediaTitleEscaped); logger.log("Possible types: " + possibleTypes.toString()); if (possibleTypes.isEmpty()) logger.log("WARNING: no Freebase type found at all for \"" + valueStr + "\""); else type = retrieveEntityType(possibleTypes); } } // set up the entity position int startPos = linkTag.getStartPosition() - offset; int endPos = startPos + length; offset = offset + test.length() - length; //debug //String text = article.getRawText(); //String valueStr2 = text.substring(startPos,endPos); //boolean test2 = valueStr.equals(valueStr2); //if(!test2) // System.out.println("ERROR: entity and article do not match (position problem)"); // no type: we can't create the entity if (type == null) { logger.log("WARNING: no entity was created, because no type could be identified for \"" + valueStr + "\""); } // otherwise, we try else { // ignore if purely numerical if (StringTools.hasNoLetter(valueStr)) logger.log("The string is only numerical (no letters) so no entity is created for " + valueStr); // ignore if recognized as a location/organization but actually a demonym else if (discardDemonyms && (type == EntityType.LOCATION || type == EntityType.ORGANIZATION) && DEMONYMS.contains(valueStr)) logger.log("The string is in the demonym list, so no entity is created for " + valueStr); else { //debug //if(valueStr.equalsIgnoreCase("Irish")) // System.out.print(""); // possibly look for an acronym if (useAcronyms) { // only organization and locations have relevant acronyms // (for a person, acronyms usually correspond to titles or awards) if (type == EntityType.ORGANIZATION || type == EntityType.LOCATION) { // check if there's an acronym inside the entity name itself Pattern r = Pattern.compile("\\([^\\(a-z]+?\\)$"); // must be in uppercase Matcher m = r.matcher(valueStr); if (m.find()) { // create an additional entity (acronym) with the same type int last = m.groupCount(); String acro = m.group(last); int l = acro.length(); acro = acro.substring(1, l - 1); int s = startPos + m.start(last) + 1; int e = startPos + m.end(last) - 1; if (!StringTools.hasNoLetter(acro)) { //debug //String valueStr3 = text.substring(s,e); //boolean test3 = acro.equals(valueStr3); //if(!test3) // System.out.println("ERROR: entity acronym and article do not match (position problem)"); AbstractEntity<?> entity = AbstractEntity.build(type, s, e, RecognizerName.SUBEE, acro); result.add(entity); logger.log("Creation of an extra entity (acronym) " + entity); } // remove the acronym from the original string valueStr = valueStr.substring(0, valueStr.length() - l).trim(); endPos = startPos + valueStr.length(); } // check if there's an acronym right after the entity else { r = Pattern.compile("\\([^\\(a-z]+?\\)"); // must be in uppercase m = r.matcher(linkedText); if (m.find(linkTag.getEndTag().getEndPosition() - TAG_PAR_START.length())) { // possibly create an additional entity (acronym) with the same type int last = m.groupCount(); String acro = m.group(last); acro = acro.substring(1, acro.length() - 1); int s = m.start(last) - 1 - (offset - TAG_PAR_END.length()) + 1; // actually <a/> and not <p/>, but same length... // the acronym must be right after the original entity if (s == endPos + 2 && !StringTools.hasNoLetter(acro)) { int e = m.end(last) - 1 - (offset - TAG_PAR_END.length()) - 1; //debug //String valueStr3 = text.substring(s,e); //boolean test3 = acro.equals(valueStr3); //if(!test3) // System.out.println("ERROR: entity acronym and article do not match (position problem)"); AbstractEntity<?> entity = AbstractEntity.build(type, s, e, RecognizerName.SUBEE, acro); result.add(entity); logger.log("Creation of an extra entity (acronym) " + entity); } } } } } // create the entity AbstractEntity<?> entity = AbstractEntity.build(type, startPos, endPos, RecognizerName.SUBEE, valueStr); result.add(entity); logger.log("Creation of the entity " + entity); } } } logger.decreaseOffset(); logger.decreaseOffset(); return result; }
From source file:com.akop.bach.parser.XboxLiveParser.java
public static String getStandardIcon(String loadBalIcon) { if (loadBalIcon == null) return null; Matcher m; if (!(m = PATTERN_LOADBAL_ICON.matcher(loadBalIcon)).find()) return loadBalIcon; String replacement = loadBalIcon.substring(0, m.start(1)) + loadBalIcon.substring(m.end(1)); return replacement; }
From source file:de.escalon.hypermedia.spring.PartialUriTemplate.java
/** * Creates a new {@link PartialUriTemplate} using the given template string. * * @param template must not be {@literal null} or empty. *///w ww .ja v a 2 s . c om public PartialUriTemplate(String template) { Assert.hasText(template, "Template must not be null or empty!"); Matcher matcher = VARIABLE_REGEX.matcher(template); // first group is the variable start without leading {: "", "/", "?", "#", // second group is the comma-separated name list without the trailing } of the variable int endOfPart = 0; while (matcher.find()) { // 0 is the current match, i.e. the entire variable expression int startOfPart = matcher.start(0); // add part before current match if (endOfPart < startOfPart) { final String partWithoutVariables = template.substring(endOfPart, startOfPart); final StringTokenizer stringTokenizer = new StringTokenizer(partWithoutVariables, "?", true); boolean inQuery = false; while (stringTokenizer.hasMoreTokens()) { final String token = stringTokenizer.nextToken(); if ("?".equals(token)) { inQuery = true; } else { if (!inQuery) { urlComponents.add(token); } else { urlComponents.add("?" + token); } variableIndices.add(Collections.<Integer>emptyList()); } } } endOfPart = matcher.end(0); // add current match as part final String variablePart = template.substring(startOfPart, endOfPart); urlComponents.add(variablePart); // collect variablesInPart and track for each part which variables it contains // group(1) is the variable head without the leading { TemplateVariable.VariableType type = TemplateVariable.VariableType.from(matcher.group(1)); // group(2) is the String[] names = matcher.group(2).split(","); List<Integer> variablesInPart = new ArrayList<Integer>(); for (String name : names) { TemplateVariable variable = new TemplateVariable(name, type); variablesInPart.add(variables.size()); variables.add(variable); variableNames.add(name); } variableIndices.add(variablesInPart); } // finish off remaining part if (endOfPart < template.length()) { urlComponents.add(template.substring(endOfPart)); variableIndices.add(Collections.<Integer>emptyList()); } }
From source file:com.github.hateoas.forms.affordance.PartialUriTemplate.java
/** * Creates a new {@link PartialUriTemplate} using the given template string. * * @param template must not be {@literal null} or empty. *///from w w w. j a v a 2 s .c o m public PartialUriTemplate(String template) { Assert.hasText(template, "Template must not be null or empty!"); Matcher matcher = VARIABLE_REGEX.matcher(template); // first group is the variable start without leading {: "", "/", "?", "#", // second group is the comma-separated name list without the trailing } of the variable int endOfPart = 0; while (matcher.find()) { // 0 is the current match, i.e. the entire variable expression int startOfPart = matcher.start(0); // add part before current match if (endOfPart < startOfPart) { final String partWithoutVariables = template.substring(endOfPart, startOfPart); final StringTokenizer stringTokenizer = new StringTokenizer(partWithoutVariables, "?", true); boolean inQuery = false; while (stringTokenizer.hasMoreTokens()) { final String token = stringTokenizer.nextToken(); if ("?".equals(token)) { inQuery = true; } else { if (!inQuery) { urlComponents.add(token); } else { urlComponents.add("?" + token); } variableIndices.add(Collections.<Integer>emptyList()); } } } endOfPart = matcher.end(0); // add current match as part final String variablePart = template.substring(startOfPart, endOfPart); urlComponents.add(variablePart); // collect variablesInPart and track for each part which variables it contains // group(1) is the variable head without the leading { TemplateVariable.VariableType type = TemplateVariable.VariableType.from(matcher.group(1)); // group(2) are the variable names String[] names = matcher.group(2).split(","); List<Integer> variablesInPart = new ArrayList<Integer>(); for (String name : names) { TemplateVariable variable = new TemplateVariable(name, type); variablesInPart.add(variables.size()); variables.add(variable); variableNames.add(name); } variableIndices.add(variablesInPart); } // finish off remaining part if (endOfPart < template.length()) { urlComponents.add(template.substring(endOfPart)); variableIndices.add(Collections.<Integer>emptyList()); } }
From source file:com.cyberway.issue.crawler.extractor.ExtractorHTML.java
protected void processGeneralTag(CrawlURI curi, CharSequence element, CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); // Just in case it's an OBJECT or APPLET tag String codebase = null;/*from ww w . j av a2 s. c om*/ ArrayList<String> resources = null; // Just in case it's a FORM CharSequence action = null; CharSequence actionContext = null; CharSequence method = null; final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(curi, ATTR_TREAT_FRAMES_AS_EMBED_LINKS)) .booleanValue(); final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(curi, ATTR_IGNORE_FORM_ACTION_URLS)) .booleanValue(); final boolean extractValueAttributes = ((Boolean) getUncheckedAttribute(curi, EXTRACT_VALUE_ATTRIBUTES)) .booleanValue(); final String elementStr = element.toString(); while (attr.find()) { int valueGroup = (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16; int start = attr.start(valueGroup); int end = attr.end(valueGroup); assert start >= 0 : "Start is: " + start + ", " + curi; assert end >= 0 : "End is :" + end + ", " + curi; CharSequence value = cs.subSequence(start, end); value = TextUtils.unescapeHtml(value); if (attr.start(2) > -1) { // HREF CharSequence context = Link.elementContext(element, attr.group(2)); if (elementStr.equalsIgnoreCase(LINK)) { // <LINK> elements treated as embeds (css, ico, etc) processEmbed(curi, value, context); } else { // other HREFs treated as links if (value.toString().indexOf("java") != -1) System.out.println(value + "--------javascript--------"); processLink(curi, value, context); } if (elementStr.equalsIgnoreCase(BASE)) { try { curi.setBaseURI(value.toString()); } catch (URIException e) { if (getController() != null) { // Controller can be null: e.g. when running // ExtractorTool. getController().logUriError(e, curi.getUURI(), value.toString()); } else { logger.info("Failed set base uri: " + curi + ", " + value.toString() + ": " + e.getMessage()); } } } } else if (attr.start(3) > -1) { // ACTION if (!ignoreFormActions) { action = value; actionContext = Link.elementContext(element, attr.group(3)); // handling finished only at end (after METHOD also collected) } } else if (attr.start(4) > -1) { // ON____ processScriptCode(curi, value); // TODO: context? } else if (attr.start(5) > -1) { // SRC etc. CharSequence context = Link.elementContext(element, attr.group(5)); // true, if we expect another HTML page instead of an image etc. final char hopType; if (!framesAsEmbeds && (elementStr.equalsIgnoreCase(FRAME) || elementStr.equalsIgnoreCase(IFRAME))) { hopType = Link.NAVLINK_HOP; } else { hopType = Link.EMBED_HOP; } processEmbed(curi, value, context, hopType); } else if (attr.start(6) > -1) { // CODEBASE codebase = (value instanceof String) ? (String) value : value.toString(); CharSequence context = Link.elementContext(element, attr.group(6)); processEmbed(curi, codebase, context); } else if (attr.start(7) > -1) { // CLASSID, DATA if (resources == null) { resources = new ArrayList<String>(); } resources.add(value.toString()); } else if (attr.start(8) > -1) { // ARCHIVE if (resources == null) { resources = new ArrayList<String>(); } String[] multi = TextUtils.split(WHITESPACE, value); for (int i = 0; i < multi.length; i++) { resources.add(multi[i]); } } else if (attr.start(9) > -1) { // CODE if (resources == null) { resources = new ArrayList<String>(); } // If element is applet and code value does not end with // '.class' then append '.class' to the code value. if (elementStr.equalsIgnoreCase(APPLET) && !value.toString().toLowerCase().endsWith(CLASSEXT)) { resources.add(value.toString() + CLASSEXT); } else { resources.add(value.toString()); } } else if (attr.start(10) > -1) { // VALUE, with possibility of URI if (extractValueAttributes && TextUtils.matches(LIKELY_URI_PATH, value)) { CharSequence context = Link.elementContext(element, attr.group(10)); processLink(curi, value, context); } } else if (attr.start(11) > -1) { // STYLE inline attribute // then, parse for URIs this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi, value, getController()); } else if (attr.start(12) > -1) { // METHOD method = value; // form processing finished at end (after ACTION also collected) } else if (attr.start(13) > -1) { // any other attribute // ignore for now // could probe for path- or script-looking strings, but // those should be vanishingly rare in other attributes, // and/or symptomatic of page bugs } } TextUtils.recycleMatcher(attr); // finish handling codebase/resources now that all available if (resources != null) { Iterator iter = resources.iterator(); UURI codebaseURI = null; String res = null; try { if (codebase != null) { // TODO: Pass in the charset. codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase); } while (iter.hasNext()) { res = iter.next().toString(); res = (String) TextUtils.unescapeHtml(res); if (codebaseURI != null) { res = codebaseURI.resolve(res).toString(); } processEmbed(curi, res, element); // TODO: include attribute too } } catch (URIException e) { curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase); } catch (IllegalArgumentException e) { DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e); } } // finish handling form action, now method is available if (action != null) { if (method == null || "GET".equalsIgnoreCase(method.toString()) || !((Boolean) getUncheckedAttribute(curi, ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()) { processLink(curi, action, actionContext); } } }
From source file:net.java.sip.communicator.impl.gui.main.chat.ChatConversationPanel.java
/** * Process provided replacers one by one sequentially. The output of the * first replacer is then fed as input into the second replacer, and so on. * <p>/*from w w w.j ava 2 s . c om*/ * {@link Replacer}s that expect HTML content ( * {@link Replacer#expectsPlainText()}) will typically receive the complete * message as an argument. {@linkplain Replacer}s that expect plain text * content will typically receive small pieces that are found in between * HTML tags. The pieces of plain text content cannot be predicted as * results change when they are processed by other replacers. * </p> * * @param content the original content to process * @param replacers the replacers to call * @return returns the final result message content after it has been * processed by all replacers */ private String processReplacers(final String content, final Replacer... replacers) { StringBuilder source = new StringBuilder(content); for (final Replacer replacer : replacers) { final StringBuilder target = new StringBuilder(); if (replacer.expectsPlainText()) { int startPos = 0; final Matcher plainTextInHtmlMatcher = TEXT_TO_REPLACE_PATTERN.matcher(source); while (plainTextInHtmlMatcher.find()) { final String plainTextAsHtml = plainTextInHtmlMatcher.group(1); final int startMatchPosition = plainTextInHtmlMatcher.start(1); final int endMatchPosition = plainTextInHtmlMatcher.end(1); target.append(source.substring(startPos, startMatchPosition)); final String plaintext = StringEscapeUtils.unescapeHtml4(plainTextAsHtml); // Invoke replacer. try { replacer.replace(target, plaintext); } catch (RuntimeException e) { logger.error("An error occurred in replacer: " + replacer.getClass().getName(), e); } startPos = endMatchPosition; } target.append(source.substring(startPos)); } else { // Invoke replacer. try { replacer.replace(target, source.toString()); } catch (RuntimeException e) { logger.error("An error occurred in replacer: " + replacer.getClass().getName(), e); } } source = target; } return source.toString(); }
From source file:com.github.gekoh.yagen.ddl.CreateDDL.java
public String updateCreateSequence(Dialect dialect, String sqlCreate, Type type) { Matcher matcher = SEQ_CREATE_PATTERN.matcher(sqlCreate); if (matcher.find()) { StringBuilder sql = new StringBuilder(); sql.append(sqlCreate.substring(0, matcher.start(1))); sql.append(getProfile().getNamingStrategy().sequenceName(matcher.group(1))); sql.append(sqlCreate.substring(matcher.end(1))); sqlCreate = sql.toString();/*from www . j ava2s . c o m*/ } return sqlCreate; }
From source file:com.dwdesign.tweetings.util.Utils.java
public static final int matcherEnd(final Matcher matcher, final int group) { try {/* w w w.j a v a 2 s. c o m*/ return matcher.end(group); } catch (final IllegalStateException e) { // Ignore. } return -1; }
From source file:com.github.gekoh.yagen.ddl.CreateDDL.java
public String updateCreateIndex(Dialect dialect, StringBuffer buf, String name, Table table, List<org.hibernate.mapping.Column> columns) { String newName = getProfile().getNamingStrategy().indexName(name); if (!name.equals(newName)) { Matcher matcher = IDX_CREATE_PATTERN.matcher(buf.toString()); if (matcher.find()) { StringBuilder builder = new StringBuilder(); builder.append(buf.substring(0, matcher.start(2))); builder.append(newName);//from w w w. j a v a 2s.c om builder.append(buf.substring(matcher.end(2))); buf = new StringBuffer(builder.toString()); } name = newName; } String tableNameLC = getProfile().getNamingStrategy().tableName(table.getName()).toLowerCase(); if (!renderTable(tableNameLC)) { return "-- skipped creation of index '" + name + "' for table '" + tableNameLC + "' as the mapped entity was not chosen to be processed"; } if (externalViews.contains(tableNameLC)) { return "-- skipped creation of index '" + name + "' on table '" + tableNameLC + "' since there is a view in place"; } TableConfig tableConfig = tblNameToConfig.get(tableNameLC); checkObjectName(dialect, name); IntervalPartitioning partitioning = tableConfig.getTableAnnotationOfType(IntervalPartitioning.class); if (partitioning != null && supportsPartitioning(dialect)) { Matcher matcher = IDX_CREATE_PATTERN.matcher(buf.toString()); // find create index and define local not for unique indexes if (matcher.find() && matcher.group(1) == null) { buf.append(" local"); } } String i18nFK = tableConfig.getI18nBaseEntityFkCol(); if (i18nFK != null) { StringBuilder sql = new StringBuilder(); String i18nTblName = getI18NDetailTableName(tableNameLC); if (columns.size() == 1) { if (hasIndex(table, i18nTblName, columns.get(0))) { return "-- table " + i18nTblName + " already has an index on column " + columns.get(0).getName(); } tblColNameHasSingleColIndex.add(i18nTblName + "." + columns.get(0).getName().toLowerCase()); } Matcher matcher = IDX_CREATE_PATTERN.matcher(buf.toString()); if (matcher.find()) { sql.append(buf.substring(0, matcher.start(3))).append(i18nTblName) .append(buf.substring(matcher.end(3))); } getProfile().duplex(ObjectType.INDEX, name, sql.toString()); return sql.toString(); } if (columns.size() == 1) { if (hasIndex(table, tableNameLC, columns.get(0))) { return "-- table " + table.getName() + " already has an index on column " + columns.get(0).getName(); } tblColNameHasSingleColIndex.add(tableNameLC + "." + columns.get(0).getName().toLowerCase()); } getProfile().duplex(ObjectType.INDEX, name, buf.toString()); return buf.toString(); }