List of usage examples for java.util.regex Matcher start
public int start(String name)
From source file:com.joliciel.talismane.filters.SentenceHolderImpl.java
@Override public List<Sentence> getDetectedSentences(Sentence leftover) { if (LOG.isTraceEnabled()) { LOG.trace("getDetectedSentences. leftover=" + leftover); }/* w w w . j a v a 2 s . c om*/ List<Sentence> sentences = new ArrayList<Sentence>(); int currentIndex = 0; boolean haveLeftOvers = this.getText().length() > 0; if (this.sentenceBoundaries.size() > 0) { haveLeftOvers = false; int lastSentenceBoundary = this.sentenceBoundaries.descendingIterator().next(); if (lastSentenceBoundary < this.getText().length() - 1) { haveLeftOvers = true; } if (LOG.isTraceEnabled()) { LOG.trace("haveLeftOvers? " + lastSentenceBoundary + " < " + (this.getText().length() - 1) + " = " + haveLeftOvers); } } List<Integer> allBoundaries = new ArrayList<Integer>(this.sentenceBoundaries); if (haveLeftOvers) allBoundaries.add(this.getText().length() - 1); for (int sentenceBoundary : allBoundaries) { boolean isLeftover = haveLeftOvers && sentenceBoundary == this.getText().length() - 1; Sentence sentence = filterService.getSentence(); int leftOverTextLength = 0; String text = ""; if (leftover != null) { sentence = leftover; leftOverTextLength = leftover.getText().length(); text = leftover.getText() + this.getText().substring(currentIndex, sentenceBoundary + 1); leftover = null; } else { text = this.getText().substring(currentIndex, sentenceBoundary + 1); } // handle trim & duplicate white space here Matcher matcherOpeningWhiteSpace = openingWhiteSpacePattern.matcher(text); int openingWhiteSpaceEnd = 0; if (matcherOpeningWhiteSpace.find()) { openingWhiteSpaceEnd = matcherOpeningWhiteSpace.end(1); } int closingWhiteSpaceStart = text.length(); if (!isLeftover) { Matcher matcherClosingWhiteSpace = closingWhiteSpacePattern.matcher(text); if (matcherClosingWhiteSpace.find()) { closingWhiteSpaceStart = matcherClosingWhiteSpace.start(1); } } Matcher matcherDuplicateWhiteSpace = duplicateWhiteSpacePattern.matcher(text); Set<Integer> duplicateWhiteSpace = new HashSet<Integer>(); while (matcherDuplicateWhiteSpace.find()) { // remove all white space barring the first for (int i = matcherDuplicateWhiteSpace.start() + 1; i < matcherDuplicateWhiteSpace.end(); i++) { duplicateWhiteSpace.add(i); } } StringBuilder sb = new StringBuilder(); int i = currentIndex; for (int j = 0; j < text.length(); j++) { boolean appendLetter = false; if (j < openingWhiteSpaceEnd) { // do nothing } else if (j >= closingWhiteSpaceStart) { // do nothing } else if (duplicateWhiteSpace.contains(j)) { // do nothing } else { appendLetter = true; } if (j >= leftOverTextLength) { // if we're past the leftovers and onto the new stuff if (appendLetter) sentence.addOriginalIndex(this.getOriginalIndexes().get(i)); if (this.getOriginalTextSegments().containsKey(i)) sentence.getOriginalTextSegments().put(sb.length(), this.getOriginalTextSegments().get(i)); i++; } if (appendLetter) sb.append(text.charAt(j)); } sentence.setText(sb.toString()); if (LOG.isTraceEnabled()) { LOG.trace("sentence.setText |" + sentence.getText() + "|"); } sentence.setComplete(!isLeftover); for (Entry<Integer, Integer> newlineLocation : this.newlines.entrySet()) { sentence.addNewline(newlineLocation.getKey(), newlineLocation.getValue()); } sentence.setFileName(this.getFileName()); sentences.add(sentence); currentIndex = sentenceBoundary + 1; } return sentences; }
From source file:com.dreamlinx.automation.DINRelay.java
/** * Creates an HttpClient to communicate with the DIN relay. * @throws MalformedURLException// w w w. ja va 2s. c o m * @throws HttpException * @throws IOException */ private void setupHttpClient() throws MalformedURLException, HttpException, IOException { httpClient = new HttpClient(); httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); GetMethod getMethod = new GetMethod("http://" + ipAddress); int result = httpClient.executeMethod(getMethod); if (result != 200) { throw new HttpException(result + " - " + getMethod.getStatusText()); } String response = getMethod.getResponseBodyAsString(); getMethod.releaseConnection(); String regex = "name=\"Challenge\" value=\".*\""; Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(response); String challenge = ""; while (matcher.find()) { int start = matcher.start(0); int end = matcher.end(0); challenge = response.substring(start + 24, end - 1); } String md5Password = challenge + username + password + challenge; md5Password = toMD5(md5Password); PostMethod postMethod = new PostMethod("http://" + ipAddress + "/login.tgi"); postMethod.addParameter("Username", username); postMethod.addParameter("Password", md5Password); result = httpClient.executeMethod(postMethod); if (result != 200) { throw new HttpException(result + " - " + postMethod.getStatusText()); } postMethod.releaseConnection(); }
From source file:net.healeys.lexic.online.OnlineGame.java
public boolean start() { Pattern pat = Pattern.compile("(\\w+):(.+)"); for (int attempt = 0; attempt < MAX_ATTEMPTS; attempt++) { try {//from www.j av a 2s . c o m HttpClient httpClient = new DefaultHttpClient(); HttpGet get = new HttpGet(uri); addHeaders(get); HttpResponse resp = httpClient.execute(get); BufferedReader br = new BufferedReader(new InputStreamReader(resp.getEntity().getContent())); String line; while ((line = br.readLine()) != null) { // Log.d(TAG,"line:"+line); Matcher mat = pat.matcher(line); if (mat.find()) { String key = line.substring(mat.start(1), mat.end(1)); String value = line.substring(mat.start(2), mat.end(2)); // Log.d(TAG,"key:"+key); // Log.d(TAG,"value:"+value); if (key.equals("board")) { String[] letters = value.split(","); if (letters.length == 16) { setBoard(new FourByFourBoard(letters)); } else if (letters.length == 25) { setBoard(new FiveByFiveBoard(letters)); } } else if (key.equals("id")) { id = Integer.parseInt(value); } else { urls.put(key, value); } } } super.start(); return true; } catch (Exception e) { // Log.e(TAG,"Connection Error in constructor",e); } } super.start(); return false; }
From source file:com.nextep.designer.sqlgen.oracle.parser.OraclePackageParser.java
private String renameSqlEnd(String sql, String newName) { // Matching the END tag final Pattern pattern = Pattern.compile("end\\s+((\\w)+)(;|/|\\s)*$"); //$NON-NLS-1$ final Matcher m = pattern.matcher(sql.toLowerCase()); String newSql = sql;/*w w w. j a v a 2 s . co m*/ if (m.find()) { newSql = sql.substring(0, m.start(1)) + newName + sql.substring(m.end(1)); } return newSql; }
From source file:com.zextras.zimbradrive.soap.SearchRequestHdlr.java
private String getStandardQuery(String query) { StringBuilder parsedQueryBuilder = new StringBuilder(); Pattern nonQuotedTokenSValuePattern = Pattern.compile("([^ :]+:)([^\"]*?)( |$)"); //preTokenDelimiter tokenName : nonQuotedTokenValue postTokenDelimiter Matcher nonQuotedTokenSValueMatcher = nonQuotedTokenSValuePattern.matcher(query); int lastMatchEndIndex = 0; while (nonQuotedTokenSValueMatcher.find()) { String preMatchValueQuery = query.substring(lastMatchEndIndex, nonQuotedTokenSValueMatcher.end(1)); String matchValueQuery = query.substring(nonQuotedTokenSValueMatcher.start(2), nonQuotedTokenSValueMatcher.end(2)); parsedQueryBuilder.append(preMatchValueQuery).append("\"").append(matchValueQuery).append("\""); lastMatchEndIndex = nonQuotedTokenSValueMatcher.end(2); }/* ww w. j a v a 2 s .co m*/ parsedQueryBuilder.append(query.substring(lastMatchEndIndex)); return parsedQueryBuilder.toString(); }
From source file:com.google.testing.pogen.parser.template.RegexVariableExtractor.java
@Override public void startElement(QName element, XMLAttributes attrs, Augmentations augs) { processCharacters();// ww w . j a v a 2 s. c om // Ignore elements with prefix (:) to deal with not html elements such as "c:set" in JSP. if (element.prefix == null) { // Get offset information HTMLEventInfo info = (HTMLEventInfo) augs.getItem(AUGMENTATIONS); HtmlTagInfo tagInfo = new HtmlTagInfo(attrs.getValue(attributeName), info.getBeginCharacterOffset(), info.getEndCharacterOffset(), repeatedRanges); tagInfoStack.push(tagInfo); for (int i = 0; i < attrs.getLength(); i++) { // Ignore variables appearing two more than Matcher matcher = variablePattern.matcher(attrs.getValue(i)); while (matcher.find()) { int iGroup = getFirstAvailableGroupIndex(matcher); if (!excludedRanges.contains(matcher.start(iGroup))) { tagInfo.addVariableInfo(matcher.group(0), matcher.group(iGroup), matcher.start(iGroup), attrs.getQName(i)); } } if (attrs.getQName(i).equals("id")) { tagInfo.setIdValue(attrs.getValue(i)); } else if (attrs.getQName(i).equals("name")) { tagInfo.setNameValue(attrs.getValue(i)); } } } super.startElement(element, attrs, augs); }
From source file:net.healeys.lexic.online.OnlineGame.java
public boolean submitWords(WebView display) { Pattern contentPat = Pattern.compile("([^;]+); charset=(.+)"); for (int attempt = 0; attempt < MAX_ATTEMPTS; attempt++) { String url = BASE_URL + urls.get("words"); Iterator<String> li = uniqueListIterator(); StringBuffer sb = new StringBuffer(4096); while (li.hasNext()) { sb.append(li.next());/*from ww w.j a v a2s . c om*/ if (li.hasNext()) sb.append(','); } String data = URLEncoder.encode(sb.toString()); try { HttpClient httpClient = new DefaultHttpClient(); HttpPost post = new HttpPost(url); addHeaders(post); post.setEntity(new StringEntity("words=" + data)); HttpResponse resp = httpClient.execute(post); BufferedReader br = new BufferedReader(new InputStreamReader(resp.getEntity().getContent())); sb = new StringBuffer(4096); String line; while ((line = br.readLine()) != null) { sb.append(line); sb.append('\n'); } String contentHeader = resp.getFirstHeader("Content-type").getValue(); String contentType; String contentEncoding; Matcher mat = contentPat.matcher(contentHeader); if (mat.find()) { contentType = contentHeader.substring(mat.start(1), mat.end(1)); contentEncoding = contentHeader.substring(mat.start(2), mat.end(2)); } else { contentType = contentHeader; contentEncoding = "utf-8"; } // Log.d(TAG,"url:"+url); // Log.d(TAG,"data:"+sb.toString()); // Log.d(TAG,"contentType:"+contentType); // Log.d(TAG,"contentEncoding:"+contentEncoding); display.loadDataWithBaseURL(url, sb.toString(), "text/html", "utf-8", null); return true; } catch (Exception e) { // Log.d(TAG,"error submitting words",e); } } return false; }
From source file:dk.netarkivet.harvester.harvesting.extractor.IcelandicExtractorJS.java
public long considerStrings(Extractor ext, CrawlURI curi, CharSequence cs, boolean handlingJSFile) { long foundLinks = 0; Matcher strings = TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs); int startIndex = 0; while (strings.find(startIndex)) { CharSequence subsequence = cs.subSequence(strings.start(2), strings.end(2)); Matcher uri = TextUtils.getMatcher(STRING_URI_DETECTOR, subsequence); if (uri.matches()) { String string = uri.group(); boolean falsePositive = false; try { string = StringEscapeUtils.unescapeJavaScript(string); } catch (NestableRuntimeException e) { LOGGER.log(Level.WARNING, "problem unescaping some javascript", e); }/* w ww. j a v a2s . c o m*/ string = UriUtils.speculativeFixup(string, curi.getUURI()); // Filter out some bad false positives (should really fix regexp for URI detection) if (string.contains("/.") || string.contains("@") || string.length() > 150) { // While legal in URIs, these are rare and usually an indication of a false positive // in the speculative extraction. falsePositive = true; } if (!falsePositive) { falsePositive = shouldIgnorePossibleRelativeLink(string); } if (falsePositive) { foundFalsePositives++; } else { foundLinks++; try { int max = ext.getExtractorParameters().getMaxOutlinks(); if (handlingJSFile) { addRelativeToVia(curi, max, string, JS_MISC, SPECULATIVE); } else { addRelativeToBase(curi, max, string, JS_MISC, SPECULATIVE); } } catch (URIException e) { ext.logUriError(e, curi.getUURI(), string); } } } else { foundLinks += considerStrings(ext, curi, subsequence, handlingJSFile); } // reconsider the last closing quote as possible opening quote startIndex = strings.end(2); } TextUtils.recycleMatcher(strings); return foundLinks; }
From source file:com.github.rwitzel.streamflyer.xml.XmlVersionModifier.java
/** * @see com.github.rwitzel.streamflyer.core.Modifier#modify(java.lang.StringBuilder, int, boolean) *//*from w w w .j av a 2s. com*/ @Override public AfterModification modify(StringBuilder characterBuffer, int firstModifiableCharacterInBuffer, boolean endOfStreamHit) { switch (state) { case NO_LONGER_MODIFYING: return factory.skipEntireBuffer(characterBuffer, firstModifiableCharacterInBuffer, endOfStreamHit); case INITIAL: state = XmlVersionModifierState.PROLOG_REQUEST; // you never know how many whitespace characters are in the prolog return factory.modifyAgainImmediately(INITIAL_NUMBER_OF_CHARACTERS, firstModifiableCharacterInBuffer); case PROLOG_REQUEST: // (Should we do aware of BOMs here? No. I consider it the // responsibility of the caller to provide characters without BOM.) Matcher matcher = Pattern.compile("<\\?xml[^>]*version\\s*=\\s*['\"]((1.0)|(1.1))['\"].*") .matcher(characterBuffer); if (matcher.matches()) { // replace version in prolog characterBuffer.replace(matcher.start(1), matcher.end(1), xmlVersion); } else { // is there a prolog that is too long? Matcher matcher2 = Pattern.compile("<\\?xml.*").matcher(characterBuffer); if (matcher2.matches()) { // this is not normal at all -> throw exception throw new XmlPrologRidiculouslyLongException(characterBuffer.toString()); } // insert prolog characterBuffer.insert(0, "<?xml version='" + xmlVersion + "'>"); } state = XmlVersionModifierState.NO_LONGER_MODIFYING; return factory.skipEntireBuffer(characterBuffer, firstModifiableCharacterInBuffer, endOfStreamHit); default: throw new IllegalStateException("state " + state + " not supported"); } }
From source file:gov.nyc.doitt.gis.geoclient.parser.test.ChunkSpecParser.java
protected List<MutableToken> parseBracketValues(String delimitedString) { List<MutableToken> result = new ArrayList<>(); Matcher matcher = TOKEN_VALUE_PATTERN.matcher(delimitedString); int previousBracketOffset = 0; while (matcher.find()) { String value = matcher.group(1); int start = matcher.start(1) - previousBracketOffset - 1; // -1 for opening "[" of this group int end = start + value.length(); result.add(new MutableToken(value, start, end)); previousBracketOffset = previousBracketOffset + 2; }//from w ww . j a va 2 s . c o m return result; }