List of usage examples for java.lang String codePointAt
public int codePointAt(int index)
From source file:com.gargoylesoftware.htmlunit.HttpWebConnection.java
private Charset getCharset(final String charset, final List<NameValuePair> pairs) { for (final NameValuePair pair : pairs) { if (pair instanceof KeyDataPair) { final KeyDataPair pairWithFile = (KeyDataPair) pair; if (pairWithFile.getData() == null && pairWithFile.getFile() != null) { final String fileName = pairWithFile.getFile().getName(); for (int i = 0; i < fileName.length(); i++) { if (fileName.codePointAt(i) > 127) { return Charset.forName(charset); }//from w w w. ja v a2 s . c o m } } } } return null; }
From source file:org.apache.orc.impl.mask.RedactMaskFactory.java
/** * Mask the given stringified numeric value excluding the unmask range. * Non-digit characters are passed through on the assumption they are * markers (eg. one of ",.ef")./*from w ww. j a v a 2 s . c o m*/ * @param value the original value. */ String maskNumericString(final String value) { StringBuilder result = new StringBuilder(); final int length = value.codePointCount(0, value.length()); for (int c = 0; c < length; ++c) { int cp = value.codePointAt(c); if (isIndexInUnmaskRange(c, length) || Character.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) { result.appendCodePoint(cp); } else { result.appendCodePoint(DIGIT_CP_REPLACEMENT); } } return result.toString(); }
From source file:com.gargoylesoftware.htmlunit.javascript.host.xml.XMLHttpRequestTest.java
/** * @throws Exception if the test fails/* w w w . j a va 2s . co m*/ */ @Test public void java_encoding() throws Exception { // Chrome and FF return the last apostrophe, see overrideMimeType_charset_all() // but Java and other tools (e.g. Notpad++) return only 3 characters, not 4 // this method is not a test case, but rather to show the behavior of java final String string = "'\u9EC4'"; final ByteArrayInputStream bais = new ByteArrayInputStream(string.getBytes("UTF-8")); try (final BufferedReader reader = new BufferedReader(new InputStreamReader(bais, "GBK"))) { final String output = reader.readLine(); assertNotNull(output); assertEquals(39, output.codePointAt(0)); assertEquals(27035, output.codePointAt(1)); assertEquals(65533, output.codePointAt(2)); assertEquals(39, output.codePointAt(3)); } }
From source file:org.sleuthkit.autopsy.casemodule.Case.java
/** * Sanitize the case name for PostgreSQL database, Solr cores, and ActiveMQ * topics. Makes it plain-vanilla enough that each item should be able to * use it.//from ww w. jav a 2 s .c om * * Sanitize the PostgreSQL/Solr core, and ActiveMQ name by excluding: * Control characters Non-ASCII characters Various others shown below * * Solr: * http://stackoverflow.com/questions/29977519/what-makes-an-invalid-core-name * may not be / \ : * * ActiveMQ: * http://activemq.2283324.n4.nabble.com/What-are-limitations-restrictions-on-destination-name-td4664141.html * may not be ? * * PostgreSQL: * http://www.postgresql.org/docs/9.4/static/sql-syntax-lexical.html 63 * chars max, must start with a-z or _ following chars can be letters _ or * digits * * SQLite: Uses autopsy.db for the database name follows Windows naming * convention * * @param caseName The name of the case as typed in by the user * * @return the sanitized case name to use for Database, Solr, and ActiveMQ */ static String sanitizeCaseName(String caseName) { String result; // Remove all non-ASCII characters result = caseName.replaceAll("[^\\p{ASCII}]", "_"); // Remove all control characters result = result.replaceAll("[\\p{Cntrl}]", "_"); // Remove / \ : ? space ' " result = result.replaceAll("[ /?:'\"\\\\]", "_"); // Make it all lowercase result = result.toLowerCase(); // Must start with letter or underscore for PostgreSQL. If not, prepend an underscore. if (result.length() > 0 && !(Character.isLetter(result.codePointAt(0))) && !(result.codePointAt(0) == '_')) { result = "_" + result; } // Chop to 63-16=47 left (63 max for PostgreSQL, taking 16 for the date _20151225_123456) if (result.length() > MAX_SANITIZED_NAME_LENGTH) { result = result.substring(0, MAX_SANITIZED_NAME_LENGTH); } if (result.isEmpty()) { result = "case"; } return result; }
From source file:com.microsoft.windowsazure.mobileservices.MobileServiceTableBase.java
/** * Validates if a given string contains any of the following special characters: "(U+0022), +(U+002B), /(U+002F), ?(U+003F), \(U+005C), `(U+0060) * @param s/*from ww w. ja v a 2s. c o m*/ * @return */ protected boolean containsSpecialCharacter(String s) { boolean result = false; final int length = s.length(); final int cpQuotationMark = 0x0022; final int cpPlusSign = 0x002B; final int cpSolidus = 0x002F; final int cpQuestionMark = 0x003F; final int cpReverseSolidus = 0x005C; final int cpGraveAccent = 0x0060; for (int offset = 0; offset < length;) { final int codepoint = s.codePointAt(offset); if (codepoint == cpQuotationMark || codepoint == cpPlusSign || codepoint == cpSolidus || codepoint == cpQuestionMark || codepoint == cpReverseSolidus || codepoint == cpGraveAccent) { result = true; break; } offset += Character.charCount(codepoint); } return result; }
From source file:net.sf.jabref.importer.HTMLConverter.java
public String formatUnicode(String text) { if (text == null) { return null; }//from ww w . jav a 2 s.c o m Set<Character> chars = unicodeSymbols.keySet(); for (Character character : chars) { // System.err.println(new Integer((int) character).toString() + ": " + character.toString() + ": " + unicodeSymbols.get(character)); text = text.replaceAll(character.toString(), unicodeSymbols.get(character)); } Integer cp; for (int i = 0; i <= (text.length() - 1); i++) { cp = text.codePointAt(i); if (cp >= 129) { LOGGER.warn("Unicode character not converted: " + cp); } } return text; }
From source file:org.languagetool.rules.spelling.hunspell.HunspellRule.java
protected String getSentenceTextWithoutUrlsAndImmunizedTokens(AnalyzedSentence sentence) { StringBuilder sb = new StringBuilder(); AnalyzedTokenReadings[] sentenceTokens = getSentenceWithImmunization(sentence).getTokens(); for (int i = 1; i < sentenceTokens.length; i++) { String token = sentenceTokens[i].getToken(); if (sentenceTokens[i].isImmunized() || sentenceTokens[i].isIgnoredBySpeller() || isUrl(token) || isEMail(token) || isQuotedCompound(sentence, i, token)) { if (isQuotedCompound(sentence, i, token)) { sb.append(" ").append(token.substring(1)); }/*from w ww .ja v a2 s.c o m*/ // replace URLs and immunized tokens with whitespace to ignore them for spell checking: else if (token.length() < 20) { sb.append(WHITESPACE_ARRAY[token.length()]); } else { for (int j = 0; j < token.length(); j++) { sb.append(' '); } } } else if (token.length() > 1 && token.codePointCount(0, token.length()) != token.length()) { // some symbols such as emojis () have a string length that equals 2 for (int charIndex = 0; charIndex < token.length();) { int unicodeCodePoint = token.codePointAt(charIndex); int increment = Character.charCount(unicodeCodePoint); if (increment == 1) { sb.append(token.charAt(charIndex)); } else { sb.append(" "); } charIndex += increment; } } else { sb.append(token); } } return sb.toString(); }
From source file:gate.creole.tokeniser.SimpleTokeniser.java
/** * The method that does the actual tokenisation. *//*from w ww . j ava2s . c o m*/ @Override public void execute() throws ExecutionException { interrupted = false; AnnotationSet annotationSet; //check the input if (document == null) { throw new ExecutionException("No document to tokenise!"); } if (annotationSetName == null || annotationSetName.equals("")) annotationSet = document.getAnnotations(); else annotationSet = document.getAnnotations(annotationSetName); fireStatusChanged("Tokenising " + document.getName() + "..."); String content = document.getContent().toString(); int length = content.length(); int currentChar; int charsInCurrentCP = 1; DFSMState graphPosition = dInitialState; //the index of the first character of the token trying to be recognised int tokenStart = 0; DFSMState lastMatchingState = null; DFSMState nextState; String tokenString; int charIdx = 0; int oldCharIdx = 0; FeatureMap newTokenFm; while (charIdx < length) { currentChar = content.codePointAt(charIdx); // number of chars we have to advance after processing this code point. // 1 in the vast majority of cases, but 2 where the code point is a // supplementary character represented as a surrogate pair. charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1; // Out.println( // currentChar + typesMnemonics[Character.getType(currentChar)+128]); nextState = graphPosition.next(typeIds.get(new Integer(Character.getType(currentChar))).intValue()); if (null != nextState) { graphPosition = nextState; if (graphPosition.isFinal()) { lastMatchingState = graphPosition; } charIdx += charsInCurrentCP; } else {//we have a match! newTokenFm = Factory.newFeatureMap(); if (null == lastMatchingState) { // no rule matches this character, so create a single-char // DEFAULT_TOKEN annotation covering it and start again after it charIdx = tokenStart + charsInCurrentCP; tokenString = content.substring(tokenStart, charIdx); newTokenFm.put("type", "UNKNOWN"); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); try { annotationSet.add(new Long(tokenStart), new Long(charIdx), "DEFAULT_TOKEN", newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! ioe.printStackTrace(Err.getPrintWriter()); } // Out.println("Default token: " + tokenStart + // "->" + tokenStart + " :" + tokenString + ";"); } else { // we've reached the end of a string that the FSM recognised tokenString = content.substring(tokenStart, charIdx); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) { newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]); //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" + // lastMatchingState.getTokenDesc()[i][1]); } try { annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! throw new GateRuntimeException(ioe.toString()); } // Out.println(lastMatchingState.getTokenDesc()[0][0] + // ": " + tokenStart + "->" + lastMatch + // " :" + tokenString + ";"); //charIdx = lastMatch + 1; } // reset to initial state and start looking again from here lastMatchingState = null; graphPosition = dInitialState; tokenStart = charIdx; } if ((charIdx - oldCharIdx > 256)) { fireProgressChanged((100 * charIdx) / length); oldCharIdx = charIdx; if (isInterrupted()) throw new ExecutionInterruptedException(); } } // while(charIdx < length) if (null != lastMatchingState) { // we dropped off the end having found a match, annotate it tokenString = content.substring(tokenStart, charIdx); newTokenFm = Factory.newFeatureMap(); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) { newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]); } try { annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! throw new GateRuntimeException(ioe.toString()); } } reset(); fireProcessFinished(); fireStatusChanged("Tokenisation complete!"); }
From source file:tufts.vue.ds.Field.java
/** @return double value if one found, Double.NaN otherwise */ private double getNumericValue(final String text, final boolean tryCurrency) { try {//w w w . jav a2 s . co m // Double.parseDouble handles most stuff, including "0x2F" style // hex values was well as scientific notation. return Double.parseDouble(text); } catch (Throwable t) { } Number value = null; try { // This handles values of the form "1,234,567". It will also extract any // number that can be found at the head of a string: e.g. "7foo" will return // 7, or "70%" will return 70 (*not* 0.70). The instance of LocalNumberFormat will // generally be a DecimalFormat value = LocalNumberFormat.parse(text); } catch (Throwable t) { } // Note that if we use a NumberFormat.getCurrencyInstance() here to handle // currency, it will only allow the local currency symbol. if (value == null && tryCurrency && text.length() > 1 && isCurrencySymbol(text.codePointAt(0))) { value = getNumericValue(text.substring(1), false); // NOTE RECURSION //Log.debug("HANDLED CURRENCY " + Util.tags(text) + " = " + Util.tags(value)); } // could allow for percent parsers that return value/100 if (DEBUG.SCHEMA || DEBUG.DATA) Log.debug(Util.tags(text) + " = " + Util.tags(value)); return value == null ? Double.NaN : value.doubleValue(); }
From source file:org.xwoot.wikiContentManager.XWikiSwizzleClient.XwikiSwizzleClient.java
/** * DOCUMENT ME!//w w w .j av a 2s . co m * * @param pageId DOCUMENT ME! * @param value DOCUMENT ME! * @param algo DOCUMENT ME! * @param rmd DOCUMENT ME! * @return DOCUMENT ME! * @throws NoSuchAlgorithmException * @throws XWikiSwizzleClientException */ synchronized public String setPageContent(String pageId, String value, String algo, byte[] rmd) throws NoSuchAlgorithmException, XWikiSwizzleClientException { String result = null; Page page = null; String pageContent = ""; // if user have not connected client, method do it for him // else it's to the user to do the connection gestion... boolean b = this.relogin(); page = this.getWikiPage(pageId); if (page != null) { pageContent = page.getContent(); } byte[] messageDigest = this.getDigest(pageContent, algo); if (MessageDigest.isEqual(messageDigest, rmd)) { if (page == null) { Map p = this.createPage(pageId, value); if (p == null) { throw new XWikiSwizzleClientException("Problem with setPageContent : can't create the page"); } } else { page.setContent(value); this.storeWikiPage(page); } } else { if ((pageContent == null) || ((pageContent.length() == 1) && (pageContent.codePointAt(0) == VOID_CHARACTER)) || (pageContent.length() < 1)) { result = ""; } else { result = pageContent; } } this.logout(b); return result; }