Description
Returns a substring of str that respects Unicode character boundaries.
License
Apache License
Parameter
Parameter | Description |
---|
str | the original String |
begin | the beginning index, inclusive |
end | the ending index, exclusive |
Exception
Parameter | Description |
---|
IndexOutOfBoundsException | if the begin is negative,or end is larger than the length of str, or begin is larger than end |
Return
the specified substring, possibly adjusted in order to not split unicode surrogate pairs
Declaration
public static String unicodePreservingSubstring(String str, int begin,
int end)
Method Source Code
//package com.java2s;
/**/*from w w w . j a v a2 s .c o m*/
* Copyright (c) 2000, Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class Main {
/**
* Returns a substring of {@code str} that respects Unicode character
* boundaries.
*
* <p>The string will never be split between a [high, low] surrogate pair,
* as defined by {@link Character#isHighSurrogate} and
* {@link Character#isLowSurrogate}.
*
* <p>If {@code begin} or {@code end} are the low surrogate of a unicode
* character, it will be offset by -1.
*
* <p>This behavior guarantees that
* {@code str.equals(StringUtil.unicodePreservingSubstring(str, 0, n) +
* StringUtil.unicodePreservingSubstring(str, n, str.length())) } is
* true for all {@code n}.
* </pre>
*
* <p>This means that unlike {@link String#substring(int, int)}, the length of
* the returned substring may not necessarily be equivalent to
* {@code end - begin}.
*
* @param str the original String
* @param begin the beginning index, inclusive
* @param end the ending index, exclusive
* @return the specified substring, possibly adjusted in order to not
* split unicode surrogate pairs
* @throws IndexOutOfBoundsException if the {@code begin} is negative,
* or {@code end} is larger than the length of {@code str}, or
* {@code begin} is larger than {@code end}
*/
public static String unicodePreservingSubstring(String str, int begin,
int end) {
return str.substring(unicodePreservingIndex(str, begin),
unicodePreservingIndex(str, end));
}
/**
* Equivalent to:
*
* <pre>
* {@link #unicodePreservingSubstring(String, int, int)}(
* str, begin, str.length())
* </pre>
*/
public static String unicodePreservingSubstring(String str, int begin) {
return unicodePreservingSubstring(str, begin, str.length());
}
/**
* Normalizes {@code index} such that it respects Unicode character
* boundaries in {@code str}.
*
* <p>If {@code index} is the low surrogate of a unicode character,
* the method returns {@code index - 1}. Otherwise, {@code index} is
* returned.
*
* <p>In the case in which {@code index} falls in an invalid surrogate pair
* (e.g. consecutive low surrogates, consecutive high surrogates), or if
* if it is not a valid index into {@code str}, the original value of
* {@code index} is returned.
*
* @param str the String
* @param index the index to be normalized
* @return a normalized index that does not split a Unicode character
*/
public static int unicodePreservingIndex(String str, int index) {
if (index > 0 && index < str.length()) {
if (Character.isHighSurrogate(str.charAt(index - 1))
&& Character.isLowSurrogate(str.charAt(index))) {
return index - 1;
}
}
return index;
}
}
Related
- substringAfter(String str, String separator)
- substringAfterLast(String str, String separator)
- substringBefore(String str, String separator)
- substringBeforeLast(String str, String separator)
- unicodePreservingSubstring(String paramString, int paramInt1, int paramInt2)
- unicodePreservingSubstring(String str, int begin)
- substringAfter(String str, String separator)
- substringBetween(String str, String open, String close)
- subStringEndString(String sourceStr, String endString)