Description
Encodes a string in UTF-8 as a byte array.
License
Creative Commons License
Parameter
Parameter | Description |
---|
str | A text string. |
replace | If true, replaces unpaired surrogate code points with the replacement character (U + FFFD). If false, stops processing when an unpaired surrogate code point is seen. |
Exception
Parameter | Description |
---|
NullPointerException | The parameter str is null. |
IllegalArgumentException | The string contains an unpaired surrogate codepoint and replace is false, or an internal error occurred. |
Return
The string encoded in UTF-8.
Declaration
public static byte[] GetUtf8Bytes(String str, boolean replace)
Method Source Code
//package com.java2s;
import java.io.*;
public class Main {
private static final int StreamedStringBufferLength = 4096;
/**//from w w w . j a v a 2s. c o m
* Encodes a string in UTF-8 as a byte array.
* @param str A text string.
* @param replace If true, replaces unpaired surrogate code points with the
* replacement character (U + FFFD). If false, stops processing when an
* unpaired surrogate code point is seen.
* @return The string encoded in UTF-8.
* @throws NullPointerException The parameter {@code str} is null.
* @throws IllegalArgumentException The string contains an unpaired surrogate code
* point and {@code replace} is false, or an internal error occurred.
*/
public static byte[] GetUtf8Bytes(String str, boolean replace) {
return GetUtf8Bytes(str, replace, false);
}
/**
* Encodes a string in UTF-8 as a byte array.
* @param str A text string.
* @param replace If true, replaces unpaired surrogate code points with the
* replacement character (U + FFFD). If false, stops processing when an
* unpaired surrogate code point is seen.
* @param lenientLineBreaks A Boolean object.
* @return The string encoded in UTF-8.
* @throws NullPointerException The parameter {@code str} is null.
* @throws IllegalArgumentException The string contains an unpaired surrogate code
* point and {@code replace} is false, or an internal error occurred.
*/
public static byte[] GetUtf8Bytes(String str, boolean replace, boolean lenientLineBreaks) {
if (str == null) {
throw new NullPointerException("str");
}
try {
java.io.ByteArrayOutputStream ms = null;
try {
ms = new java.io.ByteArrayOutputStream();
if (WriteUtf8(str, ms, replace) != 0) {
throw new IllegalArgumentException("Unpaired surrogate code point");
}
return ms.toByteArray();
} finally {
try {
if (ms != null)
ms.close();
} catch (java.io.IOException ex) {
}
}
} catch (IOException ex) {
throw new IllegalArgumentException("I/O error occurred", ex);
}
}
/**
* Writes a portion of a string in UTF-8 encoding to a data stream.
* @param str A string to write.
* @param offset The zero-based index where the string portion to write begins.
* @param length The length of the string portion to write.
* @param stream A writable data stream.
* @param replace If true, replaces unpaired surrogate code points with the
* replacement character (U + FFFD). If false, stops processing when an
* unpaired surrogate code point is seen.
* @return 0 if the entire string portion was written; or -1 if the string
* portion contains an unpaired surrogate code point and {@code replace}
* is false.
* @throws NullPointerException The parameter {@code str} is null or {@code
* stream} is null.
* @throws IllegalArgumentException The parameter {@code offset} is less than 0,
* {@code length} is less than 0, or {@code offset} plus {@code length}
* is greater than the string's length.
* @throws java.io.IOException An I/O error occurred.
*/
public static int WriteUtf8(String str, int offset, int length, OutputStream stream, boolean replace)
throws java.io.IOException {
return WriteUtf8(str, offset, length, stream, replace, false);
}
/**
* Writes a portion of a string in UTF-8 encoding to a data stream.
* @param str A string to write.
* @param offset The zero-based index where the string portion to write begins.
* @param length The length of the string portion to write.
* @param stream A writable data stream.
* @param replace If true, replaces unpaired surrogate code points with the
* replacement character (U + FFFD). If false, stops processing when an
* unpaired surrogate code point is seen.
* @param lenientLineBreaks If true, replaces carriage return (CR) not followed
* by line feed (LF) and LF not preceded by CR with CR-LF pairs.
* @return 0 if the entire string portion was written; or -1 if the string
* portion contains an unpaired surrogate code point and {@code replace}
* is false.
* @throws NullPointerException The parameter {@code str} is null or {@code
* stream} is null.
* @throws IllegalArgumentException The parameter {@code offset} is less than 0,
* {@code length} is less than 0, or {@code offset} plus {@code length}
* is greater than the string's length.
* @throws java.io.IOException An I/O error occurred.
*/
public static int WriteUtf8(String str, int offset, int length, OutputStream stream, boolean replace,
boolean lenientLineBreaks) throws java.io.IOException {
if (stream == null) {
throw new NullPointerException("stream");
}
if (str == null) {
throw new NullPointerException("str");
}
if (offset < 0) {
throw new IllegalArgumentException("offset (" + offset + ") is less than " + "0");
}
if (offset > str.length()) {
throw new IllegalArgumentException("offset (" + offset + ") is more than " + str.length());
}
if (length < 0) {
throw new IllegalArgumentException("length (" + length + ") is less than " + "0");
}
if (length > str.length()) {
throw new IllegalArgumentException("length (" + length + ") is more than " + str.length());
}
if (str.length() - offset < length) {
throw new IllegalArgumentException(
"str.length() minus offset (" + (str.length() - offset) + ") is less than " + length);
}
byte[] bytes;
int retval = 0;
bytes = new byte[StreamedStringBufferLength];
int byteIndex = 0;
int endIndex = offset + length;
for (int index = offset; index < endIndex; ++index) {
int c = str.charAt(index);
if (c <= 0x7f) {
if (lenientLineBreaks) {
if (c == 0x0d && (index + 1 >= endIndex || str.charAt(index + 1) != 0x0a)) {
// bare CR, convert to CRLF
if (byteIndex + 2 > StreamedStringBufferLength) {
// Write bytes retrieved so far
stream.write(bytes, 0, byteIndex);
byteIndex = 0;
}
bytes[byteIndex++] = 0x0d;
bytes[byteIndex++] = 0x0a;
continue;
} else if (c == 0x0d) {
// CR-LF pair
if (byteIndex + 2 > StreamedStringBufferLength) {
// Write bytes retrieved so far
stream.write(bytes, 0, byteIndex);
byteIndex = 0;
}
bytes[byteIndex++] = 0x0d;
bytes[byteIndex++] = 0x0a;
++index;
continue;
}
if (c == 0x0a) {
// bare LF, convert to CRLF
if (byteIndex + 2 > StreamedStringBufferLength) {
// Write bytes retrieved so far
stream.write(bytes, 0, byteIndex);
byteIndex = 0;
}
bytes[byteIndex++] = 0x0d;
bytes[byteIndex++] = 0x0a;
continue;
}
}
if (byteIndex >= StreamedStringBufferLength) {
// Write bytes retrieved so far
stream.write(bytes, 0, byteIndex);
byteIndex = 0;
}
bytes[byteIndex++] = (byte) c;
} else if (c <= 0x7ff) {
if (byteIndex + 2 > StreamedStringBufferLength) {
// Write bytes retrieved so far
stream.write(bytes, 0, byteIndex);
byteIndex = 0;
}
bytes[byteIndex++] = (byte) (0xc0 | ((c >> 6) & 0x1f));
bytes[byteIndex++] = (byte) (0x80 | (c & 0x3f));
} else {
if ((c & 0xfc00) == 0xd800 && index + 1 < endIndex && str.charAt(index + 1) >= 0xdc00
&& str.charAt(index + 1) <= 0xdfff) {
// Get the Unicode code point for the surrogate pair
c = 0x10000 + ((c - 0xd800) << 10) + (str.charAt(index + 1) - 0xdc00);
++index;
} else if ((c & 0xf800) == 0xd800) {
// unpaired surrogate
if (!replace) {
retval = -1;
break; // write bytes read so far
}
c = 0xfffd;
}
if (c <= 0xffff) {
if (byteIndex + 3 > StreamedStringBufferLength) {
// Write bytes retrieved so far
stream.write(bytes, 0, byteIndex);
byteIndex = 0;
}
bytes[byteIndex++] = (byte) (0xe0 | ((c >> 12) & 0x0f));
bytes[byteIndex++] = (byte) (0x80 | ((c >> 6) & 0x3f));
bytes[byteIndex++] = (byte) (0x80 | (c & 0x3f));
} else {
if (byteIndex + 4 > StreamedStringBufferLength) {
// Write bytes retrieved so far
stream.write(bytes, 0, byteIndex);
byteIndex = 0;
}
bytes[byteIndex++] = (byte) (0xf0 | ((c >> 18) & 0x07));
bytes[byteIndex++] = (byte) (0x80 | ((c >> 12) & 0x3f));
bytes[byteIndex++] = (byte) (0x80 | ((c >> 6) & 0x3f));
bytes[byteIndex++] = (byte) (0x80 | (c & 0x3f));
}
}
}
stream.write(bytes, 0, byteIndex);
return retval;
}
/**
* Writes a string in UTF-8 encoding to a data stream.
* @param str A string to write.
* @param stream A writable data stream.
* @param replace If true, replaces unpaired surrogate code points with the
* replacement character (U + FFFD). If false, stops processing when an
* unpaired surrogate code point is seen.
* @return 0 if the entire string was written; or -1 if the string contains an
* unpaired surrogate code point and {@code replace} is false.
* @throws NullPointerException The parameter {@code str} is null or {@code
* stream} is null.
* @throws java.io.IOException An I/O error occurred.
*/
public static int WriteUtf8(String str, OutputStream stream, boolean replace) throws java.io.IOException {
if (str == null) {
throw new NullPointerException("str");
}
return WriteUtf8(str, 0, str.length(), stream, replace);
}
}
Related
- getUTF8()
- getUtf8()
- getUTF8(byte[] data, int offset, int length)
- getUTF8Bytes(String s)
- getUtf8Bytes(String s)
- getUTF8Bytes(String string)
- getUTF8BytesFromString(String str)
- getUtf8Decoder()
- getUtf8OrDefault()