Java examples for java.lang:String UTF
Encodes a string in UTF-8 as a byte array.
/*/*from ww w.ja v a 2 s . c o m*/ Written in 2013 by Peter O. Any copyright is dedicated to the Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ If you like this, you should donate to Peter O. at: http://upokecenter.dreamhosters.com/articles/donate-now-2/ */ //package com.java2s; import java.io.*; public class Main { private static final int StreamedStringBufferLength = 4096; /** * Encodes a string in UTF-8 as a byte array. * @param str A text string. * @param replace If true, replaces unpaired surrogate code points with the * replacement character (U + FFFD). If false, stops processing when an * unpaired surrogate code point is seen. * @return The string encoded in UTF-8. * @throws NullPointerException The parameter {@code str} is null. * @throws IllegalArgumentException The string contains an unpaired surrogate code * point and {@code replace} is false, or an internal error occurred. * @throws IllegalArgumentException The parameter "offset" is less than 0, * "bytesCount" is less than 0, or offset plus bytesCount is greater * than the length of "data" . */ public static byte[] GetUtf8Bytes(String str, boolean replace) { if (str == null) { throw new NullPointerException("str"); } try { java.io.ByteArrayOutputStream ms = null; try { ms = new java.io.ByteArrayOutputStream(); if (WriteUtf8(str, ms, replace) != 0) { throw new IllegalArgumentException( "Unpaired surrogate code point"); } return ms.toByteArray(); } finally { try { if (ms != null) ms.close(); } catch (java.io.IOException ex) { } } } catch (IOException ex) { throw new IllegalArgumentException("I/O error occurred", ex); } } /** * Writes a portion of a string in UTF-8 encoding to a data stream. * @param str A string to write. * @param offset The zero-based index where the string portion to write begins. * @param length The length of the string portion to write. * @param stream A writable data stream. * @param replace If true, replaces unpaired surrogate code points with the * replacement character (U + FFFD). If false, stops processing when an * unpaired surrogate code point is seen. * @return 0 if the entire string portion was written; or -1 if the string * portion contains an unpaired surrogate code point and {@code replace} * is false. * @throws NullPointerException The parameter {@code str} is null or {@code * stream} is null. * @throws IllegalArgumentException The parameter {@code offset} is less than 0, * {@code length} is less than 0, or {@code offset} plus {@code length} * is greater than the string's length. * @throws java.io.IOException An I/O error occurred. */ public static int WriteUtf8(String str, int offset, int length, OutputStream stream, boolean replace) throws java.io.IOException { return WriteUtf8(str, offset, length, stream, replace, false); } /** * Writes a portion of a string in UTF-8 encoding to a data stream. * @param str A string to write. * @param offset The zero-based index where the string portion to write begins. * @param length The length of the string portion to write. * @param stream A writable data stream. * @param replace If true, replaces unpaired surrogate code points with the * replacement character (U + FFFD). If false, stops processing when an * unpaired surrogate code point is seen. * @param lenientLineBreaks If true, replaces carriage return (CR) not followed * by line feed (LF) and LF not preceded by CR with CR-LF pairs. * @return 0 if the entire string portion was written; or -1 if the string * portion contains an unpaired surrogate code point and {@code replace} * is false. * @throws NullPointerException The parameter {@code str} is null or {@code * stream} is null. * @throws IllegalArgumentException The parameter {@code offset} is less than 0, * {@code length} is less than 0, or {@code offset} plus {@code length} * is greater than the string's length. * @throws java.io.IOException An I/O error occurred. */ public static int WriteUtf8(String str, int offset, int length, OutputStream stream, boolean replace, boolean lenientLineBreaks) throws java.io.IOException { if (stream == null) { throw new NullPointerException("stream"); } if (str == null) { throw new NullPointerException("str"); } if (offset < 0) { throw new IllegalArgumentException("offset (" + offset + ") is less than " + "0"); } if (offset > str.length()) { throw new IllegalArgumentException("offset (" + offset + ") is more than " + str.length()); } if (length < 0) { throw new IllegalArgumentException("length (" + length + ") is less than " + "0"); } if (length > str.length()) { throw new IllegalArgumentException("length (" + length + ") is more than " + str.length()); } if (str.length() - offset < length) { throw new IllegalArgumentException( "str.length() minus offset (" + (str.length() - offset) + ") is less than " + length); } byte[] bytes; int retval = 0; bytes = new byte[StreamedStringBufferLength]; int byteIndex = 0; int endIndex = offset + length; for (int index = offset; index < endIndex; ++index) { int c = str.charAt(index); if (c <= 0x7f) { if (lenientLineBreaks) { if (c == 0x0d && (index + 1 >= endIndex || str .charAt(index + 1) != 0x0a)) { // bare CR, convert to CRLF if (byteIndex + 2 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = 0x0d; bytes[byteIndex++] = 0x0a; continue; } if (c == 0x0a) { // bare LF, convert to CRLF if (byteIndex + 2 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = 0x0d; bytes[byteIndex++] = 0x0a; continue; } } if (byteIndex >= StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = (byte) c; } else if (c <= 0x7ff) { if (byteIndex + 2 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = (byte) (0xc0 | ((c >> 6) & 0x1f)); bytes[byteIndex++] = (byte) (0x80 | (c & 0x3f)); } else { if ((c & 0xfc00) == 0xd800 && index + 1 < endIndex && str.charAt(index + 1) >= 0xdc00 && str.charAt(index + 1) <= 0xdfff) { // Get the Unicode code point for the surrogate pair c = 0x10000 + ((c - 0xd800) << 10) + (str.charAt(index + 1) - 0xdc00); ++index; } else if ((c & 0xf800) == 0xd800) { // unpaired surrogate if (!replace) { retval = -1; break; // write bytes read so far } c = 0xfffd; } if (c <= 0xffff) { if (byteIndex + 3 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = (byte) (0xe0 | ((c >> 12) & 0x0f)); bytes[byteIndex++] = (byte) (0x80 | ((c >> 6) & 0x3f)); bytes[byteIndex++] = (byte) (0x80 | (c & 0x3f)); } else { if (byteIndex + 4 > StreamedStringBufferLength) { // Write bytes retrieved so far stream.write(bytes, 0, byteIndex); byteIndex = 0; } bytes[byteIndex++] = (byte) (0xf0 | ((c >> 18) & 0x07)); bytes[byteIndex++] = (byte) (0x80 | ((c >> 12) & 0x3f)); bytes[byteIndex++] = (byte) (0x80 | ((c >> 6) & 0x3f)); bytes[byteIndex++] = (byte) (0x80 | (c & 0x3f)); } } } stream.write(bytes, 0, byteIndex); return retval; } /** * Writes a string in UTF-8 encoding to a data stream. * @param str A string to write. * @param stream A writable data stream. * @param replace If true, replaces unpaired surrogate code points with the * replacement character (U + FFFD). If false, stops processing when an * unpaired surrogate code point is seen. * @return 0 if the entire string was written; or -1 if the string contains an * unpaired surrogate code point and {@code replace} is false. * @throws NullPointerException The parameter {@code str} is null or {@code * stream} is null. * @throws java.io.IOException An I/O error occurred. */ public static int WriteUtf8(String str, OutputStream stream, boolean replace) throws java.io.IOException { if (str == null) { throw new NullPointerException("str"); } return WriteUtf8(str, 0, str.length(), stream, replace); } }