Encodes a string in UTF-8 as a byte array. - Java java.lang

Java examples for java.lang:String UTF

Description

Encodes a string in UTF-8 as a byte array.

Demo Code

/*/*from   ww  w.ja v  a  2 s  . c  o  m*/
 Written in 2013 by Peter O.
 Any copyright is dedicated to the Public Domain.
 http://creativecommons.org/publicdomain/zero/1.0/
 If you like this, you should donate to Peter O.
 at: http://upokecenter.dreamhosters.com/articles/donate-now-2/
 */
//package com.java2s;
import java.io.*;

public class Main {
    private static final int StreamedStringBufferLength = 4096;

    /**
     * Encodes a string in UTF-8 as a byte array.
     * @param str A text string.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U + FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @return The string encoded in UTF-8.
     * @throws NullPointerException The parameter {@code str} is null.
     * @throws IllegalArgumentException The string contains an unpaired surrogate code
     * point and {@code replace} is false, or an internal error occurred.
     * @throws IllegalArgumentException The parameter "offset" is less than 0,
     * "bytesCount" is less than 0, or offset plus bytesCount is greater
     * than the length of "data" .
     */
    public static byte[] GetUtf8Bytes(String str, boolean replace) {
        if (str == null) {
            throw new NullPointerException("str");
        }
        try {
            java.io.ByteArrayOutputStream ms = null;
            try {
                ms = new java.io.ByteArrayOutputStream();

                if (WriteUtf8(str, ms, replace) != 0) {
                    throw new IllegalArgumentException(
                            "Unpaired surrogate code point");
                }
                return ms.toByteArray();
            } finally {
                try {
                    if (ms != null)
                        ms.close();
                } catch (java.io.IOException ex) {
                }
            }
        } catch (IOException ex) {
            throw new IllegalArgumentException("I/O error occurred", ex);
        }
    }

    /**
     * Writes a portion of a string in UTF-8 encoding to a data stream.
     * @param str A string to write.
     * @param offset The zero-based index where the string portion to write begins.
     * @param length The length of the string portion to write.
     * @param stream A writable data stream.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U + FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @return 0 if the entire string portion was written; or -1 if the string
     * portion contains an unpaired surrogate code point and {@code replace}
     * is false.
     * @throws NullPointerException The parameter {@code str} is null or {@code
     * stream} is null.
     * @throws IllegalArgumentException The parameter {@code offset} is less than 0,
     * {@code length} is less than 0, or {@code offset} plus {@code length}
     * is greater than the string's length.
     * @throws java.io.IOException An I/O error occurred.
     */
    public static int WriteUtf8(String str, int offset, int length,
            OutputStream stream, boolean replace)
            throws java.io.IOException {
        return WriteUtf8(str, offset, length, stream, replace, false);
    }

    /**
     * Writes a portion of a string in UTF-8 encoding to a data stream.
     * @param str A string to write.
     * @param offset The zero-based index where the string portion to write begins.
     * @param length The length of the string portion to write.
     * @param stream A writable data stream.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U + FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @param lenientLineBreaks If true, replaces carriage return (CR) not followed
     * by line feed (LF) and LF not preceded by CR with CR-LF pairs.
     * @return 0 if the entire string portion was written; or -1 if the string
     * portion contains an unpaired surrogate code point and {@code replace}
     * is false.
     * @throws NullPointerException The parameter {@code str} is null or {@code
     * stream} is null.
     * @throws IllegalArgumentException The parameter {@code offset} is less than 0,
     * {@code length} is less than 0, or {@code offset} plus {@code length}
     * is greater than the string's length.
     * @throws java.io.IOException An I/O error occurred.
     */
    public static int WriteUtf8(String str, int offset, int length,
            OutputStream stream, boolean replace, boolean lenientLineBreaks)
            throws java.io.IOException {
        if (stream == null) {
            throw new NullPointerException("stream");
        }
        if (str == null) {
            throw new NullPointerException("str");
        }
        if (offset < 0) {
            throw new IllegalArgumentException("offset (" + offset
                    + ") is less than " + "0");
        }
        if (offset > str.length()) {
            throw new IllegalArgumentException("offset (" + offset
                    + ") is more than " + str.length());
        }
        if (length < 0) {
            throw new IllegalArgumentException("length (" + length
                    + ") is less than " + "0");
        }
        if (length > str.length()) {
            throw new IllegalArgumentException("length (" + length
                    + ") is more than " + str.length());
        }
        if (str.length() - offset < length) {
            throw new IllegalArgumentException(
                    "str.length() minus offset (" + (str.length() - offset)
                            + ") is less than " + length);
        }
        byte[] bytes;
        int retval = 0;
        bytes = new byte[StreamedStringBufferLength];
        int byteIndex = 0;
        int endIndex = offset + length;
        for (int index = offset; index < endIndex; ++index) {
            int c = str.charAt(index);
            if (c <= 0x7f) {
                if (lenientLineBreaks) {
                    if (c == 0x0d
                            && (index + 1 >= endIndex || str
                                    .charAt(index + 1) != 0x0a)) {
                        // bare CR, convert to CRLF
                        if (byteIndex + 2 > StreamedStringBufferLength) {
                            // Write bytes retrieved so far
                            stream.write(bytes, 0, byteIndex);
                            byteIndex = 0;
                        }
                        bytes[byteIndex++] = 0x0d;
                        bytes[byteIndex++] = 0x0a;
                        continue;
                    }
                    if (c == 0x0a) {
                        // bare LF, convert to CRLF
                        if (byteIndex + 2 > StreamedStringBufferLength) {
                            // Write bytes retrieved so far
                            stream.write(bytes, 0, byteIndex);
                            byteIndex = 0;
                        }
                        bytes[byteIndex++] = 0x0d;
                        bytes[byteIndex++] = 0x0a;
                        continue;
                    }
                }
                if (byteIndex >= StreamedStringBufferLength) {
                    // Write bytes retrieved so far
                    stream.write(bytes, 0, byteIndex);
                    byteIndex = 0;
                }
                bytes[byteIndex++] = (byte) c;
            } else if (c <= 0x7ff) {
                if (byteIndex + 2 > StreamedStringBufferLength) {
                    // Write bytes retrieved so far
                    stream.write(bytes, 0, byteIndex);
                    byteIndex = 0;
                }
                bytes[byteIndex++] = (byte) (0xc0 | ((c >> 6) & 0x1f));
                bytes[byteIndex++] = (byte) (0x80 | (c & 0x3f));
            } else {
                if ((c & 0xfc00) == 0xd800 && index + 1 < endIndex
                        && str.charAt(index + 1) >= 0xdc00
                        && str.charAt(index + 1) <= 0xdfff) {
                    // Get the Unicode code point for the surrogate pair
                    c = 0x10000 + ((c - 0xd800) << 10)
                            + (str.charAt(index + 1) - 0xdc00);
                    ++index;
                } else if ((c & 0xf800) == 0xd800) {
                    // unpaired surrogate
                    if (!replace) {
                        retval = -1;
                        break; // write bytes read so far
                    }
                    c = 0xfffd;
                }
                if (c <= 0xffff) {
                    if (byteIndex + 3 > StreamedStringBufferLength) {
                        // Write bytes retrieved so far
                        stream.write(bytes, 0, byteIndex);
                        byteIndex = 0;
                    }
                    bytes[byteIndex++] = (byte) (0xe0 | ((c >> 12) & 0x0f));
                    bytes[byteIndex++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                    bytes[byteIndex++] = (byte) (0x80 | (c & 0x3f));
                } else {
                    if (byteIndex + 4 > StreamedStringBufferLength) {
                        // Write bytes retrieved so far
                        stream.write(bytes, 0, byteIndex);
                        byteIndex = 0;
                    }
                    bytes[byteIndex++] = (byte) (0xf0 | ((c >> 18) & 0x07));
                    bytes[byteIndex++] = (byte) (0x80 | ((c >> 12) & 0x3f));
                    bytes[byteIndex++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                    bytes[byteIndex++] = (byte) (0x80 | (c & 0x3f));
                }
            }
        }
        stream.write(bytes, 0, byteIndex);
        return retval;
    }

    /**
     * Writes a string in UTF-8 encoding to a data stream.
     * @param str A string to write.
     * @param stream A writable data stream.
     * @param replace If true, replaces unpaired surrogate code points with the
     * replacement character (U + FFFD). If false, stops processing when an
     * unpaired surrogate code point is seen.
     * @return 0 if the entire string was written; or -1 if the string contains an
     * unpaired surrogate code point and {@code replace} is false.
     * @throws NullPointerException The parameter {@code str} is null or {@code
     * stream} is null.
     * @throws java.io.IOException An I/O error occurred.
     */
    public static int WriteUtf8(String str, OutputStream stream,
            boolean replace) throws java.io.IOException {
        if (str == null) {
            throw new NullPointerException("str");
        }
        return WriteUtf8(str, 0, str.length(), stream, replace);
    }
}

Related Tutorials