Java tutorial
/** * Copyright 2012 Nikita Koksharov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.corundumstudio.socketio.parser; import io.netty.buffer.ByteBuf; public class UTF8CharsScanner { /** * Lookup table used for determining which input characters need special * handling when contained in text segment. */ final static int[] sInputCodes; static { /* * 96 would do for most cases (backslash is ascii 94) but if we want to * do lookups by raw bytes it's better to have full table */ int[] table = new int[256]; // Control chars and non-space white space are not allowed unquoted for (int i = 0; i < 32; ++i) { table[i] = -1; } // And then string end and quote markers are special too table['"'] = 1; table['\\'] = 1; sInputCodes = table; } /** * Additionally we can combine UTF-8 decoding info into similar data table. */ final static int[] sInputCodesUtf8; static { int[] table = new int[sInputCodes.length]; System.arraycopy(sInputCodes, 0, table, 0, sInputCodes.length); for (int c = 128; c < 256; ++c) { int code; // We'll add number of bytes needed for decoding if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) code = 2; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) code = 3; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... code = 4; } else { // And -1 seems like a good "universal" error marker... code = -1; } table[c] = code; } sInputCodesUtf8 = table; } private int getCharTailIndex(ByteBuf inputBuffer, int i) { int c = (int) inputBuffer.getByte(i) & 0xFF; switch (sInputCodesUtf8[c]) { case 2: // 2-byte UTF i += 2; break; case 3: // 3-byte UTF i += 3; break; case 4: // 4-byte UTF i += 4; break; default: i++; break; } return i; } public int getLength(ByteBuf inputBuffer, int start) { int len = 0; for (int i = start; i < inputBuffer.writerIndex();) { i = getCharTailIndex(inputBuffer, i); len++; } return len; } public int findTailIndex(ByteBuf inputBuffer, int start, int end, int charsToRead) { int len = 0; int i = start; while (i < end) { i = getCharTailIndex(inputBuffer, i); len++; if (charsToRead == len) { break; } } return i; } }