List of usage examples for java.nio.charset MalformedInputException MalformedInputException
public MalformedInputException(int inputLength)
From source file:org.apache.arrow.vector.util.Text.java
/** * Check to see if a byte array is valid utf-8 * * @param utf8/*from www.j a v a 2s . c o m*/ * the array of bytes * @param start * the offset of the first byte in the array * @param len * the length of the byte sequence * @throws MalformedInputException * if the byte array contains invalid bytes */ public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException { int count = start; int leadByte = 0; int length = 0; int state = LEAD_BYTE; while (count < start + len) { int aByte = utf8[count] & 0xFF; switch (state) { case LEAD_BYTE: leadByte = aByte; length = bytesFromUTF8[aByte]; switch (length) { case 0: // check for ASCII if (leadByte > 0x7F) { throw new MalformedInputException(count); } break; case 1: if (leadByte < 0xC2 || leadByte > 0xDF) { throw new MalformedInputException(count); } state = TRAIL_BYTE_1; break; case 2: if (leadByte < 0xE0 || leadByte > 0xEF) { throw new MalformedInputException(count); } state = TRAIL_BYTE_1; break; case 3: if (leadByte < 0xF0 || leadByte > 0xF4) { throw new MalformedInputException(count); } state = TRAIL_BYTE_1; break; default: // too long! Longest valid UTF-8 is 4 bytes (lead + three) // or if < 0 we got a trail byte in the lead byte position throw new MalformedInputException(count); } // switch (length) break; case TRAIL_BYTE_1: if (leadByte == 0xF0 && aByte < 0x90) { throw new MalformedInputException(count); } if (leadByte == 0xF4 && aByte > 0x8F) { throw new MalformedInputException(count); } if (leadByte == 0xE0 && aByte < 0xA0) { throw new MalformedInputException(count); } if (leadByte == 0xED && aByte > 0x9F) { throw new MalformedInputException(count); } // falls through to regular trail-byte test!! case TRAIL_BYTE: if (aByte < 0x80 || aByte > 0xBF) { throw new MalformedInputException(count); } if (--length == 0) { state = LEAD_BYTE; } else { state = TRAIL_BYTE; } break; default: break; } // switch (state) count++; } }
From source file:io.Text.java
/** * Check to see if a byte array is valid utf-8 * @param utf8 the array of bytes//from w w w.j av a2 s . c om * @param start the offset of the first byte in the array * @param len the length of the byte sequence * @throws MalformedInputException if the byte array contains invalid bytes */ public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException { int count = start; int leadByte = 0; int length = 0; int state = LEAD_BYTE; while (count < start + len) { int aByte = ((int) utf8[count] & 0xFF); switch (state) { case LEAD_BYTE: leadByte = aByte; length = bytesFromUTF8[aByte]; switch (length) { case 0: // check for ASCII if (leadByte > 0x7F) throw new MalformedInputException(count); break; case 1: if (leadByte < 0xC2 || leadByte > 0xDF) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; case 2: if (leadByte < 0xE0 || leadByte > 0xEF) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; case 3: if (leadByte < 0xF0 || leadByte > 0xF4) throw new MalformedInputException(count); state = TRAIL_BYTE_1; break; default: // too long! Longest valid UTF-8 is 4 bytes (lead + three) // or if < 0 we got a trail byte in the lead byte position throw new MalformedInputException(count); } // switch (length) break; case TRAIL_BYTE_1: if (leadByte == 0xF0 && aByte < 0x90) throw new MalformedInputException(count); if (leadByte == 0xF4 && aByte > 0x8F) throw new MalformedInputException(count); if (leadByte == 0xE0 && aByte < 0xA0) throw new MalformedInputException(count); if (leadByte == 0xED && aByte > 0x9F) throw new MalformedInputException(count); // falls through to regular trail-byte test!! case TRAIL_BYTE: if (aByte < 0x80 || aByte > 0xBF) throw new MalformedInputException(count); if (--length == 0) { state = LEAD_BYTE; } else { state = TRAIL_BYTE; } break; } // switch (state) count++; } }