Java tutorial
/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.sqoop.lib; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.util.ArrayList; import java.util.List; /** * Parses a record containing one or more fields. Fields are separated * by some FIELD_DELIMITER character, e.g. a comma or a ^A character. * Records are terminated by a RECORD_DELIMITER character, e.g., a newline. * * Fields may be (optionally or mandatorily) enclosed by a quoting char * e.g., '\"' * * Fields may contain escaped characters. An escape character may be, e.g., * the '\\' character. Any character following an escape character * is treated literally. e.g., '\n' is recorded as an 'n' character, not a * newline. * * Unexpected results may occur if the enclosing character escapes itself. * e.g., this cannot parse SQL SELECT statements where the single character * ['] escapes to ['']. * * This class is not synchronized. Multiple threads must use separate * instances of RecordParser. * * The fields parsed by RecordParser are backed by an internal buffer * which is cleared when the next call to parseRecord() is made. If * the buffer is required to be preserved, you must copy it yourself. */ public final class RecordParser { public static final Log LOG = LogFactory.getLog(RecordParser.class.getName()); private enum ParseState { FIELD_START, ENCLOSED_FIELD, UNENCLOSED_FIELD, ENCLOSED_ESCAPE, ENCLOSED_EXPECT_DELIMITER, UNENCLOSED_ESCAPE } /** * An error thrown when parsing fails. */ public static class ParseError extends Exception { public ParseError() { super("ParseError"); } public ParseError(final String msg) { super(msg); } public ParseError(final String msg, final Throwable cause) { super(msg, cause); } public ParseError(final Throwable cause) { super(cause); } } private DelimiterSet delimiters; private ArrayList<String> outputs; public RecordParser(final DelimiterSet delimitersIn) { this.delimiters = delimitersIn.copy(); this.outputs = new ArrayList<String>(); } /** * Return a list of strings representing the fields of the input line. * This list is backed by an internal buffer which is cleared by the * next call to parseRecord(). */ public List<String> parseRecord(CharSequence input) throws ParseError { if (null == input) { throw new ParseError("null input string"); } return parseRecord(CharBuffer.wrap(input)); } /** * Return a list of strings representing the fields of the input line. * This list is backed by an internal buffer which is cleared by the * next call to parseRecord(). */ public List<String> parseRecord(Text input) throws ParseError { if (null == input) { throw new ParseError("null input string"); } // TODO(aaron): The parser should be able to handle UTF-8 strings // as well, to avoid this transcode operation. return parseRecord(input.toString()); } /** * Return a list of strings representing the fields of the input line. * This list is backed by an internal buffer which is cleared by the * next call to parseRecord(). */ public List<String> parseRecord(byte[] input) throws ParseError { if (null == input) { throw new ParseError("null input string"); } return parseRecord(ByteBuffer.wrap(input).asCharBuffer()); } /** * Return a list of strings representing the fields of the input line. * This list is backed by an internal buffer which is cleared by the * next call to parseRecord(). */ public List<String> parseRecord(char[] input) throws ParseError { if (null == input) { throw new ParseError("null input string"); } return parseRecord(CharBuffer.wrap(input)); } public List<String> parseRecord(ByteBuffer input) throws ParseError { if (null == input) { throw new ParseError("null input string"); } return parseRecord(input.asCharBuffer()); } // TODO(aaron): Refactor this method to be much shorter. // CHECKSTYLE:OFF /** * Return a list of strings representing the fields of the input line. * This list is backed by an internal buffer which is cleared by the * next call to parseRecord(). */ public List<String> parseRecord(CharBuffer input) throws ParseError { if (null == input) { throw new ParseError("null input string"); } /* This method implements the following state machine to perform parsing. Note that there are no restrictions on whether particular characters (e.g., field-sep, record-sep, etc) are distinct or the same. The state transitions are processed in the order seen in this comment. Starting state is FIELD_START encloser -> ENCLOSED_FIELD escape char -> UNENCLOSED_ESCAPE field delim -> FIELD_START (for a new field) record delim -> stops processing all other letters get added to current field, -> UNENCLOSED FIELD ENCLOSED_FIELD state: escape char goes to ENCLOSED_ESCAPE encloser goes to ENCLOSED_EXPECT_DELIMITER field sep or record sep gets added to the current string normal letters get added to the current string ENCLOSED_ESCAPE state: any character seen here is added literally, back to ENCLOSED_FIELD ENCLOSED_EXPECT_DELIMITER state: field sep goes to FIELD_START record sep halts processing. all other characters are errors. UNENCLOSED_FIELD state: ESCAPE char goes to UNENCLOSED_ESCAPE FIELD_SEP char goes to FIELD_START RECORD_SEP char halts processing normal chars or the enclosing char get added to the current string UNENCLOSED_ESCAPE: add charater literal to current string, return to UNENCLOSED_FIELD */ char curChar = DelimiterSet.NULL_CHAR; ParseState state = ParseState.FIELD_START; int len = input.length(); StringBuilder sb = null; outputs.clear(); char enclosingChar = delimiters.getEnclosedBy(); char fieldDelim = delimiters.getFieldsTerminatedBy(); char recordDelim = delimiters.getLinesTerminatedBy(); char escapeChar = delimiters.getEscapedBy(); boolean enclosingRequired = delimiters.isEncloseRequired(); for (int pos = 0; pos < len; pos++) { curChar = input.get(); switch (state) { case FIELD_START: // ready to start processing a new field. if (null != sb) { // We finished processing a previous field. Add to the list. outputs.add(sb.toString()); } sb = new StringBuilder(); if (enclosingChar == curChar) { // got an opening encloser. state = ParseState.ENCLOSED_FIELD; } else if (escapeChar == curChar) { state = ParseState.UNENCLOSED_ESCAPE; } else if (fieldDelim == curChar) { // we have a zero-length field. This is a no-op. continue; } else if (recordDelim == curChar) { // we have a zero-length field, that ends processing. pos = len; } else { // current char is part of the field. state = ParseState.UNENCLOSED_FIELD; sb.append(curChar); if (enclosingRequired) { throw new ParseError("Opening field-encloser expected at position " + pos); } } break; case ENCLOSED_FIELD: if (escapeChar == curChar) { // the next character is escaped. Treat it literally. state = ParseState.ENCLOSED_ESCAPE; } else if (enclosingChar == curChar) { // we're at the end of the enclosing field. Expect an EOF or EOR char. state = ParseState.ENCLOSED_EXPECT_DELIMITER; } else { // this is a regular char, or an EOF / EOR inside an encloser. Add to // the current field string, and remain in this state. sb.append(curChar); } break; case UNENCLOSED_FIELD: if (escapeChar == curChar) { // the next character is escaped. Treat it literally. state = ParseState.UNENCLOSED_ESCAPE; } else if (fieldDelim == curChar) { // we're at the end of this field; may be the start of another one. state = ParseState.FIELD_START; } else if (recordDelim == curChar) { pos = len; // terminate processing immediately. } else { // this is a regular char. Add to the current field string, // and remain in this state. sb.append(curChar); } break; case ENCLOSED_ESCAPE: // Treat this character literally, whatever it is, and return to // enclosed field processing. sb.append(curChar); state = ParseState.ENCLOSED_FIELD; break; case ENCLOSED_EXPECT_DELIMITER: // We were in an enclosed field, but got the final encloser. Now we // expect either an end-of-field or an end-of-record. if (fieldDelim == curChar) { // end of one field is the beginning of the next. state = ParseState.FIELD_START; } else if (recordDelim == curChar) { // stop processing. pos = len; } else { // Don't know what to do with this character. throw new ParseError("Expected delimiter at position " + pos); } break; case UNENCLOSED_ESCAPE: // Treat this character literally, whatever it is, and return to // non-enclosed field processing. sb.append(curChar); state = ParseState.UNENCLOSED_FIELD; break; default: throw new ParseError("Unexpected parser state: " + state); } } if (state == ParseState.FIELD_START && curChar == fieldDelim) { // we hit an EOF/EOR as the last legal character and we need to mark // that string as recorded. This if block is outside the for-loop since // we don't have a physical 'epsilon' token in our string. if (null != sb) { outputs.add(sb.toString()); sb = new StringBuilder(); } } if (null != sb) { // There was a field that terminated by running out of chars or an EOR // character. Add to the list. outputs.add(sb.toString()); } return outputs; } // CHECKSTYLE:ON public boolean isEnclosingRequired() { return delimiters.isEncloseRequired(); } @Override public String toString() { return "RecordParser[" + delimiters.toString() + "]"; } @Override public int hashCode() { return this.delimiters.hashCode(); } }