Android Open Source - BestBoard Tokenizer

From Project

Back to project page BestBoard.
License

The source code is released under:
MIT License
If you think the Android project BestBoard listed in this page is inappropriate, such as containing malicious code/tools or violating the copyright, please email info at java2s dot com, thanks.
Java Source Code

package digitalgarden.magicmerlin.utils;
/* ww w .j  a  va 2 s .c  om*/
import java.io.IOException;
import java.io.Reader;

import digitalgarden.magicmerlin.scribe.Scribe;

/**
 * ReaderTokenizer identifies keywords and data from a reader stream.
 * Keywords can contain only ASCII (<128) characters (letters, digits and underscore), 
 * while full BMP Unicode set is allowed in string and character constans.
 * Tokens should be surrounded by white-spaces (Currently ascii chars below space, ',' and '='). 
 * Recognition will skip to the next white-space from the first not-known character.
 * <p> 
 * Tokens can be: 
 * <ul>
 * <li>keywords - letters, digits and '_' allowed.
 * Keywords are converted to lowercase, and cannot begin with number.</li>
 * <li>"string" - All BMP characters, escape sequences are allowed.</li>
 * <li>'c' - One character. All BMP characters, escape sequences are allowed.</li>
 * <li>-+integers - negative or positive sign (on the first position) and 0-9 are allowed.</li>
 * <li>-+0xhexadecimals - same as integers, but A-F letters are allowed.</li>
 * <li>0ccolors - same as integers, but in aarrggbb/rrggbb/argb/rgb format, 
 * with hexadecimal digits. Default 'aa/a' is 0xFF. One or two digits will be interpreted as grayscale.</li>
 * <li>-+decimal.fraction - same as integers (and one decimal point)</li>
 * <li>EOL, EOF - end of line, end of file are treated as tokens.</li>
 * <li># - toggles EOL recognition. Between two # all the lines will be treated as one line.</li>
 * <li>;notes - all characters (till EOL) are ignored after the note sign.</li>
 * </ul>
 * <p>  
 * Escape sequence can be used instead of any character. 
 * Not of the all escape sequences are implemented yet.
 * <ul>
 * <li> '\n' - New Line/Line Feed (0x0A) </li>
 * <li> '\r' - Carriage Return (0x0D) </li>
 * <li> '\t' - (Horizontal) Tab (0x09) </li>
 * <li> '\\' - Backslash (0x5C) </li>
 * <li> '\'' - Single quotation mark (0x27) </li>
 * <li> '\"' - Double quotation mark (0x22)</li>
 * <li> '\HHHH' - Four digit hexadecimal unicode</li>
 * </ul>
 * <p>
 * nextToken() will recognize the next token and return the token type.
 * getStringToken(), getIntegerToken, getDoubleToken can be used to retrieve the parameters:
 * <ul>
 * <li> TYPE_KEYWORD - string representation and generated token code </li>
 * <li> TYPE_STRING - string constant without double quation marks </li>
 * <li> TYPE_CHARACTER - character without single  quation marks as string and unicode value </li>
 * <li> TYPE_INTEGER - long precision integer and original number as string </li>
 * <li> TYPE_FRACTION - double precision fraction and original number as string </li>
 * <li> TYPE_EOL and TYPE_EOF - are standalone tokens, but they will return only type </li>
 * </ul>
 * <p>
 * Read errors will throw exception, but recognition mistakes will not stop the process.
 * A message log will be generated in case of any mistakes. 
 */
public class Tokenizer
  {
  /**
   ** TOKEN TYPES
   **/

  /** INTERNAL: Type not determined yet - this type will never returned. */
  private static final int TYPE_UNKNOWN = 0;

  /** INTERNAL: Hexadecimal number - TYPE_INTEGER will be returned. */
  private static final int TYPE_HEXADECIMAL = 6;

  /** INTERNAL: Color value - TYPE_INTEGER will be returned. */
  private static final int TYPE_COLOR = 7;

  /** End of reader stream token */
  public static final int TYPE_EOF = -1;

  /** End of line token */
  public static final int TYPE_EOL = -2;
  
  /** Keyword token, string representation and generated code are returned */
  public static final int TYPE_KEYWORD = 1;

  /** String constant token between double quation marks, only string representation is returned */
  public static final int TYPE_STRING = 2;

  /** Character constant token, character as string, and unicode value as integer will be returned */
  public static final int TYPE_CHARACTER = 3;

  /** Integer number. Long precision, and the original number as string will be returned */
  public static final int TYPE_INTEGER = 4;

  /** Decimal fraction. Double precision, and the original number as string will be returned */
  public static final int TYPE_FRACTION = 5;

  
  /**
   ** CONSTRUCTOR - SETTING UP READER SOURCE
   **/
  
  /** Reader stream source. Only read() method is used. */
  private Reader reader;

  /**
   * Constructs a new {@code ReaderTokenizer} with {@code reader} as source reader. 
   * @param reader
   * @throws IOException 
     * @throws NullPointerException if {@code reader} is {@code null}.
   */
  public Tokenizer( Reader reader ) throws IOException
    {
    if ( reader == null)
      throw new NullPointerException("Reader parameter is null!");
    this.reader = reader;
    
    // Check for BOM
    if ( read() == 0xFEFF )
      Scribe.note("BOM character detected, skipped.");
    else
      pushBackLastRead();
    }
  
  
  /**
   ** PARAMETERS OF CURRENT TOKEN
   **/
  
  /** Type of the token */
  private int tokenType;
  
  /** 
   * String value of the token
   * TYPE_OEF, TYPE_EOL - Empty string; 
   * TYPE_KEYWORD - lowercase token;
   * TYPE_STRING - string without quation marks;
   * TYPE_CHARACTER - one character long string;
   * TYPE_INTEGER, TYPE_FRACTON - original number as string;
   */
  private StringBuilder tokenStringBuilder = new StringBuilder();
  
  /** 
   * Integer value of the token
   * TYPE_INTEGER - absolute (always positive) value, tokenMinusSign marks negative values
   * TYPE_CHARACTER - unicode value (always between 0x0000-0xFFFF)
   * TYPE_KEYWORD - token code created from the first dozen characters
   * TYPE_DOUBLE - absolute integer created without the decimal sign
   * all other types - null value;
   */
  private long tokenInteger;
  
  /**
   * Divider part of the double token (fraction)
   * TYPE_FRACTION - Divider part for tokenFraction
   * all other types - null value;
   */
  private long tokenDivider;

  /** True if the numeric value (tokenInteger or tokenDouble) is negative. */
  private boolean tokenMinusSign;
  
  /** 
   * No more digits can be processed for numbers (also parts of fractions).
   * Result is unpredictable above it, log message will be sent.
   */
  public final static int MAX_DECIMAL_DIGITS = 18;  

  /** 
   * No more digits can be processed for hexadecimal numbers.
   * Result is unpredictable above it, log message will be sent.
   */
  public final static int MAX_HEXADECIMAL_DIGITS = 15;  

  /** 
   * No more characters can be processed for keywords.
   * Result is unpredictable above it, log message will be sent.
   */
  public final static int MAX_KEYWORD_CHARACTERS = 12;  

  
  /**
   ** GETTERS FOR TOKEN PARAMETERS
   **/
  
  /**
   * Get string representation of current token
   * @return string value as:
   * TYPE_OEF, TYPE_EOL - Empty string; 
   * TYPE_KEYWORD - lowercase token;
   * TYPE_STRING - string without quation marks;
   * TYPE_CHARACTER - one character long string without quation marks;
   * TYPE_INTEGER, TYPE_FRACTON - original number as string;
   */
  public String getStringToken()
    {
    return tokenStringBuilder.toString();
    }
  
  /**
   * Get integer representation of current token
   * @return integer value as:
   * TYPE_INTEGER - integer value (can be negative)
   * TYPE_FRACTON - integer value created without the decimal point (can be negative)
   * TYPE_CHARACTER - unicode value (always between 0x0000-0xFFFF)
   * TYPE_KEYWORD - token code created from the first dozen characters
   * all other types - null value;
   */
  public long getIntegerToken()
    {
    return tokenMinusSign ? -tokenInteger : tokenInteger;
    }
  
  /**
   * Get divider integer for calculating double values. 
   * Double value = getInteger()/getIntegerDivider().
   * This is experimental, later can be used for returning not only decimal fractions.
   * @return integer value as:
   * TYPE_FRACTON - integer divider
   * all other types - null value;
   */
  public long getIntegerDividerToken()
    {
    return tokenDivider;
    }

  /**
   * Get double precision floating point representation of current token (fractions)
   * !! This is just an experimental algorithm to get fractions. 
   * As a side effect all integer returning types will give the same integer here, as double.
   * Java's parseDouble may work better (and quicker)
   * @return double value as:
   * TYPE_INTEGER, TYPE_FRACTION - double value (can be negative)
   * TYPE_CHARACTER - unicode value (always between 0x0000-0xFFFF) as double
   * TYPE_KEYWORD - token code created from the first dozen characters as double
   * all other types - null value;
   */
  public double getDoubleToken()
    {
    double result = (double)tokenInteger / tokenDivider;
    if ( tokenMinusSign && tokenInteger != 0 )
      return -result;
    return result;
    }
  
  /**
   * Returns true if this is an EOL or EOF token.
   */
  public boolean isEndOfSectionToken()
    {
    return tokenType == TYPE_EOL || tokenType == TYPE_EOF;
    }

  
  /**
   ** LETTER AND DIGIT VALIDATIONS
   **/

  /** End-of-file character */
  public static final int EOF = -1;

  /** End-of-line character */
  public static final int EOL = '\n';

  /** End-of-line recognition toggle */
  public static final int EOL_TOGGLE = '#';

  /** Note character - line will be ignored after note */
  public static final int MARK_NOTE = ';';
  
  /** Decimal fraction character */
  public static final int MARK_FRACTION = '.';
  
  /** Character (single) quote mark character */
  public static final int MARK_STRING = '\"';

  /** String (double) quote mark character */
  public static final int MARK_CHARACTER = '\'';
  
  /** Minus sign character */
  public static final int MARK_MINUS = '-';
  
  /** Plus sign character */
  public static final int MARK_PLUS = '+';

  /** Escape sequence comming */
  public static final int MARK_ESCAPE = '\\';

  
  /**
   * True if character is a decimal digit
   */
  private boolean isValidDecimalDigit( int ch )
    {
    if ( ch >= '0' && ch <= '9' )
      return true;
    return false;
    }
  
  /**
   * True if character is a hexadecimal digit
   */
  private boolean isValidHexadecimalDigit( int ch )
    {
    if ( ch >= '0' && ch <= '9' )
      return true;
    if ( ch >= 'a' && ch <= 'f' )
      return true;
    if ( ch >= 'A' && ch <= 'F' )
      return true;
    return false;
    }
  
  /**
   * True if character is an ASCII non-accented letter
   */
  private boolean isValidAsciiLetter( int ch )
    {
    if ( ch >= 'a' && ch <= 'z' )
      return true;
    if ( ch >= 'A' && ch <= 'Z' )
      return true;
    return false;
    }
  
  /**
   * Converts non-accented ascii letters to lowercase 
   */
  private int toLowerCaseAsciiLetter( int ch )
    {
    if ( ch >= 'A' && ch <= 'Z' )
      return ch + 'a' - 'A';
    return ch;    
    }
  
  /**
   * True if character is a valid white-space
   * EOL and EOF are special white-spaces
   * !! Special mark characters cannot be used as white spaces !!
   */
  private boolean isValidWhiteSpace( int ch )
    {
    if ( ch <= ' ' || ch == ',' || ch == '=' )
      return true;
    
    // NOTE is not a real white space, but identifieing it as a white space stops
    // any skip of non-white-spaces
    // That means: no white-spaces are needed before the MARK_NOTE, 
    // Therefore MARK_NOTE should be evaluated BEFORE white-spaces
    if ( ch == MARK_NOTE )
      return true;
    return false;
    }
  
  /**
   * True if character is accepted in keywords
   */
  private boolean isValidKeyword( int ch )
    {
    if ( isValidAsciiLetter(ch) )
      return true;
    if ( isValidDecimalDigit(ch) )
      return true;
    if ( ch == '_' )
      return true;
    return false;
    }
  
  /** Number of characters used for keywords */
  private final static int TOKEN_CODE_RADIX = 'Z' - 'A' + 12;

  /**
   * Returns the numeric value of the (hexa)decimal character.
   * Result is inpredictable if character is not valid!
   * This method is used for creating special token codes for keywords. 
   * In this case -'z' and '_' are allowed.
   * @param ch (Hexa)decimal character '0'-'9' or 'a'-'f' or 'A'-'F'. 
   * Also 'G'-'Z' and '_' for keywords.
   * @return decimal value (0-15)
   */
  private int valueOf( int ch )
    {
    if ( ch <= '9' ) // && >='0'
      return ch-'0';

    if ( ch <= 'Z' ) // && ch >= 'A'
      return ch-'A'+10;

    if ( ch == '_' )
      return TOKEN_CODE_RADIX -1;
    
    // 'a' <= ch <= 'z'
    return ch-'a'+10;
    }
  
  /**
   * Character values as string. Used in error messages.
   * [char] (unicode) format. [char] is not displayed for characters below ASCII space.  
   * @param ch character
   * @return character printable form and unicode value
   */
  private String getCharacterDescription( int ch )
    {
    return (( ch >= ' ' ) ? "[" + (char)ch + "] " : "")  + "(" + (int)ch + ")";
    }
  
  
  /**
   ** READER STREAM READING FUNCTIONS
   **/
  
  /** Temporary storage for last read */
  private int lastRead = 0;
  
  /** read() will not read a new character, but will give back the last one */
  private boolean rewindLastRead = false;
  
  /**
   * Reads a new character from reader stream. 
   * If rewindLastRead == true, then the read will return last character instead of a new one
   * @return Character from the reader stream
   * @throws IOException if reading fails
   */
  private int read() throws IOException
    {
    if ( rewindLastRead )
      rewindLastRead = false;
    else
      lastRead = reader.read();
    return lastRead;
    }
  
  /**
   * Push back last character. Next read() will give the last character once more
   */
  private void pushBackLastRead()
    {
    rewindLastRead = true;
    }
  
  /**
   * Skips non-white-space characters in reader stream
   * @throws IOException if reading fails
   */
  private void findNextWhiteSpace() throws IOException
    {
    while ( !isValidWhiteSpace( read() ) );
    pushBackLastRead();
    }
  
  /**
   * Skips to the end of the textual line (or EOF) in reader stream. 
   * It will forward to next EOL/EOF character, so ignoreEOL is NOT respected.
   * The method will stop at the end of the textual line.
   * nextToken() (which respects ignoreEOL) will give back EOL or 
   * first token of the next line if depending from the state of ignoreEOL.
   * This method is only used for notes.
   * Notes will not terminate the line of tokens, only textual lines
   * therefore multiple lines (signed by #) can contain notes at the end of any line.
   * @throws IOException if reading fails
   */
  private void findNextEOL() throws IOException
    {
    int ch;
    while ( (ch = read()) != EOL && ch != EOF );
    pushBackLastRead();
    }
  
  /**
   * Skips the whole remaining line. It respects ignoreEOL, so next valid EOL also will be skipped.
   * Reads tokens, till the last read token will be an EOL/EOF. 
   * Next token will be the first token of the next line (or EOF once more).
   * @throws IOException if reading fails
   */
  public void skipThisLine() throws IOException
    {
    do   {
      nextToken();
      } while ( !isEndOfSectionToken() );  
    }

  
  /**
   ** ERROR HANDLING
   **/
  
  /** Line number. Needed for log messages */
  private int lineNumber = 1;
  
  /** Count format errors */
  private int errorCount = 0;

  /**
   * Internal error handling - all error messages come through this method.
   * It is easier to send error messages to a common output, and to count format mistakes.
   * @param message error message, will be completed with line number
   */
  private void error( String message )
    {
    errorCount++;
    Scribe.error( message + " (line: " + getLineNumber() + ") ");
    }
  
  /** Returns the number of the actual line. Helps to locate errors. */
  public int getLineNumber()
    {
    return lineNumber;
    }

  /** Returns the number of errors. 0 == no errors, correctly formatted input. */ 
  public int getErrorCount()
    {
    return errorCount;
    }
  
  
  /**
   ** HEART OF THE CLASS: TOKENIZER PART
   **/
  
  /**
   * Converts escape sequences into characters.
   * Sequences are read from reader stream, after the escape sign ('\').
   * \n, \r, \t and \\ (MARK_ESCAPE), \" (MARK_STRING), \'(MARK_CHARACTER) are identified.
   * Unknown sequnces's second caharcter will be treated as normal.
   * Or: FOUR hexadecimal digits will be converted to unicode. (\HHHH)
   * Not four digit long sequnces will be skipped.
   * @return converted character
   * @throws IOException if reading fails.
   */
  private int convertEscapeSequence() throws IOException
    {
    int ch;
    
    ch = read();
    // some of the conventional escape sequences for formatting 
    if ( ch == 'n' )  return '\n';
    if ( ch == 'r' )  return '\r';
    if ( ch == 't' )  return '\t';
    // These marks could not be used without escaped sequences
    if ( ch == MARK_ESCAPE )  return MARK_ESCAPE;
    if ( ch == MARK_STRING )  return MARK_STRING;
    if ( ch == MARK_CHARACTER )  return MARK_CHARACTER;

    // Four digit hexadecimal code identifies unicode character
    int value = 0;
    int len = 1;
    while (true)
      {
      if ( isValidHexadecimalDigit(ch) )
        {
        value*=0x10;
        value+= valueOf(ch);
        }
      else // Non-valid character was found
        {
        if ( len==1 )
          {
          // This is not a malformed hexadecimal value, rather an unknown escape sequence. Returned as normal (non-escaped) character.
          error( "Unknown escape sequence. " + getCharacterDescription(ch) + " part will be used as a normal charcter.");
          return ch;
          }
        else
          {
          // This is not a four-digit hexadecimal code.
          // Try to return shorter code.
          error( "Malformed escape sequence was found: " + getCharacterDescription( value ));
          pushBackLastRead();
          break;
          }
        }
      
      if ( len >= 4)
        break;
      
      ch = read();
      len++;
      }

    return value;
    }

  /** nextToken() will not read a new token, but will give back the last one if true */
  private boolean rewindLastToken = false;

  /**
   * Push back last token. Next call on nexToken() will give the last token once more
   */
  public void pushBackLastToken()
    {
    rewindLastToken = true;
    }

  /** EOL will be recognized as a whitespace instead of a standalone token (false default). EOL_TOGGLE will toggle this */
  private boolean ignoreEOL = false; 
  
  /**
   * This is the central part of the class: tokenize the next token from reader.
   * Token type is returned, token parameters can be read by getStringToken(), getIntegerToken, getDoubleToken().
   * Reading errors generate exception, but format mistakes will be only logged.  
   * @return token type
   * @throws IOException
   */
  public int nextToken() throws IOException
    {
    // Repeat the last token
    if (rewindLastToken)
      {
      rewindLastToken = false;
      return tokenType;
      }
    
    // Clear token parameters
    tokenType = TYPE_UNKNOWN;
    tokenStringBuilder.setLength( 0 );
    tokenInteger = 0;
    tokenDivider = 1;
    tokenMinusSign = false;
    
    // number of characters in character constans (if not 1 an error will be generated)
    // number of integer digits in numerical types (if exceeds MAX_DIGITS an error will be generated)
    // number of characters in keywords (longer than MAX_CHARACTERS cannot be converted to long token codes)
    // not used in string type (no length limit)
    int tokenLength = 0;
    
    // The digits of argb, rgb and single digit grayscale formats are counting two digits.
    // This is a helper variable to immediateli calculate doubled digits.
    // Whether Integer or IntegerDoubledDigits should be returned, will be decided after counting all digits. 
    long tokenIntegerDoubledDigits = 0;
    
    int ch;
    while (true)
      {
      ch = read();
      
      // TYPE_UNKNOWN: 
      // First character determines tokenType
      // ************************************
      // ?? what is the best order ??
      // Digits should be before keywords (keywords cannot start with digits)
      // NOTE_MARKS should be before white-spaces (it is special "white-space")
      if ( tokenType == TYPE_UNKNOWN )
        {

        // Unsigned integer (or fraction) number is starting, this character will be evaluated as well
        // This is the entry point for hexadecimal and color values, too
        if ( isValidDecimalDigit(ch) )
          {
          tokenType = TYPE_INTEGER;
          }
        
        // Signed number is starting. Sign can be only the first charter! 
        else if ( ch == MARK_MINUS || ch == MARK_PLUS )
          {
          tokenType = TYPE_INTEGER;
          tokenStringBuilder.append( (char)ch );
          tokenMinusSign = ch == MARK_MINUS ? true : false;
          // next characters are processed as numbers
          continue;
          }

        // Fraction is started without any leading integer digits 
        else if ( ch == MARK_FRACTION )
          {
          tokenType = TYPE_FRACTION;
          tokenStringBuilder.append( (char)ch );
          // next characters are processed as numbers
          continue;
          }

        // string token starts
        else if ( ch == MARK_STRING )
          {
          tokenType = TYPE_STRING;
          // String delimiter itself is not needed
          continue; 
          }

        // ' character constant token starts
        else if ( ch == MARK_CHARACTER )  
          {
          tokenType = TYPE_CHARACTER;
          // Character delimiter is not needed
          continue; 
          }

        // Note - skip this line
        else if ( ch == MARK_NOTE )
          {
          // Type remains unknown, but in next round an EOL or EOF should come 
          findNextEOL();
          continue;
          }
        
        // Keyword is starting. Because numbers proceed keywords, keywords cannot start with numbers
        else if ( isValidKeyword(ch) )
          {
          tokenType = TYPE_KEYWORD;
          }

        // EOF - this is an other special white-space
        else if ( ch == EOF )
          {
          tokenType = TYPE_EOF;
          // This token is always one character long, we can finish now
          return TYPE_EOF;
          }
        
        // EOL - this is a special white-space
        else if ( ch == EOL )
          {
          // Next read comes from the next line
          lineNumber++;
          
          if ( ignoreEOL )
            {
            // treated as valid white-space
            continue;
            }
          
          // This token is always one character long, we can finish now
          tokenType = TYPE_EOL;
          return TYPE_EOL;
          }

        // EOL_TOGGLE - toggles EOL recognition
        else if ( ch == EOL_TOGGLE )
          {
          ignoreEOL = !ignoreEOL;
          continue;
          }

        // Other, non special white spaces are skipped
        else if ( isValidWhiteSpace(ch) )
          {
          continue;
          }
        
        // All other characters are not allowed! The whole token will be skipped
        else
          {
          error("Character " + getCharacterDescription(ch) + " is not allowed in tokens. Token skipped.");
          findNextWhiteSpace();
          continue;
          }
        }
      
      // All consecutive characters are processed here
      // Token types are the branches of this process
      // *********************************************
      switch ( tokenType )
        {
        case TYPE_INTEGER:
          // All numeric tokens are start as integers.
          // Minus sign was already set.
          // 0x converts integer to hexadecimal integer
          // . converts integer to fractal (floating point) !!
          // Non-numeric ending is truncated till the next white-space
          if ( isValidDecimalDigit(ch) )
            {
            tokenStringBuilder.append( (char)ch );
            if ( tokenLength == MAX_DECIMAL_DIGITS )
              error("Integer (" + getStringToken() + ") exceeds the max. " + MAX_DECIMAL_DIGITS + " digits! Numeric result is unpredictable!");
            tokenInteger *= 10L;
            tokenInteger += valueOf( ch );
            tokenLength++;
            }
          else if ( (ch=='x' || ch=='X') && tokenInteger == 0 )
            {
            tokenStringBuilder.append( (char)ch );
            tokenLength = 0;
            tokenType = TYPE_HEXADECIMAL;
            }
          else if ( (ch=='c' || ch=='C') && tokenInteger == 0 )
            {
            tokenStringBuilder.append( (char)ch );
            tokenLength = 0;
            tokenMinusSign = false; // minus sign will be omitted
            tokenType = TYPE_COLOR;
            }
          else if ( (ch == MARK_FRACTION) )
            {
            tokenStringBuilder.append( (char)ch );
            tokenType = TYPE_FRACTION;
            }
          else 
            {
            // Can be MARK_PLUS, _MINUS, _FRACTION - but without any valid digits
            if ( tokenLength < 1)
              error("Malformed integer: (" + getStringToken() + ") No numeric characters were found.");
            if ( !isValidWhiteSpace(ch) )
              {
              error("Malformed integer: (" + getStringToken() + ") Endig was truncated.");
              findNextWhiteSpace();
              }
            else
              {
              // Correctly terminated integer returned - no messages.
              // Last white-space should be evaluated.
              pushBackLastRead();
              }
            return TYPE_INTEGER;
            }
          break;
          
        case TYPE_HEXADECIMAL:
          // Format: 0xH...H or 0XH...H, can be signed
          // Otherwise similar to integer type
          if ( isValidHexadecimalDigit(ch) )
            {
            tokenStringBuilder.append( (char)ch );
            if ( tokenLength == MAX_HEXADECIMAL_DIGITS )
              error("Hexadecimal integer (" + getStringToken() + ") exceeds the max. " + MAX_HEXADECIMAL_DIGITS + " digits! Numeric result is unpredictable!");
            tokenInteger *= 0x10L;
            tokenInteger += valueOf(ch);
            tokenLength++;
            }
          else 
            {
            // Can be 0x without any valid digits
            if ( tokenLength < 1)
              error("Malformed hexadecimal integer: (" + getStringToken() + ") No hexadecimal characters were found.");
            if ( !isValidWhiteSpace(ch) )
              {
              error("Malformed hexadecimal integer: (" + getStringToken() + ") Endig was truncated.");
              findNextWhiteSpace();
              }
            else
              {
              // Correctly terminated hexadecimal integer returned - no messages.
              // Last white-space should be evaluated.
              pushBackLastRead();
              }
            return TYPE_INTEGER;
            }
          break;
          
        case TYPE_COLOR:
          // Format: 
          // 0xaarrggbb - 8 digits hexadecimal,
          // 0xrrggbb - 6 digits hexadecimal (+ 0xFF as aa)
          // 0xargb - 4 digits hexadecimal - all digits will be doubled
          // 0xrgb - 3 digits hexadecimal - all digits will be doubled (+ 0xFF as aa)
          // 0xhh - 2 digits hexadecimal - will be used for all colors (+ 0xFF as aa) - grayscale
          // 0xh - 1 digit hexadecimal - will be used for all color digits (+ 0xFF as aa) - grayscale
          // minus sign is already omitted!
          // Otherwise similar to integer type
          if ( isValidHexadecimalDigit(ch) )
            {
            int digit = valueOf(ch);
            
            tokenStringBuilder.append( (char)ch );
            tokenInteger *= 0x10L;
            tokenInteger += digit;
            // doubled digits will be added twice
            tokenIntegerDoubledDigits *= 0x10L;
            tokenIntegerDoubledDigits += digit;
            tokenIntegerDoubledDigits *= 0x10L;
            tokenIntegerDoubledDigits += digit;
            tokenLength++;
            }
          else 
            {
            // Can be 0x without any valid digits
            if ( tokenLength < 1)
              error("Malformed color value: (" + getStringToken() + ") No hexadecimal characters were found.");
            // One digit grayscale
            else if ( tokenLength == 1 ) // 0xh
              {
              tokenInteger = 0xFF000000L +
                  tokenIntegerDoubledDigits * 0x10000L +
                  tokenIntegerDoubledDigits * 0x100L +
                  tokenIntegerDoubledDigits;
              }
            // Two digits grayscale
            else if ( tokenLength == 2 ) // 0xhh
              {
              tokenInteger = 0xFF000000L +
                  tokenInteger * 0x10000L +
                  tokenInteger * 0x100L +
                  tokenInteger;
              }
            // Red-Green-Blue each has one digit
            else if ( tokenLength == 3 ) // 0xrgb
              {
              tokenInteger = 0xFF000000L + tokenIntegerDoubledDigits;
              }
            // Alpha-Red-Green-Blue each has one digit
            else if ( tokenLength == 4 ) // 0xargb
              {
              tokenInteger = tokenIntegerDoubledDigits;
              }
            // Red-Green-Blue each has two digits
            else if ( tokenLength == 6 ) // 0xrrggbb
              {
              tokenInteger = 0xFF000000L + tokenInteger;
              }
            // Alpha-Red-Green-Blue each has two digits - full color value
            else if ( tokenLength == 8 ) // 0xaarrggbb
              {
              // nothing to do, tokenInteger already contains information
              }
            else
              {
              tokenInteger = 0xFFFFFFFFL;
              error("Malformed color value: (" + getStringToken() + ") Returning opaque white color.");
              }

            if ( !isValidWhiteSpace(ch) )
              {
              error("Malformed hexadecimal integer: (" + getStringToken() + ") Endig was truncated.");
              findNextWhiteSpace();
              }
            else
              {
              // Correctly terminated hexadecimal integer returned - no messages.
              // Last white-space should be evaluated.
              pushBackLastRead();
              }
            return TYPE_INTEGER;
            }
          break;
          
        case TYPE_FRACTION:
          // Format: d...d.d...d, can be signed
          // Will be returned only as double
          // Otherwise similar to integer type
          if ( isValidDecimalDigit(ch) )
            {
            tokenStringBuilder.append( (char)ch );
            if ( tokenLength == MAX_DECIMAL_DIGITS )
              error("Fraction (" + getStringToken() + ") exceeds the max. " + MAX_DECIMAL_DIGITS + " digits! Numeric result is unpredictable!");
            tokenInteger *= 10;
            tokenInteger += valueOf( ch );
            tokenDivider*= 10;
            tokenLength++;
            }
          else 
            {
            // Can be MARK_PLUS, _MINUS, _FRACTION - but without any valid digits
            if ( tokenLength < 1)
              error("Malformed fraction: (" + getStringToken() + ") No numeric characters were found.");
            if ( !isValidWhiteSpace(ch) )
              {
              error("Malformed fraction: (" + getStringToken() + ") Endig was truncated.");
              findNextWhiteSpace();
              }
            else
              {
              // Correctly terminated fraction returned - no messages.
              // Last white-space should be evaluated.
              pushBackLastRead();
              }
            return TYPE_FRACTION;
            }
          break;
          
        case TYPE_KEYWORD:
          // Cannot start with number, otherwise all keyword chars are accepted
          // Endig from unknown character will be truncated
          if ( isValidKeyword(ch) )
            {
            tokenInteger*=TOKEN_CODE_RADIX;
            tokenInteger+= valueOf(ch);
            ch = toLowerCaseAsciiLetter(ch);
            tokenStringBuilder.append( (char)ch );
            if ( tokenLength == MAX_KEYWORD_CHARACTERS )
              error("Keyword (" + getStringToken() + ") exceeds the max. " + MAX_KEYWORD_CHARACTERS + " characters! Token code is not unique!");
            tokenLength++;
            }
          else if ( isValidWhiteSpace(ch) )
            {
            // Correctly terminated keyword returned - no messages.
            // Last white-space should be evaluated.
            pushBackLastRead();
            return TYPE_KEYWORD;
            }
          else
            {
            error("Malformed keyword: (" + getStringToken() + ") Endig was truncated.");
            findNextWhiteSpace();
            return tokenType;
            }
          break;

        case TYPE_STRING:
          // String tokens are ended with ", or terminated by EOL, EOF.
          // All escape sequences are translated to characters, which are added to the token.
          if ( ch == MARK_STRING )
            {
            return TYPE_STRING;
            }
          else if ( ch == EOL || ch == EOF )
            {
            error("String (" + getStringToken() + ") was not terminated properly!");
            pushBackLastRead();
            return TYPE_STRING;
            }
          else if ( ch == '\\' )
            {
            ch = convertEscapeSequence();
            }
          tokenStringBuilder.append( (char)ch );
          tokenLength++;
          break;
  
        case TYPE_CHARACTER:
          // Characters are always ONE character long, ended with '
          // All escape sequences are translated to character
          // Longer character sequences are skipped till the first white-space
          if ( tokenLength == 0 ) // No character yet!
            {
            if ( ch == MARK_CHARACTER )
              {
              error("Empty character constant! Null returned.");
              return TYPE_CHARACTER;
              }
            else if ( ch == EOL || ch == EOF )
              {
              error("Character constant missing! Null returned.");
              pushBackLastRead();
              return TYPE_CHARACTER;
              }
            else if ( ch == '\\' )
              {
              ch = convertEscapeSequence();
              }
            tokenStringBuilder.append( (char)ch );
            tokenInteger = (char)ch;
            tokenLength++;
            }
          else if ( tokenLength >= 1 ) // Character is ready
            {
            if ( ch == MARK_CHARACTER )
              {
              // Correctly terminated character returned - no messages
              return TYPE_CHARACTER;
              }
            else
              {
              error("Character constant " + getCharacterDescription(ch) + " is not terminated! Remaining token part skipped!");
              pushBackLastRead(); // It can be even a new line!
              findNextWhiteSpace();
              return TYPE_CHARACTER;
              }
            }
          break;

        default:
          // Impossible branch entry
          Scribe.debug( "Unknown token type: " + tokenType );
          return tokenType;
        }
      }
    }
  }
Java Source Code List

digitalgarden.bestboard.BoardParser.java
digitalgarden.bestboard.Board.java
digitalgarden.bestboard.MainActivity.java
digitalgarden.bestboard.Roller.java
digitalgarden.bestboard.TokenizerTest.java
digitalgarden.magicmerlin.scribe.Scribe.java
digitalgarden.magicmerlin.utils.Keyboard.java
digitalgarden.magicmerlin.utils.Rest.java
digitalgarden.magicmerlin.utils.Screen.java
digitalgarden.magicmerlin.utils.Tokenizer.java