com.wellsandwhistles.android.redditsp.reddit.prepared.markdown.MarkdownTokenizer.java Source code

Java tutorial

Introduction

Here is the source code for com.wellsandwhistles.android.redditsp.reddit.prepared.markdown.MarkdownTokenizer.java

Source

 package com.wellsandwhistles.android.redditsp.reddit.prepared.markdown;

 /** This file was either copied or modified from https://github.com/QuantumBadger/RedReader
  * under the Free Software Foundation General Public License version 3*/

 import org.apache.commons.lang3.StringEscapeUtils;

 import java.util.HashSet;

 public final class MarkdownTokenizer {

     // TODO support double graves

     public static final int TOKEN_UNDERSCORE = -1, TOKEN_UNDERSCORE_DOUBLE = -2, TOKEN_ASTERISK = -3,
             TOKEN_ASTERISK_DOUBLE = -4, TOKEN_TILDE_DOUBLE = -5, TOKEN_CARET = -6, TOKEN_GRAVE = -7,
             TOKEN_BRACKET_SQUARE_OPEN = -8, TOKEN_BRACKET_SQUARE_CLOSE = -9, TOKEN_PAREN_OPEN = -10,
             TOKEN_PAREN_CLOSE = -11, TOKEN_UNICODE_OPEN = -12, TOKEN_UNICODE_CLOSE = -13;

     private static final char[][] reverseLookup = new char[20][];

     private static final char[][] linkPrefixes = { "http://".toCharArray(), "https://".toCharArray(),
             "www.".toCharArray() };

     private static final char[][] linkPrefixes_reddit = { "/r/".toCharArray(), "r/".toCharArray(),
             "/u/".toCharArray(), "u/".toCharArray(), "/user/".toCharArray() };

     private static final HashSet<Integer> unicodeWhitespace = new HashSet<>();

     static {
         reverseLookup[20 + TOKEN_UNDERSCORE] = new char[] { '_' };
         reverseLookup[20 + TOKEN_UNDERSCORE_DOUBLE] = new char[] { '_', '_' };
         reverseLookup[20 + TOKEN_ASTERISK] = new char[] { '*' };
         reverseLookup[20 + TOKEN_ASTERISK_DOUBLE] = new char[] { '*', '*' };
         reverseLookup[20 + TOKEN_TILDE_DOUBLE] = new char[] { '~', '~' };
         reverseLookup[20 + TOKEN_CARET] = new char[] { '^' };
         reverseLookup[20 + TOKEN_GRAVE] = new char[] { '`' };
         reverseLookup[20 + TOKEN_BRACKET_SQUARE_OPEN] = new char[] { '[' };
         reverseLookup[20 + TOKEN_BRACKET_SQUARE_CLOSE] = new char[] { ']' };
         reverseLookup[20 + TOKEN_PAREN_OPEN] = new char[] { '(' };
         reverseLookup[20 + TOKEN_PAREN_CLOSE] = new char[] { ')' };
         reverseLookup[20 + TOKEN_UNICODE_OPEN] = new char[] { '&' };
         reverseLookup[20 + TOKEN_UNICODE_CLOSE] = new char[] { ';' };

         unicodeWhitespace.add(0x0009);
         unicodeWhitespace.add(0x000B);
         unicodeWhitespace.add(0x00A0);
         unicodeWhitespace.add(0x1680);
         unicodeWhitespace.add(0x2000);
         unicodeWhitespace.add(0x2001);
         unicodeWhitespace.add(0x2002);
         unicodeWhitespace.add(0x2003);
         unicodeWhitespace.add(0x2004);
         unicodeWhitespace.add(0x2005);
         unicodeWhitespace.add(0x2006);
         unicodeWhitespace.add(0x2007);
         unicodeWhitespace.add(0x2008);
         unicodeWhitespace.add(0x2009);
         unicodeWhitespace.add(0x200A);
         unicodeWhitespace.add(0x202F);
         unicodeWhitespace.add(0x205F);
         unicodeWhitespace.add(0x3000);
     }

     public static boolean isUnicodeWhitespace(int codepoint) {
         return unicodeWhitespace.contains(codepoint);
     }

     public static IntArrayLengthPair tokenize(final CharArrSubstring input) {

         final IntArrayLengthPair tmp1 = new IntArrayLengthPair(input.length * 3);
         final IntArrayLengthPair tmp2 = new IntArrayLengthPair(input.length * 3);

         tmp1.pos = input.length;
         for (int i = 0; i < input.length; i++) {
             tmp1.data[i] = input.charAt(i);
         }

         // Markdown is evil.

         naiveTokenize(tmp1, tmp2);
         clean(tmp2, tmp1);
         linkify(tmp1, tmp2);
         clean(tmp2, tmp1);

         return tmp1;
     }

     private static void linkify(final IntArrayLengthPair input, final IntArrayLengthPair output) {

         if (input.data.length > output.data.length * 3)
             throw new RuntimeException();
         output.clear();

         int inBrackets = 0;
         boolean lastCharOk = true;

         for (int i = 0; i < input.pos; i++) {

             final int token = input.data[i];

             switch (token) {

             case TOKEN_BRACKET_SQUARE_OPEN:
             case TOKEN_PAREN_OPEN:
                 output.data[output.pos++] = token;
                 inBrackets++;
                 lastCharOk = true;
                 break;

             case TOKEN_BRACKET_SQUARE_CLOSE:
             case TOKEN_PAREN_CLOSE:
                 output.data[output.pos++] = token;
                 inBrackets--;
                 lastCharOk = true;
                 break;

             case ' ':
                 output.data[output.pos++] = ' ';
                 lastCharOk = true;
                 break;

             case 'h':
             case 'w':

                 if (inBrackets == 0 && lastCharOk) {

                     final int linkStartType = getLinkStartType(input.data, i, input.pos);
                     if (linkStartType >= 0) {

                         // Greedily read to space, or <>, or etc

                         final int linkStartPos = i;
                         final int linkPrefixEndPos = linkPrefixes[linkStartType].length + linkStartPos;
                         int linkEndPos = linkPrefixEndPos;

                         while (linkEndPos < input.pos) {

                             final int lToken = input.data[linkEndPos];

                             final boolean isValidChar = lToken != ' ' && lToken != '<' && lToken != '>'
                                     && lToken != TOKEN_GRAVE && lToken != TOKEN_BRACKET_SQUARE_OPEN
                                     && lToken != TOKEN_BRACKET_SQUARE_CLOSE;

                             if (isValidChar) {
                                 linkEndPos++;
                             } else {
                                 break;
                             }
                         }

                         // discard many final chars if they are '.', ',', '?', ';' etc
                         // THEN, discard single final char if it is '\'', '"', etc

                         while (input.data[linkEndPos - 1] == '.' || input.data[linkEndPos - 1] == ','
                                 || input.data[linkEndPos - 1] == '?' || input.data[linkEndPos - 1] == ';') {
                             linkEndPos--;
                         }

                         if (input.data[linkEndPos - 1] == '"') {
                             linkEndPos--;
                         }

                         if (input.data[linkEndPos - 1] == '\'') {
                             linkEndPos--;
                         }

                         if (input.data[linkEndPos - 1] == ')') {
                             linkEndPos--;
                         }

                         if (linkEndPos - linkPrefixEndPos >= 2) {

                             final int[] reverted = revert(input.data, linkStartPos, linkEndPos);

                             output.data[output.pos++] = TOKEN_BRACKET_SQUARE_OPEN;
                             output.append(reverted);
                             output.data[output.pos++] = TOKEN_BRACKET_SQUARE_CLOSE;
                             output.data[output.pos++] = TOKEN_PAREN_OPEN;
                             output.append(reverted);
                             output.data[output.pos++] = TOKEN_PAREN_CLOSE;

                             i = linkEndPos - 1;

                         } else {
                             output.data[output.pos++] = token;
                         }

                     } else {
                         output.data[output.pos++] = token;
                     }

                 } else {
                     output.data[output.pos++] = token;
                 }

                 lastCharOk = false;
                 break;

             case 'r':
             case 'u':
             case '/':

                 if (inBrackets == 0 && lastCharOk) {

                     final int linkStartType = getRedditLinkStartType(input.data, i, input.pos);
                     if (linkStartType >= 0) {

                         final int linkStartPos = i;
                         final int linkPrefixEndPos = linkPrefixes_reddit[linkStartType].length + linkStartPos;
                         int linkEndPos = linkPrefixEndPos;

                         while (linkEndPos < input.pos) {

                             final int lToken = input.data[linkEndPos];

                             final boolean isValidChar = (lToken >= 'a' && lToken <= 'z')
                                     || (lToken >= 'A' && lToken <= 'Z') || (lToken >= '0' && lToken <= '9')
                                     || lToken == '_' || lToken == TOKEN_UNDERSCORE
                                     || lToken == TOKEN_UNDERSCORE_DOUBLE || lToken == '+' || lToken == '-';

                             if (isValidChar) {
                                 linkEndPos++;
                             } else {
                                 break;
                             }
                         }

                         if (linkEndPos - linkPrefixEndPos > 2) {

                             final int[] reverted = revert(input.data, linkStartPos, linkEndPos);

                             output.data[output.pos++] = TOKEN_BRACKET_SQUARE_OPEN;
                             output.append(reverted);
                             output.data[output.pos++] = TOKEN_BRACKET_SQUARE_CLOSE;
                             output.data[output.pos++] = TOKEN_PAREN_OPEN;
                             output.append(reverted);
                             output.data[output.pos++] = TOKEN_PAREN_CLOSE;

                             i = linkEndPos - 1;

                         } else {
                             output.data[output.pos++] = token;
                         }

                     } else {
                         output.data[output.pos++] = token;
                     }

                 } else {
                     output.data[output.pos++] = token;
                 }

                 lastCharOk = false;
                 break;

             default:
                 // TODO test this against reddits impl
                 lastCharOk = token < 0 || (!Character.isLetterOrDigit(token));
                 output.data[output.pos++] = token;
                 break;
             }
         }
     }

public static void clean(final IntArrayLengthPair input, final IntArrayLengthPair output) {

   // TODO use single byte array, flags
   final boolean[] toRevert = new boolean[input.pos];
   final boolean[] toDelete = new boolean[input.pos];

   int openingUnderscore = -1, openingUnderscoreDouble = -1;
   int openingAsterisk = -1, openingAsteriskDouble = -1;
   int openingTildeDouble = -1;

   int lastBracketSquareOpen = -1;

   for(int i = 0; i < input.pos; i++) {

      final int c = input.data[i];

      final boolean beforeASpace = i + 1 < input.pos && input.data[i + 1] == ' ';
      final boolean afterASpace = i > 0 && input.data[i - 1] == ' ';

      switch(c) {

         case TOKEN_UNDERSCORE:

            if(openingUnderscore < 0) {
               // Opening underscore
               if(beforeASpace) {
                  toRevert[i] = true;
               } else {
                  openingUnderscore =  i;
               }

            } else {
               // Closing underscore
               if(afterASpace) {
                  toRevert[i] = true;
               } else {
                  openingUnderscore = -1;
               }
            }

            break;

         case TOKEN_UNDERSCORE_DOUBLE:

            if(i != 0 && openingUnderscoreDouble == i - 1) {
               toRevert[openingUnderscoreDouble] = true;
               toRevert[i] = true;
               openingUnderscoreDouble = -1;

            } else {

               if(openingUnderscoreDouble < 0) {
                  // Opening double underscore
                  if(beforeASpace) {
                     toRevert[i] = true;
                  } else {
                     openingUnderscoreDouble = i;
                  }

               } else {
                  // Closing double underscore
                  if(afterASpace) {
                     toRevert[i] = true;
                  } else {
                     openingUnderscoreDouble = -1;
                  }
               }
            }

            break;

         case TOKEN_ASTERISK:

            if(openingAsterisk < 0) {
               // Opening asterisk
               if(beforeASpace) {
                  toRevert[i] = true;
               } else {
                  openingAsterisk =  i;
               }

            } else {
               // Closing asterisk
               if(afterASpace) {
                  toRevert[i] = true;
               } else {
                  openingAsterisk = -1;
               }
            }

            break;

         case TOKEN_ASTERISK_DOUBLE:

            if(i != 0 && openingAsteriskDouble == i - 1) {
               toRevert[openingAsteriskDouble] = true;
               toRevert[i] = true;
               openingAsteriskDouble = -1;

            } else {

               if(openingAsteriskDouble < 0) {
                  // Opening double asterisk
                  if(beforeASpace) {
                     toRevert[i] = true;
                  } else {
                     openingAsteriskDouble = i;
                  }

               } else {
                  // Closing double asterisk
                  if(afterASpace) {
                     toRevert[i] = true;
                  } else {
                     openingAsteriskDouble = -1;
                  }
               }
            }

            break;

         case TOKEN_TILDE_DOUBLE:

            if(i != 0 && openingTildeDouble == i - 1) {
               toRevert[openingTildeDouble] = true;
               toRevert[i] = true;
               openingTildeDouble = -1;

            } else {

               if(openingTildeDouble < 0) {
                  // Opening double tilde
                  if(beforeASpace) {
                     toRevert[i] = true;
                  } else {
                     openingTildeDouble = i;
                  }

               } else {
                  // Closing double tilde
                  if(afterASpace) {
                     toRevert[i] = true;
                  } else {
                     openingTildeDouble = -1;
                  }
               }
            }

            break;

         case TOKEN_GRAVE:

            final int openingGrave = i;
            final int closingGrave = indexOf(input.data, TOKEN_GRAVE, i + 1, input.pos);

            if(closingGrave < 0) {
               toRevert[i] = true;
            } else {

               for(int j = openingGrave + 1; j < closingGrave; j++) {
                  if(input.data[j] < 0) toRevert[j] = true;
               }

               i = closingGrave;
            }

            break;

         case TOKEN_BRACKET_SQUARE_OPEN:
            if(lastBracketSquareOpen < 0) {

               // Attempt to parse link text with well-bracketed square brackets

               final int closingSquareBracket = findCloseWellBracketed(
                     input.data,
                     TOKEN_BRACKET_SQUARE_OPEN,
                     TOKEN_BRACKET_SQUARE_CLOSE,
                     i,
                     input.pos);

               if(closingSquareBracket > i) {

                  final int parenOpenPos = indexOf(input.data, TOKEN_PAREN_OPEN, closingSquareBracket + 1, input.pos);

                  if(parenOpenPos > closingSquareBracket
                        && isSpaces(input.data, closingSquareBracket + 1, parenOpenPos)) {

                     lastBracketSquareOpen = i;

                     for(int j = i + 1; j < closingSquareBracket; j++) {
                        if(input.data[j] == TOKEN_BRACKET_SQUARE_OPEN) {
                           input.data[j] = '[';

                        } else if(input.data[j] == TOKEN_BRACKET_SQUARE_CLOSE) {
                           input.data[j] = ']';
                        }
                     }

                  } else {
                     toRevert[i] = true;
                  }

               } else {
                  toRevert[i] = true;
               }

            } else {
               toRevert[lastBracketSquareOpen] = true;
               lastBracketSquareOpen = i;
            }
            break;

         case TOKEN_BRACKET_SQUARE_CLOSE:

            if(lastBracketSquareOpen < 0) {
               toRevert[i] = true;

            } else {

               final int lastBracketSquareClose = i;

               final int parenOpenPos = indexOf(input.data, TOKEN_PAREN_OPEN,
                     lastBracketSquareClose + 1, input.pos);

               boolean linkParseSuccess = false;

               if(parenOpenPos >= 0) {

                  if(isSpaces(input.data, lastBracketSquareClose + 1, parenOpenPos)) {

                     final int parenClosePos = findParenClosePos(input, parenOpenPos + 1);

                     if(parenClosePos >= 0) {

                        linkParseSuccess = true;

                        for(int j = lastBracketSquareOpen + 1; j < lastBracketSquareClose; j++) {
                           if(input.data[j] == TOKEN_BRACKET_SQUARE_OPEN
                              || input.data[j] == TOKEN_BRACKET_SQUARE_CLOSE) {
                              toRevert[j] = true;
                           }
                        }

                        for(int j = lastBracketSquareClose + 1; j < parenOpenPos; j++) {
                           toDelete[j] = true;
                        }

                        for(int j = parenOpenPos + 1; j < parenClosePos; j++) {
                           if(input.data[j] < 0) {
                              toRevert[j] = true;
                           } else if(input.data[j] == ' ' && input.data[j-1] == ' ') {
                              toDelete[j] = true;
                           }
                        }

                        for(int j = parenOpenPos + 1; input.data[j] == ' '; j++) {
                           toDelete[j] = true;
                        }

                        for(int j = parenClosePos - 1; input.data[j] == ' '; j--) {
                           toDelete[j] = true;
                        }

                        i = parenClosePos;
                     }
                  }
               }

               if(!linkParseSuccess) {
                  toRevert[lastBracketSquareOpen] = true;
                  toRevert[lastBracketSquareClose] = true;
               }
            }

            lastBracketSquareOpen = -1;
            break;

         case TOKEN_PAREN_OPEN:
         case TOKEN_PAREN_CLOSE:
         case TOKEN_UNICODE_CLOSE:
            toRevert[i] = true;
            break;

         case TOKEN_UNICODE_OPEN:

            final int openingUnicode = i;
            final int closingUnicode = indexOf(input.data, TOKEN_UNICODE_CLOSE, i + 1,
                  Math.min(input.pos, i + 20));

            if(closingUnicode < 0) {
               toRevert[i] = true;

            } else if(input.data[i + 1] == '#') {

               if(input.data[i + 2] == 'x' && isHexDigits(input.data, openingUnicode + 3, closingUnicode)) {

                  final int codePoint = getHex(input.data, openingUnicode + 3, closingUnicode);

                  if(unicodeWhitespace.contains(codePoint)) {
                     input.data[openingUnicode] = ' ';
                  } else {
                     input.data[openingUnicode] = codePoint;
                  }

                  for(int j = openingUnicode + 1; j <= closingUnicode; j++) {
                     toDelete[j] = true;
                  }

                  i = closingUnicode;

               } else if(isDigits(input.data, openingUnicode + 2, closingUnicode)) {

                  final int codePoint = getDecimal(input.data, openingUnicode + 2, closingUnicode);

                  if(unicodeWhitespace.contains(codePoint)) {
                     input.data[openingUnicode] = ' ';
                  } else {
                     input.data[openingUnicode] = codePoint;
                  }

                  for(int j = openingUnicode + 1; j <= closingUnicode; j++) {
                     toDelete[j] = true;
                  }

                  i = closingUnicode;

               } else {
                  toRevert[i] = true;
               }

            } else {

               Integer codePoint = null;

               try {

                  final String name = new String(input.data, openingUnicode + 1, closingUnicode - openingUnicode - 1);

                  final String result = StringEscapeUtils.unescapeHtml4("&" + name + ";");

                  if(result.length() == 1) {
                     codePoint = (int) result.charAt(0);

                  } else if(name.equalsIgnoreCase("apos")) {
                     codePoint = (int) '\'';

                  } else if(name.equalsIgnoreCase("nsub")) {
                     codePoint = (int) '';
                  }

               } catch(Throwable ignore) {
                  // Ignore this
               }

               if(codePoint != null) {

                  if(unicodeWhitespace.contains(codePoint)) {
                     input.data[openingUnicode] = ' ';
                  } else {
                     input.data[openingUnicode] = codePoint;
                  }

                  for(int j = openingUnicode + 1; j <= closingUnicode; j++) {
                     toDelete[j] = true;
                  }

                  i = closingUnicode;

               } else {
                  toRevert[i] = true;
               }
            }

            break;

         case TOKEN_CARET:

            if(input.pos <= i + 1 || input.data[i + 1] == ' ') {
               toRevert[i] = true;
            }

            break;

         case ' ':

            if(i < 1 || input.data[i - 1] == ' ') {
               toDelete[i] = true;
            }

            break;
      }
   }

   if(openingUnderscore >= 0) toRevert[openingUnderscore] = true;
   if(openingUnderscoreDouble >= 0) toRevert[openingUnderscoreDouble] = true;
   if(openingAsterisk >= 0) toRevert[openingAsterisk] = true;
   if(openingAsteriskDouble >= 0) toRevert[openingAsteriskDouble] = true;
   if(openingTildeDouble >= 0) toRevert[openingTildeDouble] = true;
   if(lastBracketSquareOpen >= 0) toRevert[lastBracketSquareOpen] = true;

   for(int j = input.pos - 1; j >= 0 && input.data[j] == ' '; j--) {
      toDelete[j] = true;
   }

   output.clear();

   for(int i = 0; i < input.pos; i++) {

      if(toDelete[i]) continue;

      if(toRevert[i]) {

         final char[] revertTo = reverseLookup[20 + input.data[i]];
         output.append(revertTo);

      } else {
         output.data[output.pos++] = input.data[i];
      }
   }
}

     private static int findParenClosePos(final IntArrayLengthPair tokens, int startPos) {

         for (int i = startPos; i < tokens.pos; i++) {

             switch (tokens.data[i]) {

             case TOKEN_PAREN_CLOSE:
                 return i;

             case '"':
                 i = indexOfIgnoreEscaped(tokens, '"', i + 1);
                 if (i < 0)
                     return -1;
                 break;
             }
         }

         return -1;
     }

     private static int indexOfIgnoreEscaped(final IntArrayLengthPair haystack, int needle, int startPos) {
         for (int i = startPos; i < haystack.pos; i++) {
             if (haystack.data[i] == '\\')
                 i++;
             else if (haystack.data[i] == needle)
                 return i;
         }
         return -1;
     }

     public static void naiveTokenize(final IntArrayLengthPair input, final IntArrayLengthPair output) {

         output.clear();

         for (int i = 0; i < input.pos; i++) {

             final int c = input.data[i];

             switch (c) {

             case '*':

                 if (i < input.pos - 1 && input.data[i + 1] == '*') {
                     i++;
                     output.data[output.pos++] = TOKEN_ASTERISK_DOUBLE;
                 } else {
                     output.data[output.pos++] = TOKEN_ASTERISK;
                 }

                 break;

             case '_':

                 if (i < input.pos - 1 && input.data[i + 1] == '_') {
                     i++;
                     output.data[output.pos++] = TOKEN_UNDERSCORE_DOUBLE;
                 } else {
                     if ((i < input.pos - 1 && input.data[i + 1] == ' ') || (i > 0 && input.data[i - 1] == ' ')
                             || (i == 0) || (i == input.pos - 1)) {
                         output.data[output.pos++] = TOKEN_UNDERSCORE;
                     } else {
                         output.data[output.pos++] = c;
                     }
                 }
                 break;

             case '~':

                 if (i < input.pos - 1 && input.data[i + 1] == '~') {
                     i++;
                     output.data[output.pos++] = TOKEN_TILDE_DOUBLE;

                 } else
                     output.data[output.pos++] = '~';

                 break;

             case '^':
                 output.data[output.pos++] = TOKEN_CARET;
                 break;

             case '`':
                 output.data[output.pos++] = TOKEN_GRAVE;
                 break;

             case '[':
                 output.data[output.pos++] = TOKEN_BRACKET_SQUARE_OPEN;
                 break;

             case ']':
                 output.data[output.pos++] = TOKEN_BRACKET_SQUARE_CLOSE;
                 break;

             case '(':
                 output.data[output.pos++] = TOKEN_PAREN_OPEN;
                 break;

             case ')':
                 output.data[output.pos++] = TOKEN_PAREN_CLOSE;
                 break;

             case '&':
                 output.data[output.pos++] = TOKEN_UNICODE_OPEN;
                 break;

             case ';':
                 output.data[output.pos++] = TOKEN_UNICODE_CLOSE;
                 break;

             case '\\':
                 if (i < input.pos - 1)
                     output.data[output.pos++] = input.data[++i];
                 else
                     output.data[output.pos++] = '\\';
                 break;

             case '\t':
             case '\r':
             case '\f':
             case '\n':
                 output.data[output.pos++] = ' ';
                 break;

             default:
                 output.data[output.pos++] = c;
                 break;
             }
         }
     }

     private static int indexOf(final int[] haystack, final int needle, final int startInclusive,
             final int endExclusive) {
         for (int i = startInclusive; i < endExclusive; i++)
             if (haystack[i] == needle)
                 return i;
         return -1;
     }

     private static int reverseIndexOf(final int[] haystack, final int needle, final int startInclusive) {
         for (int i = startInclusive; i >= 0; i--)
             if (haystack[i] == needle)
                 return i;
         return -1;
     }

     public static int findCloseWellBracketed(final int[] haystack, final int openBracket, final int closeBracket,
             final int startInclusive, final int endExclusive) {

         if (haystack[startInclusive] != openBracket) {
             throw new RuntimeException("Internal markdown parser error");
         }

         int b = 1;

         for (int i = startInclusive + 1; i < endExclusive; i++) {
             if (haystack[i] == openBracket) {
                 b++;
             } else if (haystack[i] == closeBracket) {
                 b--;
             }

             if (b == 0) {
                 return i;
             }
         }

         return -1;
     }

     private static boolean isSpaces(final int[] haystack, final int startInclusive, final int endExclusive) {
         for (int i = startInclusive; i < endExclusive; i++)
             if (haystack[i] != ' ')
                 return false;
         return true;
     }

     private static boolean isDigits(final int[] haystack, final int startInclusive, final int endExclusive) {
         for (int i = startInclusive; i < endExclusive; i++)
             if (haystack[i] < '0' || haystack[i] > '9')
                 return false;
         return true;
     }

     private static boolean isHexDigits(final int[] haystack, final int startInclusive, final int endExclusive) {
         for (int i = startInclusive; i < endExclusive; i++) {
             final int c = haystack[i];
             if ((c < '0' || c > '9') && (c < 'a' || c > 'f') && (c < 'A' || c > 'F'))
                 return false;
         }
         return true;
     }

     private static int getDecimal(final int[] chars, final int startInclusive, final int endExclusive) {
         int result = 0;
         for (int i = startInclusive; i < endExclusive; i++) {
             result *= 10;
             result += chars[i] - '0';
         }
         return result;
     }

     private static int fromHex(int ch) {
         if (ch >= '0' && ch <= '9')
             return ch - '0';
         if (ch >= 'a' && ch <= 'f')
             return 10 + ch - 'a';
         return 10 + ch - 'A';
     }

     private static int getHex(final int[] chars, final int startInclusive, final int endExclusive) {
         int result = 0;
         for (int i = startInclusive; i < endExclusive; i++) {
             result *= 16;
             result += fromHex(chars[i]);
         }
         return result;
     }

     private static boolean equals(final int[] haystack, final char[] needle, int startInclusive) {
         for (int i = 0; i < needle.length; i++)
             if (haystack[startInclusive + i] != needle[i])
                 return false;
         return true;
     }

     private static int getLinkStartType(final int[] haystack, final int startInclusive, final int endExclusive) {
         final int maxLen = endExclusive - startInclusive;
         for (int type = 0; type < linkPrefixes.length; type++) {
             if (linkPrefixes[type].length <= maxLen && equals(haystack, linkPrefixes[type], startInclusive)) {
                 return type;
             }
         }
         return -1;
     }

     private static int getRedditLinkStartType(final int[] haystack, final int startInclusive,
             final int endExclusive) {
         final int maxLen = endExclusive - startInclusive;
         for (int type = 0; type < linkPrefixes_reddit.length; type++) {
             if (linkPrefixes_reddit[type].length <= maxLen
                     && equals(haystack, linkPrefixes_reddit[type], startInclusive)) {
                 return type;
             }
         }
         return -1;
     }

     // TODO avoid generating new array
     private static int[] revert(final int[] tokens, final int startInclusive, final int endExclusive) {

         int outputLen = 0;

         for (int i = startInclusive; i < endExclusive; i++) {
             final int token = tokens[i];
             if (token < 0) {
                 outputLen += reverseLookup[20 + token].length;

             } else {
                 outputLen++;
             }
         }

         final int[] result = new int[outputLen];
         int resultPos = 0;

         for (int i = startInclusive; i < endExclusive; i++) {
             final int token = tokens[i];
             if (token < 0) {
                 for (final char c : reverseLookup[20 + token]) {
                     result[resultPos++] = c;
                 }

             } else {
                 result[resultPos++] = token;
             }
         }

         return result;
     }

 }