opennlp.tools.util.featuregen.StringPattern.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.tools.util.featuregen.StringPattern.java

Source

  /*
   * Licensed to the Apache Software Foundation (ASF) under one or more
   * contributor license agreements.  See the NOTICE file distributed with
   * this work for additional information regarding copyright ownership.
   * The ASF licenses this file to You under the Apache License, Version 2.0
   * (the "License"); you may not use this file except in compliance with
   * the License. You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */

  package opennlp.tools.util.featuregen;

  /**
   * Recognizes predefined patterns in strings.
   */
  public class StringPattern {

      private static final int INITAL_CAPITAL_LETTER = 0x1;
      private static final int ALL_CAPITAL_LETTER = 0x1 << 1;
      private static final int ALL_LOWERCASE_LETTER = 0x1 << 2;
      private static final int ALL_LETTERS = 0x1 << 3;
      private static final int ALL_DIGIT = 0x1 << 4;
      private static final int ALL_HIRAGANA = 0x1 << 5;
      private static final int ALL_KATAKANA = 0x1 << 6;
      private static final int CONTAINS_PERIOD = 0x1 << 7;
      private static final int CONTAINS_COMMA = 0x1 << 8;
      private static final int CONTAINS_SLASH = 0x1 << 9;
      private static final int CONTAINS_DIGIT = 0x1 << 10;
      private static final int CONTAINS_HYPHEN = 0x1 << 11;
      private static final int CONTAINS_LETTERS = 0x1 << 12;
      private static final int CONTAINS_UPPERCASE = 0x1 << 13;

      private final int pattern;

      private final int digits;

      private StringPattern(int pattern, int digits) {
          this.pattern = pattern;
          this.digits = digits;
      }

public static StringPattern recognize(String token) {

  int pattern = ALL_CAPITAL_LETTER | ALL_LOWERCASE_LETTER | ALL_DIGIT | ALL_LETTERS
      | ALL_HIRAGANA | ALL_KATAKANA;

  int digits = 0;

  for (int i = 0; i < token.length(); i++) {
    final char ch = token.charAt(i);
    final int letterType = Character.getType(ch);
    boolean isLetter = letterType == Character.UPPERCASE_LETTER ||
        letterType == Character.LOWERCASE_LETTER ||
        letterType == Character.TITLECASE_LETTER ||
        letterType == Character.MODIFIER_LETTER ||
        letterType == Character.OTHER_LETTER;

    if (isLetter) {
      pattern |= CONTAINS_LETTERS;
      pattern &= ~ALL_DIGIT;

      if (letterType == Character.UPPERCASE_LETTER) {
        if (i == 0) {
          pattern |= INITAL_CAPITAL_LETTER;
        }

        pattern |= CONTAINS_UPPERCASE;

        pattern &= ~ALL_LOWERCASE_LETTER;
      } else {
        pattern &= ~ALL_CAPITAL_LETTER;
      }
    } else {
      // contains chars other than letter, this means
      // it can not be one of these:
      pattern &= ~ALL_LETTERS;
      pattern &= ~ALL_CAPITAL_LETTER;
      pattern &= ~ALL_LOWERCASE_LETTER;

      if (letterType == Character.DECIMAL_DIGIT_NUMBER) {
        pattern |= CONTAINS_DIGIT;
        pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
        digits++;
      } else {
        pattern &= ~ALL_DIGIT;
      }

      switch (ch) {
        case ',':
          pattern |= CONTAINS_COMMA;
          break;

        case '.':
          pattern |= CONTAINS_PERIOD;
          break;

        case '/':
          pattern |= CONTAINS_SLASH;
          break;

        case '-':
          pattern |= CONTAINS_HYPHEN;
          break;

        default:
          break;
      }
    }

    // for Japanese...
    final int codePoint = token.codePointAt(i);
    final Character.UnicodeScript us = Character.UnicodeScript.of(codePoint);
    if (us != Character.UnicodeScript.COMMON) {
      if (us == Character.UnicodeScript.LATIN) {
        pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
      }
      else if (us == Character.UnicodeScript.HAN) {
        pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA | ALL_LOWERCASE_LETTER);
      }
      else if (us == Character.UnicodeScript.HIRAGANA) {
        pattern &= ~(ALL_KATAKANA | ALL_LOWERCASE_LETTER);
      }
      else if (us == Character.UnicodeScript.KATAKANA) {
        pattern &= ~(ALL_HIRAGANA | ALL_LOWERCASE_LETTER);
      }
    }
    else {
      if (ch != '' && ch != '' && ch != '')
        pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
    }
  }

  return new StringPattern(pattern, digits);
}

      /**
       * @return true if all characters are letters.
       */
      public boolean isAllLetter() {
          return (pattern & ALL_LETTERS) > 0;
      }

      /**
       * @return true if first letter is capital.
       */
      public boolean isInitialCapitalLetter() {
          return (pattern & INITAL_CAPITAL_LETTER) > 0;
      }

      /**
       * @return true if all letters are capital.
       */
      public boolean isAllCapitalLetter() {
          return (pattern & ALL_CAPITAL_LETTER) > 0;
      }

      /**
       * @return true if all letters are lower case.
       */
      public boolean isAllLowerCaseLetter() {
          return (pattern & ALL_LOWERCASE_LETTER) > 0;
      }

      /**
       * @return true if all chars are digits.
       */
      public boolean isAllDigit() {
          return (pattern & ALL_DIGIT) > 0;
      }

      /**
       * @return true if all chars are hiragana.
       */
      public boolean isAllHiragana() {
          return (pattern & ALL_HIRAGANA) > 0;
      }

      /**
       * @return true if all chars are katakana.
       */
      public boolean isAllKatakana() {
          return (pattern & ALL_KATAKANA) > 0;
      }

      /**
       * Retrieves the number of digits.
       */
      public int digits() {
          return digits;
      }

      public boolean containsPeriod() {
          return (pattern & CONTAINS_PERIOD) > 0;
      }

      public boolean containsComma() {
          return (pattern & CONTAINS_COMMA) > 0;
      }

      public boolean containsSlash() {
          return (pattern & CONTAINS_SLASH) > 0;
      }

      public boolean containsDigit() {
          return (pattern & CONTAINS_DIGIT) > 0;
      }

      public boolean containsHyphen() {
          return (pattern & CONTAINS_HYPHEN) > 0;
      }

      public boolean containsLetters() {
          return (pattern & CONTAINS_LETTERS) > 0;
      }
  }