org.silverpeas.core.util.EncodingUtil.java Source code

Java tutorial

Introduction

Here is the source code for org.silverpeas.core.util.EncodingUtil.java

Source

  /*
   * Copyright (C) 2000 - 2018 Silverpeas
   *
   * This program is free software: you can redistribute it and/or modify
   * it under the terms of the GNU Affero General Public License as
   * published by the Free Software Foundation, either version 3 of the
   * License, or (at your option) any later version.
   *
   * As a special exception to the terms and conditions of version 3.0 of
   * the GPL, you may redistribute this Program in connection with Free/Libre
   * Open Source Software ("FLOSS") applications as described in Silverpeas's
   * FLOSS exception.  You should have received a copy of the text describing
   * the FLOSS exception, and it is also available here:
   * "https://www.silverpeas.org/legal/floss_exception.html"
   *
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU Affero General Public License for more details.
   *
   * You should have received a copy of the GNU Affero General Public License
   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
   */
  package org.silverpeas.core.util;

  import com.ibm.icu.text.CharsetDetector;
  import com.ibm.icu.text.CharsetMatch;
  import org.apache.commons.codec.binary.Hex;
  import org.apache.commons.lang3.ArrayUtils;
  import org.apache.commons.lang3.CharEncoding;
  import org.apache.commons.lang3.StringUtils;
  import org.silverpeas.core.util.StringUtil;

  import java.io.UnsupportedEncodingException;
  import java.text.ParseException;
  import java.util.LinkedHashSet;
  import java.util.Set;

  public class EncodingUtil extends StringUtils {

private static final char[] PUNCTUATION =
    new char[]{'&', '\"', '\'', '{', '(', '[', '-', '|', '`', '_', '\\', '^', '@', ')', ']', '=',
        '+', '}', '?', ',', '.', ';', '/', ':', '!', '', '%', '*', '$', '', '', '', '', '',
        ''};

      /**
       * Method for trying to detect encoding.
       * @param data some data to try to detect the encoding.
       * @param declaredEncoding expected encoding.
       * @return the detected encoding.
       */
      public static String detectStringEncoding(byte[] data, String declaredEncoding)
              throws UnsupportedEncodingException {
          if (data != null) {
              String value = new String(data, declaredEncoding);
              if (hasEncodingToBeChecked(value)) {
                  return findBestEncoding(data, declaredEncoding);
              }
          }
          return declaredEncoding;
      }

      /**
       * If the value contains one character which is neither an alphanumeric, neither a whitespace
       * neither a punctuation character, then the encoding has to be checked
       * @param value the value to verify.
       * @return true if the encoding of the given value has to be checked.
       */
      private static boolean hasEncodingToBeChecked(String value) {
          if (value != null) {
              char[] chars = value.toCharArray();
              for (char currentChar : chars) {
                  if (!Character.isLetterOrDigit(currentChar) && !Character.isWhitespace(currentChar)
                          && !ArrayUtils.contains(PUNCTUATION, currentChar)) {
                      return true;
                  }
              }
          }
          return false;
      }

      /**
       * @param data some data to try to detect the encoding.
       * @param declaredEncoding expected encoding.
       * @return the best encoding or the one declared if the encoding could not have to be guessed.
       * @throws UnsupportedEncodingException
       */
      private static String findBestEncoding(byte[] data, String declaredEncoding)
              throws UnsupportedEncodingException {
          final Set<String> supportedEncodings;
          if (CharEncoding.UTF_8.equals(declaredEncoding)) {
              supportedEncodings = detectMaybeEncoding(data, CharEncoding.ISO_8859_1);
          } else {
              supportedEncodings = detectMaybeEncoding(data, CharEncoding.UTF_8);
          }
          for (String encoding : supportedEncodings) {
              String encodedData = new String(data, encoding);
              if (!hasEncodingToBeChecked(encodedData)) {
                  return encoding;
              }
          }
          return declaredEncoding;
      }

      /**
       * Method for trying to detect encoding
       * @param data some data to try to detect the encoding.
       * @param declaredEncoding expected encoding.
       * @return the possible encodings.
       */
      private static Set<String> detectMaybeEncoding(byte[] data, String declaredEncoding) {
          final CharsetDetector detector = new CharsetDetector();
          if (!StringUtil.isDefined(declaredEncoding)) {
              detector.setDeclaredEncoding(CharEncoding.ISO_8859_1);
          } else {
              detector.setDeclaredEncoding(declaredEncoding);
          }
          detector.setText(data);
          CharsetMatch[] detectedEnc = detector.detectAll();
          Set<String> encodings = new LinkedHashSet<>(detectedEnc.length);
          for (CharsetMatch detectedEncoding : detectedEnc) {
              encodings.add(detectedEncoding.getName());
          }
          return encodings;
      }

      /**
       * Decodes the specified text with hexadecimal values in bytes of those same values. The text is
       * considered to be in the UTF-8 charset.
       *
       * @param hexText the text with hexadecimal-based characters.
       * @return the binary representation of the text.
       * @throws ParseException if an odd number or illegal of characters is supplied.
       */
      public static byte[] fromHex(String hexText) throws ParseException {
          try {
              return Hex.decodeHex(hexText.toCharArray());
          } catch (Exception ex) {
              throw new ParseException(ex.getMessage(), -1);
          }
      }

      /**
       * Encodes the specified binary data into a String representing the hexadecimal values of each
       * byte in order. The String is in the UTF-8 charset.
       *
       * @param binaryData the binary data to concert in hexadecimal-based String.
       * @return a String representation of the binary data in Hexadecimal characters.
       */
      public static String asHex(byte[] binaryData) {
          return Hex.encodeHexString(binaryData);
      }

      private EncodingUtil() {
      }
  }