com.adguard.commons.utils.CharsetUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.adguard.commons.utils.CharsetUtils.java

Source

/**
 This file is part of Adguard Content Blocker (https://github.com/AdguardTeam/ContentBlocker).
 Copyright  2016 Performix LLC. All rights reserved.
    
 Adguard Content Blocker is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by the
 Free Software Foundation, either version 3 of the License, or (at your option)
 any later version.
    
 Adguard Content Blocker is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
    
 You should have received a copy of the GNU General Public License along with
 Adguard Content Blocker.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.adguard.commons.utils;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Helper methods for working with charsets
 */
public class CharsetUtils {

    private static final Pattern CODE_PAGE_REGEX = Pattern.compile("cp([-_ ]*)?([0-9]+)", Pattern.CASE_INSENSITIVE);
    private final static Logger log = LoggerFactory.getLogger(CharsetUtils.class);

    /**
     * Default http encoding
     */
    public static final Charset DEFAULT_HTTP_ENCODING = Charset.forName("ISO-8859-1");

    /**
     * Utf-8 encoding
     */
    public static final Charset UTF8 = Charset.forName("utf-8");

    /**
     * Extracts Charset from Content-Type header value
     *
     * @param contentType Content-Type header value
     * @return Charset or DEFAULT_HTTP_ENCODING if it is not specified
     */
    public static Charset forContentType(String contentType) {
        return forContentType(contentType, DEFAULT_HTTP_ENCODING);
    }

    /**
     * Extracts Charset from Content-Type header value
     *
     * @param contentType    Content-Type header value
     * @param defaultCharset Will be returned if no charset found
     * @return Charset or defaultCharset
     */
    public static Charset forContentType(String contentType, Charset defaultCharset) {
        try {
            if (!StringUtils.isEmpty(contentType)) {
                String[] parts = StringUtils.split(contentType, ';');

                for (String t1 : parts) {
                    String t = t1.trim();
                    int index = t.toLowerCase().indexOf("charset=");
                    if (index != -1) {
                        String charset = t.substring(index + 8);
                        String charset1 = StringUtils.split(charset, ",;")[0];
                        return forName(charset1, defaultCharset);
                    }
                }
                return defaultCharset;
            }

            return defaultCharset;
        } catch (Exception ex) {
            log.debug(String.format("Cannot extract charset from %s", contentType), ex);
            return defaultCharset;
        }
    }

    /**
     * Safely gets charset for the specified name
     *
     * @param charsetName Charset name
     * @return Charset or null
     */
    public static Charset forName(String charsetName) {
        return forName(charsetName, null);
    }

    /**
     * Safely gets charset for the specified name
     *
     * @param charsetName    Charset name
     * @param defaultCharset Default charset (if nothing found for specified charset name)
     * @return Charset or defaultCharset
     */
    public static Charset forName(String charsetName, Charset defaultCharset) {

        try {

            return Charset.forName(charsetName);
        } catch (Exception ex) {

            try {
                Matcher matcher = CODE_PAGE_REGEX.matcher(charsetName);
                if (matcher.find()) {
                    int codePage = NumberUtils.toInteger(matcher.group(2));
                    return codePage > 0 ? Charset.forName("CP" + codePage) : defaultCharset;
                } else {
                    log.debug("Charset not found for " + charsetName, ex);
                }
            } catch (Exception e) {
                log.debug("Charset not found for " + charsetName, e);
            }

        }

        return defaultCharset;
    }
}