org.vosao.utils.StrUtil.java Source code

Java tutorial

Introduction

Here is the source code for org.vosao.utils.StrUtil.java

Source

/**
 * Vosao CMS. Simple CMS for Google App Engine.
 * 
 * Copyright (C) 2009-2010 Vosao development team.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * email: vosao.dev@gmail.com
 */

package org.vosao.utils;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class StrUtil {

    private static Log logger = LogFactory.getLog(StrUtil.class);

    public static final String DESCRIPTION_REGEX = "<?(meta|META)\\s.*?((content|CONTENT)=\"(.*?)\"\\s+(name|NAME)=\"(description|DESCRIPTION)\"|(name|NAME)=\"(description|DESCRIPTION)\"\\s+(content|CONTENT)=\"(.*?)\")\\s*/>";
    public static final Pattern DESCRIPTION_PATTERN = Pattern.compile(DESCRIPTION_REGEX);

    public static final String KEYWORDS_REGEX = "<?(meta|META)\\s.*?((content|CONTENT)=\"(.*?)\"\\s+(name|NAME)=\"(keywords|KEYWORDS)\"|(name|NAME)=\"(keywords|KEYWORDS)\"\\s+(content|CONTENT)=\"(.*?)\")\\s*/>";
    public static final Pattern KEYWORDS_PATTERN = Pattern.compile(KEYWORDS_REGEX);

    public static final String HEAD_CLOSE_REGEX = "</(head|HEAD)>";
    public static final Pattern HEAD_CLOSE_PATTERN = Pattern.compile(HEAD_CLOSE_REGEX);

    private static String _toCSV(Collection<String> list) {
        StringBuffer result = new StringBuffer();
        int count = 0;
        for (String item : list) {
            result.append((count == 0 ? "" : ",")).append(item);
            count++;
        }
        return result.toString();
    }

    public static String toCSV(Set<String> list) {
        return _toCSV(list);
    }

    public static String toCSV(List<String> list) {
        return _toCSV(list);
    }

    public static List<String> fromCSV(String data) {
        List<String> result = new ArrayList<String>();
        if (!StringUtils.isEmpty(data)) {
            if (data.indexOf(',') == -1) {
                result.add(data);
                return result;
            }
            for (String s : data.split(",")) {
                result.add(s);
            }
        }
        return result;
    }

    /**
     * Gzip the input string into a byte[].
     * 
     * @param input
     * @return
     * @throws IOException
     */
    public static byte[] zipStringToBytes(String input) throws IOException {
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        BufferedOutputStream bufos = new BufferedOutputStream(new GZIPOutputStream(bos));
        bufos.write(input.getBytes("UTF-8"));
        bufos.close();
        byte[] retval = bos.toByteArray();
        bos.close();
        return retval;
    }

    /**
     * Unzip a string out of the given gzipped byte array.
     * 
     * @param bytes
     * @return
     * @throws IOException
     */
    public static String unzipStringFromBytes(byte[] bytes) throws IOException {
        ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
        BufferedInputStream bufis = new BufferedInputStream(new GZIPInputStream(bis));
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        byte[] buf = new byte[1024];
        int len;
        while ((len = bufis.read(buf)) > 0) {
            bos.write(buf, 0, len);
        }
        String retval = bos.toString("UTF-8");
        bis.close();
        bufis.close();
        bos.close();
        return retval;
    }

    public static String removeJavascript(String s) {
        StringBuffer buf = new StringBuffer(s);
        int scriptStart = buf.indexOf("<script");
        while (scriptStart > 0) {
            int scriptEnd = buf.indexOf("</script>", scriptStart) + 9;
            if (scriptEnd > 0) {
                buf.replace(scriptStart, scriptEnd, "");
            }
            scriptStart = buf.indexOf("<script", scriptStart);
        }
        return buf.toString();
    }

    private static final String[] VELOCITY_PATTERNS = { "\\$\\{.*\\}", "##.*$", "#for\\s*\\(.*", "#set\\s*\\(.*",
            "\\$\\w+\\.\\w+\\(.*\\)", "\\$\\w+\\.\\w+", "#if\\s*\\(.*\\)", "#end" };

    private static final String[] XML_PATTERNS = { "<\\!\\[CDATA\\[", "\\]\\]>" };
    private static final String[] HTML_PATTERNS = { "\\&gt;", "\\&lt;", "\\&nbsp;",
            "[\\n.,`~@#$%\\^\\&*\\(\\)\\[\\]\\-\\=\\/\\|]" };

    public static String extractSearchTextFromHTML(String html) {
        String result = removeJavascript(html).replaceAll("<.*?>", "");
        for (String pattern : VELOCITY_PATTERNS) {
            result = result.replaceAll(pattern, "");
        }
        for (String pattern : XML_PATTERNS) {
            result = result.replaceAll(pattern, "");
        }
        for (String pattern : HTML_PATTERNS) {
            result = result.replaceAll(pattern, " ");
        }
        return result;
    }

    public static String extractTextFromHTML(String html) {
        return removeJavascript(html).replaceAll("<.*?>", " ").replaceAll("&#160;", " ").replaceAll("&nbsp;", " ")
                .replaceAll("\n+", " ").replaceAll("\t+", " ").replaceAll(" +", " ");
    }

    public static List<Long> toLong(List<String> list) {
        List<Long> result = new ArrayList<Long>();
        for (String s : list) {
            try {
                result.add(Long.valueOf(s));
            } catch (NumberFormatException e) {
                logger.error("Wrong number format " + s);
            }
        }
        return result;
    }

    public static String[] splitByWord(String data) {
        return data.split("[ ,.:?!~#\n\t]+");
    }

    /**
     * Unpack title from old csv format: enTitle1,ruTitle2
     * @param data - data to unpack.
     * @return - result map.
     */
    public static Map<String, String> unpack06Title(String data) {
        Map<String, String> result = new HashMap<String, String>();
        if (StringUtils.isEmpty(data)) {
            return result;
        }
        String[] items = data.split(",");
        for (String item : items) {
            if (item.length() > 2) {
                String key = item.substring(0, 2);
                String value = item.substring(2);
                result.put(key, value);
            }
        }
        return result;
    }
}