com.sfs.DataFilter.java Source code

Introduction

Here is the source code for com.sfs.DataFilter.java
Source

/*******************************************************************************
 * Copyright (c) 2009 David Harrison.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl-3.0.html
 *
 * Contributors:
 *     David Harrison - initial API and implementation
 ******************************************************************************/
package com.sfs;

import java.text.ParseException;
import java.text.SimpleDateFormat;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Calendar;
import java.util.Set;
import java.util.HashSet;
import java.util.TreeMap;
import java.util.StringTokenizer;

import org.apache.commons.lang.StringUtils;

import org.clapper.util.html.HTMLUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.safety.Whitelist;

/**
 * The DataFilter class has a range of utility functions for parsing and
 * filtering incoming data.
 */
public class DataFilter {

    /** The Constant REPLACE_WHAT. */
    private static final String[] REPLACE_WHAT = { "<P>", "</P>", "<BR>", "\n", "<STRONG>", "</STRONG>", "<EM>",
            "</EM>", "<U>", "</U>", "<OL>", "</OL>", "<UL>", "</UL>", "<LI>", "<LI>", "<SUP>", "</SUP>", "<SUB>",
            "</SUB>" };

    /** The Constant REPLACE_TO. */
    private static final String[] REPLACE_TO = { "%p%", "%/p%", "%br%", "%br%", "%b%", "%/b%", "%i%", "%/i%", "%u%",
            "%/u%", "%ol%", "%/ol%", "%ul%", "%/ul%", "%li%", "%/li%", "%sup%", "%/sup%", "%sub%", "%/sub%" };

    /** The Constant ALLOWED_HTML. */
    private static final String[] ALLOWED_HTML = { "<p>", "</p>", "<br/>", "<br/>", "<b>", "</b>", "<i>", "</i>",
            "<u>", "</u>", "<ol>", "</ol>", "<ul>", "</ul>", "<li>", "</li>", "<sup>", "</sup>", "<sub>",
            "</sub>" };

    /** The Constant DOUBLE_QUOTE. */
    private static final String DOUBLE_QUOTE = "\"";

    /** The Constant WHITESPACE_AND_QUOTES. */
    private static final String WHITESPACE_AND_QUOTES = " \t\r\n\"";

    /** The Constant QUOTES_ONLY. */
    private static final String QUOTES_ONLY = "\"";

    /**
     * Instantiates a new data filter.
     */
    protected DataFilter() {
        throw new UnsupportedOperationException();
    }

    /**
     * Convert the supplied HTML to plain text.
     *
     * @param html the html
     * @return the string
     */
    public static String html2Text(final String html) {
        return Jsoup.parse(html).text();
    }

    /**
     * Gets the html.
     *
     * @param original the original
     *
     * @return the html
     */
    public static String getHtml(final String original) {
        String htmlString = "";

        if (original != null) {
            StringBuffer html = new StringBuffer(original.length());
            char n;
            for (int i = 0; i < original.length(); i++) {
                n = original.charAt(i);
                if (n == '>') {
                    html.append("&gt;");
                } else if (n == '<') {
                    html.append("&lt;");
                } else if (n == '&') {
                    html.append("&amp;");
                } else {
                    html.append(n);
                }
            }
            htmlString = html.toString();
        }
        return htmlString;
    }

    /**
     * Gets the safe xml.
     *
     * @param input the input
     *
     * @return the safe xml
     */
    public static String getSafeXml(final String input) {

        String safeXML = "";

        if (StringUtils.isNotBlank(input)) {

            // We do not want to convert these characters for this function
            safeXML = StringUtils.replace(input, "&gt;", "#gt#;");
            safeXML = StringUtils.replace(safeXML, "&lt;", "#lt#;");
            safeXML = StringUtils.replace(safeXML, "&amp;", "#amp#;");

            safeXML = HTMLUtil.convertCharacterEntities(safeXML);
            safeXML = StringUtils.replace(safeXML, "<br>", "<br/>");

            // Now convert the < and > characters back to normal
            safeXML = StringUtils.replace(safeXML, "#gt#;", "&gt;");
            safeXML = StringUtils.replace(safeXML, "#lt#;", "&lt;");
            safeXML = StringUtils.replace(safeXML, "#amp#;", "&amp;");
        }

        return safeXML;
    }

    /**
     * Capitalise first.
     *
     * @param input the input
     *
     * @return the string
     */
    public static String capitaliseFirst(final String input) {
        String output = "";

        if (input != null) {
            if (input.length() > 0) {
                output = Character.toUpperCase(input.charAt(0)) + input.substring(1);
            }
        }
        return output;
    }

    /**
     * Parses the date.
     *
     * @param strDate the str date
     * @param defaultcurrent the defaultcurrent
     *
     * @return the date
     */
    public static Date parseDate(final String strDate, final boolean defaultcurrent) {
        Date date = null;

        if (strDate != null) {
            date = parseDate(strDate);
        }
        if (date == null && defaultcurrent) {
            date = Calendar.getInstance().getTime();
        }
        return date;
    }

    /**
     * Parses the date.
     *
     * @param dateString the date string
     * @param format the format
     * @param defaultCurrent the default current
     *
     * @return the string
     */
    public static String parseDate(final String dateString, final String format, final boolean defaultCurrent) {
        return Formatter.numericDate(parseDate(dateString, defaultCurrent), format);
    }

    /**
     * Parses the conventional date.
     *
     * @param dateString the date string
     * @param defaultCurrent the default current
     *
     * @return the string
     */
    public static String parseConventionalDate(final String dateString, final boolean defaultCurrent) {
        return Formatter.conventionalDate(parseDate(dateString, defaultCurrent));
    }

    /**
     * Parses the date.
     *
     * @param strDate the str date
     *
     * @return the date
     */
    private static Date parseDate(final String strDate) {
        final String[] dateFormats = { "dd/MM/yyyy", "dd.MM.yyyy", "dd-MM-yyyy", "dd/M/yyyy", "dd.M.yyyy",
                "dd-M-yyyy", "d/M/yyyy", "d.M.yyyy", "d-M-yyyy", "EEEE dd MMMM yyyy" };
        final String[] timeFormats = { "HH:mm", "hh:mm a" };

        Date date = null;
        for (int i = 0; i < dateFormats.length; i++) {
            if (date == null) {
                final String dateFormat = dateFormats[i];
                try {
                    final SimpleDateFormat df = new SimpleDateFormat(dateFormat);
                    date = df.parse(strDate);
                } catch (ParseException e) {
                    // Error parsing date - catch but don't do anything
                    date = null;
                }
            }
        }
        if (date == null) {
            // Iterate through timeFormats and add date formats
            for (int x = 0; x < timeFormats.length; x++) {
                if (date == null) {
                    final String timeFormat = timeFormats[x];
                    for (int i = 0; i < dateFormats.length; i++) {
                        if (date == null) {
                            final String dateFormat = dateFormats[i];
                            try {
                                final SimpleDateFormat df = new SimpleDateFormat(timeFormat + " " + dateFormat);
                                date = df.parse(strDate);
                            } catch (ParseException e) {
                                // Error parsing date - catch but don't do anything
                                date = null;
                            }
                        }
                    }
                }
            }
        }
        return date;
    }

    /**
     * Parses the currency.
     *
     * @param currencyVal the currency string
     *
     * @return the double
     */
    public static double parseCurrency(final String currencyVal) {
        double currency = 0;

        if (currencyVal != null) {
            String currencyString = StringUtils.replace(currencyVal, "$", "");
            currencyString = StringUtils.replace(currencyString, ",", "");
            try {
                currency = Double.parseDouble(currencyString);
            } catch (NumberFormatException nfe) {
                // Catch this exception but do nothing
                currency = 0;
            }
        }
        return currency;
    }

    /**
     * Parses the integer.
     *
     * @param stringValue the string value
     * @return the int
     */
    public static int parseInteger(final String stringValue) {

        int result = 0;

        if (StringUtils.isNotBlank(stringValue)) {
            try {
                result = Integer.parseInt(stringValue);
            } catch (NumberFormatException nfe) {
                result = 0;
            }
        }
        return result;
    }

    /**
     * This function ensures no hazardous html formating is sent to the database
     * or sent back to the client. Only the specified html tags are left after
     * this filter, the rest are turned into &gt; and &lt;
     *
     * @param original the original
     *
     * @return the string
     */
    public static String convert2XML(final String original) {

        String finalisedText = original;

        for (int i = 0; i < REPLACE_WHAT.length; i++) {
            finalisedText = StringUtils.replace(finalisedText, REPLACE_WHAT[i], REPLACE_TO[i]);
        }
        // Replace &nbsp; characters with space
        finalisedText = StringUtils.replace(finalisedText, "&nbsp;", " ");

        // Now run this modified text through the HTML filter to get
        // rid of all the bad characters
        String finalisedHtml = getHtml(finalisedText);

        // With bad characters removed bring back the xml tags we want
        for (int i = 0; i < REPLACE_TO.length; i++) {
            finalisedHtml = StringUtils.replace(finalisedHtml, REPLACE_TO[i], ALLOWED_HTML[i]);
        }
        return finalisedHtml;
    }

    /**
     * Strip html comments.
     *
     * @param original the original
     *
     * @return the string
     */
    public static String stripHtmlComments(final String original) {

        String html = "";

        if (StringUtils.isNotBlank(original)) {

            String input = StringUtils.replace(original, "--&gt;", "-->");
            input = StringUtils.replace(input, "&lt;!--", "<!--");

            Document doc = Jsoup.parse(input);
            removeComments(doc);
            html = doc.body().html();
        }
        html = Jsoup.clean(html, Whitelist.relaxed());

        return StringUtils.replace(html, "&nbsp;", " ");
    }

    /**
     * Removes the comments.
     *
     * @param node the node
     */
    private static void removeComments(Node node) {
        for (int i = 0; i < node.childNodes().size();) {
            Node child = node.childNode(i);
            if (child.nodeName().equals("#comment"))
                child.remove();
            else {
                removeComments(child);
                i++;
            }
        }
    }

    /**
     * Convert text2 xml.
     *
     * @param xml the xml string
     *
     * @return the string
     */
    public static String convertText2XML(final String xml) {
        String xmlString = StringUtils.replace(xml, "&gt;", ">");
        xmlString = StringUtils.replace(xmlString, "&lt;", "<");
        xmlString = StringUtils.replace(xmlString, "&amp;", "&");

        return xmlString;
    }

    /**
     * Parses the numeric string.
     *
     * @param numericString the numeric string
     *
     * @return the int
     */
    public static int parseNumericString(final String numericString) {

        String good = "0123456789";
        String result = "";
        if (numericString != null) {
            for (int i = 0; i < numericString.length(); i++) {
                if (good.indexOf(numericString.charAt(i)) >= 0) {
                    result += numericString.charAt(i);
                }
            }
        }
        int resultingInt = 0;
        try {
            resultingInt = Integer.parseInt(result);
        } catch (NumberFormatException nfe) {
            resultingInt = 0;
        }
        return resultingInt;
    }

    /**
     * Parses the text data into a collection.
     *
     * @param text the text
     *
     * @return the collection< string>
     */
    public static Collection<String> parseTextDataToCollection(final String text) {

        final Collection<String> textList = new ArrayList<String>();

        if (StringUtils.isNotBlank(text)) {
            TreeMap<Integer, TreeMap<Integer, String>> parsedMap = DataFilter.parseTextData(text);

            if (parsedMap != null) {
                for (Integer row : parsedMap.keySet()) {
                    TreeMap<Integer, String> colMap = parsedMap.get(row);
                    if (colMap != null) {
                        for (Integer col : colMap.keySet()) {
                            final String value = colMap.get(col);
                            if (StringUtils.isNotBlank(value)) {
                                textList.add(value.trim());
                            }
                        }
                    }
                }
            }
        }
        return textList;
    }

    /**
     * Parses the text data.
     *
     * @param text the text
     *
     * @return the tree map< integer, tree map< integer, string>>
     */
    public static TreeMap<Integer, TreeMap<Integer, String>> parseTextData(final String text) {

        TreeMap<Integer, TreeMap<Integer, String>> parsedData = new TreeMap<Integer, TreeMap<Integer, String>>();

        // This counter holds the maximum number of columns provided
        int maxNumberOfTokens = 0;

        if (text != null) {
            StringTokenizer tokenizer = new StringTokenizer(text, "\n");

            int lineCounter = 1;

            while (tokenizer.hasMoreTokens()) {
                String line = tokenizer.nextToken();
                TreeMap<Integer, String> parsedLine = new TreeMap<Integer, String>();

                final StringTokenizer tabTokenizer = new StringTokenizer(line, "\t");
                if (tabTokenizer.countTokens() > 1) {
                    parsedLine = tokenizerToMap(tabTokenizer);
                } else {
                    final StringTokenizer commaTokenizer = new StringTokenizer(line, ",");
                    parsedLine = tokenizerToMap(commaTokenizer);
                }
                if (parsedLine.size() > maxNumberOfTokens) {
                    maxNumberOfTokens = parsedLine.size();
                }

                parsedData.put(lineCounter, parsedLine);
                lineCounter++;
            }
        }

        // Now cycle through all the parsed data
        // Ensure that each row has the same (max) number of tokens
        for (int rowIndex : parsedData.keySet()) {
            TreeMap<Integer, String> parsedLine = parsedData.get(rowIndex);

            // This map holds the final values
            TreeMap<Integer, String> columnTokens = new TreeMap<Integer, String>();

            for (int i = 0; i < maxNumberOfTokens; i++) {
                int columnIndex = i + 1;
                if (parsedLine.containsKey(columnIndex)) {
                    String value = parsedLine.get(columnIndex);
                    columnTokens.put(columnIndex, value);
                } else {
                    columnTokens.put(columnIndex, "");
                }
            }
            parsedData.put(rowIndex, columnTokens);
        }

        return parsedData;
    }

    /**
     * Tokenizer to map.
     *
     * @param tokenizer the tokenizer
     *
     * @return the tree map< integer, string>
     */
    private static TreeMap<Integer, String> tokenizerToMap(final StringTokenizer tokenizer) {

        TreeMap<Integer, String> parsedData = new TreeMap<Integer, String>();

        int lineCounter = 1;
        if (tokenizer != null) {
            while (tokenizer.hasMoreTokens()) {
                String token = tokenizer.nextToken();

                parsedData.put(lineCounter, token.trim());
                lineCounter++;
            }
        }
        return parsedData;
    }

    /**
     * Parse the user's search box input into a Set of String tokens.
     *
     * @param searchText the search text
     *
     * @return Set of Strings, one for each word in searchText; here "word" is
     *         defined as either a lone word surrounded by whitespace, or as a
     *         series of words surrounded by double quotes, "like this"; also,
     *         very common words (and, the, etc.) do not qualify as possible
     *         search targets.
     */
    public static HashSet<String> parseSearchText(final String searchText) {
        HashSet<String> result = new HashSet<String>();

        boolean returnTokens = true;
        String currentDelims = WHITESPACE_AND_QUOTES;
        StringTokenizer parser = new StringTokenizer(searchText, currentDelims, returnTokens);

        String token = null;
        while (parser.hasMoreTokens()) {
            token = parser.nextToken(currentDelims);
            if (!isDoubleQuote(token)) {
                addNonTrivialWordToResult(token, result);
            } else {
                currentDelims = flipDelimiters(currentDelims);
            }
        }
        return result;
    }

    /**
     * Use to determine if a particular word entered in the search box should be
     * discarded from the search.
     *
     * @param aSearchTokenCandidate the a search token candidate
     *
     * @return true, if checks if is common word
     */
    private static boolean isCommonWord(final String aSearchTokenCandidate) {

        final Set<String> commonWords = new HashSet<String>();

        commonWords.add("a");
        commonWords.add("and");
        commonWords.add("");
        commonWords.add("for");
        commonWords.add("from");
        commonWords.add("has");
        commonWords.add("i");
        commonWords.add("in");
        commonWords.add("is");
        commonWords.add("it");
        commonWords.add("of");
        commonWords.add("on");
        commonWords.add("to");
        commonWords.add("the");
        commonWords.add("or");

        return commonWords.contains(aSearchTokenCandidate);
    }

    /**
     * Text has content.
     *
     * @param aText the a text
     *
     * @return true, if successful
     */
    private static boolean textHasContent(final String aText) {
        return (aText != null) && (!aText.trim().equals(""));
    }

    /**
     * Adds the non trivial word to result.
     *
     * @param aToken the a token
     * @param aResult the a result
     */
    private static void addNonTrivialWordToResult(final String aToken, final Set<String> aResult) {
        if (textHasContent(aToken) && !isCommonWord(aToken.trim())) {
            aResult.add(aToken.trim());
        }
    }

    /**
     * Checks if is double quote.
     *
     * @param aToken the a token
     *
     * @return true, if is double quote
     */
    private static boolean isDoubleQuote(final String aToken) {
        return aToken.equals(DOUBLE_QUOTE);
    }

    /**
     * Flip delimiters.
     *
     * @param aCurrentDelims the a current delims
     *
     * @return the string
     */
    private static String flipDelimiters(final String aCurrentDelims) {
        String result = null;
        if (aCurrentDelims.equals(WHITESPACE_AND_QUOTES)) {
            result = QUOTES_ONLY;
        } else {
            result = WHITESPACE_AND_QUOTES;
        }
        return result;
    }
}