net.triptech.buildulator.DataParser.java Source code

Java tutorial

Introduction

Here is the source code for net.triptech.buildulator.DataParser.java

Source

/*******************************************************************************
 * Copyright (c) 2012 David Harrison, Triptech Ltd.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 *
 * Contributors:
 *     David Harrison, Triptech Ltd - initial API and implementation
 ******************************************************************************/
package net.triptech.buildulator;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.TreeMap;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;

import net.triptech.buildulator.SmartTokenizer;

/**
 * The Class DataParser.
 */
public class DataParser {

    /**
     * Strip any html elements from the supplied input.
     *
     * @param htmlString the html string
     * @return the string
     */
    public static String stripHtml(final String htmlString) {
        String text = "";

        if (StringUtils.isNotBlank(htmlString)) {
            try {
                text = Jsoup.parse(htmlString).text();
            } catch (Exception e) {
                text = "";
            }
        }
        return text;
    }

    /**
     * Clean the inpput HTML.
     *
     * @param htmlString the html string
     * @return the string
     */
    public static String cleanHtml(final String htmlString) {
        String htmlOutput = "";

        if (StringUtils.isNotBlank(htmlString)) {
            try {
                htmlOutput = Jsoup.clean(htmlString, Whitelist.relaxed().addAttributes("table", "style", "class")
                        .addAttributes("th", "style", "class").addAttributes("td", "style", "class"));
            } catch (Exception e) {
                htmlOutput = "";
            }
        }
        return htmlOutput;
    }

    /**
     * Parses the text data.
     *
     * @param text the text
     *
     * @return the tree map< integer, tree map< integer, string>>
     */
    public static String[][] parseTextData(final String text) {

        TreeMap<Integer, TreeMap<Integer, String>> rowData = new TreeMap<Integer, TreeMap<Integer, String>>();

        // This counter holds the maximum number of columns provided
        int maxNumberOfTokens = 0;

        if (text != null) {
            BufferedReader in = new BufferedReader(new StringReader(text));

            String line;
            int lineCounter = 0;

            try {
                while ((line = in.readLine()) != null) {
                    TreeMap<Integer, String> parsedLine = new TreeMap<Integer, String>();

                    SmartTokenizer tabTokenizer = new SmartTokenizer(line, "\t");
                    if (tabTokenizer.countTokens() > 1) {
                        parsedLine = tokenizerToMap(tabTokenizer);
                    } else {
                        SmartTokenizer commaTokenizer = new SmartTokenizer(line, ",");
                        parsedLine = tokenizerToMap(commaTokenizer);
                    }
                    if (parsedLine.size() > maxNumberOfTokens) {
                        maxNumberOfTokens = parsedLine.size();
                    }

                    rowData.put(lineCounter, parsedLine);
                    lineCounter++;
                }
            } catch (IOException ioe) {
                // Error reading string
            }
        }

        String[][] parsedData = new String[rowData.size()][];

        // Now cycle through all the parsed data
        // Ensure that each row has the same (max) number of tokens
        for (int rowIndex : rowData.keySet()) {
            TreeMap<Integer, String> parsedLine = rowData.get(rowIndex);

            // This map holds the final values
            TreeMap<Integer, String> columnTokens = new TreeMap<Integer, String>();

            for (int i = 0; i < maxNumberOfTokens; i++) {
                String value = "";
                if (parsedLine.containsKey(i)) {
                    value = parsedLine.get(i);
                }
                columnTokens.put(i, value);
            }

            parsedData[rowIndex] = new String[columnTokens.size()];

            for (int columnIndex : columnTokens.keySet()) {
                String value = columnTokens.get(columnIndex);
                parsedData[rowIndex][columnIndex] = value;
            }
        }
        return parsedData;
    }

    /**
     * Tokenizer to map.
     *
     * @param tokenizer the tokenizer
     *
     * @return the tree map< integer, string>
     */
    private static TreeMap<Integer, String> tokenizerToMap(final SmartTokenizer tokenizer) {

        TreeMap<Integer, String> parsedData = new TreeMap<Integer, String>();

        int lineCounter = 0;
        if (tokenizer != null) {
            while (tokenizer.hasMoreTokens()) {
                String token = tokenizer.nextToken();

                parsedData.put(lineCounter, token.trim());
                lineCounter++;
            }
        }
        return parsedData;
    }

}