com.cloudera.oryx.common.text.TextUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.oryx.common.text.TextUtils.java

Source

/*
 * Copyright (c) 2014, Cloudera and Intel, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */

package com.cloudera.oryx.common.text;

import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.StreamSupport;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.csv.QuoteMode;

/**
 * Text and parsing related utility methods.
 */
public final class TextUtils {

    private static final ObjectMapper MAPPER = new ObjectMapper();
    private static final CSVFormat CSV_FORMAT = CSVFormat.RFC4180.withSkipHeaderRecord().withEscape('\\');
    private static final String[] EMPTY_STRING = { "" };
    private static final Pattern TWO_DOUBLE_QUOTE_ESC = Pattern.compile("\"\"", Pattern.LITERAL);
    private static final String SLASH_QUOTE_ESC = Matcher.quoteReplacement("\\\"");

    private TextUtils() {
    }

    // Delimited --

    /**
     * @param delimited line of delimited text
     * @param delimiter delimiter to split fields on
     * @return delimited strings, parsed according to RFC 4180 but with the given delimiter
     */
    public static String[] parseDelimited(String delimited, char delimiter) {
        return doParseDelimited(delimited, formatForDelimiter(delimiter));
    }

    /**
     * @param delimited PMML-style space-delimited value string
     * @return delimited values, parsed according to PMML rules
     */
    public static String[] parsePMMLDelimited(String delimited) {
        // Although you'd think ignoreSurroundingSpaces helps here, won't work with space
        // delimiter. So manually trim below.
        String[] rawResult = doParseDelimited(delimited, formatForDelimiter(' '));
        List<String> resultList = new ArrayList<>();
        for (String raw : rawResult) {
            if (!raw.isEmpty()) {
                resultList.add(raw);
            }
        }
        return resultList.toArray(new String[resultList.size()]);
    }

    private static String[] doParseDelimited(String delimited, CSVFormat format) {
        try (CSVParser parser = CSVParser.parse(delimited, format)) {
            Iterator<CSVRecord> records = parser.iterator();
            return records.hasNext()
                    ? StreamSupport.stream(records.next().spliterator(), false).toArray(String[]::new)
                    : EMPTY_STRING;
        } catch (IOException e) {
            throw new IllegalStateException(e); // Can't happen
        }
    }

    /**
     * @param elements values to join by the delimiter to make one line of text
     * @param delimiter delimiter to put between fields
     * @return one line of text, with RFC 4180 escaping (values with comma are quoted; double-quotes
     *  are escaped by doubling) and using the given delimiter
     */
    public static String joinDelimited(Iterable<?> elements, char delimiter) {
        return doJoinDelimited(elements, formatForDelimiter(delimiter));
    }

    /**
     * @param elements values to join by space to make one line of text
     * @return one line of text, formatted according to PMML quoting rules
     *  (\" instead of "" for escaping quotes; ignore space surrounding values
     */
    public static String joinPMMLDelimited(Iterable<?> elements) {
        String rawResult = doJoinDelimited(elements, formatForDelimiter(' '));
        // Must change "" into \"
        return TWO_DOUBLE_QUOTE_ESC.matcher(rawResult).replaceAll(SLASH_QUOTE_ESC);
    }

    /**
     * @param elements numbers to join by space to make one line of text
     * @return one line of text, formatted according to PMML quoting rules
     */
    public static String joinPMMLDelimitedNumbers(Iterable<? extends Number> elements) {
        // bit of a workaround because NON_NUMERIC quote mode still quote "-1"!
        CSVFormat format = formatForDelimiter(' ').withQuoteMode(QuoteMode.NONE);
        // No quoting, no need to convert quoting
        return doJoinDelimited(elements, format);
    }

    private static CSVFormat formatForDelimiter(char delimiter) {
        CSVFormat format = CSV_FORMAT;
        if (delimiter != format.getDelimiter()) {
            format = format.withDelimiter(delimiter);
        }
        return format;
    }

    private static String doJoinDelimited(Iterable<?> elements, CSVFormat format) {
        StringWriter out = new StringWriter();
        try (CSVPrinter printer = new CSVPrinter(out, format)) {
            for (Object element : elements) {
                printer.print(element);
            }
            printer.flush();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
        return out.toString();
    }

    /// JSON --

    /**
     * @param json line of JSON text
     * @return delimited strings, the elements of the JSON array
     * @throws IOException if JSON parsing fails
     */
    public static String[] parseJSONArray(String json) throws IOException {
        return MAPPER.readValue(json, String[].class);
    }

    /**
     * @param elements elements to be joined in a JSON string. May be any objects.
     * @return JSON representation of the list of objects
     */
    public static String joinJSON(Iterable<?> elements) {
        try {
            return MAPPER.writeValueAsString(elements);
        } catch (JsonProcessingException e) {
            throw new IllegalArgumentException(e);
        }
    }

    /**
     * @param json JSON string
     * @param clazz Java type to interpret as
     * @param <T> type that should be parsed from JSON and returned
     * @return the JSON string, parsed into the given type
     */
    public static <T> T readJSON(String json, Class<T> clazz) {
        try {
            return MAPPER.readValue(json, clazz);
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    }

    /**
     * @param value value to convert
     * @param clazz desired type to interpret as
     * @param <T> type that should be parsed from JSON and returned
     * @return the given value, reinterpreted as the given type, as if serialized/deserialized
     *  via JSON to perform the conversion
     */
    public static <T> T convertViaJSON(Object value, Class<T> clazz) {
        return MAPPER.convertValue(value, clazz);
    }

}