org.talend.dataprep.transformation.actions.text.SplitTest.java Source code

Java tutorial

Introduction

Here is the source code for org.talend.dataprep.transformation.actions.text.SplitTest.java

Source

//  ============================================================================
//
//  Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
//  This source code is available under agreement available at
//  https://github.com/Talend/data-prep/blob/master/LICENSE
//
//  You should have received a copy of the agreement
//  along with this program; if not, write to Talend SA
//  9 rue Pages 92150 Suresnes, France
//
//  ============================================================================
package org.talend.dataprep.transformation.actions.text;

import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.*;
import static org.talend.dataprep.api.dataset.ColumnMetadata.Builder.column;
import static org.talend.dataprep.transformation.actions.AbstractMetadataBaseTest.ValueBuilder.value;
import static org.talend.dataprep.transformation.actions.AbstractMetadataBaseTest.ValuesBuilder.builder;
import static org.talend.dataprep.transformation.actions.ActionMetadataTestUtils.getColumn;
import static org.talend.dataprep.transformation.actions.ActionMetadataTestUtils.getRow;

import java.io.IOException;
import java.util.*;

import org.apache.commons.lang.StringUtils;
import org.junit.Before;
import org.junit.Test;
import org.talend.dataprep.api.action.ActionDefinition;
import org.talend.dataprep.api.dataset.ColumnMetadata;
import org.talend.dataprep.api.dataset.RowMetadata;
import org.talend.dataprep.api.dataset.row.DataSetRow;
import org.talend.dataprep.api.dataset.statistics.PatternFrequency;
import org.talend.dataprep.api.dataset.statistics.Statistics;
import org.talend.dataprep.api.preparation.Action;
import org.talend.dataprep.api.type.Type;
import org.talend.dataprep.parameters.Parameter;
import org.talend.dataprep.transformation.actions.AbstractMetadataBaseTest;
import org.talend.dataprep.transformation.actions.ActionMetadataTestUtils;
import org.talend.dataprep.transformation.actions.category.ActionCategory;
import org.talend.dataprep.transformation.api.action.ActionTestWorkbench;

/**
 * Test class for Split action. Creates one consumer, and test it.
 *
 * @see Split
 */
public class SplitTest extends AbstractMetadataBaseTest {

    /**
     * The action to test.
     */
    private Split action = new Split();

    /** The action parameters. */
    private Map<String, String> parameters;

    @Before
    public void init() throws IOException {
        parameters = ActionMetadataTestUtils
                .parseParameters(SplitTest.class.getResourceAsStream("splitAction.json"));
    }

    @Test
    public void testName() throws Exception {
        assertEquals("split", action.getName());
    }

    @Test
    public void testParameters() throws Exception {
        final List<Parameter> parameters = action.getParameters();
        assertEquals(6, parameters.size());
        assertEquals(1L, parameters.stream().filter(p -> StringUtils.equals(Split.LIMIT, p.getName())).count());
        final Optional<Parameter> separatorParameter = parameters.stream() //
                .filter(p -> StringUtils.equals(Split.SEPARATOR_PARAMETER, p.getName())) //
                .findFirst();
        assertTrue(separatorParameter.isPresent());
    }

    @Test
    public void testAdapt() throws Exception {
        assertThat(action.adapt((ColumnMetadata) null), is(action));
        ColumnMetadata column = column().name("myColumn").id(0).type(Type.STRING).build();
        assertThat(action.adapt(column), is(action));
    }

    @Test
    public void testCategory() throws Exception {
        assertThat(action.getCategory(), is(ActionCategory.SPLIT.getDisplayName()));
    }

    @Test
    public void should_split_row() {
        // given
        final DataSetRow row = getRow("lorem bacon", "Bacon ipsum dolor amet swine leberkas pork belly",
                "01/01/2015");

        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Bacon ipsum dolor amet swine leberkas pork belly");
        expectedValues.put("0003", "Bacon");
        expectedValues.put("0004", "ipsum dolor amet swine leberkas pork belly");
        expectedValues.put("0002", "01/01/2015");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(expectedValues, row.values());
    }

    @Test
    public void should_split_semicolon() {
        // given
        final DataSetRow row = getRow("lorem bacon", "Bacon;ipsum", "01/01/2015");

        parameters.put(Split.SEPARATOR_PARAMETER, ";");

        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Bacon;ipsum");
        expectedValues.put("0003", "Bacon");
        expectedValues.put("0004", "ipsum");
        expectedValues.put("0002", "01/01/2015");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(expectedValues, row.values());
    }

    @Test
    public void should_split_underscore() {
        // given
        final DataSetRow row = getRow("lorem bacon", "Bacon_ipsum", "01/01/2015");

        parameters.put(Split.SEPARATOR_PARAMETER, "other (string)");
        parameters.put(Split.MANUAL_SEPARATOR_PARAMETER_STRING, "_");

        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Bacon_ipsum");
        expectedValues.put("0003", "Bacon");
        expectedValues.put("0004", "ipsum");
        expectedValues.put("0002", "01/01/2015");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(expectedValues, row.values());
    }

    @Test
    public void should_split_tab() {
        // given
        final DataSetRow row = getRow("lorem bacon", "Bacon\tipsum", "01/01/2015");

        parameters.put(Split.SEPARATOR_PARAMETER, "other (string)");
        parameters.put(Split.MANUAL_SEPARATOR_PARAMETER_STRING, "\t");

        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Bacon\tipsum");
        expectedValues.put("0003", "Bacon");
        expectedValues.put("0004", "ipsum");
        expectedValues.put("0002", "01/01/2015");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(expectedValues, row.values());
    }

    @Test
    public void test_TDP_786_empty_pattern() {
        // given
        final Map<String, String> values = new HashMap<>();
        values.put("0000", "lorem bacon");
        values.put("0001", "Je vais bien (tout va bien)");
        values.put("0002", "01/01/2015");
        final DataSetRow row = new DataSetRow(values);

        parameters.put(Split.SEPARATOR_PARAMETER, "other (string)");
        parameters.put(Split.MANUAL_SEPARATOR_PARAMETER_STRING, "");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(values, row.values());
    }

    @Test
    public void test_TDP_831_invalid_pattern() {
        // given
        final Map<String, String> values = new HashMap<>();
        values.put("0000", "lorem bacon");
        values.put("0001", "Je vais bien (tout va bien)");
        values.put("0002", "01/01/2015");
        final DataSetRow row = new DataSetRow(values);

        parameters.put(Split.SEPARATOR_PARAMETER, "other (regex)");
        parameters.put(Split.MANUAL_SEPARATOR_PARAMETER_STRING, "(");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(values, row.values());
    }

    @Test
    public void test_string_that_looks_like_a_regex() {
        // given
        final DataSetRow row = getRow("lorem bacon", "Je vais bien (tout va bien)", "01/01/2015");

        parameters.put(Split.SEPARATOR_PARAMETER, "other (string)");
        parameters.put(Split.MANUAL_SEPARATOR_PARAMETER_STRING, "(");

        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Je vais bien (tout va bien)");
        expectedValues.put("0002", "01/01/2015");
        expectedValues.put("0003", "Je vais bien ");
        expectedValues.put("0004", "tout va bien)");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(expectedValues, row.values());
    }

    @Test
    public void test_split_on_regex() {
        // given
        final DataSetRow row = getRow("lorem bacon", "Je vais bien (tout va bien)", "01/01/2015");

        parameters.put(Split.SEPARATOR_PARAMETER, "other (regex)");
        parameters.put(Split.MANUAL_SEPARATOR_PARAMETER_REGEX, "bien");

        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Je vais bien (tout va bien)");
        expectedValues.put("0003", "Je vais ");
        expectedValues.put("0004", " (tout va bien)");
        expectedValues.put("0002", "01/01/2015");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(expectedValues, row.values());
    }

    @Test
    public void test_split_on_regex2() {
        // given
        final DataSetRow row = getRow("lorem bacon", "Je vais bien (tout va bien)", "01/01/2015");

        parameters.put(Split.SEPARATOR_PARAMETER, "other (regex)");
        parameters.put(Split.MANUAL_SEPARATOR_PARAMETER_REGEX, "bien|fff");

        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Je vais bien (tout va bien)");
        expectedValues.put("0003", "Je vais ");
        expectedValues.put("0004", " (tout va bien)");
        expectedValues.put("0002", "01/01/2015");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(expectedValues, row.values());
    }

    @Test
    /**
     * @see SplitTest#should_split_row()
     */
    public void test_TDP_876() {
        // given
        final DataSetRow row = builder() //
                .with(value("lorem bacon").type(Type.STRING)) //
                .with(value("Bacon ipsum dolor amet swine leberkas pork belly").type(Type.STRING)) //
                .with(value("01/01/2015").type(Type.STRING)) //
                .build();

        // when
        ActionTestWorkbench.test(Collections.singletonList(row), //
                analyzerService, // Test requires some analysis in asserts
                actionRegistry, factory.create(action, parameters));

        // then
        final RowMetadata actual = row.getRowMetadata();
        Statistics originalStats = actual.getById("0001").getStatistics();
        final List<PatternFrequency> originalPatterns = originalStats.getPatternFrequencies();

        assertFalse(originalPatterns.equals(actual.getById("0003").getStatistics().getPatternFrequencies()));
        assertFalse(originalPatterns.equals(actual.getById("0004").getStatistics().getPatternFrequencies()));
    }

    @Test
    public void should_split_row_twice() {
        // given
        final DataSetRow row = getRow("lorem bacon", "Bacon ipsum dolor amet swine leberkas pork belly",
                "01/01/2015");

        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Bacon ipsum dolor amet swine leberkas pork belly");
        expectedValues.put("0005", "Bacon");
        expectedValues.put("0006", "ipsum dolor amet swine leberkas pork belly");
        expectedValues.put("0003", "Bacon");
        expectedValues.put("0004", "ipsum dolor amet swine leberkas pork belly");
        expectedValues.put("0002", "01/01/2015");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters),
                factory.create(action, parameters));

        // then
        assertEquals(expectedValues, row.values());
    }

    /**
     * @see Action#getRowAction()
     */
    @Test
    public void should_split_row_with_separator_at_the_end() {
        // given
        final DataSetRow row = getRow("lorem bacon", "Bacon ", "01/01/2015");

        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Bacon ");
        expectedValues.put("0003", "Bacon");
        expectedValues.put("0004", "");
        expectedValues.put("0002", "01/01/2015");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(expectedValues, row.values());
    }

    /**
     * @see Action#getRowAction()
     */
    @Test
    public void should_split_row_no_separator() {
        // given
        final DataSetRow row = getRow("lorem bacon", "Bacon", "01/01/2015");

        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Bacon");
        expectedValues.put("0003", "Bacon");
        expectedValues.put("0004", "");
        expectedValues.put("0002", "01/01/2015");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(expectedValues, row.values());
    }

    /**
     * @see Action#getRowAction()
     */
    @Test
    public void should_update_metadata() {
        // given
        final List<ColumnMetadata> input = new ArrayList<>();
        input.add(createMetadata("0000", "recipe"));
        input.add(createMetadata("0001", "steps"));
        input.add(createMetadata("0002", "last update"));
        final RowMetadata rowMetadata = new RowMetadata(input);

        final List<ColumnMetadata> expected = new ArrayList<>();
        expected.add(createMetadata("0000", "recipe"));
        expected.add(createMetadata("0001", "steps"));
        expected.add(createMetadata("0003", "steps_split_1"));
        expected.add(createMetadata("0004", "steps_split_2"));
        expected.add(createMetadata("0002", "last update"));

        // when
        ActionTestWorkbench.test(rowMetadata, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(expected, rowMetadata.getColumns());
    }

    /**
     * @see Action#getRowAction()
     */
    @Test
    public void should_update_metadata_twice() {
        // given
        final List<ColumnMetadata> input = new ArrayList<>();
        input.add(createMetadata("0000", "recipe"));
        input.add(createMetadata("0001", "steps"));
        input.add(createMetadata("0002", "last update"));
        final RowMetadata rowMetadata = new RowMetadata(input);

        final List<ColumnMetadata> expected = new ArrayList<>();
        expected.add(createMetadata("0000", "recipe"));
        expected.add(createMetadata("0001", "steps"));
        expected.add(createMetadata("0005", "steps_split_1"));
        expected.add(createMetadata("0006", "steps_split_2"));
        expected.add(createMetadata("0003", "steps_split_1"));
        expected.add(createMetadata("0004", "steps_split_2"));
        expected.add(createMetadata("0002", "last update"));

        // when
        ActionTestWorkbench.test(rowMetadata, actionRegistry, factory.create(action, parameters),
                factory.create(action, parameters));

        assertEquals(expected, rowMetadata.getColumns());
    }

    @Test
    public void should_not_split_separator_not_found() throws IOException {
        // given
        final DataSetRow row = getRow("lorem bacon", "Bacon ipsum dolor amet swine leberkas pork belly",
                "01/01/2015");

        parameters.put(Split.SEPARATOR_PARAMETER, "-");
        parameters.put(Split.LIMIT, "4");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        final Map<String, String> expectedValues = new HashMap<>();
        expectedValues.put("0000", "lorem bacon");
        expectedValues.put("0001", "Bacon ipsum dolor amet swine leberkas pork belly");
        expectedValues.put("0003", "Bacon ipsum dolor amet swine leberkas pork belly");
        expectedValues.put("0004", "");
        expectedValues.put("0005", "");
        expectedValues.put("0006", "");
        expectedValues.put("0002", "01/01/2015");

        assertEquals(expectedValues, row.values());
    }

    @Test
    public void should_not_split_because_null_separator() throws IOException {
        // given
        final Map<String, String> values = new HashMap<>();
        values.put("0000", "lorem bacon");
        values.put("0001", "Bacon ipsum dolor amet swine leberkas pork belly");
        values.put("0002", "01/01/2015");
        final DataSetRow row = new DataSetRow(values);

        parameters.put(Split.SEPARATOR_PARAMETER, "");

        // when
        ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(values, row.values());
    }

    @Test
    public void should_not_update_metadata_because_null_separator() throws IOException {
        // given
        final List<ColumnMetadata> input = new ArrayList<>();
        input.add(createMetadata("0000", "recipe"));
        input.add(createMetadata("0001", "steps"));
        input.add(createMetadata("0002", "last update"));
        final RowMetadata rowMetadata = new RowMetadata(input);

        parameters.put(Split.SEPARATOR_PARAMETER, "");

        // when
        ActionTestWorkbench.test(rowMetadata, actionRegistry, factory.create(action, parameters));

        // then
        assertEquals(rowMetadata, rowMetadata);
    }

    @Test
    public void should_accept_column() {
        assertTrue(action.acceptField(getColumn(Type.STRING)));
    }

    @Test
    public void should_not_accept_column() {
        assertFalse(action.acceptField(getColumn(Type.NUMERIC)));
        assertFalse(action.acceptField(getColumn(Type.FLOAT)));
        assertFalse(action.acceptField(getColumn(Type.DATE)));
        assertFalse(action.acceptField(getColumn(Type.BOOLEAN)));
    }

    @Test
    public void should_have_separator_that_could_be_blank() {
        Optional<Parameter> parameter = new Split().getParameters().stream()
                .filter(p -> StringUtils.equals(p.getName(), Split.SEPARATOR_PARAMETER)).findFirst();
        if (parameter.isPresent()) {
            assertTrue(parameter.get().isCanBeBlank());
        } else {
            fail();
        }
    }

    @Test
    public void should_have_expected_behavior() {
        assertEquals(1, action.getBehavior().size());
        assertTrue(action.getBehavior().contains(ActionDefinition.Behavior.METADATA_CREATE_COLUMNS));
    }

    /**
     * @param name name of the column metadata to create.
     * @return a new column metadata
     */
    protected ColumnMetadata createMetadata(String id, String name) {
        return ColumnMetadata.Builder.column().computedId(id).name(name).type(Type.STRING).headerSize(12).empty(0)
                .invalid(2).valid(5).build();
    }

}