org.datavec.api.transform.TransformProcess.java Source code

Introduction

Here is the source code for org.datavec.api.transform.TransformProcess.java
Source

/*-
 *  * Copyright 2016 Skymind, Inc.
 *  *
 *  *    Licensed under the Apache License, Version 2.0 (the "License");
 *  *    you may not use this file except in compliance with the License.
 *  *    You may obtain a copy of the License at
 *  *
 *  *        http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  *    Unless required by applicable law or agreed to in writing, software
 *  *    distributed under the License is distributed on an "AS IS" BASIS,
 *  *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  *    See the License for the specific language governing permissions and
 *  *    limitations under the License.
 */

package org.datavec.api.transform;

import com.google.common.collect.Sets;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.ClassUtils;
import org.datavec.api.transform.filter.ConditionFilter;
import org.datavec.api.transform.transform.string.AppendStringColumnTransform;
import org.datavec.api.transform.transform.string.ConvertToString;
import org.datavec.api.util.reflections.DataVecSubTypesScanner;
import org.reflections.ReflectionUtils;
import org.reflections.Reflections;
import org.reflections.util.ClasspathHelper;
import org.reflections.util.ConfigurationBuilder;
import org.reflections.util.FilterBuilder;
import org.datavec.api.writable.*;
import org.nd4j.shade.jackson.annotation.JsonAutoDetect;
import org.nd4j.shade.jackson.annotation.JsonProperty;
import org.nd4j.shade.jackson.annotation.PropertyAccessor;
import org.nd4j.shade.jackson.core.JsonFactory;
import org.nd4j.shade.jackson.core.JsonProcessingException;
import org.nd4j.shade.jackson.databind.DeserializationFeature;
import org.nd4j.shade.jackson.databind.ObjectMapper;
import org.nd4j.shade.jackson.databind.SerializationFeature;
import org.nd4j.shade.jackson.databind.introspect.AnnotatedClass;
import org.nd4j.shade.jackson.databind.jsontype.NamedType;
import org.nd4j.shade.jackson.dataformat.yaml.YAMLFactory;
import org.nd4j.shade.jackson.datatype.joda.JodaModule;
import org.datavec.api.transform.analysis.columns.ColumnAnalysis;
import org.datavec.api.transform.condition.Condition;
import org.datavec.api.transform.filter.Filter;
import org.datavec.api.transform.rank.CalculateSortedRank;
import org.datavec.api.transform.schema.Schema;
import org.datavec.api.transform.sequence.ConvertFromSequence;
import org.datavec.api.transform.sequence.ConvertToSequence;
import org.datavec.api.transform.sequence.SequenceSplit;
import org.datavec.api.transform.sequence.window.ReduceSequenceByWindowTransform;
import org.datavec.api.transform.sequence.window.WindowFunction;
import org.datavec.api.transform.transform.categorical.CategoricalToIntegerTransform;
import org.datavec.api.transform.transform.categorical.IntegerToCategoricalTransform;
import org.datavec.api.transform.transform.categorical.StringToCategoricalTransform;
import org.datavec.api.transform.transform.column.*;
import org.datavec.api.transform.transform.condition.ConditionalCopyValueTransform;
import org.datavec.api.transform.transform.condition.ConditionalReplaceValueTransform;
import org.datavec.api.transform.transform.integer.IntegerColumnsMathOpTransform;
import org.datavec.api.transform.transform.longtransform.LongColumnsMathOpTransform;
import org.datavec.api.transform.transform.longtransform.LongMathOpTransform;
import org.datavec.api.transform.transform.normalize.Normalize;
import org.datavec.api.transform.transform.doubletransform.*;
import org.datavec.api.transform.transform.string.RemoveWhiteSpaceTransform;
import org.datavec.api.transform.transform.string.StringMapTransform;
import org.datavec.api.transform.transform.time.StringToTimeTransform;
import org.datavec.api.transform.transform.time.TimeMathOpTransform;
import org.datavec.api.transform.analysis.columns.NumericalColumnAnalysis;
import org.datavec.api.transform.sequence.SequenceComparator;
import org.datavec.api.transform.transform.categorical.CategoricalToOneHotTransform;
import lombok.Data;
import org.datavec.api.transform.analysis.DataAnalysis;
import org.datavec.api.transform.reduce.IReducer;
import org.datavec.api.transform.schema.SequenceSchema;
import org.datavec.api.transform.transform.integer.IntegerMathOpTransform;
import org.datavec.api.writable.comparator.WritableComparator;
import org.joda.time.DateTimeZone;

import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.Modifier;
import java.net.URL;
import java.util.*;
import java.util.concurrent.TimeUnit;

/**
 * A TransformProcess defines
 * an ordered list of transformations
 * to be executed on some data
 *
 * @author Alex Black
 */
@Data
@Slf4j
public class TransformProcess implements Serializable {

    private final Schema initialSchema;
    private List<DataAction> actionList;

    private static Set<Class<?>> subtypesClassCache = null;
    private static ObjectMapper jsonMapper = initMapperJson();
    private static ObjectMapper yamlMapper = initMapperYaml();

    public TransformProcess(@JsonProperty("initialSchema") Schema initialSchema,
            @JsonProperty("actionList") List<DataAction> actionList) {
        this.initialSchema = initialSchema;
        this.actionList = actionList;

        //Calculate and set the schemas for each tranformation:
        Schema currInputSchema = initialSchema;
        for (DataAction d : actionList) {
            if (d.getTransform() != null) {
                Transform t = d.getTransform();
                t.setInputSchema(currInputSchema);
                currInputSchema = t.transform(currInputSchema);
            } else if (d.getFilter() != null) {
                //Filter -> doesn't change schema. But we DO need to set the schema in the filter...
                d.getFilter().setInputSchema(currInputSchema);
            } else if (d.getConvertToSequence() != null) {
                if (currInputSchema instanceof SequenceSchema) {
                    throw new RuntimeException(
                            "Cannot convert to sequence: schema is already a sequence schema: " + currInputSchema);
                }
                ConvertToSequence cts = d.getConvertToSequence();
                cts.setInputSchema(currInputSchema);
                currInputSchema = cts.transform(currInputSchema);
            } else if (d.getConvertFromSequence() != null) {
                ConvertFromSequence cfs = d.getConvertFromSequence();
                if (!(currInputSchema instanceof SequenceSchema)) {
                    throw new RuntimeException(
                            "Cannot convert from sequence: schema is not a sequence schema: " + currInputSchema);
                }
                cfs.setInputSchema((SequenceSchema) currInputSchema);
                currInputSchema = cfs.transform((SequenceSchema) currInputSchema);
            } else if (d.getSequenceSplit() != null) {
                d.getSequenceSplit().setInputSchema(currInputSchema);
                continue; //no change to sequence schema
            } else if (d.getReducer() != null) {
                IReducer reducer = d.getReducer();
                reducer.setInputSchema(currInputSchema);
                currInputSchema = reducer.transform(currInputSchema);
            } else if (d.getCalculateSortedRank() != null) {
                CalculateSortedRank csr = d.getCalculateSortedRank();
                csr.setInputSchema(currInputSchema);
                currInputSchema = csr.transform(currInputSchema);
            } else {
                throw new RuntimeException("Unknown action: " + d);
            }
        }
    }

    private TransformProcess(Builder builder) {
        this(builder.initialSchema, builder.actionList);
    }

    public List<DataAction> getActionList() {
        return actionList;
    }

    /**
     * Get the Schema of the output data, after executing the process
     *
     * @return Schema of the output data
     */
    public Schema getFinalSchema() {
        return getSchemaAfterStep(actionList.size());
    }

    /**
     * Return the schema after executing all steps up to and including the specified step.
     * Steps are indexed from 0: so getSchemaAfterStep(0) is after one transform has been executed.
     *
     * @param step Index of the step
     * @return Schema of the data, after that (and all prior) steps have been executed
     */
    public Schema getSchemaAfterStep(int step) {
        Schema currInputSchema = initialSchema;
        int i = 0;
        for (DataAction d : actionList) {
            if (d.getTransform() != null) {
                Transform t = d.getTransform();
                currInputSchema = t.transform(currInputSchema);
            } else if (d.getFilter() != null) {
                i++;
                continue; //Filter -> doesn't change schema
            } else if (d.getConvertToSequence() != null) {
                if (currInputSchema instanceof SequenceSchema) {
                    throw new RuntimeException(
                            "Cannot convert to sequence: schema is already a sequence schema: " + currInputSchema);
                }
                ConvertToSequence cts = d.getConvertToSequence();
                currInputSchema = cts.transform(currInputSchema);
            } else if (d.getConvertFromSequence() != null) {
                ConvertFromSequence cfs = d.getConvertFromSequence();
                if (!(currInputSchema instanceof SequenceSchema)) {
                    throw new RuntimeException(
                            "Cannot convert from sequence: schema is not a sequence schema: " + currInputSchema);
                }
                currInputSchema = cfs.transform((SequenceSchema) currInputSchema);
            } else if (d.getSequenceSplit() != null) {
                continue; //Sequence split -> no change to schema
            } else if (d.getReducer() != null) {
                IReducer reducer = d.getReducer();
                currInputSchema = reducer.transform(currInputSchema);
            } else if (d.getCalculateSortedRank() != null) {
                CalculateSortedRank csr = d.getCalculateSortedRank();
                currInputSchema = csr.transform(currInputSchema);
            } else {
                throw new RuntimeException("Unknown action: " + d);
            }
            if (i++ == step)
                return currInputSchema;
        }
        return currInputSchema;
    }

    /**
     * Execute the full sequence of transformations for a single example. May return null if example is filtered
     * <b>NOTE:</b> Some TransformProcess operations cannot be done on examples individually. Most notably, ConvertToSequence
     * and ConvertFromSequence operations require the full data set to be processed at once
     *
     * @param input
     * @return
     */
    public List<Writable> execute(List<Writable> input) {
        List<Writable> currValues = input;

        for (DataAction d : actionList) {
            if (d.getTransform() != null) {
                Transform t = d.getTransform();
                currValues = t.map(currValues);

            } else if (d.getFilter() != null) {
                Filter f = d.getFilter();
                if (f.removeExample(currValues))
                    return null;
            } else if (d.getConvertToSequence() != null) {
                throw new RuntimeException(
                        "Cannot execute examples individually: TransformProcess contains a ConvertToSequence operation");
            } else if (d.getConvertFromSequence() != null) {
                throw new RuntimeException(
                        "Unexpected operation: TransformProcess contains a ConvertFromSequence operation");
            } else if (d.getSequenceSplit() != null) {
                throw new RuntimeException(
                        "Cannot execute examples individually: TransformProcess contains a SequenceSplit operation");
            } else {
                throw new RuntimeException("Unknown action: " + d);
            }
        }

        return currValues;
    }

    /**
     *
     * @param input
     * @return
     */
    public List<List<Writable>> executeSequenceToSequence(List<List<Writable>> input) {
        List<List<Writable>> currValues = input;

        for (DataAction d : actionList) {
            if (d.getTransform() != null) {
                Transform t = d.getTransform();
                currValues = t.mapSequence(currValues);

            } else if (d.getFilter() != null) {
                //                Filter f = d.getFilter();
                //                if (f.removeExample(currValues)) return null;
                throw new RuntimeException("Sequence filtering not yet implemnted here");
            } else if (d.getConvertToSequence() != null) {
                throw new RuntimeException(
                        "Cannot execute examples individually: TransformProcess contains a ConvertToSequence operation");
            } else if (d.getConvertFromSequence() != null) {
                throw new RuntimeException(
                        "Unexpected operation: TransformProcess contains a ConvertFromSequence operation");
            } else if (d.getSequenceSplit() != null) {
                throw new RuntimeException(
                        "Cannot execute examples individually: TransformProcess contains a SequenceSplit operation");
            } else {
                throw new RuntimeException("Unknown action: " + d);
            }
        }

        return currValues;
    }

    /**
     * Execute the full sequence of transformations for a single time series (sequence). May return null if example is filtered
     */
    public List<List<Writable>> executeSequence(List<List<Writable>> inputSequence) {
        return executeSequenceToSequence(inputSequence);
    }

    /**
     * Convert the TransformProcess to a JSON string
     *
     * @return TransformProcess, as JSON
     */
    public String toJson() {
        try {
            return jsonMapper.writeValueAsString(this);
        } catch (JsonProcessingException e) {
            //Ignore the first exception, try reinitializing subtypes for custom transforms etc
        }

        jsonMapper = reinitializeMapperWithSubtypes(jsonMapper);

        try {
            return jsonMapper.writeValueAsString(this);
        } catch (JsonProcessingException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Convert the TransformProcess to a YAML string
     *
     * @return TransformProcess, as YAML
     */
    public String toYaml() {
        try {
            return yamlMapper.writeValueAsString(this);
        } catch (JsonProcessingException e) {
            //Ignore the first exception, try reinitializing subtypes for custom transforms etc
        }

        yamlMapper = reinitializeMapperWithSubtypes(yamlMapper);

        try {
            return yamlMapper.writeValueAsString(this);
        } catch (JsonProcessingException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Deserialize a JSON String (created by {@link #toJson()}) to a TransformProcess
     *
     * @return TransformProcess, from JSON
     */
    public static TransformProcess fromJson(String json) {
        try {
            return jsonMapper.readValue(json, TransformProcess.class);
        } catch (IOException e) {
            //Ignore the first exception, try reinitializing subtypes for custom transforms etc
        }

        jsonMapper = reinitializeMapperWithSubtypes(jsonMapper);

        try {
            return jsonMapper.readValue(json, TransformProcess.class);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Deserialize a JSON String (created by {@link #toJson()}) to a TransformProcess
     *
     * @return TransformProcess, from JSON
     */
    public static TransformProcess fromYaml(String yaml) {
        try {
            return yamlMapper.readValue(yaml, TransformProcess.class);
        } catch (IOException e) {
            //Ignore the first exception, try reinitializing subtypes for custom transforms etc
        }

        yamlMapper = reinitializeMapperWithSubtypes(yamlMapper);

        try {
            return yamlMapper.readValue(yaml, TransformProcess.class);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private static ObjectMapper reinitializeMapperWithSubtypes(ObjectMapper mapper) {
        //Register concrete subtypes for JSON serialization

        List<Class<?>> classes = Arrays.<Class<?>>asList(Transform.class, Condition.class, Filter.class,
                IReducer.class);
        List<String> classNames = new ArrayList<>(6);
        for (Class<?> c : classes)
            classNames.add(c.getName());

        // First: scan the classpath and find all instances of the 'baseClasses' classes

        if (subtypesClassCache == null) {
            List<Class<?>> interfaces = Arrays.<Class<?>>asList(Transform.class, Condition.class, Filter.class,
                    IReducer.class);
            List<Class<?>> classesList = Arrays.<Class<?>>asList();

            Collection<URL> urls = ClasspathHelper.forClassLoader();
            List<URL> scanUrls = new ArrayList<>();
            for (URL u : urls) {
                String path = u.getPath();
                if (!path.matches(".*/jre/lib/.*jar")) { //Skip JRE/JDK JARs
                    scanUrls.add(u);
                }
            }

            Reflections reflections = new Reflections(
                    new ConfigurationBuilder().filterInputsBy(new FilterBuilder().exclude("^(?!.*\\.class$).*$") //Consider only .class files (to avoid debug messages etc. on .dlls, etc
                            //Exclude the following: the assumption here is that no custom functionality will ever be present
                            // under these package name prefixes.
                            .exclude("^org.nd4j.*").exclude("^org.bytedeco.*") //JavaCPP
                            .exclude("^com.fasterxml.*")//Jackson
                            .exclude("^org.apache.*") //Apache commons, Spark, log4j etc
                            .exclude("^org.projectlombok.*").exclude("^com.twelvemonkeys.*").exclude("^org.joda.*")
                            .exclude("^org.slf4j.*").exclude("^com.google.*").exclude("^org.reflections.*")
                            .exclude("^ch.qos.*") //Logback
                    ).addUrls(scanUrls).setScanners(new DataVecSubTypesScanner(interfaces, classesList)));
            org.reflections.Store store = reflections.getStore();

            Iterable<String> subtypesByName = store.getAll(DataVecSubTypesScanner.class.getSimpleName(),
                    classNames);

            Set<? extends Class<?>> subtypeClasses = Sets.newHashSet(ReflectionUtils.forNames(subtypesByName));
            subtypesClassCache = new HashSet<>();
            for (Class<?> c : subtypeClasses) {
                if (Modifier.isAbstract(c.getModifiers()) || Modifier.isInterface(c.getModifiers())) {
                    //log.info("Skipping abstract/interface: {}",c);
                    continue;
                }
                subtypesClassCache.add(c);
            }
        }

        //Second: get all currently registered subtypes for this mapper
        Set<Class<?>> registeredSubtypes = new HashSet<>();
        for (Class<?> c : classes) {
            AnnotatedClass ac = AnnotatedClass.construct(c,
                    mapper.getSerializationConfig().getAnnotationIntrospector(), null);
            Collection<NamedType> types = mapper.getSubtypeResolver().collectAndResolveSubtypes(ac,
                    mapper.getSerializationConfig(), mapper.getSerializationConfig().getAnnotationIntrospector());
            for (NamedType nt : types) {
                registeredSubtypes.add(nt.getType());
            }
        }

        //Third: register all _concrete_ subtypes that are not already registered
        List<NamedType> toRegister = new ArrayList<>();
        for (Class<?> c : subtypesClassCache) {
            //Check if it's concrete or abstract...
            if (Modifier.isAbstract(c.getModifiers()) || Modifier.isInterface(c.getModifiers())) {
                //log.info("Skipping abstract/interface: {}",c);
                continue;
            }

            if (!registeredSubtypes.contains(c)) {
                String name;
                if (ClassUtils.isInnerClass(c)) {
                    Class<?> c2 = c.getDeclaringClass();
                    name = c2.getSimpleName() + "$" + c.getSimpleName();
                } else {
                    name = c.getSimpleName();
                }
                toRegister.add(new NamedType(c, name));
                if (log.isDebugEnabled()) {
                    for (Class<?> baseClass : classes) {
                        if (baseClass.isAssignableFrom(c)) {
                            log.debug("Registering class for JSON serialization: {} as subtype of {}", c.getName(),
                                    baseClass.getName());
                            break;
                        }
                    }
                }
            }
        }

        mapper.registerSubtypes(toRegister.toArray(new NamedType[toRegister.size()]));
        //Recreate the mapper (via copy), as mapper won't use registered subtypes after first use
        mapper = mapper.copy();
        return mapper;
    }

    private static ObjectMapper initMapperJson() {
        return initMapper(new JsonFactory());
    }

    private static ObjectMapper initMapperYaml() {
        return initMapper(new YAMLFactory());
    }

    private static ObjectMapper initMapper(JsonFactory factory) {
        ObjectMapper om = new ObjectMapper(factory);
        om.registerModule(new JodaModule());
        om.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
        om.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false);
        om.enable(SerializationFeature.INDENT_OUTPUT);
        om.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.NONE);
        om.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY);
        return om;
    }

    /**
     * Based on the input schema,
     * map raw string values to the appropriate
     * writable
     * @param values the values to convert
     * @return the transformed values based on the schema
     */
    public List<Writable> transformRawStringsToInput(String... values) {
        List<Writable> ret = new ArrayList<>();
        if (values.length != initialSchema.numColumns())
            throw new IllegalArgumentException(
                    String.format("Number of values %d does not match the number of input columns %d for schema",
                            values.length, initialSchema.numColumns()));
        for (int i = 0; i < values.length; i++) {
            switch (initialSchema.getType(i)) {
            case String:
                ret.add(new Text(values[i]));
                break;
            case Integer:
                ret.add(new IntWritable(Integer.parseInt(values[i])));
                break;
            case Double:
                ret.add(new DoubleWritable(Double.parseDouble(values[i])));
                break;
            case Float:
                ret.add(new FloatWritable(Float.parseFloat(values[i])));
                break;
            case Categorical:
                ret.add(new Text(values[i]));
                break;
            case Boolean:
                ret.add(new BooleanWritable(Boolean.parseBoolean(values[i])));
                break;
            case Time:

                break;
            case Long:
                ret.add(new LongWritable(Long.parseLong(values[i])));
            }
        }
        return ret;
    }

    /**
     * Builder class for constructing a TransformProcess
     */
    public static class Builder {

        private List<DataAction> actionList = new ArrayList<>();
        private Schema initialSchema;

        public Builder(Schema initialSchema) {
            this.initialSchema = initialSchema;
        }

        /**
         * Add a transformation to be executed after the previously-added operations have been executed
         *
         * @param transform Transform to execute
         */
        public Builder transform(Transform transform) {
            actionList.add(new DataAction(transform));
            return this;
        }

        /**
         * Add a filter operation to be executed after the previously-added operations have been executed
         *
         * @param filter Filter operation to execute
         */
        public Builder filter(Filter filter) {
            actionList.add(new DataAction(filter));
            return this;
        }

        /**
         * Add a filter operation, based on the specified condition.
         *
         * If condition is satisfied (returns true): remove the example or sequence<br>
         * If condition is not satisfied (returns false): keep the example or sequence
         *
         * @param condition Condition to filter on
         */
        public Builder filter(Condition condition) {
            return filter(new ConditionFilter(condition));
        }

        /**
         * Remove all of the specified columns, by name
         *
         * @param columnNames Names of the columns to remove
         */
        public Builder removeColumns(String... columnNames) {
            return transform(new RemoveColumnsTransform(columnNames));
        }

        /**
         * Remove all of the specified columns, by name
         *
         * @param columnNames Names of the columns to remove
         */
        public Builder removeColumns(Collection<String> columnNames) {
            return transform(new RemoveColumnsTransform(columnNames.toArray(new String[columnNames.size()])));
        }

        /**
         * Remove all columns, except for those that are specified here
         * @param columnNames    Names of the columns to keep
         */
        public Builder removeAllColumnsExceptFor(String... columnNames) {
            return transform(new RemoveAllColumnsExceptForTransform(columnNames));
        }

        /**
         * Remove all columns, except for those that are specified here
         * @param columnNames    Names of the columns to keep
         */
        public Builder removeAllColumnsExceptFor(Collection<String> columnNames) {
            return removeAllColumnsExceptFor(columnNames.toArray(new String[columnNames.size()]));
        }

        /**
         * Rename a single column
         *
         * @param oldName Original column name
         * @param newName New column name
         */
        public Builder renameColumn(String oldName, String newName) {
            return transform(new RenameColumnsTransform(oldName, newName));
        }

        /**
         * Rename multiple columns
         *
         * @param oldNames List of original column names
         * @param newNames List of new column names
         */
        public Builder renameColumns(List<String> oldNames, List<String> newNames) {
            return transform(new RenameColumnsTransform(oldNames, newNames));
        }

        /**
         * Reorder the columns using a partial or complete new ordering.
         * If only some of the column names are specified for the new order, the remaining columns will be placed at
         * the end, according to their current relative ordering
         *
         * @param newOrder Names of the columns, in the order they will appear in the output
         */
        public Builder reorderColumns(String... newOrder) {
            return transform(new ReorderColumnsTransform(newOrder));
        }

        /**
         * Duplicate a single column
         *
         * @param column Name of the column to duplicate
         * @param newName    Name of the new (duplicate) column
         */
        public Builder duplicateColumn(String column, String newName) {
            return transform(new DuplicateColumnsTransform(Collections.singletonList(column),
                    Collections.singletonList(newName)));
        }

        /**
         * Duplicate a set of columns
         *
         * @param columnNames Names of the columns to duplicate
         * @param newNames    Names of the new (duplicated) columns
         */
        public Builder duplicateColumns(List<String> columnNames, List<String> newNames) {
            return transform(new DuplicateColumnsTransform(columnNames, newNames));
        }

        /**
         * Perform a mathematical operation (add, subtract, scalar max etc) on the specified integer column, with a scalar
         *
         * @param column The integer column to perform the operation on
         * @param mathOp     The mathematical operation
         * @param scalar     The scalar value to use in the mathematical operation
         */
        public Builder integerMathOp(String column, MathOp mathOp, int scalar) {
            return transform(new IntegerMathOpTransform(column, mathOp, scalar));
        }

        /**
         * Calculate and add a new integer column by performing a mathematical operation on a number of existing columns.
         * New column is added to the end.
         *
         * @param newColumnName Name of the new/derived column
         * @param mathOp        Mathematical operation to execute on the columns
         * @param columnNames   Names of the columns to use in the mathematical operation
         */
        public Builder integerColumnsMathOp(String newColumnName, MathOp mathOp, String... columnNames) {
            return transform(new IntegerColumnsMathOpTransform(newColumnName, mathOp, columnNames));
        }

        /**
         * Perform a mathematical operation (add, subtract, scalar max etc) on the specified long column, with a scalar
         *
         * @param columnName The long column to perform the operation on
         * @param mathOp     The mathematical operation
         * @param scalar     The scalar value to use in the mathematical operation
         */
        public Builder longMathOp(String columnName, MathOp mathOp, long scalar) {
            return transform(new LongMathOpTransform(columnName, mathOp, scalar));
        }

        /**
         * Calculate and add a new long column by performing a mathematical operation on a number of existing columns.
         * New column is added to the end.
         *
         * @param newColumnName Name of the new/derived column
         * @param mathOp        Mathematical operation to execute on the columns
         * @param columnNames   Names of the columns to use in the mathematical operation
         */
        public Builder longColumnsMathOp(String newColumnName, MathOp mathOp, String... columnNames) {
            return transform(new LongColumnsMathOpTransform(newColumnName, mathOp, columnNames));
        }

        /**
         * Perform a mathematical operation (add, subtract, scalar max etc) on the specified double column, with a scalar
         *
         * @param columnName The double column to perform the operation on
         * @param mathOp     The mathematical operation
         * @param scalar     The scalar value to use in the mathematical operation
         */
        public Builder doubleMathOp(String columnName, MathOp mathOp, double scalar) {
            return transform(new DoubleMathOpTransform(columnName, mathOp, scalar));
        }

        /**
         * Calculate and add a new double column by performing a mathematical operation on a number of existing columns.
         * New column is added to the end.
         *
         * @param newColumnName Name of the new/derived column
         * @param mathOp        Mathematical operation to execute on the columns
         * @param columnNames   Names of the columns to use in the mathematical operation
         */
        public Builder doubleColumnsMathOp(String newColumnName, MathOp mathOp, String... columnNames) {
            return transform(new DoubleColumnsMathOpTransform(newColumnName, mathOp, columnNames));
        }

        /**
         * Perform a mathematical operation (such as sin(x), ceil(x), exp(x) etc) on a column
         *
         * @param columnName   Column name to operate on
         * @param mathFunction MathFunction to apply to the column
         */
        public Builder doubleMathFunction(String columnName, MathFunction mathFunction) {
            return transform(new DoubleMathFunctionTransform(columnName, mathFunction));
        }

        /**
         * Perform a mathematical operation (add, subtract, scalar min/max only) on the specified time column
         *
         * @param columnName   The integer column to perform the operation on
         * @param mathOp       The mathematical operation
         * @param timeQuantity The quantity used in the mathematical op
         * @param timeUnit     The unit that timeQuantity is specified in
         */
        public Builder timeMathOp(String columnName, MathOp mathOp, long timeQuantity, TimeUnit timeUnit) {
            return transform(new TimeMathOpTransform(columnName, mathOp, timeQuantity, timeUnit));
        }

        /**
         * Convert the specified column(s) from a categorical representation to a one-hot representation.
         * This involves the creation of multiple new columns each.
         *
         * @param columnNames Names of the categorical column(s) to convert to a one-hot representation
         */
        public Builder categoricalToOneHot(String... columnNames) {
            for (String s : columnNames) {
                transform(new CategoricalToOneHotTransform(s));
            }
            return this;
        }

        /**
         * Convert the specified column(s) from a categorical representation to an integer representation.
         * This will replace the specified categorical column(s) with an integer repreesentation, where
         * each integer has the value 0 to numCategories-1.
         *
         * @param columnNames Name of the categorical column(s) to convert to an integer representation
         */
        public Builder categoricalToInteger(String... columnNames) {
            for (String s : columnNames) {
                transform(new CategoricalToIntegerTransform(s));
            }
            return this;
        }

        /**
         * Convert the specified column from an integer representation (assume values 0 to numCategories-1) to
         * a categorical representation, given the specified state names
         *
         * @param columnName         Name of the column to convert
         * @param categoryStateNames Names of the states for the categorical column
         */
        public Builder integerToCategorical(String columnName, List<String> categoryStateNames) {
            return transform(new IntegerToCategoricalTransform(columnName, categoryStateNames));
        }

        /**
         * Convert the specified column from an integer representation to a categorical representation, given the specified
         * mapping between integer indexes and state names
         *
         * @param columnName           Name of the column to convert
         * @param categoryIndexNameMap Names of the states for the categorical column
         */
        public Builder integerToCategorical(String columnName, Map<Integer, String> categoryIndexNameMap) {
            return transform(new IntegerToCategoricalTransform(columnName, categoryIndexNameMap));
        }

        /**
         * Add a new column, where all values in the column are identical and as specified.
         *
         * @param newColumnName Name of the new column
         * @param newColumnType Type of the new column
         * @param fixedValue    Value in the new column for all records
         */
        public Builder addConstantColumn(String newColumnName, ColumnType newColumnType, Writable fixedValue) {
            return transform(new AddConstantColumnTransform(newColumnName, newColumnType, fixedValue));
        }

        /**
         * Add a new double column, where the value for that column (for all records) are identical
         *
         * @param newColumnName Name of the new column
         * @param value         Value in the new column for all records
         */
        public Builder addConstantDoubleColumn(String newColumnName, double value) {
            return addConstantColumn(newColumnName, ColumnType.Double, new DoubleWritable(value));
        }

        /**
         * Add a new integer column, where the value for that column (for all records) are identical
         *
         * @param newColumnName Name of the new column
         * @param value         Value of the new column for all records
         */
        public Builder addConstantIntegerColumn(String newColumnName, int value) {
            return addConstantColumn(newColumnName, ColumnType.Integer, new IntWritable(value));
        }

        /**
         * Add a new integer column, where the value for that column (for all records) are identical
         *
         * @param newColumnName Name of the new column
         * @param value         Value in the new column for all records
         */
        public Builder addConstantLongColumn(String newColumnName, long value) {
            return addConstantColumn(newColumnName, ColumnType.Long, new LongWritable(value));
        }

        /**
         * Convert the specified column to a string.
         * @param inputColumn the input column to convert
         * @return builder pattern
         */
        public Builder convertToString(String inputColumn) {
            return transform(new ConvertToString(inputColumn));
        }

        /**
         * Normalize the specified column with a given type of normalization
         *
         * @param column Column to normalize
         * @param type   Type of normalization to apply
         * @param da     DataAnalysis object
         */
        public Builder normalize(String column, Normalize type, DataAnalysis da) {

            ColumnAnalysis ca = da.getColumnAnalysis(column);
            if (!(ca instanceof NumericalColumnAnalysis))
                throw new IllegalStateException(
                        "Column \"" + column + "\" analysis is not numerical. " + "Column is not numerical?");

            NumericalColumnAnalysis nca = (NumericalColumnAnalysis) ca;
            double min = nca.getMinDouble();
            double max = nca.getMaxDouble();
            double mean = nca.getMean();
            double sigma = nca.getSampleStdev();

            switch (type) {
            case MinMax:
                return transform(new MinMaxNormalizer(column, min, max));
            case MinMax2:
                return transform(new MinMaxNormalizer(column, min, max, -1, 1));
            case Standardize:
                return transform(new StandardizeNormalizer(column, mean, sigma));
            case SubtractMean:
                return transform(new SubtractMeanNormalizer(column, mean));
            case Log2Mean:
                return transform(new Log2Normalizer(column, mean, min, 0.5));
            case Log2MeanExcludingMin:
                long countMin = nca.getCountMinValue();

                //mean including min value: (sum/totalCount)
                //mean excluding min value: (sum - countMin*min)/(totalCount - countMin)
                double meanExMin;
                if (ca.getCountTotal() - countMin == 0) {
                    if (ca.getCountTotal() == 0) {
                        log.warn("Normalizing with Log2MeanExcludingMin but 0 records present in analysis");
                    } else {
                        log.warn("Normalizing with Log2MeanExcludingMin but all records are the same value");
                    }
                    meanExMin = mean;
                } else {
                    meanExMin = (mean * ca.getCountTotal() - countMin * min) / (ca.getCountTotal() - countMin);
                }
                return transform(new Log2Normalizer(column, meanExMin, min, 0.5));
            default:
                throw new RuntimeException("Unknown/not implemented normalization type: " + type);
            }
        }

        /**
         * Convert a set of independent records/examples into a sequence, according to some key.
         * Within each sequence, values are ordered using the provided {@link SequenceComparator}
         *
         * @param keyColumn  Column to use as a key (values with the same key will be combined into sequences)
         * @param comparator A SequenceComparator to order the values within each sequence (for example, by time or String order)
         */
        public Builder convertToSequence(String keyColumn, SequenceComparator comparator) {
            actionList.add(new DataAction(new ConvertToSequence(keyColumn, comparator)));
            return this;
        }

        /**
         * Convert a sequence to a set of individual values (by treating each value in each sequence as a separate example)
         */
        public Builder convertFromSequence() {
            actionList.add(new DataAction(new ConvertFromSequence()));
            return this;
        }

        /**
         * Split sequences into 1 or more other sequences. Used for example to split large sequences into a set of smaller sequences
         *
         * @param split SequenceSplit that defines how splits will occur
         */
        public Builder splitSequence(SequenceSplit split) {
            actionList.add(new DataAction(split));
            return this;
        }

        /**
         * Reduce (i.e., aggregate/combine) a set of examples (typically by key).
         * <b>Note</b>: In the current implementation, reduction operations can be performed only on standard (i.e., non-sequence) data
         *
         * @param reducer Reducer to use
         */
        public Builder reduce(IReducer reducer) {
            actionList.add(new DataAction(reducer));
            return this;
        }

        /**
         * Reduce (i.e., aggregate/combine) a set of sequence examples - for each sequence individually - using a window function.
         * For example, take all records/examples in each 24-hour period (i.e., using window function), and convert them into
         * a singe value (using the reducer). In this example, the output is a sequence, with time period of 24 hours.
         *
         * @param reducer        Reducer to use to reduce each window
         * @param windowFunction Window function to find apply on each sequence individually
         */
        public Builder reduceSequenceByWindow(IReducer reducer, WindowFunction windowFunction) {
            actionList.add(new DataAction(new ReduceSequenceByWindowTransform(reducer, windowFunction)));
            return this;
        }

        /**
         * CalculateSortedRank: calculate the rank of each example, after sorting example.
         * For example, we might have some numerical "score" column, and we want to know for the rank (sort order) for each
         * example, according to that column.<br>
         * The rank of each example (after sorting) will be added in a new Long column. Indexing is done from 0; examples will have
         * values 0 to dataSetSize-1.<br>
         * <p>
         * Currently, CalculateSortedRank can only be applied on standard (i.e., non-sequence) data
         * Furthermore, the current implementation can only sort on one column
         *
         * @param newColumnName Name of the new column (will contain the rank for each example)
         * @param sortOnColumn  Column to sort on
         * @param comparator    Comparator used to sort examples
         */
        public Builder calculateSortedRank(String newColumnName, String sortOnColumn,
                WritableComparator comparator) {
            actionList.add(new DataAction(new CalculateSortedRank(newColumnName, sortOnColumn, comparator)));
            return this;
        }

        /**
         * CalculateSortedRank: calculate the rank of each example, after sorting example.
         * For example, we might have some numerical "score" column, and we want to know for the rank (sort order) for each
         * example, according to that column.<br>
         * The rank of each example (after sorting) will be added in a new Long column. Indexing is done from 0; examples will have
         * values 0 to dataSetSize-1.<br>
         * <p>
         * Currently, CalculateSortedRank can only be applied on standard (i.e., non-sequence) data
         * Furthermore, the current implementation can only sort on one column
         *
         * @param newColumnName Name of the new column (will contain the rank for each example)
         * @param sortOnColumn  Column to sort on
         * @param comparator    Comparator used to sort examples
         * @param ascending     If true: sort ascending. False: descending
         */
        public Builder calculateSortedRank(String newColumnName, String sortOnColumn, WritableComparator comparator,
                boolean ascending) {
            actionList.add(
                    new DataAction(new CalculateSortedRank(newColumnName, sortOnColumn, comparator, ascending)));
            return this;
        }

        /**
         * Convert the specified String column to a categorical column. The state names must be provided.
         *
         * @param columnName Name of the String column to convert to categorical
         * @param stateNames State names of the category
         */
        public Builder stringToCategorical(String columnName, List<String> stateNames) {
            return transform(new StringToCategoricalTransform(columnName, stateNames));
        }

        /**
         * Remove all whitespace characters from the values in the specified String column
         *
         * @param columnName Name of the column to remove whitespace from
         */
        public Builder stringRemoveWhitespaceTransform(String columnName) {
            return transform(new RemoveWhiteSpaceTransform(columnName));
        }

        /**
         * Replace one or more String values in the specified column with new values.
         * <p>
         * Keys in the map are the original values; the Values in the map are their replacements.
         * If a String appears in the data but does not appear in the provided map (as a key), that String values will
         * not be modified.
         *
         * @param columnName Name of the column in which to do replacement
         * @param mapping    Map of oldValues -> newValues
         */
        public Builder stringMapTransform(String columnName, Map<String, String> mapping) {
            return transform(new StringMapTransform(columnName, mapping));
        }

        /**
         * Convert a String column (containing a date/time String) to a time column (by parsing the date/time String)
         *
         * @param column       String column containing the date/time Strings
         * @param format       Format of the strings. Time format is specified as per http://www.joda.org/joda-time/apidocs/org/joda/time/format/DateTimeFormat.html
         * @param dateTimeZone Timezone of the column
         */
        public Builder stringToTimeTransform(String column, String format, DateTimeZone dateTimeZone) {
            return transform(new StringToTimeTransform(column, format, dateTimeZone));
        }

        /**
         * Append a String to a specified column
         *
         * @param column      Column to append the value to
         * @param toAppend    String to append to the end of each writable
         */
        public Builder appendStringColumnTransform(String column, String toAppend) {
            return transform(new AppendStringColumnTransform(column, toAppend));
        }

        /**
         * Replace the values in a specified column with a specified new value, if some condition holds.
         * If the condition does not hold, the original values are not modified.
         *
         * @param column    Column to operate on
         * @param newValue  Value to use as replacement, if condition is satisfied
         * @param condition Condition that must be satisfied for replacement
         */
        public Builder conditionalReplaceValueTransform(String column, Writable newValue, Condition condition) {
            return transform(new ConditionalReplaceValueTransform(column, newValue, condition));
        }

        /**
         * Replace the value in a specified column with a new value taken from another column, if a condition is satisfied/true.<br>
         * Note that the condition can be any generic condition, including on other column(s), different to the column
         * that will be modified if the condition is satisfied/true.<br>
         *
         * @param columnToReplace    Name of the column in which values will be replaced (if condition is satisfied)
         * @param sourceColumn       Name of the column from which the new values will be
         * @param condition          Condition to use
         */
        public Builder conditionalCopyValueTransform(String columnToReplace, String sourceColumn,
                Condition condition) {
            return transform(new ConditionalCopyValueTransform(columnToReplace, sourceColumn, condition));
        }

        /**
         * Create the TransformProcess object
         */
        public TransformProcess build() {
            return new TransformProcess(this);
        }

    }

}