org.datavec.transform.basic.BasicTransformExample.java Source code

Introduction

Here is the source code for org.datavec.transform.basic.BasicTransformExample.java
Source

/*******************************************************************************
 * Copyright (c) 2015-2019 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

package org.datavec.transform.basic;

import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.datavec.api.records.reader.RecordReader;
import org.datavec.api.records.reader.impl.csv.CSVRecordReader;
import org.datavec.api.transform.TransformProcess;
import org.datavec.api.transform.schema.Schema;
import org.datavec.api.writable.Writable;
import org.datavec.spark.transform.SparkTransformExecutor;
import org.datavec.spark.transform.misc.StringToWritablesFunction;
import org.datavec.spark.transform.misc.WritablesToStringFunction;

import java.io.File;
import java.net.URL;
import java.util.List;

/**
 * Another basic preprocessing and filtering example. We download the raw iris dataset from the internet.
 * The file we downloaded has 2 problems:
 * 1. an empty line at the bottom.
 * 2. the labels are in the form of strings. We want an integer.
 * The output matches the iris.txt file you can find in the resources of the datavec and dl4j examples
**/
public class BasicTransformExample {

    public static void main(String[] args) throws Exception {

        String filename = "iris.data";
        URL url = new URL("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data");

        File irisText = new File(filename);
        if (!irisText.exists()) {
            FileUtils.copyURLToFile(url, irisText);
        }

        SparkConf conf = new SparkConf();
        conf.setMaster("local[*]");
        conf.setAppName("DataVec Example");

        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> stringData = sc.textFile(filename);

        //Take out empty lines.
        RecordReader rr = new CSVRecordReader();
        JavaRDD<List<Writable>> parsedInputData = stringData.filter((x) -> !x.isEmpty())
                .map(new StringToWritablesFunction(rr));

        // Print the original text file.  Not the empty line at the bottom,
        List<String> inputDataCollected = stringData.collect();
        System.out.println("\n\n---- Original Data ----");
        for (String s : inputDataCollected)
            System.out.println("'" + s + "'");

        //
        JavaRDD<String> processedAsString = parsedInputData.map(new WritablesToStringFunction(","));
        List<String> inputDataParsed = processedAsString.collect();
        System.out.println("\n\n---- Parsed Data ----");
        for (String s : inputDataParsed)
            System.out.println("'" + s + "'");

        // the String to label conversion. Define schema and transform:
        Schema schema = new Schema.Builder()
                .addColumnsDouble("Sepal length", "Sepal width", "Petal length", "Petal width")
                .addColumnCategorical("Species", "Iris-setosa", "Iris-versicolor", "Iris-virginica").build();

        TransformProcess tp = new TransformProcess.Builder(schema).categoricalToInteger("Species").build();

        // do the transformation.
        JavaRDD<List<Writable>> processedData = SparkTransformExecutor.execute(parsedInputData, tp);

        // This is where we print the final result (which you would save to a text file.
        processedAsString = processedData.map(new WritablesToStringFunction(","));
        inputDataParsed = processedAsString.collect();
        System.out.println("\n\n---- Parsed and filtered data ----");
        for (String s : inputDataParsed)
            System.out.println(s);

    }
}