com.datumbox.framework.core.common.dataobjects.DataframeMatrix.java Source code

Java tutorial

Introduction

Here is the source code for com.datumbox.framework.core.common.dataobjects.DataframeMatrix.java

Source

/**
 * Copyright (C) 2013-2017 Vasilis Vryniotis <bbriniotis@datumbox.com>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datumbox.framework.core.common.dataobjects;

import com.datumbox.framework.common.dataobjects.TypeInference;
import com.datumbox.framework.common.storage.interfaces.StorageEngine;
import com.datumbox.framework.common.utilities.RandomGenerator;
import org.apache.commons.math3.linear.OpenMapRealVector;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.commons.math3.linear.RealVector;

import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * The DataframeMatrix class is responsible for converting a Dataframe object to a
 Matrix representation. Some of the methods on framework require working with
 matrices and this class provides the tools to achieve the necessary conversions.
 * 
 * @author Vasilis Vryniotis <bbriniotis@datumbox.com>
 */
public class DataframeMatrix {

    /**
     * We create a single storage engine for all DataframeMatrix and MapRealMatrix objects. It is necessary to define it static
     * and package protected to make it accessible to other classes such as the MapRealMatrix. This is because
     * some methods of the RealMatrix require generating new object without passing the configuration file. The engine
     * is created only once to avoid hurting performance. Thus the storageEngine is initialized once in a thread-safe manner.
     */
    static StorageEngine storageEngine;

    /**
     * Keeps record how many usages were made on the storageEngine to avoid conflicting names.
     */
    static final AtomicInteger storageId = new AtomicInteger();

    private final RealMatrix X;
    private final RealVector Y;

    /**
     * Getter for the X Matrix which contains the data of the Dataframe.
     * 
     * @return 
     */
    public RealMatrix getX() {
        return X;
    }

    /**
     * Getter for the Y vector with the values of the response variables.
     * 
     * @return 
     */
    public RealVector getY() {
        return Y;
    }

    /**
     * Private constructor which accepts as arguments the Y Vector with the values
     * of the response variables, the X matrix with the actual data and a
     * feature2ColumnId map which provides a mapping between the column name and
     * their column id in the data matrix.
     * 
     * @param Y
     * @param X
     */
    private DataframeMatrix(RealMatrix X, RealVector Y) {
        //this constructor must be private because it is used only internally
        this.Y = Y;
        this.X = X;
    }

    /**
     * Initializes the static storage engine if it's not already set.
     *
     * @param dataset
     */
    private static void setStorageEngine(Dataframe dataset) {
        //create a single storage engine for all the MapRealMatrixes
        if (storageEngine == null) {
            synchronized (DataframeMatrix.class) {
                if (storageEngine == null) {
                    String storageName = "mdf" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong();
                    storageEngine = dataset.configuration.getStorageConfiguration()
                            .createStorageEngine(storageName);
                }
            }
        }
    }

    /**
     * Method used to generate a training Dataframe to a DataframeMatrix and extracts its contents
    to Matrixes. It populates the featureIdsReference map with the mappings
     * between the feature names and the column ids of the matrix. Typically used
     * to convert the training dataset.
     * 
     * @param dataset
     * @param addConstantColumn
     * @param recordIdsReference
     * @param featureIdsReference
     * @return 
     */
    public static DataframeMatrix newInstance(Dataframe dataset, boolean addConstantColumn,
            Map<Integer, Integer> recordIdsReference, Map<Object, Integer> featureIdsReference) {
        if (!featureIdsReference.isEmpty()) {
            throw new IllegalArgumentException("The featureIdsReference map should be empty.");
        }

        setStorageEngine(dataset);

        int n = dataset.size();
        int d = dataset.xColumnSize();

        if (addConstantColumn) {
            ++d;
        }

        DataframeMatrix m = new DataframeMatrix(new MapRealMatrix(n, d), new MapRealVector(n));

        if (dataset.isEmpty()) {
            return m;
        }

        boolean extractY = (dataset.getYDataType() == TypeInference.DataType.NUMERICAL);

        int featureId = 0;
        if (addConstantColumn) {
            for (int row = 0; row < n; ++row) {
                m.X.setEntry(row, featureId, 1.0); //put the constant in evey row
            }
            featureIdsReference.put(Dataframe.COLUMN_NAME_CONSTANT, featureId);
            ++featureId;
        }

        int rowId = 0;
        for (Map.Entry<Integer, Record> e : dataset.entries()) {
            Integer rId = e.getKey();
            Record r = e.getValue();
            if (recordIdsReference != null) {
                recordIdsReference.put(rId, rowId);
            }

            if (extractY) {
                m.Y.setEntry(rowId, TypeInference.toDouble(r.getY()));
            }

            for (Map.Entry<Object, Object> entry : r.getX().entrySet()) {
                Object feature = entry.getKey();
                Integer knownFeatureId = featureIdsReference.get(feature);
                if (knownFeatureId == null) {
                    featureIdsReference.put(feature, featureId);
                    knownFeatureId = featureId;

                    ++featureId;
                }

                Double value = TypeInference.toDouble(entry.getValue());
                if (value != null) {
                    m.X.setEntry(rowId, knownFeatureId, value);
                } //else the X matrix maintains the 0.0 default value
            }
            ++rowId;
        }

        return m;
    }

    /**
     * Parses a testing dataset and converts it to DataframeMatrix by using an already
    existing mapping between feature names and column ids. Typically used
     * to parse the testing or validation dataset.
     * 
     * @param newData
     * @param recordIdsReference
     * @param featureIdsReference
     * @return 
     */
    public static DataframeMatrix parseDataset(Dataframe newData, Map<Integer, Integer> recordIdsReference,
            Map<Object, Integer> featureIdsReference) {
        if (featureIdsReference.isEmpty()) {
            throw new IllegalArgumentException("The featureIdsReference map should not be empty.");
        }

        setStorageEngine(newData);

        int n = newData.size();
        int d = featureIdsReference.size();

        DataframeMatrix m = new DataframeMatrix(new MapRealMatrix(n, d), new MapRealVector(n));

        if (newData.isEmpty()) {
            return m;
        }

        boolean extractY = (newData.getYDataType() == TypeInference.DataType.NUMERICAL);

        boolean addConstantColumn = featureIdsReference.containsKey(Dataframe.COLUMN_NAME_CONSTANT);

        int rowId = 0;
        for (Map.Entry<Integer, Record> e : newData.entries()) {
            Integer rId = e.getKey();
            Record r = e.getValue();
            if (recordIdsReference != null) {
                recordIdsReference.put(rId, rowId);
            }

            if (extractY) {
                m.Y.setEntry(rowId, TypeInference.toDouble(r.getY()));
            }

            if (addConstantColumn) {
                m.X.setEntry(rowId, 0, 1.0); //add the constant column
            }
            for (Map.Entry<Object, Object> entry : r.getX().entrySet()) {
                Object feature = entry.getKey();
                Double value = TypeInference.toDouble(entry.getValue());
                if (value != null) {
                    Integer featureId = featureIdsReference.get(feature);
                    if (featureId != null) {//if the feature exists
                        m.X.setEntry(rowId, featureId, value);
                    }
                } //else the X matrix maintains the 0.0 default value
            }
            ++rowId;
        }

        return m;
    }

    /**
     * Parses a single Record and converts it to RealVector by using an already
     * existing mapping between feature names and column ids. 
     * 
     * @param r
     * @param featureIdsReference
     * @return 
     */
    public static RealVector parseRecord(Record r, Map<Object, Integer> featureIdsReference) {
        if (featureIdsReference.isEmpty()) {
            throw new IllegalArgumentException("The featureIdsReference map should not be empty.");
        }

        int d = featureIdsReference.size();

        //create an Map-backed vector only if we have available info about configuration.
        RealVector v = (storageEngine != null) ? new MapRealVector(d) : new OpenMapRealVector(d);

        boolean addConstantColumn = featureIdsReference.containsKey(Dataframe.COLUMN_NAME_CONSTANT);

        if (addConstantColumn) {
            v.setEntry(0, 1.0); //add the constant column
        }
        for (Map.Entry<Object, Object> entry : r.getX().entrySet()) {
            Object feature = entry.getKey();
            Double value = TypeInference.toDouble(entry.getValue());
            if (value != null) {
                Integer featureId = featureIdsReference.get(feature);
                if (featureId != null) {//if the feature exists
                    v.setEntry(featureId, value);
                }
            } else {
                //else the X matrix maintains the 0.0 default value
            }
        }

        return v;
    }
}