com.moscona.dataSpace.DataFrame.java Source code

Java tutorial

Introduction

Here is the source code for com.moscona.dataSpace.DataFrame.java

Source

/*
 * Copyright (c) 2015. Arnon Moscona
 *
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU Lesser General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 *
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 *
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package com.moscona.dataSpace;

import com.moscona.dataSpace.exceptions.DataSpaceException;
import com.moscona.dataSpace.impl.AbstractVector;
import com.moscona.dataSpace.impl.IntegerVector;
import com.moscona.dataSpace.util.CompressedBitMap;
import com.moscona.exceptions.InvalidArgumentException;
import org.apache.commons.lang3.StringUtils;

import java.io.Serializable;
import java.util.*;

/**
 * Created: 12/9/10 3:39 PM
 * By: Arnon Moscona
 * A collection of named vectors (names in the data frame may differ from names in the data space)
 */
public class DataFrame extends AbstractDataElement implements IDataElement {
    private static final long serialVersionUID = -1952904026775489258L; // incompatible change
    public static final String PRIMARY_SORT_COLUMN = "primarySortColumn";
    private HashMap<String, Entry> columns;
    private ArrayList<String> order;
    private String rowIdColumn = null;
    private int size = 0;
    private DataSpace dataSpace;
    private HashMap<String, Object> metaData;

    public DataFrame(DataSpace dataSpace) {
        this.dataSpace = dataSpace;
        columns = new HashMap<String, Entry>();
        order = new ArrayList<String>();
        metaData = new HashMap<String, Object>();
    }

    /**
     * Adds a column to the data frame
     * @param name
     * @param column
     * @param isLabel true if this column is considered the a label (in the analytical sense - a label is an outcome,
     * whereas non-label columns are generally features)
     * @return this (for concatenation of cbind() operations)
     * @throws DataSpaceException
     */
    public DataFrame cbind(String name, IVector column, boolean isLabel) throws DataSpaceException {
        if (getNameSpace() != null) {
            ((DataBundle) getNameSpace()).enforceVectorMembershipRules(column);
        }
        IVector vector = column;
        if (vector.getDataSpace() != dataSpace) { // we insist on reference equality, not just equivalence
            vector = column.copyTo(dataSpace);
        }

        if (columns.containsKey(name)) {
            throw new DataSpaceException("A column named\"" + name + "\" already exists in this data frame");
        }
        if (columns.size() > 0 && vector.size() != size) {
            throw new DataSpaceException("The column size is different from the data frame size. Expected " + size
                    + " but got " + vector.size() + " new column: " + name + " (existing columns : "
                    + StringUtils.join(order, ", ") + ")");
        }

        columns.put(name, new Entry(vector, isLabel));
        order.add(name);
        size = vector.size();

        return this;
    }

    /**
     * Adds a non-label column to the data frame
     * @param name
     * @param column
     * @return
     * @throws DataSpaceException
     */
    public DataFrame cbind(String name, IVector column) throws DataSpaceException {
        return cbind(name, column, false);
    }

    public int size() {
        return size;
    }

    /**
     *
     * @param name
     * @return
     * @throws DataSpaceException
     */
    public DataFrame setId(String name) throws DataSpaceException {
        validateColumnExists(name);
        rowIdColumn = name;
        return this;
    }

    private void validateColumnExists(String name) throws DataSpaceException {
        validateNotEmpty();
        if (!columns.containsKey(name)) {
            throw new DataSpaceException("There is no column named \"" + name + "\" in this data frame");
        }
    }

    private void validateColumnIndex(int i) throws DataSpaceException {
        validateNotEmpty();
        if (i < 0 || i >= columns.size()) {
            throw new DataSpaceException("The column index " + i + " does not exist in this data frame");
        }
    }

    private void validateRowIndex(int i) throws DataSpaceException {
        validateNotEmpty();
        if (i < 0 || i >= getRowCount()) {
            throw new DataSpaceException("The row index " + i + " does not exist in this data frame. Row count = "
                    + getRowCount() + " row requested = " + i);
        }
    }

    private void validateNotEmpty() throws DataSpaceException {
        if (columns.size() == 0) {
            throw new DataSpaceException("No columns in this data frame");
        }
    }

    public IVector get(String name) throws DataSpaceException {
        validateColumnExists(name);
        return columns.get(name).vector;
    }

    public IVector get(int column) throws DataSpaceException {
        validateColumnIndex(column);
        return get(order.get(column));
    }

    public int getRowCount() throws DataSpaceException {
        validateNotEmpty();
        return size;
    }

    public IScalar get(int row, String column) throws DataSpaceException {
        validateColumnExists(column);
        validateRowIndex(row);
        return get(column).get(row);
    }

    public IScalar get(int row, int column) throws DataSpaceException {
        validateColumnIndex(column);
        return get(row, order.get(column));
    }

    public Map<String, IScalar> getRow(int row, Collection<String> requestedColumns) throws DataSpaceException {
        validateRowIndex(row);
        HashMap<String, IScalar> retval = new HashMap<String, IScalar>();
        Collection<String> list = (requestedColumns == null ? order : requestedColumns);
        for (String column : list) {
            retval.put(column, columns.get(column).vector.get(row));
        }
        return retval;
    }

    public Map<String, IScalar> getRow(int row) throws DataSpaceException {
        return getRow(row, null);
    }

    public IVectorIterator<Map<String, IScalar>> iterator() throws DataSpaceException {
        validateAllVectorsAreSealed();
        return new RowIterator(this);
    }

    public IVectorIterator<Map<String, IScalar>> iterator(Collection<String> columns)
            throws DataSpaceException, InvalidArgumentException {
        validateAllVectorsAreSealed();
        return new RowIterator(this, columns);
    }

    private synchronized void validateAllVectorsAreSealed() throws DataSpaceException {
        for (String entry : columns.keySet()) {
            AbstractVector vector = (AbstractVector) (columns.get(entry)).vector;
            if (!vector.isSealed()) {
                throw new DataSpaceException(
                        "Error: attempt to iterate while at least one vector is not sealed: " + entry);
            }
        }
    }

    public IVectorIterator<Map<String, IScalar>> iterator(IBitMap result) throws DataSpaceException {
        if (result == null) {
            return iterator();
        }
        return new BitMapRowIterator(this, result);
    }

    public IVectorIterator<Map<String, IScalar>> iterator(IBitMap result, Collection<String> columns)
            throws DataSpaceException, InvalidArgumentException {
        if (result == null) {
            return iterator(columns);
        }
        return new BitMapRowIterator(this, result, columns);
    }

    public IVectorIterator<Map<String, IScalar>> iterator(IBitMap result, String... columns)
            throws DataSpaceException, InvalidArgumentException {
        if (columns == null || columns.length == 0) {
            return iterator(result);
        } else {
            ArrayList<String> list = new ArrayList<String>(columns.length);
            Collections.addAll(list, columns);
            return iterator(result, list);
        }
    }

    public boolean isLabel(String column) throws DataSpaceException {
        validateColumnExists(column);
        return columns.get(column).isLabel;
    }

    /**
     * Identifies the column that is set as the row ID (if exists)
     * @return the name of the column
     */
    public String getRowId() {
        return rowIdColumn;
    }

    /**
     * Returns the row ID value for a specific row
     * @param row the row number
     * @return
     * @throws DataSpaceException
     */
    public IScalar getRowId(int row) throws DataSpaceException {
        validateRowIndex(row);
        if (rowIdColumn == null) {
            return null;
        }
        return columns.get(rowIdColumn).vector.get(row);
    }

    /**
     * Given a bit map of the same length as the data frame, produces a new data frame matching only the true entries in the bit map
     * @param bitmap
     * @return
     */
    public DataFrame subset(IBitMap bitmap) throws DataSpaceException {
        DataFrame retval = new DataFrame(dataSpace);

        for (String col : order) {
            Entry entry = columns.get(col);
            IVector subset = entry.vector.subset(bitmap);
            retval.cbind(col, subset, entry.isLabel);
        }

        retval.rowIdColumn = rowIdColumn;
        return retval;
    }

    /**
     * Converts the selected column into a factor and replaces the original column with the appropriate FactorValueVector
     * @param columnName
     */
    public void factor(String columnName) throws DataSpaceException {
        IVector column = get(columnName);
        column.factor(columnName + " factor");
        // replaceEntry(columnName, newColumn);  //IMPORTANT - decided to get rid of IFactorValueRecord and simply mark vectors as factors for now
    }

    private void replaceEntry(String columnName, IVector column) {
        Entry entry = columns.get(columnName);
        entry.vector = column;
    }

    @Override
    public long sizeInBytes() {
        return 0; // do nothing. The data frame is not memory managed
    }

    public IBitMap select(IQueryIntersectionSet query) throws DataSpaceException {
        // concurrency HOLD (fix before release) #IT-477 need to obtain a read lock here - maybe just create a synchronized block around getting a copy of the column name->vector mapping prior to query execution. This way we have a brief lock that would get the "image" of the data frame that would be stable for the duration of the query and would not require holding any long lived lock.
        // validate columns
        for (int i = 0; i < query.size(); i++) {
            String column = query.getColumnName(i);
            if (!columns.containsKey(column)) {
                throw new DataSpaceException("Query references non-existing column \"" + column + "\" query: \""
                        + query + "\", data frame columns: " + columnsAsString());
            }
        }

        // decide on resolution order
        ArrayList<Integer> executionOrder = resolveExecutionOrder(query);

        // execute
        IBitMap finalResult = null;
        for (int i : executionOrder) {
            String column = query.getColumnName(i);
            IQueryTerm term = query.getTerm(i);
            QueryState queryState = new QueryState();
            IBitMap termResult = query.applyTerm(i, columns.get(column).vector, queryState, finalResult);
            if (finalResult == null) {
                finalResult = termResult; // first term to be executed
            } else {
                finalResult = finalResult.and(termResult);
                // HOLD (fix before release)  need to go into javaEWAH and create a method that checks whether the bit map is all true or all false efficiently (faster than cardinality()) - contribute back to Daniel Lemire)
                if (finalResult.cardinality() == 0) {
                    break; // we narrowed down the result to nothing - no need to evaluate any further
                }
            }
        }

        return finalResult;
    }

    private ArrayList<Integer> resolveExecutionOrder(IQueryIntersectionSet query) {
        ArrayList<Integer> retval = new ArrayList<Integer>();
        for (int i = 0; i < query.size(); i++) {
            // HOLD (fix before release)  query order optimization by selectivity estimate (using column stats and rough estimate of selectivity by each term) and grouping same columns
            retval.add(i);
        }
        return retval;
    }

    private String columnsAsString() {
        return StringUtils.join(order, ", ");
    }

    public Set<String> getColumnNames() {
        return columns.keySet();
    }

    /**
     * Advisory metadata (unverified) telling the user that the data has a primary sort on the specified column.
     * This is a contract between the creator and the user.
     * @param column
     * @throws DataSpaceException if the column does not exist
     */
    public void setSortColumn(String column) throws DataSpaceException {
        validateColumnExists(column);
        metaData.put(PRIMARY_SORT_COLUMN, column);
    }

    public String getSortColumn() {
        Object col = metaData.get(PRIMARY_SORT_COLUMN);
        return col == null ? null : col.toString();
    }

    /**
     * Advisory metadata (unverified) telling the user that the data has a primary sort column.
     * This is a contract between the creator and the user.
     * @return
     */
    public boolean isSorted() {
        String sortColumn = getPrimarySortColumn();
        return sortColumn != null;
    }

    /**
     * Advisory metadata (unverified) telling the user that the data has a primary sort on the specified column.
     * This is a contract between the creator and the user.
     * @return the name of the sort column or null (indicating no sort)
     */
    private String getPrimarySortColumn() {
        return (String) metaData.get(PRIMARY_SORT_COLUMN);
    }

    /**
     * Create an integer vector populated with a the row numbers. Sets the column as the ID column
     * @param columnName
     * @throws DataSpaceException if there are no vectors in the data frame, or if there is already a column of the specified name
     */
    public IntegerVector createIdColumn(String columnName) throws DataSpaceException {
        if (columns.size() == 0) {
            throw new DataSpaceException(
                    "There are no column in this data frame. Cannot tell what length the ID column should be");
        }
        if (order.contains(columnName)) {
            throw new DataSpaceException("This data frame already contains a column named \"" + columnName + "\"");
        }

        // create the column
        IntegerVector id = new IntegerVector(dataSpace);
        id.setPersistenceType(getPersistenceType());

        id.setDescription("ID column (row number)");
        for (int i = 0; i < size(); i++) {
            id.append(i);
        }
        id.seal();

        // figure out what name space it should be in
        INameSpace nameSpace = getNameSpace();
        if (nameSpace != null) {
            nameSpace.add(id);
        }

        //add the column and mark it as the ID
        cbind(columnName, id);
        setId(columnName);

        return id;
    }

    @Override
    public DataFrame setDescription(String description) {
        super.setDescription(description);
        return this;
    }

    public double estimateMemoryLoad() throws DataSpaceException {
        double totalBytesForFirstSegments = 0.0;
        for (String column : columns.keySet()) {
            totalBytesForFirstSegments += columns.get(column).vector.getFirsSegmentSizeInBytes();
        }
        return totalBytesForFirstSegments / dataSpace.getMemoryManager().getMaxSize();
    }

    // ----------------------------------------------------------------------------------------------------------------

    private class Entry implements Serializable {
        private static final long serialVersionUID = -4284435669643449962L;
        public IVector vector;
        public boolean isLabel;

        private Entry(IVector vector, boolean label) {
            isLabel = label;
            this.vector = vector;
        }
    }
}