tech.tablesaw.filters.TimeDependentFilteringTest.java Source code

Java tutorial

Introduction

Here is the source code for tech.tablesaw.filters.TimeDependentFilteringTest.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package tech.tablesaw.filters;

import com.google.common.base.Stopwatch;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeSet;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import org.apache.commons.lang3.RandomUtils;
import org.apache.commons.text.RandomStringGenerator;

import tech.tablesaw.api.DateColumn;
import tech.tablesaw.api.DoubleColumn;
import tech.tablesaw.api.StringColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.columns.dates.PackedLocalDate;
import tech.tablesaw.columns.numbers.NumberColumnFormatter;
import tech.tablesaw.table.StandardTableSliceGroup;
import tech.tablesaw.table.TableSlice;
import tech.tablesaw.table.TableSliceGroup;

import java.io.IOException;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.TimeUnit;

import static java.lang.System.out;
import static org.junit.jupiter.api.Assertions.fail;

/**
 * Tests manipulation of large (but not big) data sets
 */
public class TimeDependentFilteringTest {

    private static final int CONCEPT_COUNT = 10;
    private static final int PATIENT_COUNT = 10_000;

    // pools to get random test data from
    private static List<String> concepts = new ArrayList<>(CONCEPT_COUNT);
    private static IntArrayList patientIds = new IntArrayList(PATIENT_COUNT);
    private static int size = 60 * 365;
    private static IntArrayList dates = new IntArrayList(size);

    public static void main(String[] args) throws Exception {

        int numberOfRecordsInTable = 100_000_000;
        Stopwatch stopwatch = Stopwatch.createStarted();

        Table t = defineSchema();
        generateTestData(t, numberOfRecordsInTable, stopwatch);

        t.setName("Observations");

        // non temporal constraints
        String conceptA = t.stringColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
        String conceptB = t.stringColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));

        // independent temporal constraints
        String conceptZ = t.stringColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
        String conceptD = t.stringColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
        DependencyFilter independentConstraintFilter = DependencyFilter.FIRST;

        // temporal dependency range constraint
        Range<Integer> daysConstraint = Range.closed(0, 0);

        StringColumn concept = t.stringColumn("concept");

        //Non-temporal clause
        Table nt = t.where(concept.isEqualTo(conceptA).and(concept.isNotEqualTo(conceptB)));

        DoubleColumn ntPatients = nt.doubleColumn("patient");

        // Group the original table by patient id
        TableSliceGroup patients = StandardTableSliceGroup.create(t, "patient");

        // Create a list of patient sub-tables to work with TODO(lwhite): Build the copy-on-write to ViewGroups to avoid
        CopyOnWriteArrayList<TableSlice> patientTables = new CopyOnWriteArrayList<>(patients.getSlices());

        // Apply the independent temporal event filtering to the patient subtables and remove any that don't pass
        for (TableSlice patientTable : patients) {
            StringColumn concepts = patientTable.stringColumn("concept");
            double patientId = Double.parseDouble(patientTable.name());
            if (!concepts.contains(conceptZ) || concepts.contains(conceptD)) {
                patientTables.remove(patientTable);
            } else if (!ntPatients.contains(patientId)) { // filtering out the non-temporal now constraints for
                // efficiency
                patientTables.remove(patientTable);
            }
        }

        List<IndependentResult> independentResults = new ArrayList<>();

        // Working with the filtered patient tables, calculate the event dates for the independent events
        for (TableSlice patientTable : patientTables) {
            IndependentResult result = new IndependentResult();
            List<LocalDate> eventDates = new ArrayList<>();

            // iterate an individual table and find the rows where concept matches the target concept
            for (int row : patientTable) {
                StringColumn concepts = patientTable.stringColumn("concept");
                DateColumn dates = patientTable.dateColumn("date");
                if (concepts.get(row).equals(conceptZ)) {
                    eventDates.add(dates.get(row));
                }
            }

            if (independentConstraintFilter == DependencyFilter.FIRST) {
                if (eventDates.isEmpty()) {
                    // this is an error
                    fail("There are no event dates");
                } else { //Get the first event for the current patient and createFromCsv a date range around it
                    LocalDate date = eventDates.get(0);
                    result.addRange(Range.closed(date.minusDays(daysConstraint.lowerEndpoint()),
                            date.plusDays(daysConstraint.upperEndpoint())));
                } //TODO handle last and any cases
            }
            independentResults.add(result);
        }
    }

    private static Table defineSchema() {
        Table t;
        t = Table.create("Observations");
        StringColumn conceptId = StringColumn.create("concept");
        DateColumn date = DateColumn.create("date");
        DoubleColumn value = DoubleColumn.create("value");
        DoubleColumn patientId = DoubleColumn.create("patient");
        patientId.setPrintFormatter(NumberColumnFormatter.ints());

        t.addColumns(conceptId);
        t.addColumns(date);
        t.addColumns(value);
        t.addColumns(patientId);
        return t;
    }

    private static void generateTestData(Table t, int numberOfRecordsInTable, Stopwatch stopwatch)
            throws IOException {
        stopwatch.reset().start();
        out.println("Generating test data");
        generateData(numberOfRecordsInTable, t);
        out.println("Time to generate " + numberOfRecordsInTable + " records: "
                + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
    }

    private static void generateData(int observationCount, Table table) {
        // createFromCsv pools of random values

        RandomStringGenerator generator = new RandomStringGenerator.Builder().withinRange(32, 127).build();
        while (concepts.size() <= CONCEPT_COUNT) {
            concepts.add(generator.generate(30));
        }

        while (patientIds.size() <= PATIENT_COUNT) {
            patientIds.add(RandomUtils.nextInt(0, 2_000_000_000));
        }

        while (dates.size() <= size) {
            dates.add(PackedLocalDate.pack(randomDate()));
        }

        DateColumn dateColumn = table.dateColumn("date");
        StringColumn conceptColumn = table.stringColumn("concept");
        DoubleColumn valueColumn = table.doubleColumn("value");
        DoubleColumn patientColumn = table.doubleColumn("patient");

        // sample from the pools to write the data
        for (int i = 0; i < observationCount; i++) {
            dateColumn.appendInternal(dates.getInt(RandomUtils.nextInt(0, dates.size())));
            conceptColumn.append(concepts.get(RandomUtils.nextInt(0, concepts.size())));
            valueColumn.append(RandomUtils.nextDouble(0f, 100_000f));
            patientColumn.append(patientIds.getInt(RandomUtils.nextInt(0, patientIds.size())));
        }
    }

    // TODO(lwhite): Put this in a Test utils class
    private static LocalDate randomDate() {
        Random random = new Random();
        int minDay = (int) LocalDate.of(2000, 1, 1).toEpochDay();
        int maxDay = (int) LocalDate.of(2016, 1, 1).toEpochDay();
        long randomDay = minDay + random.nextInt(maxDay - minDay);
        return LocalDate.ofEpochDay(randomDay);
    }

    private enum DependencyFilter {
        FIRST, LAST, ANY
    }

    private static class IndependentResult {
        private RangeSet<LocalDate> dateRanges = TreeRangeSet.create();

        private void addRange(Range<LocalDate> dateRange) {
            dateRanges.add(dateRange);
        }
    }
}