com.ggvaidya.scinames.model.Dataset.java Source code

Introduction

Here is the source code for com.ggvaidya.scinames.model.Dataset.java
Source

/*
 * Copyright (C) 2017 Gaurav Vaidya <gaurav@ggvaidya.com>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.ggvaidya.scinames.model;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.io.input.BOMInputStream;
import org.w3c.dom.Document;
import org.w3c.dom.Element;

import com.ggvaidya.scinames.model.rowextractors.NameExtractor;
import com.ggvaidya.scinames.model.rowextractors.NameExtractorFactory;
import com.ggvaidya.scinames.model.rowextractors.NameExtractorParseException;
import com.ggvaidya.scinames.util.ExcelImporter;
import com.ggvaidya.scinames.util.ModificationTimeProperty;
import com.ggvaidya.scinames.util.SimplifiedDate;

import javafx.beans.Observable;
import javafx.beans.property.BooleanProperty;
import javafx.beans.property.ObjectProperty;
import javafx.beans.property.ReadOnlyStringWrapper;
import javafx.beans.property.SimpleBooleanProperty;
import javafx.beans.property.SimpleObjectProperty;
import javafx.beans.property.SimpleStringProperty;
import javafx.beans.property.StringProperty;
import javafx.beans.property.StringPropertyBase;
import javafx.collections.FXCollections;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.collections.ObservableMap;
import javafx.scene.control.TableColumn;
import javafx.scene.control.TableView;

/**
 * A Dataset includes names and other data tied to those names.
 * 
 * DESIGN: Each dataset contains three pieces of information:
 *   - A list of rows, each associated with ONE Name 
 *  - A set of explicit changes that the project asserts took place at this dataset
 *  - A set of unexpected changes that have taken place since the previous checklist.
 * 
 * This means that we can get rid of our other two data types, since:
 *   - A Checklist is simply a Dataset without any data apart from name.
 *  - A ChecklistDiff is simply a set of explicit changes without any data.
 * 
 * @author Gaurav Vaidya <gaurav@ggvaidya.com>
 */
public class Dataset implements Citable, Comparable<Dataset> {
    private static final Logger LOGGER = Logger.getLogger(Dataset.class.getSimpleName());

    /* Constants */
    public static final String TYPE_DATASET = "Dataset";
    public static final String TYPE_CHECKLIST = "Checklist";

    /* Private variables */
    private Project project;
    private StringProperty nameProperty = new SimpleStringProperty();
    private ObjectProperty<SimplifiedDate> dateProperty = new SimpleObjectProperty<>(SimplifiedDate.MIN);
    private Dataset prevDataset;
    private StringProperty typeProperty = new SimpleStringProperty(TYPE_DATASET);
    private ModificationTimeProperty lastModified = new ModificationTimeProperty();

    // Data in this dataset.
    private ObservableList<DatasetColumn> columns = FXCollections.observableArrayList();
    private ObservableList<DatasetRow> rows = FXCollections.observableList(new LinkedList<>());
    private ObservableList<Change> explicitChanges = FXCollections.observableList(new LinkedList<>());
    private ObservableList<Change> implicitChanges = FXCollections.observableList(new LinkedList<>());
    private ObservableMap<String, String> properties = FXCollections.observableHashMap();

    {
        /* Make sure that certain changes trigger modifications. */
        nameProperty.addListener(c -> lastModified.modified());
        dateProperty.addListener(c -> lastModified.modified());
        typeProperty.addListener(c -> lastModified.modified());
        columns.addListener((Observable c) -> lastModified.modified());
        rows.addListener((Observable c) -> lastModified.modified());
        explicitChanges.addListener((Observable o) -> lastModified.modified());
        properties.addListener((Observable c) -> lastModified.modified());
    }

    /* Accessors */
    public Optional<Project> getProject() {
        return Optional.ofNullable(project);
    }

    public void setProject(Project p) {
        project = p;
        lastModified.modified();
    }

    public StringProperty nameProperty() {
        return nameProperty;
    }

    public String getName() {
        return nameProperty.get();
    }

    public void setName(String n) {
        nameProperty.set(n);
    }

    public ObjectProperty<SimplifiedDate> dateProperty() {
        return dateProperty;
    }

    public SimplifiedDate getDate() {
        return dateProperty.getValue();
    }

    public ModificationTimeProperty lastModifiedProperty() {
        return lastModified;
    }

    public ObservableList<DatasetColumn> getColumns() {
        return columns;
    }

    public ObservableList<DatasetRow> rowsProperty() {
        return rows;
    }

    public ObservableList<Change> explicitChangesProperty() {
        return explicitChanges;
    }

    public int getRowCount() {
        return rows.size();
    }

    public Stream<DatasetRow> getRowsAsStream() {
        return rows.stream();
    }

    public StringProperty typeProperty() {
        return typeProperty;
    }

    public String getType() {
        return typeProperty.getValue();
    }

    public boolean isChecklist() {
        return getType().equals(TYPE_CHECKLIST);
    }

    public Map<String, String> getProperties() {
        return properties;
    }

    public ObservableMap<String, String> propertiesProperty() {
        return properties;
    }

    /* Higher order accessors */
    public boolean isChangeImplicit(Change ch) {
        return implicitChanges.contains(ch);
    }

    public void makeChangeExplicit(Change ch) {
        // Only do this if the change is implicit!
        if (isChangeImplicit(ch)) {
            explicitChanges.add(ch);
            implicitChanges.remove(ch);

            LOGGER.info("Before setPreviousDataset(" + project + ", " + prevDataset + ")\n - Explicit changes: "
                    + explicitChanges + "\n - Implicit changes: " + implicitChanges);

            if (project != null)
                setPreviousDataset(Optional.of(project), Optional.ofNullable(prevDataset));

            LOGGER.info("After setPreviousDataset(" + project + ", " + prevDataset + ")\n - Explicit changes: "
                    + explicitChanges + "\n - Implicit changes: " + implicitChanges);

            lastModified.modified();
        }
    }

    @Override
    public int compareTo(Dataset tp) {
        int compare = getDate().compareTo(tp.getDate());
        if (compare != 0)
            return compare;

        // Identical dates! Let's try names.
        compare = getName().compareTo(tp.getName());
        if (compare != 0)
            return compare;

        // Identical date, identical name! Go with the smaller hashcode.
        compare = hashCode() - tp.hashCode();
        if (compare != 0)
            return compare;

        // Crap, identical hashcodes. Are we identical?
        if (this == tp)
            return 0;

        // No? Always pick us, I guess.
        return -1;
    }

    /**
     * Used to store notes associated with this change. This is actually a property ("note"), so
     * we create a StringProperty to wrap it.
     */
    public StringProperty noteProperty() {
        Dataset dataset = this;

        return new StringPropertyBase() {
            @Override
            public String getName() {
                return "note";
            }

            @Override
            public Object getBean() {
                return dataset;
            }

            @Override
            public String get() {
                return dataset.getProperties().get("note");
            }

            @Override
            public void set(String value) {
                dataset.getProperties().put("note", value);
                dataset.lastModified.modified();
            }
        };
    }

    @Override
    public void setDate(SimplifiedDate sd) {
        dateProperty.setValue(sd);
        lastModified.modified();
    }

    public void setColumns(List<DatasetColumn> cols) {
        columns.clear();
        columns.addAll(cols);
    }

    public Set<DatasetRow> getRowsByName(Name name) {
        // Make sure that our caches have been built.
        getNamesByRow();

        // And then use them caches.
        Set<DatasetRow> rows = rowsByName.get(name);
        if (rows == null)
            return new HashSet<>();
        return rows;

        /*
         * WTF is this?
         * 
        Set<Name> namesInAllRows = getNamesInAllRows();
        if(!namesInAllRows.contains(name)) return new HashSet<>();
            
        return getNamesByRow().entrySet().stream()
           .filter(entry -> entry.getValue().contains(name))
           .map(entry -> entry.getKey())
           .collect(Collectors.toSet());
        */
    }

    /* Managing previous timepoint */
    public Optional<Dataset> getPreviousDataset() {
        return Optional.ofNullable(prevDataset);
    }

    /**
     * Set the previous dataset. This is where we calculate implicit changes from the previous
     * dataset that have not been explained in our explicit changes.
     * 
     * @param proj Optionally, the project this dataset is a part of. Used for change filtering.
     * @param tp Optionally, the previous dataset. If null, means there isn't one: we should be considered
     *       the first checklist.
     */
    public void setPreviousDataset(Optional<Project> proj, Optional<Dataset> tp) {
        project = proj.orElse(null);
        prevDataset = tp.orElse(null);

        implicitChanges.clear();
        if (isChecklist()) {
            // Implicit changes don't exist for non-checklists. If we're a checklist, figure out what
            // names are new or have been removed in this checklist.

            Set<Name> names = getNamesInAllRows();
            Set<Name> prevNames;

            if (proj.isPresent() && tp.isPresent()) {
                prevNames = prevDataset.getRecognizedNames(proj.get()).collect(Collectors.toSet());
            } else {
                prevNames = new HashSet<>();
            }

            /*
             * Logically, at this point, we need to apply the change filter so that changes that
             * should be filtered out, are filtered out. However, we haven't calculated name clusters
             * at this point, so the filtering wouldn't be correct anyway.
             * 
             * So, instead, we accept all explicit changes and calculate implicit changes on that
             * basis. We then filter changes out of the explicit and implicit changes as needed.
             * 
             */

            // What names do explicit changes add or remove?
            Set<Name> addedByExplicitChanges = explicitChanges.stream().flatMap(ch -> ch.getToStream())
                    .collect(Collectors.toSet());
            Set<Name> deletedByExplicitChanges = explicitChanges.stream().flatMap(ch -> ch.getFromStream())
                    .collect(Collectors.toSet());

            // Calculate implicit changes that can't be explained by an explicit change.
            Stream<Change> additions = names.stream()
                    .filter(n -> !prevNames.contains(n) && !addedByExplicitChanges.contains(n))
                    .map(n -> new Change(this, ChangeType.ADDITION, Stream.empty(), Stream.of(n)));
            Stream<Change> deletions = prevNames.stream()
                    .filter(n -> !names.contains(n) && !deletedByExplicitChanges.contains(n))
                    .map(n -> new Change(this, ChangeType.DELETION, Stream.of(n), Stream.empty()));

            implicitChanges.addAll(additions.collect(Collectors.toList()));
            implicitChanges.addAll(deletions.collect(Collectors.toList()));
        }
    }

    /* Names management */
    /*
       There are several ways in which we could organize this. The approach
       we're going with is:
     - Each row does not know what names are in it.
     - We keep a centralized list of names for each row.
     - We also keep a centralized list of all names.
     - The ONLY way to access either of these datasets is through
       memoized functions, i.e. they will NOT be stored.
     - That way, you won't trigger name parsing until you need it;
       when you need it, it'll be done once and will be fairly efficient.
     - Updating a row or column will reset the memoized caches, so we
       don't waste any more time.
    */

    private Map<DatasetRow, Set<Name>> namesByRow = null;
    private Map<Name, Set<DatasetRow>> rowsByName = null;
    private Set<Name> namesInRows = null;

    /*
     * Tracks when the namesByRow was last modified. For some reason, we have two systems to
     * track this: you can look at namesByRow or namesByRowLastModified. getNamesByRow() looks
     * at both.
     */
    private ModificationTimeProperty namesByRowLastModified = new ModificationTimeProperty();

    {
        // If the columns or rows change, we need to reparse ALL names.
        columns.addListener((ListChangeListener.Change<? extends DatasetColumn> c) -> resetNamesCaches());
        rows.addListener((ListChangeListener.Change<? extends DatasetRow> c) -> resetNamesCaches());

        lastModified.addListener((a, b, c) -> namesByRowLastModified.modified());
    }

    public void deleteChange(Change ch) {
        if (isChangeImplicit(ch)) {
            // If ch is an implicit change, we need to update our change calculations. 
            if (project != null)
                setPreviousDataset(Optional.of(project), Optional.ofNullable(prevDataset));

        } else {
            // Explicit change. Delete!
            explicitChanges.remove(ch);

            if (project != null)
                setPreviousDataset(Optional.of(project), Optional.ofNullable(prevDataset));

            // We've been modified!
            lastModified.modified();
        }
    }

    private void resetNamesCaches() {
        LOGGER.entering(Dataset.class.getSimpleName(), "resetNamesCaches");
        namesByRow = null;
        namesInRows = null;
        rowsByName = null;

        // If this is being reset, we've probably been updated.
        lastModified.modified();

        // And if we don't know our names, neither should you.
        if (project != null)
            project.clearRecognizedNamesCache();
    }

    /**
     * The workhorse method for name parsing.
     * 
     * @return Map of rows in this dataset against all the names in each row.
     */
    public Map<DatasetRow, Set<Name>> getNamesByRow() {
        LOGGER.entering(Dataset.class.getSimpleName(), "getNamesByRow");

        if (namesByRow == null || namesByRowLastModified.isModified()) {
            LOGGER.log(Level.FINE, "Recalculating names using extractors: {0}",
                    NameExtractorFactory.serializeExtractorsToString(getNameExtractors()));

            long startTime = System.nanoTime();

            // Recalculate all.
            resetNamesCaches();

            namesByRow = new HashMap<>();
            namesInRows = new HashSet<>();
            rowsByName = new HashMap<>();

            for (DatasetRow row : rows) {
                Set<Name> names = new HashSet<>();
                if (getNameExtractors() != null && getNameExtractors().size() > 0)
                    names = NameExtractorFactory.extractNamesUsingExtractors(getNameExtractors(), row);
                namesByRow.put(row, names);
                namesInRows.addAll(names);

                for (Name n : names) {
                    if (!rowsByName.containsKey(n))
                        rowsByName.put(n, new HashSet<>());

                    rowsByName.get(n).add(row);
                }
            }
            namesByRowLastModified.saved();

            // Report on how long this took.
            double timeTaken = (System.nanoTime() - startTime) / 1e6d;
            double timePerRow = 0;
            if (rows.size() > 0)
                timePerRow = timeTaken / rows.size();

            LOGGER.log(Level.FINE, "getNamesByRow() extracted {0} in {1} seconds ({2} seconds/row) on dataset {3}",
                    new Object[] { namesInRows.size(), timeTaken, timePerRow, this });
        }

        return namesByRow;
    }

    public Set<Name> getNamesInRow(DatasetRow row) {
        // trigger parse if necessary
        Map<DatasetRow, Set<Name>> namesByRow = getNamesByRow();

        LOGGER.entering(Dataset.class.getSimpleName(), "getNamesInRow", row);

        if (namesByRow.containsKey(row))
            return namesByRow.get(row);
        else
            return new HashSet<>();
    }

    /**
     * Returns the set of all names recorded in the rows of this dataset. Note that this
     * does NOT include names referenced from the explicit changes -- please see getReferencedNames()
     * for that!
     * 
     * @return The set of all names recorded in the rows of this dataset.
     */
    public Set<Name> getNamesInAllRows() {
        // Make sure our caches are up to date.
        getNamesByRow();

        // At this point, they should be!
        return namesInRows;
    }

    /**
     * @return The inverse of getNamesByRow(): returns the set of rows associated with each name.
     */
    public Map<Name, Set<DatasetRow>> getRowsByName() {
        // Make sure our caches are up to date.
        getNamesByRow();

        // At this point, they should be!
        return rowsByName;
    }

    /*
     * Name extractors subsystem.
     * 
     * See model.rowextractors for more information.
     */

    private List<NameExtractor> nameExtractors = NameExtractorFactory.getDefaultExtractors();

    public List<NameExtractor> getNameExtractors() {
        return nameExtractors;
    }

    /**
     * @return The current name extractors as a string.
     */
    public String getNameExtractorsAsString() {
        return NameExtractorFactory.serializeExtractorsToString(nameExtractors);
    }

    /**
     * Set the current name extractors as a string.
     * 
     * @param str String represention of name extractors
     * @throws NameExtractorParseException If the string representation could not be parsed.
     */
    public void setNameExtractorsString(String str) throws NameExtractorParseException {
        nameExtractors = NameExtractorFactory.getExtractors(str);
        LOGGER.log(Level.FINE, "setNameExtractorsString() called, extractors now set to {0}",
                NameExtractorFactory.serializeExtractorsToString(nameExtractors));
        resetNamesCaches();
    }

    /**
     * Returns a Stream of all distinct names referenced from this dataset. This includes 
     * names found in dataset rows and names found in ALL explicit changes (not just 
     * filtered ones!), and nothing else.
     * 
     * @return A Stream of all distinct names referenced from this dataset.
     */
    public Stream<Name> getReferencedNames() {
        Stream<Name> namesFromData = getNamesInAllRows().stream();
        Stream<Name> namesFromChanges = explicitChanges.stream().flatMap(ch -> ch.getAllNames().stream());

        return Stream.concat(namesFromData, namesFromChanges).distinct();
    }

    /**
     * Returns a Stream of all distinct names recognized at the end of this checklist.
     * 
     * For a checklist, this is every name in every row, plus names added by explicit
     * changes (which overrule the dataset), minus names removed by explicit changes.
     * 
     * For a dataset, it's (prevDataset's recognized names) + 
     * (names added by explicit and implicit changes) - (names removed by explicit
     * and implicit changes).
     * 
     * @param proj Required for filtering changes
     * @return A Stream of recognized names as at the end of this checklist.
     */
    public Stream<Name> getRecognizedNames(Project proj) {
        // Start with names we explicitly add.
        Set<Name> addedNames = getChanges(proj).flatMap(ch -> ch.getToStream()).collect(Collectors.toSet());
        Set<Name> initialNames = new HashSet<>(addedNames);

        // If this is not a checklist, then pass through previously recognized names.
        if (prevDataset != null)
            initialNames.addAll(proj.getRecognizedNames(prevDataset));

        // Delete names we explicitly delete.
        Set<Name> deletedNames = getChanges(proj).flatMap(ch -> ch.getFromStream()).collect(Collectors.toSet());

        Set<Name> finalList = initialNames.stream().filter(n -> {
            // Filter out names that have been deleted, EXCEPT those that
            // have been explicitly added (such as in a lump or split).
            if (deletedNames.contains(n)) {
                if (addedNames.contains(n))
                    return true; // don't filter
                else
                    return false; // do filter
            } else
                return true; // don't filter
        }).collect(Collectors.toSet());

        // This should be the same as the names in a checklist!
        // Double-check!
        if (isChecklist() && !finalList.equals(getNamesInAllRows())) {
            // TODO: OKAY, so this is caused by the following scenario:
            //   - We explicitly rename "Osteocephalus vilmae" to "Hylomantis buckleyi" within a dataset
            //    - We do that because AmphibiaWeb *says* they are duplicates.
            //   - However, this dataset has rows for *both* vilmae and buckleyi.
            //   - So how?
            //      - We fix the discrepancy by recognizing all the names in the rows -- whether
            //        or not they're reflected in the changes.

            Set<Name> finalListButNotInRows = new HashSet<>(finalList);
            finalListButNotInRows.removeAll(getNamesInAllRows());

            Set<Name> rowNamesButNotFinalList = new HashSet<>(getNamesInAllRows());
            rowNamesButNotFinalList.removeAll(finalList);

            LOGGER.warning("Discrepency in calculating recognized names for " + this + ":\n"
                    + "\t - Final list but not in rows: " + finalListButNotInRows + "\n"
                    + "\t - Rows but not in final list: " + rowNamesButNotFinalList + "\n" + "\t - Name count: "
                    + initialNames.size() + " + " + addedNames.size() + " - " + deletedNames.size() + " = "
                    + (initialNames.size() + addedNames.size() - deletedNames.size()) + " (but should be "
                    + finalList.size() + ")\n"
                    + "Species in the rows but not in final count will be added to the list of recognized names.");

            finalList.addAll(rowNamesButNotFinalList);
        }

        return finalList.stream();
    }

    /* Display options: provides information on what happened in this dataset for UI purposes */

    public String getRowCountSummary() {
        Map<DatasetRow, Set<Name>> namesByRow = getNamesByRow();
        long rowsWithNames = namesByRow.values().stream().filter(names -> names.size() > 0).count();

        int rowCount = getRowCount();

        if (rowCount <= 0)
            return "No rows";
        else {
            String pcNamed;
            if (rowsWithNames == rowCount)
                pcNamed = "Completely (100%)";
            else if (rowCount - rowsWithNames < 50)
                pcNamed = String.format("%.2f%%", ((double) rowsWithNames / rowCount * 100)) + " (all but "
                        + (rowCount - rowsWithNames) + " rows)";
            else
                pcNamed = String.format("%.2f%%", ((double) rowsWithNames / rowCount * 100));

            return rowCount + " rows (" + pcNamed + " named with " + getNamesInAllRows().size()
                    + " distinct names)";
        }
    }

    // Calculating this ourselves is too slow, so we hook into Project's cache.
    public String getNameCountSummary(Project project) {
        if (isChecklist())
            return project.getRecognizedNames(this).size() + " recognized (" + getReferencedNames().count()
                    + " referenced in rows and changes)";
        else
            return getReferencedNames().count() + " referenced (" + project.getRecognizedNames(this).size()
                    + " recognized)";
    }

    public String getBinomialCountSummary(Project project) {
        if (isChecklist())
            return project.getRecognizedNames(this).stream().flatMap(n -> n.asBinomial()).distinct().count()
                    + " recognized (" + getReferencedNames().flatMap(n -> n.asBinomial()).distinct().count()
                    + " referenced)";
        else
            return getReferencedNames().flatMap(n -> n.asBinomial()).distinct().count() + " referenced ("
                    + project.getRecognizedNames(this).stream().flatMap(n -> n.asBinomial()).distinct().count()
                    + " recognized)";
    }

    /** 
     * Set up a TableView to contain the data contained in this dataset.
     * 
     * @param tv The TableView to populate.
     */
    public void displayInTableView(TableView<DatasetRow> tv) {
        // Setup table.
        tv.setEditable(false);
        //controller.setTableColumnResizeProperty(TableView.CONSTRAINED_RESIZE_POLICY);
        ObservableList<TableColumn<DatasetRow, ?>> cols = tv.getColumns();
        cols.clear();
        // We need to precalculate.
        ObservableList<DatasetRow> rows = this.rowsProperty();
        // Set up columns.
        TableColumn<DatasetRow, String> colRowName = new TableColumn<>("Name");
        colRowName.setCellValueFactory((TableColumn.CellDataFeatures<DatasetRow, String> features) -> {
            DatasetRow row = features.getValue();
            Set<Name> names = getNamesInRow(row);
            if (names.isEmpty()) {
                return new ReadOnlyStringWrapper("(None)");
            } else {
                return new ReadOnlyStringWrapper(
                        names.stream().map(name -> name.getFullName()).collect(Collectors.joining("; ")));
            }
        });
        colRowName.setPrefWidth(100.0);
        cols.add(colRowName);
        // Create a column for every column here.
        this.getColumns().forEach((DatasetColumn col) -> {
            String colName = col.getName();
            TableColumn<DatasetRow, String> colColumn = new TableColumn<>(colName);
            colColumn.setCellValueFactory((TableColumn.CellDataFeatures<DatasetRow, String> features) -> {
                DatasetRow row = features.getValue();
                String val = row.get(colName);
                return new ReadOnlyStringWrapper(val == null ? "" : val);
            });
            colColumn.setPrefWidth(100.0);
            cols.add(colColumn);
        });

        // Set table items.
        // tv.getItems().clear();
        tv.setItems(rows);
    }

    /* Change management */

    public synchronized void onChangeChanged(Optional<Project> project, Change change) {
        LOGGER.entering(Dataset.class.getSimpleName(), "project = " + project + ", change = " + change);

        if (explicitChanges.contains(change)) {
            // Explicit changes affect how implicit changes are processed;
            // so if explicit changes change, then we need to recalculate
            // implicit changes.
            setPreviousDataset(project, Optional.ofNullable(prevDataset));
        } else if (implicitChanges.contains(change)) {
            // It's an implicit change? Well, it just got promoted to 
            // an explicit change.
            implicitChanges.remove(change);
            explicitChanges.add(change);
        } else {
            // We don't know about this change? Add it now!
            explicitChanges.add(change);

            // And then refire the notification.
            onChangeChanged(project, change);
            return;
        }

        // Whatever happens, we've changed, as has the project.
        lastModified.modified();
    }

    public Stream<Change> getExplicitChanges(Project p) {
        return explicitChanges.stream().filter(p.getChangeFilter());
    }

    public Stream<Change> getImplicitChanges(Project p) {
        return implicitChanges.stream().filter(p.getChangeFilter());
    }

    public String getChangesCountSummary(Project p) {
        if (explicitChanges.size() == 0) {
            if (implicitChanges.size() == 0) {
                return "No changes";
            } else {
                // Implicit only
                return getImplicitChangesCountSummary(p);
            }
        } else {
            if (implicitChanges.size() == 0) {
                // Explicit only
                return getExplicitChangesCountSummary(p);
            } else {
                // Both explicit and implicit changes
                return getImplicitChangesCountSummary(p) + "; " + getExplicitChangesCountSummary(p);
            }
        }
    }

    public String getExplicitChangesCountSummary(Project p) {
        if (getExplicitChanges(p).count() == 0)
            return "None";

        Map<ChangeType, Long> changeCounts = getExplicitChanges(p)
                .collect(Collectors.groupingBy(Change::getType, Collectors.counting()));

        String changes_by_type = changeCounts.entrySet().stream()
                .sorted((a, b) -> b.getValue().compareTo(a.getValue())).map(e -> e.getValue() + " " + e.getKey())
                .collect(Collectors.joining(", "));

        return getExplicitChanges(p).count() + " explicit changes (" + changes_by_type + ")";
    }

    public String getImplicitChangesCountSummary(Project p) {
        if (getImplicitChanges(p).count() == 0)
            return "None";

        Map<ChangeType, Long> implicitChangeCounts = getImplicitChanges(p)
                .collect(Collectors.groupingBy(Change::getType, Collectors.counting()));

        String implicit_changes_by_type = implicitChangeCounts.entrySet().stream()
                .sorted((a, b) -> b.getValue().compareTo(a.getValue())).map(e -> e.getValue() + " " + e.getKey())
                .collect(Collectors.joining(", "));

        return getImplicitChanges(p).count() + " implicit changes (" + implicit_changes_by_type + ")";
    }

    /**
     * Return all changes associated with this dataset, explicit or implicit. You almost certainly
     * want to use getChanges(Project), which will give you all the changes after filtering those
     * the project isn't interested in -- so please be careful about using this!
     * 
     * @return Stream of all changes associated with this dataset.
     */
    public Stream<Change> getAllChanges() {
        return getAllChangesAsList().stream();
    }

    public List<Change> getAllChangesAsList() {
        List<Change> allChanges = new LinkedList<>(explicitChanges);
        allChanges.addAll(implicitChanges);
        return allChanges;
    }

    public Stream<Change> getChanges(Project project) {
        return getAllChanges().filter(project.getChangeFilter());
    }

    @Override
    public String getCitation() {
        return getName() + " (" + getDate() + ")";
    }

    @Override
    public String toString() {
        return getType() + " " + getCitation();
    }

    public String asTitle() {
        return getType() + " " + getName() + " (" + getDate() + ": " + rows.size() + " rows, "
                + getReferencedNames().count() + " referenced names, " + explicitChanges.size()
                + " explicit changes, " + implicitChanges.size() + " implicit changes)";
    }

    /* Data load */

    /**
     * Attempt to load a dataset from a file. We use regular expressions to try to guess the file type,
     * and then delegate the job out. Rather cleverly, we try extracting the names using every extractor
     * this project knows about, and then pick the one that gives us the most number of names.
     * 
     * @param proj The project doing the loading, used to get the name extractors.
     * @param f The file to open.
     * @return The dataset loaded from that file.
     * @throws IOException If there was an error loading the file.
     */
    public static Dataset loadFromFile(Project proj, File f) throws IOException {
        Dataset ds;

        // Excel file? Handle separately!
        String fileName = f.getName().toLowerCase();
        if (fileName.endsWith(".xlsx") || fileName.endsWith(".xls")) {
            ds = new ExcelImporter(f).asDataset(0);
        } else if (fileName.endsWith(".csv") || fileName.endsWith(".tsv")) {
            CSVFormat csvFormat = CSVFormat.DEFAULT;
            if (fileName.endsWith(".tsv"))
                csvFormat = CSVFormat.TDF.withQuote(null); // We need this to load the AmphibiaWeb files.

            ds = Dataset.fromCSV(csvFormat, f);
        } else {
            // Text-based file? Try using the first line to figure out what's going on.
            String firstLine;
            try (LineNumberReader r = new LineNumberReader(new FileReader(f))) {
                // Load the first line to try to identify the file type.
                firstLine = r.readLine();
            }

            // The most basic type of file is a TaxDiff file, which always
            // begins with:
            if (ChecklistDiff.pTaxDiffFirstLine.matcher(firstLine).matches()) {
                // Note that checklist diffs don't need name extractors!
                return ChecklistDiff.fromTaxDiffFile(f);
            }

            // If all else fails, try loading it as a checklist. Also don't need name extractors!
            return Checklist.fromListInFile(f);
        }

        // If we're here, we need name extractors.

        // Try all name extractors, see which one matches the most names.
        Set<List<NameExtractor>> allAvailableNameExtractors = proj.getNameExtractors();
        allAvailableNameExtractors.add(NameExtractorFactory.getDefaultExtractors());

        LOGGER.info("Starting name extractor comparisons");
        List<NameExtractor> bestExtractor = null;
        long bestExtractorCount = Long.MIN_VALUE;
        for (List<NameExtractor> extractor : allAvailableNameExtractors) {
            long count = ds.rows.stream()
                    .flatMap(row -> NameExtractorFactory.extractNamesUsingExtractors(extractor, row).stream())
                    .distinct().count();

            if (count > bestExtractorCount) {
                bestExtractorCount = count;
                bestExtractor = extractor;
            }
        }
        LOGGER.info("Finished name extractor comparisons: best extractor at " + bestExtractorCount + " names was "
                + NameExtractorFactory.serializeExtractorsToString(bestExtractor));

        try {
            ds.setNameExtractorsString(NameExtractorFactory.serializeExtractorsToString(bestExtractor));
        } catch (NameExtractorParseException ex) {
            // Forget about it. We'll go with the default.
        }

        return ds;
    }

    /* Constructor  */
    public Dataset(String name, SimplifiedDate date, String checklistType) {
        nameProperty.setValue(name);
        dateProperty.setValue(date);
        typeProperty.setValue(checklistType);
    }

    // Blank constructor
    public Dataset() {
        nameProperty.setValue("(unnamed)");
    }

    /* Serialization */

    /**
     * Load this dataset from a CSV file. We load the entire CSV file, except
     * for blank cells.
     * 
     * @param project The project to which the resulting Dataset should belong
     * @param csvFormat The CSV format of the input file.
     * @param csvFile The input file to load.
     * @param renamedColumns Rename these columns on the fly.
     * @return
     * @throws IOException 
     */
    public static Dataset fromCSV(CSVFormat csvFormat, File csvFile) throws IOException {
        Dataset dataset = new Dataset(csvFile.getName(), new SimplifiedDate(), Dataset.TYPE_CHECKLIST);

        // Get ready to filter input files.
        InputStream ins = new FileInputStream(csvFile);

        // Look for BOMs and discard!
        ins = new BOMInputStream(ins, false);

        // Convert into a Reader.
        Reader reader = new BufferedReader(new InputStreamReader(ins));

        // Load CSV
        CSVParser parser = csvFormat.withHeader().parse(reader);
        Map<String, Integer> headerMap = parser.getHeaderMap();

        dataset.setColumns(headerMap.entrySet().stream().sorted((Object o1, Object o2) -> {
            Map.Entry<String, Integer> e1 = (Map.Entry) o1;
            Map.Entry<String, Integer> e2 = (Map.Entry) o2;

            return e1.getValue().compareTo(e2.getValue());
        }).map(e -> e.getKey()).map(colName -> DatasetColumn.of(colName))
                /*
                .map(col -> {
                   // Rename any renamedColumns.
                   if(renamedColumns.containsKey(col))
                return renamedColumns.get(col);
                   else
                return col;
                })*/
                .collect(Collectors.toList()));

        dataset.rows.clear();
        dataset.rows.addAll(parser.getRecords().stream().map(record -> {
            DatasetRow row = new DatasetRow(dataset);
            row.putAll(record.toMap());
            return row;
        }).collect(Collectors.toList()));

        return dataset;
    }

    public Element serializeToElement(Document doc) {
        Element datasetElement = doc.createElement("dataset");

        datasetElement.setAttribute("name", getName());
        datasetElement.setAttribute("type", getType());
        dateProperty.getValue().setDateAttributesOnElement(datasetElement);
        datasetElement.setAttribute("nameExtractors", getNameExtractorsAsString());

        // Properties
        Element propertiesElement = doc.createElement("properties");
        for (String key : getProperties().keySet()) {
            Element propertyElement = doc.createElement("property");
            propertyElement.setAttribute("name", key);
            propertyElement.setTextContent(getProperties().get(key));
            propertiesElement.appendChild(propertyElement);
        }
        datasetElement.appendChild(propertiesElement);

        Element changesElement = doc.createElement("changes");
        for (Change ch : explicitChanges) {
            Element changeElement = ch.serializeToElement(doc);
            changesElement.appendChild(changeElement);
        }
        datasetElement.appendChild(changesElement);

        Element columnsElement = doc.createElement("columns");
        for (DatasetColumn col : columns) {
            Element columnElement = doc.createElement("column");
            columnElement.setAttribute("name", col.getName());
            columnsElement.appendChild(columnElement);
        }
        datasetElement.appendChild(columnsElement);

        Element rowsElement = doc.createElement("rows");
        for (DatasetRow row : rows) {
            Element rowElement = doc.createElement("row");

            for (DatasetColumn col : row.getColumns()) {
                // Ignore elements without a value.
                String val = row.get(col);
                if (val == null || val.equals(""))
                    continue;

                Element itemElement = doc.createElement("key");
                itemElement.setAttribute("name", col.getName());
                itemElement.setTextContent(val);
                rowElement.appendChild(itemElement);
            }

            rowsElement.appendChild(rowElement);
        }
        datasetElement.appendChild(rowsElement);

        return datasetElement;
    }

    /*
    public static Dataset serializeFromNode(Project p, Node node) throws SAXException {
           
       System.err.println(" - starting serialization of dataset from " + node + ", memory usage: " + Runtime.getRuntime().freeMemory());
           
       NamedNodeMap attr = node.getAttributes();
       String name = attr.getNamedItem("name").getNodeValue();
       SimplifiedDate date = new SimplifiedDate(node);
           
       Dataset dataset = new Dataset(name, date);
           
       NodeList children = node.getChildNodes();
       for(int x = 0; x < children.getLength(); x++) {
     Node child = children.item(x);
         
     if(child.getNodeType() != Node.ELEMENT_NODE) continue;
         
     if(child.getNodeName().equalsIgnoreCase("columns")) {
        dataset.columns.clear();
            
        NodeList columns = child.getChildNodes();
        for(int y = 0; y < columns.getLength(); y++) {
           Node column = columns.item(y);
               
           if(column.getNodeType() != Node.ELEMENT_NODE) continue;
               
           String colName = column.getAttributes().getNamedItem("name").getNodeValue();
           dataset.columns.add(DatasetColumn.of(colName));
        }
            
        continue;
     }
     else if(child.getNodeName().equalsIgnoreCase("rows")) {
        dataset.rows.clear();
            
        NodeList rows = child.getChildNodes();
        for(int y = 0; y < rows.getLength(); y++) {
           Node rowElement = rows.item(y);
               
           if(rowElement.getNodeType() != Node.ELEMENT_NODE) continue;
               
           if(!rowElement.getNodeName().equalsIgnoreCase("row"))
              throw new SAXException("Unexpected element in 'rows': " + rowElement);
               
           DatasetRow row = new DatasetRow();
               
           NodeList items = rowElement.getChildNodes();
           for(int z = 0; z < items.getLength(); z++) {
              Node item = items.item(z);
                  
              if(item.getNodeType() != Node.ELEMENT_NODE) continue;
                  
              if(!item.getNodeName().equalsIgnoreCase("key"))
                 throw new SAXException("Unexpected element in 'row': " + item);
                  
              String key = item.getAttributes().getNamedItem("name").getNodeValue();
              String value = item.getTextContent();
        
              row.put(key, value);
           }
               
           dataset.rows.add(row);
        }
     } else
        throw new SAXException("Unexpected node in 'dataset': " + child);
       }
           
       System.err.println(" - dataset loaded: " + dataset + ", memory usage: " + Runtime.getRuntime().freeMemory());
           
       return dataset;
    }*/
}