Java tutorial
/** * Copyright (C) 2013-2017 Vasilis Vryniotis <bbriniotis@datumbox.com> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.datumbox.framework.core.common.dataobjects; import com.datumbox.framework.common.Configuration; import com.datumbox.framework.common.concurrency.ForkJoinStream; import com.datumbox.framework.common.concurrency.StreamMethods; import com.datumbox.framework.common.concurrency.ThreadMethods; import com.datumbox.framework.common.dataobjects.AssociativeArray; import com.datumbox.framework.common.dataobjects.FlatDataList; import com.datumbox.framework.common.dataobjects.TypeInference; import com.datumbox.framework.common.interfaces.Copyable; import com.datumbox.framework.core.common.interfaces.Extractable; import com.datumbox.framework.core.common.interfaces.Savable; import com.datumbox.framework.common.storage.abstracts.BigMapHolder; import com.datumbox.framework.common.storage.interfaces.BigMap; import com.datumbox.framework.common.storage.interfaces.StorageEngine; import com.datumbox.framework.common.storage.interfaces.StorageEngine.MapType; import com.datumbox.framework.common.storage.interfaces.StorageEngine.StorageHint; import com.datumbox.framework.common.utilities.RandomGenerator; import com.datumbox.framework.core.common.text.StringCleaner; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.net.URI; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; /** * The Dataframe class stores a list of Records Objects and several meta-data. All * Machine Learning algorithms get as argument Dataframe objects. The class has an * internal static Builder class which can be used to generate Dataframe objects * from Text or CSV files. * * @author Vasilis Vryniotis <bbriniotis@datumbox.com> */ public class Dataframe implements Collection<Record>, Copyable<Dataframe>, Savable { /** * Internal name of the response variable. */ public static final String COLUMN_NAME_Y = "~Y"; /** * Internal name of the constant. */ public static final String COLUMN_NAME_CONSTANT = "~CONSTANT"; /** * The Builder is a utility class which can help you build Dataframe from Text files, CSV files or load it from disk. */ public static class Builder { /** * It builds a Dataframe object from a provided list of text files. The data * map should have as index the names of each class and as values the URIs * of the training files. The files should contain one training example * per row. If we want to parse a Text File of unknown category then * pass a single URI with null as key. * * The method requires as arguments a file with the category names and locations * of the training files, an instance of a TextExtractor which is used * to extract the keywords from the documents and the Storage Configuration * Object. * * @param textFilesMap * @param textExtractor * @param configuration * @return */ public static Dataframe parseTextFiles(Map<Object, URI> textFilesMap, Extractable textExtractor, Configuration configuration) { Dataframe dataset = new Dataframe(configuration); Logger logger = LoggerFactory.getLogger(Dataframe.Builder.class); int count = 0; for (Map.Entry<Object, URI> entry : textFilesMap.entrySet()) { Object theClass = entry.getKey(); URI datasetURI = entry.getValue(); logger.info("{}:> Dataset Parsing {} class", ++count, theClass); try (BufferedReader br = new BufferedReader( new InputStreamReader(new FileInputStream(new File(datasetURI)), "UTF8"))) { final int baseCounter = dataset.size(); //because we read multiple files we need to keep track of all records added earlier ThreadMethods.throttledExecution(StreamMethods.enumerate(br.lines()), e -> { Integer rId = baseCounter + e.getKey(); String line = e.getValue(); AssociativeArray xData = new AssociativeArray( textExtractor.extract(StringCleaner.clear(line))); Record r = new Record(xData, theClass); //we call below the recalculateMeta() dataset.set(rId, r); }, configuration.getConcurrencyConfiguration()); } catch (IOException ex) { throw new RuntimeException(ex); } } return dataset; } public static Dataframe parseTextFiles2(Map<Object, List<URI>> textFilesMap, Extractable textExtractor, Configuration configuration) { Dataframe dataset = new Dataframe(configuration); //Logger logger = LoggerFactory.getLogger(Dataframe.Builder.class); int count = 0; Map<String, Integer> xdataMap = new HashMap<>(); for (Map.Entry<Object, List<URI>> entry1 : textFilesMap.entrySet()) { for (URI uri : entry1.getValue()) { xdataMap.clear(); Object theClass = entry1.getKey(); URI datasetURI = uri; System.out.println(String .format(++count + ":> Dataset Parsing " + theClass + " class; file:>" + uri.getPath())); //logger.info("{}:> Dataset Parsing {} class", ++count, theClass); try (BufferedReader br = new BufferedReader( new InputStreamReader(new FileInputStream(new File(datasetURI)), "UTF8"))) { final int baseCounter = dataset.size(); //because we read multiple files we need to keep track of all records added earlier ThreadMethods.throttledExecution(StreamMethods.enumerate(br.lines()), e -> { Integer cc = 1; Integer rId = baseCounter + e.getKey(); String line = e.getValue(); //AssociativeArray xData = new AssociativeArray( //Map<Integer, String> xData = textExtractor.extract(StringCleaner.clear(line)); Map<Integer, String> xData = textExtractor.extract(line); //); Map<Object, Object> mapp = new HashMap<>(); for (Map.Entry<Integer, String> dd : xData.entrySet()) { //cc = 1; // if (xdataMap.containsKey(dd.getValue())) { // cc = xdataMap.get(dd.getValue()) + 1; // } mapp.put(dd.getValue(), 1); } if (!mapp.isEmpty()) { AssociativeArray xdData = new AssociativeArray(mapp); Record r = new Record(xdData, theClass); //we call below the recalculateMeta() dataset.add(r); } }, configuration.getConcurrencyConfiguration()); } catch (IOException ex) { throw new RuntimeException(ex); } // Map<Object, Object> mapp = new HashMap<>(); // for (Map.Entry<String, Integer> xmData : xdataMap.entrySet()) { // mapp.put(xmData.getKey(), xmData.getValue()); // } // AssociativeArray xData = new AssociativeArray(mapp); // Record r = new Record(xData, theClass); // dataset.add(r); } } return dataset; } /** * It builds a Dataframe object from a CSV file; the first line of the provided * CSV file must have a header with the column names. * * The method accepts the following arguments: A Reader object from where * we will read the contents of the csv file. The name column of the * response variable y. A map with the column names and their respective * DataTypes. The char delimiter for the columns, the char for quotes and * the string of the record/row separator. The Storage Configuration * object. * * @param reader * @param yVariable * @param headerDataTypes * @param delimiter * @param quote * @param recordSeparator * @param skip * @param limit * @param configuration * @return */ public static Dataframe parseCSVFile(Reader reader, String yVariable, LinkedHashMap<String, TypeInference.DataType> headerDataTypes, char delimiter, char quote, String recordSeparator, Long skip, Long limit, Configuration configuration) { Logger logger = LoggerFactory.getLogger(Dataframe.Builder.class); if (skip == null) { skip = 0L; } if (limit == null) { limit = Long.MAX_VALUE; } logger.info("Parsing CSV file"); if (!headerDataTypes.containsKey(yVariable)) { logger.warn("WARNING: The file is missing the response variable column {}.", yVariable); } TypeInference.DataType yDataType = headerDataTypes.get(yVariable); Map<String, TypeInference.DataType> xDataTypes = new HashMap<>(headerDataTypes); //copy header types xDataTypes.remove(yVariable); //remove the response variable from xDataTypes Dataframe dataset = new Dataframe(configuration, yDataType, xDataTypes); //use the private constructor to pass DataTypes directly and avoid updating them on the fly CSVFormat format = CSVFormat.RFC4180.withHeader().withDelimiter(delimiter).withQuote(quote) .withRecordSeparator(recordSeparator); try (final CSVParser parser = new CSVParser(reader, format)) { ThreadMethods.throttledExecution(StreamMethods .enumerate(StreamMethods.stream(parser.spliterator(), false)).skip(skip).limit(limit), e -> { Integer rId = e.getKey(); CSVRecord row = e.getValue(); if (!row.isConsistent()) { logger.warn( "WARNING: Skipping row {} because its size does not match the header size.", row.getRecordNumber()); } else { Object y = null; AssociativeArray xData = new AssociativeArray(); for (Map.Entry<String, TypeInference.DataType> entry : headerDataTypes.entrySet()) { String column = entry.getKey(); TypeInference.DataType dataType = entry.getValue(); Object value = TypeInference.DataType.parse(row.get(column), dataType); //parse the string value according to the DataType if (yVariable != null && yVariable.equals(column)) { y = value; } else { xData.put(column, value); } } Record r = new Record(xData, y); //use the internal unsafe methods to avoid the update of the Metas. //The Metas are already set in the construction of the Dataframe. dataset._unsafe_set(rId, r); } }, configuration.getConcurrencyConfiguration()); } catch (IOException ex) { throw new RuntimeException(ex); } return dataset; } /** * It loads a dataframe that has already been stored. * * @param storageName * @param configuration * @return */ public static Dataframe load(String storageName, Configuration configuration) { return new Dataframe(storageName, configuration); } } /** * This class stores the data of the Dataframe. */ private static class Data extends BigMapHolder { private TypeInference.DataType yDataType = null; private AtomicInteger atomicNextAvailableRecordId = new AtomicInteger(); @BigMap(keyClass = Object.class, valueClass = TypeInference.DataType.class, mapType = MapType.HASHMAP, storageHint = StorageHint.IN_MEMORY, concurrent = true) private Map<Object, TypeInference.DataType> xDataTypes; @BigMap(keyClass = Integer.class, valueClass = Record.class, mapType = MapType.TREEMAP, storageHint = StorageHint.IN_DISK, concurrent = true) private Map<Integer, Record> records; /** * Initializes the state of the Data object. * * @param storageEngine */ private Data(StorageEngine storageEngine) { super(storageEngine); } } /** * Contains all the data of the dataframe. */ private Data data; /** * Flag that indicates whether the trainer has been saved or loaded from disk. */ private boolean stored; /** * The storage engine. */ private final StorageEngine storageEngine; /** * The configuration object used to create the Dataframe. It is defined as protected to be accessible by classes * that extend the Dataframe or the DataframeMatrix class which is on the same package. */ protected final Configuration configuration; /** * This executor is used for the parallel processing of streams with custom * Thread pool. */ private final ForkJoinStream streamExecutor; /** * Public constructor of Dataframe. * * @param configuration */ public Dataframe(Configuration configuration) { this.configuration = configuration; storageEngine = this.configuration.getStorageConfiguration() .createStorageEngine("dts" + RandomGenerator.getThreadLocalRandomUnseeded().nextLong()); streamExecutor = new ForkJoinStream(this.configuration.getConcurrencyConfiguration()); data = new Data(storageEngine); stored = false; } /** * Private constructor used by the Builder inner static class. * * @param storageName * @param configuration */ private Dataframe(String storageName, Configuration configuration) { this.configuration = configuration; storageEngine = this.configuration.getStorageConfiguration().createStorageEngine(storageName); streamExecutor = new ForkJoinStream(this.configuration.getConcurrencyConfiguration()); data = storageEngine.loadObject("data", Data.class); stored = true; } /** * Private constructor used by the Builder inner static class. * * @param configuration * @param yDataType * @param xDataTypes */ private Dataframe(Configuration configuration, TypeInference.DataType yDataType, Map<String, TypeInference.DataType> xDataTypes) { this(configuration); this.data.yDataType = yDataType; this.data.xDataTypes.putAll(xDataTypes); } //Storage Methods /** * Saves the Dataframe to disk. * * @param storageName */ public void save(String storageName) { //store the objects on storage storageEngine.saveObject("data", data); //rename the storage storageEngine.rename(storageName); //reload the data of the object data = storageEngine.loadObject("data", Data.class); //mark it as stored stored = true; } /** * Deletes the Dataframe and removes all internal variables. Once you delete a * dataset, the instance can no longer be used. */ public void delete() { storageEngine.clear(); _close(); } /** {@inheritDoc} */ @Override public void close() { if (stored) { //if the dataset is stored in disk, just close the storage _close(); } else { //if not try to delete it in case temporary files remained on disk delete(); } } /** * Closes the storage engine. */ private void _close() { try { storageEngine.close(); } catch (Exception ex) { throw new RuntimeException(ex); } finally { //Ensures that the Dataframe can't be used after _close() is called. data = null; } } //Mandatory Collection Methods /** * Returns the total number of Records of the Dataframe. * * @return */ @Override public int size() { return data.records.size(); } /** * Checks if the Dataframe is empty. * * @return */ @Override public boolean isEmpty() { return data.records.isEmpty(); } /** * Clears all the internal Records of the Dataframe. The Dataframe can be used * after you clear it. */ @Override public void clear() { data.yDataType = null; data.atomicNextAvailableRecordId.set(0); data.xDataTypes.clear(); data.records.clear(); } /** * Adds a record in the Dataframe and updates the Meta data. * * @param r * @return */ @Override public boolean add(Record r) { addRecord(r); return true; } /** * Checks if the Record exists in the Dataframe. Note that the Record is checked only * for its x and y components. * * @param o * @return */ @Override public boolean contains(Object o) { return data.records.containsValue((Record) o); } /** {@inheritDoc} */ @Override public boolean addAll(Collection<? extends Record> c) { c.stream().forEach(r -> { add(r); }); return true; } /** {@inheritDoc} */ @Override public boolean containsAll(Collection<?> c) { return data.records.values().containsAll(c); } /** {@inheritDoc} */ @Override public Object[] toArray() { Object[] array = new Object[size()]; int i = 0; for (Record r : values()) { array[i++] = r; } return array; } /** {@inheritDoc} */ @Override @SuppressWarnings("unchecked") public <T> T[] toArray(T[] a) { int size = size(); if (a.length < size) { a = (T[]) java.lang.reflect.Array.newInstance(a.getClass().getComponentType(), size); } int i = 0; for (Record r : values()) { a[i++] = (T) r; } return a; } /** * Returns a read-only iterator on the values of the Dataframe. * * @return */ @Override public Iterator<Record> iterator() { return values().iterator(); } /** {@inheritDoc} */ @Override public Stream<Record> stream() { return StreamMethods.stream(values(), false); } //Optional Collection Methods /** * Removes the first occurrence of the specified element from this Dataframe, * if it is present and it does not update the metadata. * * @param o * @return */ @Override public boolean remove(Object o) { Integer id = indexOf((Record) o); if (id == null) { return false; } remove(id); return true; } /** * Removes all of this collection's elements that are also contained in the * specified collection and updates the metadata. * * @param c * @return */ @Override public boolean removeAll(Collection<?> c) { boolean modified = false; for (Object o : c) { modified |= remove((Record) o); } if (modified) { recalculateMeta(); } return modified; } /** * Retains only the elements in this collection that are contained in the * specified collection and updates the meta data. * * @param c * @return */ @Override public boolean retainAll(Collection<?> c) { boolean modified = false; for (Map.Entry<Integer, Record> e : entries()) { Integer rId = e.getKey(); Record r = e.getValue(); if (!c.contains(r)) { remove(rId); modified = true; } } if (modified) { recalculateMeta(); } return modified; } //Other methods /** * Removes a record with a particular id from the Dataframe but does not update * the metadata. * * @param id * @return */ public Record remove(Integer id) { return data.records.remove(id); } /** * Returns the index of the first occurrence of the specified element in this * Dataframe, or null if this Dataframe does not contain the element. * WARNING: The Records are checked only for their X and Y values, not for * the yPredicted and yPredictedProbabilities values. * * @param o * @return */ public Integer indexOf(Record o) { if (o != null) { for (Map.Entry<Integer, Record> e : entries()) { Integer rId = e.getKey(); Record r = e.getValue(); if (o.equals(r)) { return rId; } } } return null; } /** * Returns a particular Record using its id. * * @param id * @return */ public Record get(Integer id) { return data.records.get(id); } /** * Adds a Record in the Dataframe and returns its id. * * @param r * @return */ public Integer addRecord(Record r) { Integer rId = _unsafe_add(r); updateMeta(r); return rId; } /** * Sets the record of a particular id in the dataset. If the record does not * exist it will be added with the specific id and the next added record will * have as id the next integer. * * Note that the meta-data are partially updated. This means that if the replaced * Record contained a column which is now no longer available in the dataset, * then the meta-data will not refect this update (the column will continue to exist * in the meta data). If this is a problem, you should call the recalculateMeta() * method to force them being recalculated. * * @param rId * @param r * @return */ public Integer set(Integer rId, Record r) { _unsafe_set(rId, r); updateMeta(r); return rId; } /** * Returns the total number of X columns in the Dataframe. * * @return */ public int xColumnSize() { return data.xDataTypes.size(); } /** * Returns the type of the response variable y. * * @return */ public TypeInference.DataType getYDataType() { return data.yDataType; } /** * Returns an Map with column names as index and DataTypes as values. * * @return */ public Map<Object, TypeInference.DataType> getXDataTypes() { return Collections.unmodifiableMap(data.xDataTypes); } /** * It extracts the values of a particular column from all records and * stores them into an FlatDataList. * * @param column * @return */ public FlatDataList getXColumn(Object column) { FlatDataList flatDataList = new FlatDataList(); for (Record r : values()) { flatDataList.add(r.getX().get(column)); } return flatDataList; } /** * It extracts the values of the response variables from all observations and * stores them into an FlatDataList. * * @return */ public FlatDataList getYColumn() { FlatDataList flatDataList = new FlatDataList(); for (Record r : values()) { flatDataList.add(r.getY()); } return flatDataList; } /** * Removes completely a list of columns from the dataset. The meta-data of * the Dataframe are updated. The method internally uses threads. * * @param columnSet */ public void dropXColumns(Set<Object> columnSet) { columnSet.retainAll(data.xDataTypes.keySet()); //keep only those columns that are already known to the Meta data of the Dataframe if (columnSet.isEmpty()) { return; } //remove all the columns from the Meta data data.xDataTypes.keySet().removeAll(columnSet); streamExecutor.forEach(StreamMethods.stream(entries(), true), e -> { Integer rId = e.getKey(); Record r = e.getValue(); AssociativeArray xData = r.getX().copy(); boolean modified = xData.keySet().removeAll(columnSet); if (modified) { Record newR = new Record(xData, r.getY(), r.getYPredicted(), r.getYPredictedProbabilities()); //safe to call in this context. we already updated the meta when we modified the xDataTypes _unsafe_set(rId, newR); } }); } /** * It generates and returns a new Dataframe which contains a subset of this Dataframe. * All the Records of the returned Dataframe are copies of the original Records. * The method is used for k-fold cross validation and sampling. Note that the * Records in the new Dataframe have DIFFERENT ids from the original ones. * * @param idsCollection * @return */ public Dataframe getSubset(FlatDataList idsCollection) { Dataframe d = new Dataframe(configuration); for (Object id : idsCollection) { d.add(get((Integer) id)); } return d; } /** * It forces the recalculation of Meta data using the Records of the dataset. */ public void recalculateMeta() { data.yDataType = null; data.xDataTypes.clear(); for (Record r : values()) { updateMeta(r); } } /** {@inheritDoc} */ @Override public Dataframe copy() { Dataframe d = new Dataframe(configuration); for (Map.Entry<Integer, Record> e : entries()) { Integer rId = e.getKey(); Record r = e.getValue(); d.set(rId, r); } return d; } /** * Returns a read-only Iterable on the keys and Records of the Dataframe. * * @return */ public Iterable<Map.Entry<Integer, Record>> entries() { return () -> new Iterator<Map.Entry<Integer, Record>>() { private final Iterator<Map.Entry<Integer, Record>> it = data.records.entrySet().iterator(); /** {@inheritDoc} */ @Override public boolean hasNext() { return it.hasNext(); } /** {@inheritDoc} */ @Override public Map.Entry<Integer, Record> next() { return it.next(); } /** {@inheritDoc} */ @Override public void remove() { throw new UnsupportedOperationException( "This is a read-only iterator, remove operation is not supported."); } }; } /** * Returns a read-only Iterable on the keys of the Dataframe. * * @return */ public Iterable<Integer> index() { return () -> new Iterator<Integer>() { private final Iterator<Integer> it = data.records.keySet().iterator(); /** {@inheritDoc} */ @Override public boolean hasNext() { return it.hasNext(); } /** {@inheritDoc} */ @Override public Integer next() { return it.next(); } /** {@inheritDoc} */ @Override public void remove() { throw new UnsupportedOperationException( "This is a read-only iterator, remove operation is not supported."); } }; } /** * Returns a read-only Iterable on the values of the Dataframe. * * @return */ public Iterable<Record> values() { return () -> new Iterator<Record>() { private final Iterator<Record> it = data.records.values().iterator(); /** {@inheritDoc} */ @Override public boolean hasNext() { return it.hasNext(); } /** {@inheritDoc} */ @Override public Record next() { return it.next(); } /** {@inheritDoc} */ @Override public void remove() { throw new UnsupportedOperationException( "This is a read-only iterator, remove operation is not supported."); } }; } /** * Sets the record in a particular position in the dataset, WITHOUT updating * the internal meta-info and returns the previous value (null if not existed). * This method is similar to set() and it allows quick updates * on the dataset. Nevertheless it is not advised to use this method because * unless you explicitly call the recalculateMeta() method, the meta data * will be corrupted. If you do use this method, MAKE sure you perform the * recalculation after you are done with the updates. * * @param rId * @param r * @return */ public Record _unsafe_set(Integer rId, Record r) { //move ahead the next id data.atomicNextAvailableRecordId.updateAndGet(x -> (x < rId) ? Math.max(x + 1, rId + 1) : x); return data.records.put(rId, r); } /** * Adds the record in the dataset without updating the Meta. The add method * returns the id of the new record. * * @param r * @return */ private Integer _unsafe_add(Record r) { Integer newId = data.atomicNextAvailableRecordId.getAndIncrement(); data.records.put(newId, r); return newId; } /** * Updates the meta data of the Dataframe using the provided Record. * The Meta-data include the supported columns and their DataTypes. * * @param r */ private void updateMeta(Record r) { for (Map.Entry<Object, Object> entry : r.getX().entrySet()) { Object column = entry.getKey(); Object value = entry.getValue(); if (value != null) { data.xDataTypes.putIfAbsent(column, TypeInference.getDataType(value)); } } if (data.yDataType == null) { Object value = r.getY(); if (value != null) { data.yDataType = TypeInference.getDataType(r.getY()); } } } }