Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * ArffLoader.java * Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand * */ package weka.core.converters; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StreamTokenizer; import java.io.StringReader; import java.net.URL; import java.text.ParseException; import java.util.ArrayList; import java.util.List; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; import weka.core.RevisionHandler; import weka.core.RevisionUtils; import weka.core.SparseInstance; import weka.core.Utils; /** * <!-- globalinfo-start --> Reads a source that is in arff (attribute relation * file format) format. * <p/> * <!-- globalinfo-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision$ * @see Loader */ public class ArffLoader extends AbstractFileLoader implements BatchConverter, IncrementalConverter, URLSourcedLoader { /** for serialization */ static final long serialVersionUID = 2726929550544048587L; /** the file extension */ public static String FILE_EXTENSION = Instances.FILE_EXTENSION; public static String FILE_EXTENSION_COMPRESSED = FILE_EXTENSION + ".gz"; /** the url */ protected String m_URL = "http://"; /** The reader for the source file. */ protected transient Reader m_sourceReader = null; /** The parser for the ARFF file */ protected transient ArffReader m_ArffReader = null; /** * Whether the values of string attributes should be retained in memory when * reading incrementally */ protected boolean m_retainStringVals; /** * Reads data from an ARFF file, either in incremental or batch mode. * <p/> * * Typical code for batch usage: * * <pre> * BufferedReader reader = * new BufferedReader(new FileReader("/some/where/file.arff")); * ArffReader arff = new ArffReader(reader); * Instances data = arff.getData(); * data.setClassIndex(data.numAttributes() - 1); * </pre> * * Typical code for incremental usage: * * <pre> * BufferedReader reader = * new BufferedReader(new FileReader("/some/where/file.arff")); * ArffReader arff = new ArffReader(reader, 1000); * Instances data = arff.getStructure(); * data.setClassIndex(data.numAttributes() - 1); * Instance inst; * while ((inst = arff.readInstance(data)) != null) { * data.add(inst); * } * </pre> * * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @author Len Trigg (trigg@cs.waikato.ac.nz) * @author fracpete (fracpete at waikato dot ac dot nz) * @version $Revision$ */ public static class ArffReader implements RevisionHandler { /** the tokenizer for reading the stream */ protected StreamTokenizer m_Tokenizer; /** Buffer of values for sparse instance */ protected double[] m_ValueBuffer; /** Buffer of indices for sparse instance */ protected int[] m_IndicesBuffer; protected List<Integer> m_stringAttIndices; /** the actual data */ protected Instances m_Data; /** the number of lines read so far */ protected int m_Lines; protected boolean m_batchMode = true; /** * Whether the values for string attributes will accumulate in the header * when reading incrementally */ protected boolean m_retainStringValues = false; /** Field separator (single character string) to use instead of the defaults */ protected String m_fieldSeparator; /** List of (single character) enclosures to use instead of the defaults */ protected List<String> m_enclosures; /** * Reads the data completely from the reader. The data can be accessed via * the <code>getData()</code> method. * * @param reader the reader to use * @throws IOException if something goes wrong * @see #getData() */ public ArffReader(Reader reader) throws IOException { m_retainStringValues = true; m_batchMode = true; m_Tokenizer = new StreamTokenizer(reader); initTokenizer(); readHeader(1000); initBuffers(); Instance inst; while ((inst = readInstance(m_Data)) != null) { m_Data.add(inst); } compactify(); } public ArffReader(Reader reader, int capacity) throws IOException { this(reader, capacity, true); } /** * Reads only the header and reserves the specified space for instances. * Further instances can be read via <code>readInstance()</code>. * * @param reader the reader to use * @param capacity the capacity of the new dataset * @param batch true if reading in batch mode * @throws IOException if something goes wrong * @throws IOException if a problem occurs * @see #getStructure() * @see #readInstance(Instances) */ public ArffReader(Reader reader, int capacity, boolean batch) throws IOException { m_batchMode = batch; if (batch) { m_retainStringValues = true; } if (capacity < 0) { throw new IllegalArgumentException("Capacity has to be positive!"); } m_Tokenizer = new StreamTokenizer(reader); initTokenizer(); readHeader(capacity); initBuffers(); } /** * Reads the data without header according to the specified template. The * data can be accessed via the <code>getData()</code> method. * * @param reader the reader to use * @param template the template header * @param lines the lines read so far * @param fieldSepAndEnclosures an optional array of Strings containing the * field separator and enclosures to use instead of the defaults. * The first entry in the array is expected to be the single * character field separator to use; the remaining entries (if any) * are enclosure characters to use. * @throws IOException if something goes wrong * @see #getData() */ public ArffReader(Reader reader, Instances template, int lines, String... fieldSepAndEnclosures) throws IOException { this(reader, template, lines, 100, true, fieldSepAndEnclosures); Instance inst; while ((inst = readInstance(m_Data)) != null) { m_Data.add(inst); } compactify(); } /** * Initializes the reader without reading the header according to the * specified template. The data must be read via the * <code>readInstance()</code> method. * * @param reader the reader to use * @param template the template header * @param lines the lines read so far * @param capacity the capacity of the new dataset * @param fieldSepAndEnclosures an optional array of Strings containing the * field separator and enclosures to use instead of the defaults. * The first entry in the array is expected to be the single * character field separator to use; the remaining entries (if any) * are enclosure characters to use. * @throws IOException if something goes wrong * @see #getData() */ public ArffReader(Reader reader, Instances template, int lines, int capacity, String... fieldSepAndEnclosures) throws IOException { this(reader, template, lines, capacity, false, fieldSepAndEnclosures); } /** * Initializes the reader without reading the header according to the * specified template. The data must be read via the * <code>readInstance()</code> method. * * @param reader the reader to use * @param template the template header * @param lines the lines read so far * @param capacity the capacity of the new dataset * @param batch true if the data is going to be read in batch mode * @param fieldSepAndEnclosures an optional array of Strings containing the * field separator and enclosures to use instead of the defaults. * The first entry in the array is expected to be the single * character field separator to use; the remaining entries (if any) * are enclosure characters to use. * @throws IOException if something goes wrong * @see #getData() */ public ArffReader(Reader reader, Instances template, int lines, int capacity, boolean batch, String... fieldSepAndEnclosures) throws IOException { m_batchMode = batch; if (batch) { m_retainStringValues = true; } if (fieldSepAndEnclosures != null && fieldSepAndEnclosures.length > 0) { if (fieldSepAndEnclosures[0] != null && fieldSepAndEnclosures[0].length() > 0) { m_fieldSeparator = fieldSepAndEnclosures[0]; } if (fieldSepAndEnclosures.length > 1) { // the rest are assumed to be enclosure characters m_enclosures = new ArrayList<String>(); for (int i = 1; i < fieldSepAndEnclosures.length; i++) { if (fieldSepAndEnclosures[i] != null && fieldSepAndEnclosures[i].length() > 0) { m_enclosures.add(fieldSepAndEnclosures[i]); } } if (m_enclosures.size() == 0) { m_enclosures = null; } } } m_Lines = lines; m_Tokenizer = new StreamTokenizer(reader); initTokenizer(); m_Data = new Instances(template, capacity); initBuffers(); } /** * initializes the buffers for sparse instances to be read * * @see #m_ValueBuffer * @see #m_IndicesBuffer */ protected void initBuffers() { m_ValueBuffer = new double[m_Data.numAttributes()]; m_IndicesBuffer = new int[m_Data.numAttributes()]; m_stringAttIndices = new ArrayList<Integer>(); if (m_Data.checkForStringAttributes()) { for (int i = 0; i < m_Data.numAttributes(); i++) { if (m_Data.attribute(i).isString()) { m_stringAttIndices.add(i); } } } } /** * compactifies the data */ protected void compactify() { if (m_Data != null) { m_Data.compactify(); } } /** * Throws error message with line number and last token read. * * @param msg the error message to be thrown * @throws IOException containing the error message */ protected void errorMessage(String msg) throws IOException { String str = msg + ", read " + m_Tokenizer.toString(); if (m_Lines > 0) { int line = Integer.parseInt(str.replaceAll(".* line ", "")); str = str.replaceAll(" line .*", " line " + (m_Lines + line - 1)); } throw new IOException(str); } /** * returns the current line number * * @return the current line number */ public int getLineNo() { return m_Lines + m_Tokenizer.lineno(); } /** * Gets next token, skipping empty lines. * * @throws IOException if reading the next token fails */ protected void getFirstToken() throws IOException { while (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) { } ; if ((m_Tokenizer.ttype == '\'') || (m_Tokenizer.ttype == '"')) { m_Tokenizer.ttype = StreamTokenizer.TT_WORD; } else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD) && (m_Tokenizer.sval.equals("?"))) { m_Tokenizer.ttype = '?'; } } /** * Gets index, checking for a premature and of line. * * @throws IOException if it finds a premature end of line */ protected void getIndex() throws IOException { if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) { errorMessage("premature end of line"); } if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { errorMessage("premature end of file"); } } /** * Gets token and checks if its end of line. * * @param endOfFileOk whether EOF is OK * @throws IOException if it doesn't find an end of line */ protected void getLastToken(boolean endOfFileOk) throws IOException { if ((m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) && ((m_Tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) { errorMessage("end of line expected"); } } /** * Gets the value of an instance's weight (if one exists) * * @return the value of the instance's weight, or NaN if no weight has been * supplied in the file */ protected double getInstanceWeight() throws IOException { double weight = Double.NaN; m_Tokenizer.nextToken(); if (m_Tokenizer.ttype == StreamTokenizer.TT_EOL || m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { return weight; } // see if we can read an instance weight // m_Tokenizer.pushBack(); if (m_Tokenizer.ttype == '{') { m_Tokenizer.nextToken(); String weightS = m_Tokenizer.sval; // try to parse weight as a double try { weight = Double.parseDouble(weightS); } catch (NumberFormatException e) { // quietly ignore return weight; } // see if we have the closing brace m_Tokenizer.nextToken(); if (m_Tokenizer.ttype != '}') { errorMessage("Problem reading instance weight: } expected"); } } return weight; } /** * Gets next token, checking for a premature and of line. * * @throws IOException if it finds a premature end of line */ protected void getNextToken() throws IOException { if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) { errorMessage("premature end of line"); } if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { errorMessage("premature end of file"); } else if ((m_Tokenizer.ttype == '\'') || (m_Tokenizer.ttype == '"')) { m_Tokenizer.ttype = StreamTokenizer.TT_WORD; } else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD) && (m_Tokenizer.sval.equals("?"))) { m_Tokenizer.ttype = '?'; } } /** * Initializes the StreamTokenizer used for reading the ARFF file. */ protected void initTokenizer() { m_Tokenizer.resetSyntax(); m_Tokenizer.whitespaceChars(0, ' '); m_Tokenizer.wordChars(' ' + 1, '\u00FF'); if (m_fieldSeparator != null) { m_Tokenizer.whitespaceChars(m_fieldSeparator.charAt(0), m_fieldSeparator.charAt(0)); } else { m_Tokenizer.whitespaceChars(',', ','); } m_Tokenizer.commentChar('%'); if (m_enclosures != null && m_enclosures.size() > 0) { for (String e : m_enclosures) { m_Tokenizer.quoteChar(e.charAt(0)); } } else { m_Tokenizer.quoteChar('"'); m_Tokenizer.quoteChar('\''); } m_Tokenizer.ordinaryChar('{'); m_Tokenizer.ordinaryChar('}'); m_Tokenizer.eolIsSignificant(true); } /** * Reads a single instance using the tokenizer and returns it. * * @param structure the dataset header information, will get updated in case * of string or relational attributes * @return null if end of file has been reached * @throws IOException if the information is not read successfully */ public Instance readInstance(Instances structure) throws IOException { return readInstance(structure, true); } /** * Reads a single instance using the tokenizer and returns it. * * @param structure the dataset header information, will get updated in case * of string or relational attributes * @param flag if method should test for carriage return after each instance * @return null if end of file has been reached * @throws IOException if the information is not read successfully */ public Instance readInstance(Instances structure, boolean flag) throws IOException { return getInstance(structure, flag); } /** * Reads a single instance using the tokenizer and returns it. * * @param structure the dataset header information, will get updated in case * of string or relational attributes * @param flag if method should test for carriage return after each instance * @return null if end of file has been reached * @throws IOException if the information is not read successfully */ protected Instance getInstance(Instances structure, boolean flag) throws IOException { m_Data = structure; // Check if any attributes have been declared. if (m_Data.numAttributes() == 0) { errorMessage("no header information available"); } // Check if end of file reached. getFirstToken(); if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { return null; } // Parse instance if (m_Tokenizer.ttype == '{') { return getInstanceSparse(flag); } else { return getInstanceFull(flag); } } /** * Reads a single instance using the tokenizer and returns it. * * @param flag if method should test for carriage return after each instance * @return null if end of file has been reached * @throws IOException if the information is not read successfully */ protected Instance getInstanceSparse(boolean flag) throws IOException { int valIndex, numValues = 0, maxIndex = -1; // if reading incrementally, and we have string values, make sure that all // string attributes are initialized if (!m_batchMode && !m_retainStringValues && m_stringAttIndices != null) { for (int i = 0; i < m_stringAttIndices.size(); i++) { m_Data.attribute(m_stringAttIndices.get(i)).setStringValue(null); } } // Get values do { // Get index getIndex(); if (m_Tokenizer.ttype == '}') { break; } // Is index valid? try { m_IndicesBuffer[numValues] = Integer.valueOf(m_Tokenizer.sval).intValue(); } catch (NumberFormatException e) { errorMessage("index number expected"); } if (m_IndicesBuffer[numValues] <= maxIndex) { errorMessage("indices have to be ordered"); } if ((m_IndicesBuffer[numValues] < 0) || (m_IndicesBuffer[numValues] >= m_Data.numAttributes())) { errorMessage("index out of bounds"); } maxIndex = m_IndicesBuffer[numValues]; // Get value; getNextToken(); // Check if value is missing. if (m_Tokenizer.ttype == '?') { m_ValueBuffer[numValues] = Utils.missingValue(); } else { // Check if token is valid. if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) { errorMessage("not a valid value"); } switch (m_Data.attribute(m_IndicesBuffer[numValues]).type()) { case Attribute.NOMINAL: // Check if value appears in header. valIndex = m_Data.attribute(m_IndicesBuffer[numValues]).indexOfValue(m_Tokenizer.sval); if (valIndex == -1) { errorMessage("nominal value not declared in header"); } m_ValueBuffer[numValues] = valIndex; break; case Attribute.NUMERIC: // Check if value is really a number. try { m_ValueBuffer[numValues] = Double.valueOf(m_Tokenizer.sval).doubleValue(); } catch (NumberFormatException e) { errorMessage("number expected"); } break; case Attribute.STRING: if (m_batchMode || m_retainStringValues) { m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]) .addStringValue(m_Tokenizer.sval); } else { m_ValueBuffer[numValues] = 0; m_Data.attribute(m_IndicesBuffer[numValues]).addStringValue(m_Tokenizer.sval); } break; case Attribute.DATE: try { m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]) .parseDate(m_Tokenizer.sval); } catch (ParseException e) { errorMessage("unparseable date: " + m_Tokenizer.sval); } break; case Attribute.RELATIONAL: try { ArffReader arff = new ArffReader(new StringReader(m_Tokenizer.sval), m_Data.attribute(m_IndicesBuffer[numValues]).relation(), 0); Instances data = arff.getData(); m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]) .addRelation(data); } catch (Exception e) { throw new IOException(e.toString() + " of line " + getLineNo()); } break; default: errorMessage("unknown attribute type in column " + m_IndicesBuffer[numValues]); } } numValues++; } while (true); double weight = 1.0; if (flag) { // check for an instance weight weight = getInstanceWeight(); if (!Double.isNaN(weight)) { getLastToken(true); } else { weight = 1.0; } } // Add instance to dataset double[] tempValues = new double[numValues]; int[] tempIndices = new int[numValues]; System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues); System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues); Instance inst = new SparseInstance(weight, tempValues, tempIndices, m_Data.numAttributes()); inst.setDataset(m_Data); return inst; } /** * Reads a single instance using the tokenizer and returns it. * * @param flag if method should test for carriage return after each instance * @return null if end of file has been reached * @throws IOException if the information is not read successfully */ protected Instance getInstanceFull(boolean flag) throws IOException { double[] instance = new double[m_Data.numAttributes()]; int index; // Get values for all attributes. for (int i = 0; i < m_Data.numAttributes(); i++) { // Get next token if (i > 0) { getNextToken(); } // Check if value is missing. if (m_Tokenizer.ttype == '?') { instance[i] = Utils.missingValue(); } else { // Check if token is valid. if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) { errorMessage("not a valid value"); } switch (m_Data.attribute(i).type()) { case Attribute.NOMINAL: // Check if value appears in header. index = m_Data.attribute(i).indexOfValue(m_Tokenizer.sval); if (index == -1) { errorMessage("nominal value not declared in header"); } instance[i] = index; break; case Attribute.NUMERIC: // Check if value is really a number. try { instance[i] = Double.valueOf(m_Tokenizer.sval).doubleValue(); } catch (NumberFormatException e) { errorMessage("number expected"); } break; case Attribute.STRING: if (m_batchMode || m_retainStringValues) { instance[i] = m_Data.attribute(i).addStringValue(m_Tokenizer.sval); } else { instance[i] = 0; m_Data.attribute(i).setStringValue(m_Tokenizer.sval); } break; case Attribute.DATE: try { instance[i] = m_Data.attribute(i).parseDate(m_Tokenizer.sval); } catch (ParseException e) { errorMessage("unparseable date: " + m_Tokenizer.sval); } break; case Attribute.RELATIONAL: try { ArffReader arff = new ArffReader(new StringReader(m_Tokenizer.sval), m_Data.attribute(i).relation(), 0); Instances data = arff.getData(); instance[i] = m_Data.attribute(i).addRelation(data); } catch (Exception e) { throw new IOException(e.toString() + " of line " + getLineNo()); } break; default: errorMessage("unknown attribute type in column " + i); } } } double weight = 1.0; if (flag) { // check for an instance weight weight = getInstanceWeight(); if (!Double.isNaN(weight)) { getLastToken(true); } else { weight = 1.0; } } // Add instance to dataset Instance inst = new DenseInstance(weight, instance); inst.setDataset(m_Data); return inst; } /** * Reads and stores header of an ARFF file. * * @param capacity the number of instances to reserve in the data structure * @throws IOException if the information is not read successfully */ protected void readHeader(int capacity) throws IOException { m_Lines = 0; String relationName = ""; // Get name of relation. getFirstToken(); if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { errorMessage("premature end of file"); } if (Instances.ARFF_RELATION.equalsIgnoreCase(m_Tokenizer.sval)) { getNextToken(); relationName = m_Tokenizer.sval; getLastToken(false); } else { errorMessage("keyword " + Instances.ARFF_RELATION + " expected"); } // Create vectors to hold information temporarily. ArrayList<Attribute> attributes = new ArrayList<Attribute>(); // Get attribute declarations. getFirstToken(); if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { errorMessage("premature end of file"); } while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(m_Tokenizer.sval)) { attributes = parseAttribute(attributes); } // Check if data part follows. We can't easily check for EOL. if (!Instances.ARFF_DATA.equalsIgnoreCase(m_Tokenizer.sval)) { errorMessage("keyword " + Instances.ARFF_DATA + " expected"); } // Check if any attributes have been declared. if (attributes.size() == 0) { errorMessage("no attributes declared"); } m_Data = new Instances(relationName, attributes, capacity); } /** * Parses the attribute declaration. * * @param attributes the current attributes vector * @return the new attributes vector * @throws IOException if the information is not read successfully */ protected ArrayList<Attribute> parseAttribute(ArrayList<Attribute> attributes) throws IOException { String attributeName; ArrayList<String> attributeValues; // Get attribute name. getNextToken(); attributeName = m_Tokenizer.sval; getNextToken(); // Check if attribute is nominal. if (m_Tokenizer.ttype == StreamTokenizer.TT_WORD) { // Attribute is real, integer, or string. if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_REAL) || m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_INTEGER) || m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_NUMERIC)) { Attribute att = new Attribute(attributeName, attributes.size()); att.setWeight(getAttributeWeight()); attributes.add(att); readTillEOL(); } else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_STRING)) { Attribute att = new Attribute(attributeName, (ArrayList<String>) null, attributes.size()); att.setWeight(getAttributeWeight()); readTillEOL(); attributes.add(att); } else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_DATE)) { String format = null; m_Tokenizer.nextToken(); if (m_Tokenizer.ttype == '{') { // No date format but it looks like there is an attribute weight m_Tokenizer.pushBack(); Attribute att = new Attribute(attributeName, format, attributes.size()); att.setWeight(getAttributeWeight()); attributes.add(att); readTillEOL(); } else if (m_Tokenizer.ttype != StreamTokenizer.TT_EOL) { // Looks like there is a date format if ((m_Tokenizer.ttype != StreamTokenizer.TT_WORD) && (m_Tokenizer.ttype != '\'') && (m_Tokenizer.ttype != '\"')) { errorMessage("not a valid date format"); } format = m_Tokenizer.sval; Attribute att = new Attribute(attributeName, format, attributes.size()); att.setWeight(getAttributeWeight()); // Now check for attribute weight attributes.add(att); readTillEOL(); } else { m_Tokenizer.pushBack(); attributes.add(new Attribute(attributeName, format, attributes.size())); } } else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_RELATIONAL)) { double weight = getAttributeWeight(); readTillEOL(); // Read attributes for subrelation // First, save current set of attributes ArrayList<Attribute> atts = attributes; attributes = new ArrayList<Attribute>(); // Now, read attributes until we hit end of declaration of relational // value getFirstToken(); if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { errorMessage("premature end of file"); } do { if (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(m_Tokenizer.sval)) { attributes = parseAttribute(attributes); } else if (Attribute.ARFF_END_SUBRELATION.equalsIgnoreCase(m_Tokenizer.sval)) { getNextToken(); if (!attributeName.equalsIgnoreCase(m_Tokenizer.sval)) { errorMessage("declaration of subrelation " + attributeName + " must be terminated by " + "@end " + attributeName); } break; } else { errorMessage("declaration of subrelation " + attributeName + " must be terminated by " + "@end " + attributeName); } } while (true); // Make relation and restore original set of attributes Instances relation = new Instances(attributeName, attributes, 0); attributes = atts; Attribute att = new Attribute(attributeName, relation, attributes.size()); att.setWeight(weight); attributes.add(att); } else { errorMessage("no valid attribute type or invalid " + "enumeration"); } } else { // Attribute is nominal. attributeValues = new ArrayList<String>(); m_Tokenizer.pushBack(); // Get values for nominal attribute. if (m_Tokenizer.nextToken() != '{') { errorMessage("{ expected at beginning of enumeration"); } while (m_Tokenizer.nextToken() != '}') { if (m_Tokenizer.ttype == StreamTokenizer.TT_EOL) { errorMessage("} expected at end of enumeration"); } else { attributeValues.add(m_Tokenizer.sval); } } Attribute att = new Attribute(attributeName, attributeValues, attributes.size()); att.setWeight(getAttributeWeight()); attributes.add(att); readTillEOL(); } getLastToken(false); getFirstToken(); if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { errorMessage("premature end of file"); } return attributes; } /** * Reads and skips all tokens before next end of line token. * * @throws IOException in case something goes wrong */ protected void readTillEOL() throws IOException { while (m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) { } m_Tokenizer.pushBack(); } /** * Gets the value of an attribute's weight (if one exists). * * @return the value of the attribute's weight, or 1.0 if no weight has been * supplied in the file */ protected double getAttributeWeight() throws IOException { double weight = 1.0; m_Tokenizer.nextToken(); if (m_Tokenizer.ttype == StreamTokenizer.TT_EOL || m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { m_Tokenizer.pushBack(); return weight; } // see if we can read an attribute weight if (m_Tokenizer.ttype == '{') { m_Tokenizer.nextToken(); try { weight = Double.parseDouble(m_Tokenizer.sval); } catch (NumberFormatException ex) { errorMessage("Problem reading attribute weight " + ex.getMessage()); } m_Tokenizer.nextToken(); if (m_Tokenizer.ttype != '}') { errorMessage("Problem reading attribute weight: } expected"); } } return weight; } /** * Returns the header format * * @return the header format */ public Instances getStructure() { return new Instances(m_Data, 0); } /** * Returns the data that was read * * @return the data */ public Instances getData() { return m_Data; } /** * Set whether to retain the values of string attributes in memory (in the * header) when reading incrementally. * * @param retain true if string values are to be retained in memory when * reading incrementally */ public void setRetainStringValues(boolean retain) { m_retainStringValues = retain; } /** * Get whether to retain the values of string attributes in memory (in the * header) when reading incrementally. * * @return true if string values are to be retained in memory when reading * incrementally */ public boolean getRetainStringValues() { return m_retainStringValues; } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } } /** * Returns a string describing this Loader * * @return a description of the Loader suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Reads a source that is in arff (attribute relation file format) " + "format. "; } /** * Tool tip text for this property * * @return the tool tip for this property */ public String retainStringValsTipText() { return "If true then the values of string attributes are " + "retained in memory when reading incrementally. Leave this " + "set to false when using incremental classifiers in the " + "Knowledge Flow."; } /** * Set whether to retain the values of string attributes in memory (in the * header) when reading incrementally. * * @param retain true if string values are to be retained in memory when * reading incrementally */ public void setRetainStringVals(boolean retain) { m_retainStringVals = retain; } /** * Get whether to retain the values of string attributes in memory (in the * header) when reading incrementally. * * @return true if string values are to be retained in memory when reading * incrementally */ public boolean getRetainStringVals() { return m_retainStringVals; } /** * Get the file extension used for arff files * * @return the file extension */ @Override public String getFileExtension() { return FILE_EXTENSION; } /** * Gets all the file extensions used for this type of file * * @return the file extensions */ @Override public String[] getFileExtensions() { return new String[] { FILE_EXTENSION, FILE_EXTENSION_COMPRESSED }; } /** * Returns a description of the file type. * * @return a short file description */ @Override public String getFileDescription() { return "Arff data files"; } /** * Resets the Loader ready to read a new data set or the same data set again. * * @throws IOException if something goes wrong */ @Override public void reset() throws IOException { m_structure = null; m_ArffReader = null; setRetrieval(NONE); if (m_File != null && !(new File(m_File).isDirectory())) { setFile(new File(m_File)); } else if (m_URL != null && !m_URL.equals("http://")) { setURL(m_URL); } } /** * Resets the Loader object and sets the source of the data set to be the * supplied url. * * @param url the source url. * @throws IOException if an error occurs */ public void setSource(URL url) throws IOException { m_structure = null; setRetrieval(NONE); setSource(url.openStream()); m_URL = url.toString(); // make sure that the file is null so that any calls to // reset() work properly m_File = null; } /** * get the File specified as the source * * @return the source file */ @Override public File retrieveFile() { return new File(m_File); } /** * sets the source File * * @param file the source file * @throws IOException if an error occurs */ @Override public void setFile(File file) throws IOException { m_File = file.getPath(); setSource(file); } /** * Set the url to load from * * @param url the url to load from * @throws IOException if the url can't be set. */ @Override public void setURL(String url) throws IOException { m_URL = url; setSource(new URL(url)); } /** * Return the current url * * @return the current url */ @Override public String retrieveURL() { return m_URL; } /** * Resets the Loader object and sets the source of the data set to be the * supplied InputStream. * * @param in the source InputStream. * @throws IOException always thrown. */ @Override public void setSource(InputStream in) throws IOException { m_File = (new File(System.getProperty("user.dir"))).getAbsolutePath(); m_URL = "http://"; m_sourceReader = new BufferedReader(new InputStreamReader(in)); } /** * Determines and returns (if possible) the structure (internally the header) * of the data set as an empty set of instances. * * @return the structure of the data set as an empty set of Instances * @throws IOException if an error occurs */ @Override public Instances getStructure() throws IOException { if (m_structure == null) { if (m_sourceReader == null) { throw new IOException("No source has been specified"); } try { m_ArffReader = new ArffReader(m_sourceReader, 1, (getRetrieval() == BATCH)); m_ArffReader.setRetainStringValues(getRetainStringVals()); m_structure = m_ArffReader.getStructure(); } catch (Exception ex) { throw new IOException("Unable to determine structure as arff (Reason: " + ex.toString() + ")."); } } return new Instances(m_structure, 0); } /** * Return the full data set. If the structure hasn't yet been determined by a * call to getStructure then method should do so before processing the rest of * the data set. * * @return the structure of the data set as an empty set of Instances * @throws IOException if there is no source or parsing fails */ @Override public Instances getDataSet() throws IOException { Instances insts = null; try { if (m_sourceReader == null) { throw new IOException("No source has been specified"); } if (getRetrieval() == INCREMENTAL) { throw new IOException("Cannot mix getting Instances in both incremental and batch modes"); } setRetrieval(BATCH); if (m_structure == null) { getStructure(); } // Read all instances insts = new Instances(m_structure, 0); Instance inst; while ((inst = m_ArffReader.readInstance(m_structure)) != null) { insts.add(inst); } // Instances readIn = new Instances(m_structure); } finally { if (m_sourceReader != null) { // close the stream m_sourceReader.close(); } } return insts; } /** * Read the data set incrementally---get the next instance in the data set or * returns null if there are no more instances to get. If the structure hasn't * yet been determined by a call to getStructure then method should do so * before returning the next instance in the data set. * * @param structure the dataset header information, will get updated in case * of string or relational attributes * @return the next instance in the data set as an Instance object or null if * there are no more instances to be read * @throws IOException if there is an error during parsing */ @Override public Instance getNextInstance(Instances structure) throws IOException { m_structure = structure; if (getRetrieval() == BATCH) { throw new IOException("Cannot mix getting Instances in both incremental and batch modes"); } setRetrieval(INCREMENTAL); Instance current = null; if (m_sourceReader != null) { current = m_ArffReader.readInstance(m_structure); } if ((m_sourceReader != null) && (current == null)) { try { // close the stream m_sourceReader.close(); m_sourceReader = null; // reset(); } catch (Exception ex) { ex.printStackTrace(); } } return current; } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } /** * Main method. * * @param args should contain the name of an input file. */ public static void main(String[] args) { runFileLoader(new ArffLoader(), args); } }