org.apache.mahout.utils.vectors.arff.ARFFVectorIterable.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.mahout.utils.vectors.arff.ARFFVectorIterable.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.utils.vectors.arff;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Locale;

import com.google.common.io.Files;
import org.apache.commons.io.Charsets;
import org.apache.mahout.math.Vector;

/**
 * Read in ARFF (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) and create {@link Vector}s
 * <p/>
 * Attribute type handling:
 * <ul>
 * <li>Numeric -> As is</li>
 * <li>Nominal -> ordinal(value) i.e. @attribute lumber {'\'(-inf-0.5]\'','\'(0.5-inf)\''}
 * will convert -inf-0.5 -> 0, and 0.5-inf -> 1</li>
 * <li>Dates -> Convert to time as a long</li>
 * <li>Strings -> Create a map of String -> long</li>
 * </ul>
 * NOTE: This class does not set the label bindings on every vector.  If you want the label
 * bindings, call {@link MapBackedARFFModel#getLabelBindings()}, as they are the same for every vector.
 */
public class ARFFVectorIterable implements Iterable<Vector> {

    private final BufferedReader buff;
    private final ARFFModel model;

    public ARFFVectorIterable(File file, ARFFModel model) throws IOException {
        this(file, Charsets.UTF_8, model);
    }

    public ARFFVectorIterable(File file, Charset encoding, ARFFModel model) throws IOException {
        this(Files.newReader(file, encoding), model);
    }

    public ARFFVectorIterable(String arff, ARFFModel model) throws IOException {
        this(new StringReader(arff), model);
    }

    public ARFFVectorIterable(Reader reader, ARFFModel model) throws IOException {
        if (reader instanceof BufferedReader) {
            buff = (BufferedReader) reader;
        } else {
            buff = new BufferedReader(reader);
        }
        //grab the attributes, then start the iterator at the first line of data
        this.model = model;

        int labelNumber = 0;
        String line;
        while ((line = buff.readLine()) != null) {
            line = line.trim();
            if (!line.startsWith(ARFFModel.ARFF_COMMENT) && !line.isEmpty()) {
                Integer labelNumInt = labelNumber;
                String[] lineParts = line.split("[\\s\\t]+", 2);

                // is it a relation name?
                if (lineParts[0].equalsIgnoreCase(ARFFModel.RELATION)) {
                    model.setRelation(ARFFType.removeQuotes(lineParts[1]));
                }
                // or an attribute
                else if (lineParts[0].equalsIgnoreCase(ARFFModel.ATTRIBUTE)) {
                    String label;
                    ARFFType type;

                    // split the name of the attribute and its description
                    String[] attrParts = lineParts[1].split("[\\s\\t]+", 2);
                    if (attrParts.length < 2)
                        throw new UnsupportedOperationException("No type for attribute found: " + lineParts[1]);

                    // label is attribute name
                    label = ARFFType.removeQuotes(attrParts[0].toLowerCase());
                    if (attrParts[1].equalsIgnoreCase(ARFFType.NUMERIC.getIndicator())) {
                        type = ARFFType.NUMERIC;
                    } else if (attrParts[1].equalsIgnoreCase(ARFFType.INTEGER.getIndicator())) {
                        type = ARFFType.INTEGER;
                    } else if (attrParts[1].equalsIgnoreCase(ARFFType.REAL.getIndicator())) {
                        type = ARFFType.REAL;
                    } else if (attrParts[1].equalsIgnoreCase(ARFFType.STRING.getIndicator())) {
                        type = ARFFType.STRING;
                    } else if (attrParts[1].toLowerCase().startsWith(ARFFType.NOMINAL.getIndicator())) {
                        type = ARFFType.NOMINAL;
                        // nominal example:
                        // @ATTRIBUTE class        {Iris-setosa,'Iris versicolor',Iris-virginica}
                        String[] classes = ARFFIterator
                                .splitCSV(attrParts[1].substring(1, attrParts[1].length() - 1));
                        for (int i = 0; i < classes.length; i++) {
                            model.addNominal(label, ARFFType.removeQuotes(classes[i]), i + 1);
                        }
                    } else if (attrParts[1].toLowerCase().startsWith(ARFFType.DATE.getIndicator())) {
                        type = ARFFType.DATE;
                        //TODO: DateFormatter map
                        DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
                        String formStr = attrParts[1].substring(ARFFType.DATE.getIndicator().length()).trim();
                        if (!formStr.isEmpty()) {
                            if (formStr.startsWith("\"")) {
                                formStr = formStr.substring(1, formStr.length() - 1);
                            }
                            format = new SimpleDateFormat(formStr, Locale.ENGLISH);
                        }
                        model.addDateFormat(labelNumInt, format);
                        //@attribute <name> date [<date-format>]
                    } else {
                        throw new UnsupportedOperationException("Invalid attribute: " + attrParts[1]);
                    }
                    model.addLabel(label, labelNumInt);
                    model.addType(labelNumInt, type);
                    labelNumber++;
                } else if (lineParts[0].equalsIgnoreCase(ARFFModel.DATA)) {
                    break; //skip it
                }
            }
        }

    }

    @Override
    public Iterator<Vector> iterator() {
        return new ARFFIterator(buff, model);
    }

    /**
     * Returns info about the ARFF content that was parsed.
     *
     * @return the model
     */
    public ARFFModel getModel() {
        return model;
    }
}