at.tuwien.ifs.somtoolbox.data.InputDataWriter.java Source code

Java tutorial

Introduction

Here is the source code for at.tuwien.ifs.somtoolbox.data.InputDataWriter.java

Source

/*
 * Copyright 2004-2010 Information & Software Engineering Group (188/1)
 *                     Institute of Software Technology and Interactive Systems
 *                     Vienna University of Technology, Austria
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package at.tuwien.ifs.somtoolbox.data;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.logging.Logger;

import cern.colt.matrix.DoubleMatrix1D;

import at.tuwien.ifs.somtoolbox.SOMToolboxException;
import at.tuwien.ifs.somtoolbox.input.SOMLibFileFormatException;
import at.tuwien.ifs.somtoolbox.util.FileUtils;
import at.tuwien.ifs.somtoolbox.util.StdErrProgressWriter;
import at.tuwien.ifs.somtoolbox.util.StringUtils;

/**
 * This class gathers methods to write certain {@link InputData}, {@link TemplateVector} and
 * {@link SOMLibClassInformation} in a certain number of file formats, such as SOMLib, WEKA ARFF, SOMPak and ESOM.
 * 
 * @author Rudolf Mayer
 * @version $Id: InputDataWriter.java 3848 2010-10-12 12:29:26Z mayer $
 */
public class InputDataWriter {

    /** Supported Output File Format Types */
    public static final String[] OUTPUT_FILE_FORMAT_TYPES = { SOMLibSparseInputData.getFormatName(),
            ARFFFormatInputData.getFormatName(), RandomAccessFileSOMLibInputData.getFormatName(),
            ESOMInputData.getFormatName(), SOMPAKInputData.getFormatName(), "Orange", "CSV" };

    /**
     * Writes the data to <a href="http://databionic-esom.sourceforge.net/user.html#File_formats">ESOM lrn/cls
     * format</a>.
     */
    public static void writeAsESOM(InputData data, String fileName) throws IOException, SOMLibFileFormatException {
        String fileNameLrn = StringUtils.appendExtension(fileName, ".lrn");

        // write the header, see http://databionic-esom.sourceforge.net/user.html#File_formats
        Logger.getLogger("at.tuwien.ifs.somtoolbox")
                .info("Writing input data as ESOM file to '" + fileNameLrn + "'.");
        PrintWriter writer = FileUtils.openFileForWriting("ESOM lrn", fileNameLrn, false);
        if (org.apache.commons.lang.StringUtils.isNotBlank(data.getDataSource())) {
            writer.println("# Converted from " + data.getDataSource() + ".");
        }
        writer.println("% " + data.numVectors());
        writer.println("% " + (data.dim() + 1));
        writer.println("% 9" + StringUtils.repeatString(data.dim(), "\t 1"));
        writer.println("% Key\t" + StringUtils.interleave(data.templateVector().getLabels(), "\t"));
        for (int i = 0; i < data.numVectors(); i++) {
            writer.print(String.valueOf(i + 1)); // index in the lrn file will start with 1, make sure this is in synch
            // with ESOMMapOutputter
            for (int j = 0; j < data.dim(); j++) {
                writer.print("\t" + data.getValue(i, j));
            }
            writer.println();
        }
        writer.close();

        // write the names file
        String fileNameNames = StringUtils.appendOrReplaceExtension(fileName, ".lrn", ".names");
        Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Writing names as ESOM file to '" + fileNameNames + "'.");
        writer = FileUtils.openFileForWriting("ESOM names", fileNameNames, false);
        if (org.apache.commons.lang.StringUtils.isNotBlank(data.getDataSource())) {
            writer.println("# Converted from " + data.getDataSource() + ".");
        }
        writer.println("% " + data.numVectors());
        for (int i = 0; i < data.numVectors(); i++) {
            // index in the names file starts at 1, make sure this is in synch with lrn file and ESOMMapOutputter
            writer.println(String.valueOf(i + 1) + "\t" + data.getLabel(i));
        }
        writer.close();

        if (data.classInformation() != null) {
            // guess a good filename
            String fileNameCls = StringUtils.appendOrReplaceExtension(fileName, ".lrn", ".cls");
            Logger.getLogger("at.tuwien.ifs.somtoolbox")
                    .info("Writing class info as ESOM file to '" + fileNameCls + "'.");
            writeAsESOM(data.classInformation(), fileNameCls);
        }

    }

    /**
     * Writes the class information as <a href="http://databionic-esom.sourceforge.net/user.html#File_formats">ESOM
     * cls</a> file.
     */
    public static void writeAsESOM(SOMLibClassInformation classInfo, String fileName)
            throws IOException, SOMLibFileFormatException {
        PrintWriter writer = FileUtils.openFileForWriting("ESOM class info", fileName);
        writer.println("% " + classInfo.numData);
        // write class index => class name mapping in header
        for (int i = 0; i < classInfo.numClasses(); i++) {
            writer.println("% " + i + " " + classInfo.getClassName(i));
        }
        for (String element : classInfo.getDataNames()) {
            writer.println(element + "\t" + classInfo.getClassIndexForInput(element));
        }
        writer.flush();
        writer.close();
    }

    public static void writeAsSOMLib(InputData data, String fileName) throws IOException {
        PrintWriter writer = FileUtils.openFileForWriting("Input vector file", fileName, true);
        InputDataWriter.writeHeaderToFile(writer, data.numVectors(), data.dim());
        for (int i = 0; i < data.numVectors(); i++) {
            InputDatum inputDatum = data.getInputDatum(i);
            InputDataWriter.writeInputDatumToFile(writer, inputDatum);
        }

        writer.flush();
        writer.close();
    }

    /** Writes the class information to a file. */
    public static void writeAsSOMLib(InputData data, TemplateVector templateVector,
            SOMLibClassInformation classInformation, boolean tabSeparatedClassFile, String basicFileName)
            throws IOException, SOMLibFileFormatException {
        writeAsSOMLib(data, basicFileName + ".vec");
        if (templateVector != null) {
            InputDataWriter.writeAsSOMLib(templateVector, basicFileName + ".tv");
        }
        if (classInformation != null) {
            if (tabSeparatedClassFile) {
                InputDataWriter.writeToFileTabSeparated(classInformation, basicFileName + ".cls");
            } else {
                InputDataWriter.writeAsSOMLib(classInformation, basicFileName + ".cls");
            }
        }
    }

    /** Writes the class information to a file. */
    public static void writeAsSOMLib(InputData data, TemplateVector templateVector,
            SOMLibClassInformation classInformation, String basicFileName)
            throws IOException, SOMLibFileFormatException {
        writeAsSOMLib(data, templateVector, classInformation, false, basicFileName);
    }

    /** Writes the class information to a file in SOMLib format. */
    public static void writeAsSOMLib(SOMLibClassInformation classInfo, String fileName)
            throws IOException, SOMLibFileFormatException {
        PrintWriter writer = FileUtils.openFileForWriting("SOMLib class info", fileName);
        writer.println("$TYPE class_information");
        writer.println("$NUM_CLASSES " + classInfo.numClasses());
        writer.write("$CLASS_NAMES ");
        for (int i = 0; i < classInfo.numClasses(); i++) {
            writer.write(classInfo.getClassName(i));
            if (i + 1 < classInfo.numClasses()) {
                writer.write(" ");
            }
        }
        writer.println();
        writer.println("$XDIM 2");
        writer.println("$YDIM " + classInfo.numData);
        for (String element : classInfo.getDataNames()) {
            writer.println(element + " " + classInfo.getClassName(element));
        }

        writer.flush();
        writer.close();
    }

    /** Writes the class information to a file in SOMLib format. */
    public static void writeAsSOMLib(HashMap<String, String> classInfo, HashSet<String> classNames, String fileName)
            throws IOException, SOMLibFileFormatException {
        ArrayList<String> classNamesList = new ArrayList<String>(classNames);
        Collections.sort(classNamesList);

        PrintWriter writer = FileUtils.openFileForWriting("SOMLib class info", fileName);
        writer.println("$TYPE class_information");

        writer.println("$NUM_CLASSES " + classNames.size());
        writer.println("$CLASS_NAMES " + StringUtils.toString(classNamesList, "", "", " "));
        writer.println("$XDIM 2");
        writer.println("$YDIM " + classInfo.size());
        for (String key : classInfo.keySet()) {
            writer.println(key + " " + classNamesList.indexOf(classInfo.get(key)));
        }

        writer.flush();
        writer.close();
    }

    public static void writeAsSOMLib(TemplateVector tv, String fileName) throws IOException {
        Logger.getLogger("at.tuwien.ifs.somtoolbox")
                .info("Start writing new  template vector to '" + fileName + "'.");
        PrintWriter writer = FileUtils.openFileForWriting("Template Vector", fileName, fileName.endsWith(".gz"));
        writeTempplateHeaderToFile(writer, fileName, tv.numVectors(), tv.dim(), tv.numinfo());
        for (int i = 0; i < tv.dim(); i++) {
            writeElementToFile(writer, i, tv.getElement(i));
        }
        writer.flush();
        writer.close();
        Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Finished.");
    }

    public static void writeElementToFile(PrintWriter writer, int i, TemplateVectorElement e) {
        StringBuffer b = new StringBuffer();
        int numinfo = e.getTemplateVector().numinfo();
        if (numinfo > 2) {
            b.append(" ").append(e.getDocumentFrequency());
        }
        if (numinfo > 3) {
            b.append(" ").append(e.getCollectionTermFrequency());
        }
        if (numinfo > 4) {
            b.append(" ").append(e.getMinimumTermFrequency());
        }
        if (numinfo > 5) {
            b.append(" ").append(e.getMaximumTermFrequency());
        }
        if (numinfo > 6) {
            b.append(" ").append(e.getMeanTermFrequency());
        }
        if (e.getComment() != null) {
            b.append(" ").append(e.getComment());
        }
        writer.println(i + " " + e.getLabel() + b.toString());
    }

    /**
     * Writes input data in the SOMPAK format (see
     * http://www.cis.hut.fi/projects/somtoolbox/package/docs2/som_read_data.html)
     */
    public static void writeAsSOMPAK(InputData data, String fileName) throws IOException {
        Logger.getLogger("at.tuwien.ifs.somtoolbox")
                .info("Writing input data as SOMPAK file to '" + fileName + "'.");
        PrintWriter writer = FileUtils.openFileForWriting("SOMPAK data", fileName, false);
        // number of dimensions first, simply the number
        writer.println(data.dim());
        // now all component names
        TemplateVector tv = data.templateVector();
        if (tv == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox")
                    .info("Template vector not loaded - creating a generic one.");
            tv = new SOMLibTemplateVector(data.numVectors(), data.dim());
        }
        writer.println(
                SOMPAKInputData.INDICATOR_COMPONENTS + " " + StringUtils.toString(tv.getLabels(), "", "", " "));
        // now all data, appended by the class name
        for (int i = 0; i < data.numVectors(); i++) {
            for (int j = 0; j < data.dim(); j++) {
                writer.print(data.getValue(i, j));
                if (j + 1 < data.dim()) {
                    writer.print(" ");
                }
            }
            if (data.classInformation() != null) {
                writer.print(" " + data.classInformation().getClassName(i));
            }
            writer.println();
        }
        writer.close();
    }

    /** Writes the data to <a href="http://www.cs.waikato.ac.nz/~ml/weka/arff.html">Weka ARFF format</a>. */
    public static void writeAsWekaARFF(InputData data, String fileName, boolean writeInstanceNames,
            boolean skipInputsWithoutClass) throws IOException, SOMToolboxException {
        if (data.classInformation() == null) {
            throw new SOMToolboxException("Class Information File needed for WEKA ARFF writing");
        }
        fileName = StringUtils.ensureExtension(fileName, ".arff");
        Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Writing input data as ARFF file to '" + fileName + "'.");
        PrintWriter writer = FileUtils.openFileForWriting("Weka ARFF", fileName, false);

        TemplateVector tv = data.templateVector();
        if (tv == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox")
                    .info("Template vector not loaded - creating a generic one.");
            tv = new SOMLibTemplateVector(data.numVectors(), data.dim());
        }

        String relation = fileName.substring(0, fileName.length() - 4);
        writer.println("@RELATION " + relation + "\n");
        for (int i = 0; i < tv.dim(); i++) {
            writer.println("@ATTRIBUTE " + tv.getLabel(i) + " NUMERIC");
        }

        if (writeInstanceNames) {
            writer.println("@ATTRIBUTE instanceName STRING");
        }

        writer.println(getWekaClassHeader(data.classInformation().classNames()));

        writer.println("@DATA");
        int skipCounter = 0;
        StdErrProgressWriter progress = new StdErrProgressWriter(data.numVectors(), "Writing vector ",
                data.numVectors() / 10);
        for (int i = 0; i < data.numVectors(); i++) {
            InputDatum inputDatum = data.getInputDatum(i);
            if (skipInputsWithoutClass
                    && !data.classInformation().hasClassAssignmentForName(inputDatum.getLabel())) {
                skipCounter++;
                Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Skipping datum '" + inputDatum.getLabel()
                        + "', as it has no class assigned; skipped " + skipCounter + " so far.");
                continue;
            }
            DoubleMatrix1D vector = inputDatum.getVector();
            for (int j = 0; j < data.dim(); j++) {
                writer.print(vector.get(j) + ",");
            }

            if (writeInstanceNames) {
                writer.print("'" + StringUtils.escapeForWeka(inputDatum.getLabel()) + "',");
            }

            writer.println("'" + data.classInformation().getClassName(inputDatum.getLabel()) + "'");
            progress.progress();
        }
        writer.flush();
        writer.close();
    }

    public static String getWekaClassHeader(String[] classNames) {
        String classNamesString = "";
        for (String className : classNames) {
            if (classNamesString.length() > 0) {
                classNamesString += ",";
            }
            classNamesString += "'" + StringUtils.escapeClassNameForWeka(className) + "'";
        }
        String x = "@ATTRIBUTE class {" + classNamesString + "}";
        return x;
    }

    /**
     * Writes input data in the tab-separated format used by the Orange data mining toolkit (see
     * http://www.ailab.si/orange/)
     */
    public static void writeAsOrange(InputData data, String fileName) throws IOException {
        Logger.getLogger("at.tuwien.ifs.somtoolbox")
                .info("Writing input data as Orange file to '" + fileName + "'.");
        PrintWriter writer = FileUtils.openFileForWriting("Orange data", fileName, false);

        TemplateVector tv = data.templateVector();
        if (tv == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox")
                    .info("Template vector not loaded - creating a generic one.");
            tv = new SOMLibTemplateVector(data.numVectors(), data.dim());
        }
        SOMLibClassInformation classInformation = data.classInformation();
        boolean haveClassInfo = classInformation != null;

        /* - first the tab-separated names of the features
        - then the types of the features
        - and then the indicator whether a feature is the class assignment
        e.g. for IRIS:
         sepallength sepalwidth  petallength petalwidth  class 
         continuous  continuous  continuous  continuous  discrete
                                                         class
        */

        // row 1: tab-separated label names
        writer.print(StringUtils.toString(tv.getLabels(), "", "", "\t"));
        if (haveClassInfo) { // and optionally the class
            writer.print("\tclass");
        }
        writer.println();

        // row 2: data types
        for (int i = 0; i < tv.dim(); i++) {
            writer.print("continuous"); // all are continuous
            if (i + 1 < tv.dim()) {
                writer.print("\t");
            }
        }
        if (haveClassInfo) {
            writer.print("\tdiscrete"); // just the class is discrete
        }
        writer.println();

        // row 3: indicating options to the attributes
        writer.print(StringUtils.repeatString(tv.dim() - 1, "\t"));
        if (haveClassInfo) {
            writer.print("\tclass"); // the class attribute
        }
        writer.println();

        // now the data, tab separated, and optionally with the class assignment

        // now all data, appended by the class name
        for (int i = 0; i < data.numVectors(); i++) {
            for (int j = 0; j < data.dim(); j++) {
                writer.print(data.getValue(i, j));
                if (j + 1 < data.dim()) {
                    writer.print("\t");
                }
            }
            if (haveClassInfo) {
                writer.print("\t" + classInformation.getClassName(i));
            }
            writer.println();
        }
        writer.close();
    }

    public static void writeAsCSV(InputData data, String fileName) throws IOException {
        Logger.getLogger("at.tuwien.ifs.somtoolbox").info("Writing input data as CVS to '" + fileName + "'.");
        PrintWriter writer = FileUtils.openFileForWriting("CVS", fileName, false);

        TemplateVector tv = data.templateVector();
        if (tv == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox")
                    .info("Template vector not loaded - creating a generic one.");
            tv = new SOMLibTemplateVector(data.numVectors(), data.dim());
        }
        SOMLibClassInformation classInformation = data.classInformation();
        boolean haveClassInfo = classInformation != null;

        String separator = "\t";

        // header: tab-separated label names
        writer.print(StringUtils.toString(tv.getLabels(), "", "", separator, "\""));
        if (haveClassInfo) { // and optionally the class
            writer.print(separator + "class");
        }
        writer.println();

        // data: tab separated, optionally with the class assignment
        for (int i = 0; i < data.numVectors(); i++) {
            for (int j = 0; j < data.dim(); j++) {
                writer.print(data.getValue(i, j));
                if (j + 1 < data.dim()) {
                    writer.print(separator);
                }
            }
            if (haveClassInfo) {
                writer.print(separator + classInformation.getClassName(i));
            }
            writer.println();
        }
        writer.close();
    }

    public static void writeHeaderToFile(PrintWriter writer, int numVectors, int dim) throws IOException {
        writer.println("$TYPE vec");
        writer.println("$XDIM " + numVectors);
        writer.println("$YDIM 1");
        writer.println("$VEC_DIM " + dim);
    }

    public static void writeInputDatumToFile(PrintWriter writer, InputDatum inputDatum) throws IOException {
        InputDataWriter.writeInputDatumToFile(writer, inputDatum.getLabel(), inputDatum.getVector());
    }

    public static void writeInputDatumToFile(PrintWriter writer, String label, DoubleMatrix1D vector)
            throws IOException {
        for (int i = 0; i < vector.size(); i++) {
            if (!Double.isNaN(vector.get(i))) {
                writer.write(vector.get(i) + " ");
            } else {
                writer.write("? ");
            }
        }
        writer.println(label);
    }

    public static void writeTempplateHeaderToFile(PrintWriter writer, String fileName, final int numVectors,
            final int dim, final int numInfo) throws IOException {
        writer.println("$TYPE template");
        writer.println("$XDIM " + numInfo);
        writer.println("$YDIM " + numVectors);
        writer.println("$VEC_DIM " + dim);
    }

    /** Writes the class information to a tab-separated file. */
    public static void writeToFileTabSeparated(SOMLibClassInformation classInfo, String fileName)
            throws IOException, SOMLibFileFormatException {
        PrintWriter writer = FileUtils.openFileForWriting("Tab-separated class info", fileName);
        for (String element : classInfo.getDataNames()) {
            writer.println(element + "\t" + classInfo.getClassName(element));
        }
        writer.flush();
        writer.close();
    }

    public static void write(String fName, InputData data, String outputFormat, boolean tabSeparatedClassFile,
            boolean skipInstanceNames, boolean skipInputsWithoutClass) throws IOException, SOMToolboxException {
        Logger logger = Logger.getLogger("at.tuwien.ifs.somtoolbox");
        if (outputFormat.equals(AbstractSOMLibSparseInputData.getFormatName())) {
            logger.info("Writing SOMLib Data Format.");
            InputDataWriter.writeAsSOMLib(data, data.templateVector(), data.classInformation(),
                    tabSeparatedClassFile, fName + ".vec");
        } else if (outputFormat.equals(RandomAccessFileSOMLibInputData.getFormatName())) {
            logger.info("Writing Random Access Binary Data Format.");
            RandomAccessFileSOMLibInputData.write(data, fName);
        } else if (outputFormat.equals(ARFFFormatInputData.getFormatName())) {
            logger.info("Writing ARFF Data Format, skipping instance names: " + skipInstanceNames);
            InputDataWriter.writeAsWekaARFF(data, fName, !skipInstanceNames, skipInputsWithoutClass);
        } else if (outputFormat.equals(ESOMInputData.getFormatName())) {
            logger.info("Writing ESOM Data Format");
            InputDataWriter.writeAsESOM(data, fName);
        } else if (outputFormat.equals(SOMPAKInputData.getFormatName())) {
            logger.info("Writing SOMPAK Data Format");
            InputDataWriter.writeAsSOMPAK(data, fName);
        } else if (outputFormat.equals("Orange")) {
            logger.info("Writing Orange Data Format");
            InputDataWriter.writeAsOrange(data, fName);
        } else if (outputFormat.equals("CSV")) {
            logger.info("Writing CSV Format");
            InputDataWriter.writeAsCSV(data, fName);
        } else {
            // check for logical programming mistakes, basically
            throw new SOMToolboxException(
                    "Didn't write format of type '" + outputFormat + "', most likely a programming error.");
        }
    }
}