at.tuwien.ifs.somtoolbox.apps.helper.VectorFileToRandomAccessFileConverter.java Source code

Java tutorial

Introduction

Here is the source code for at.tuwien.ifs.somtoolbox.apps.helper.VectorFileToRandomAccessFileConverter.java

Source

/*
 * Copyright 2004-2010 Information & Software Engineering Group (188/1)
 *                     Institute of Software Technology and Interactive Systems
 *                     Vienna University of Technology, Austria
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package at.tuwien.ifs.somtoolbox.apps.helper;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;

import org.apache.commons.lang.StringUtils;

import com.martiansoftware.jsap.JSAPResult;

import at.tuwien.ifs.somtoolbox.apps.config.OptionFactory;
import at.tuwien.ifs.somtoolbox.data.InputDataFileFormatConverter;
import at.tuwien.ifs.somtoolbox.data.RandomAccessFileSOMLibInputData;
import at.tuwien.ifs.somtoolbox.data.SOMLibSparseInputData;

/**
 * Converts an input file to a binary/random access input file. his class customises the handling of data read from the
 * file by storing it in an Random Access File.
 * <p>
 * This is a specific, memory saving implementation, that could otherwise be handled with
 * {@link InputDataFileFormatConverter}
 * </p>
 * .
 * 
 * @author Rudolf Mayer
 * @version $Id: VectorFileToRandomAccessFileConverter.java 3589 2010-05-21 10:42:01Z mayer $
 */
public class VectorFileToRandomAccessFileConverter extends SOMLibSparseInputData {

    private RandomAccessFile file;

    boolean headerWritten = false;

    public VectorFileToRandomAccessFileConverter(String inputVectorFile) throws IOException {
        String outputFile = StringUtils.chomp(inputVectorFile, ".gz") + ".bin";
        if (new File(outputFile).exists()) {
            new File(outputFile).delete();
        }
        file = new RandomAccessFile(outputFile, "rw");
        // write all the vectors
        readVectorFile(inputVectorFile, false);
        // write the vector labels in the end of the file
        RandomAccessFileSOMLibInputData.writeVectorLabels(file, dataNames);
        file.close();
    }

    /**
     * Stores the information read in the random access file.
     */
    @Override
    protected void processLine(int documentIndex, String[] lineElements) throws Exception {
        if (!headerWritten) {
            headerWritten = RandomAccessFileSOMLibInputData.writeHeader(file, numVectors(), dim());
        }
        String label = lineElements[dim].trim();
        dataNames[documentIndex] = label;
        for (int termIndex = 0; termIndex < dim; termIndex++) {
            file.writeDouble(Double.parseDouble(lineElements[termIndex]));
        }
    }

    @Override
    protected void initMatrix(boolean sparse) {
        // do nothing, we don't need to store anything
    }

    /**
     * Starts the conversion
     * 
     * @param args Needed program arguments:
     *            <ul>
     *            <li>-v inputVectorFile, mandatory</li>
     *            </ul>
     * @throws IOException If the input vector file can't be read, or the output binary file can't be written.
     */
    public static void main(String[] args) throws IOException {
        JSAPResult config = OptionFactory.parseResults(args, OptionFactory.getOptInputVectorFile(true));
        String inputVectorFile = config.getString("inputVectorFile");

        new VectorFileToRandomAccessFileConverter(inputVectorFile);
    }

}