de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasWriter.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasWriter.java

Source

/*******************************************************************************
 * Copyright 2013
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.dkpro.core.io.bincas;

import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.cas.impl.Serialization.serializeCAS;
import static org.apache.uima.cas.impl.Serialization.serializeCASMgr;
import static org.apache.uima.cas.impl.Serialization.serializeWithCompression;

import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.OutputStream;

import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.impl.CASMgrSerializer;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;

import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;

public class BinaryCasWriter extends JCasFileWriter_ImplBase {
    /**
     * Location to write the type system to. If this is not set, a file called typesystem.xml will
     * be written to the output path. If this is set, it is expected to be a file relative to the
     * current work directory or an absolute file. <br>
     * If this parameter is set, the {@link #PARAM_COMPRESSION} parameter has no effect on the type
     * system. Instead, if the type system file should be compressed or not is detected from the
     * file name extension (e.g. ".gz"). <br>
     * If this parameter is set, the type system and index repository are no longer serialized into
     * the same file as the test of the CAS. The {@link SerializedCasReader} can currently not read
     * such files. Use this only if you really know what you are doing.
     */
    public static final String PARAM_TYPE_SYSTEM_FILE = "typeSystemFile";
    @ConfigurationParameter(name = PARAM_TYPE_SYSTEM_FILE, mandatory = false)
    private File typeSystemFile;

    public static final String PARAM_FORMAT = "format";
    @ConfigurationParameter(name = PARAM_FORMAT, mandatory = true, defaultValue = "6+")
    private String format;

    public static final String PARAM_FILENAME_SUFFIX = "filenameSuffix";
    @ConfigurationParameter(name = PARAM_FILENAME_SUFFIX, mandatory = true, defaultValue = ".bin")
    private String filenameSuffix;

    private boolean typeSystemWritten;

    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        OutputStream docOS = null;
        try {
            docOS = getOutputStream(aJCas, filenameSuffix);

            if ("0".equals(format)) {
                serializeCAS(aJCas.getCas(), docOS);
            } else if ("4".equals(format)) {
                serializeWithCompression(aJCas.getCas(), docOS);
            } else if (format.startsWith("6")) {
                if ("6+".equals(format)) {
                    writeHeader(docOS);
                    writeTypeSystem(aJCas, docOS);
                }
                serializeWithCompression(aJCas.getCas(), docOS, aJCas.getTypeSystem());
            } else {
                throw new IllegalArgumentException("Unknown format [" + format + "]. Must be 0, 4,  6, or 6+");
            }

            if (!typeSystemWritten) {
                writeTypeSystem(aJCas);
                typeSystemWritten = true;
            }
        } catch (Exception e) {
            throw new AnalysisEngineProcessException(e);
        } finally {
            closeQuietly(docOS);
        }
    }

    private void writeTypeSystem(JCas aJCas) throws IOException {
        File typeOSFile;
        if (typeSystemFile != null) {
            typeOSFile = typeSystemFile;
        } else {
            typeOSFile = getTargetPath("typesystem", ".ser");
        }

        OutputStream typeOS = null;
        try {
            typeOS = CompressionUtils.getOutputStream(typeOSFile);
            writeTypeSystem(aJCas, typeOS);
        } finally {
            closeQuietly(typeOS);
        }
    }

    private void writeHeader(OutputStream aOS) throws IOException {
        byte[] header = new byte[] { 'D', 'K', 'P', 'r', 'o', '1' };
        DataOutputStream dataOS = new DataOutputStream(aOS);
        dataOS.write(header);
        dataOS.flush();
    }

    private void writeTypeSystem(JCas aJCas, OutputStream aOS) throws IOException {
        ObjectOutputStream typeOS = new ObjectOutputStream(aOS);
        CASMgrSerializer casMgrSerializer = serializeCASMgr(aJCas.getCasImpl());
        typeOS.writeObject(casMgrSerializer);
        typeOS.flush();
    }
}