de.tudarmstadt.ukp.csniper.resbuild.stuff.CasFlusher.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.csniper.resbuild.stuff.CasFlusher.java

Source

/*******************************************************************************
 * Copyright 2013
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.csniper.resbuild.stuff;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createPrimitiveDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createCollectionReader;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.impl.CASCompleteSerializer;
import org.apache.uima.cas.impl.CASImpl;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.jcas.tcas.Annotation;
import org.junit.Test;
import org.tukaani.xz.XZInputStream;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.fit.util.CasUtil;
import org.apache.uima.fit.util.JCasUtil;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree;
import de.tudarmstadt.ukp.dkpro.core.io.bincas.SerializedCasReader;
import de.tudarmstadt.ukp.dkpro.core.io.tgrep.TGrepWriter;

public class CasFlusher extends CasAnnotator_ImplBase {
    public static void flush(File aSerializedCas, OutputStream aOutputStream, int aBegin, int aEnd)
            throws IOException {
        CAS cas = new CASImpl();

        InputStream is = null;
        try {
            is = new FileInputStream(aSerializedCas);
            if (aSerializedCas.getName().endsWith(".gz")) {
                is = new GZIPInputStream(is);
            } else if (aSerializedCas.getName().endsWith(".xz")) {
                is = new XZInputStream(is);
            }
            is = new ObjectInputStream(new BufferedInputStream(is));
            CASCompleteSerializer serializer = (CASCompleteSerializer) ((ObjectInputStream) is).readObject();

            ((CASImpl) cas).reinit(serializer);
        } catch (ClassNotFoundException e) {
            throw new IOException(e);
        } finally {
            IOUtils.closeQuietly(is);
        }

        Collection<AnnotationFS> annos;
        if (aBegin > -1 && aEnd > -1) {
            annos = CasUtil.selectCovered(cas, CasUtil.getType(cas, Annotation.class), aBegin, aEnd);
        } else {
            annos = CasUtil.selectAll(cas);
        }
        for (AnnotationFS anno : annos) {
            StringBuilder sb = new StringBuilder();
            sb.append("[" + anno.getClass().getSimpleName() + "] ");
            sb.append("(" + anno.getBegin() + "," + anno.getEnd() + ") ");
            sb.append(anno.getCoveredText() + "\n");
            IOUtils.write(sb, aOutputStream, "UTF-8");
        }
    }

    @Test
    public void test() throws IOException, UIMAException {
        //      File source = new File("D:\\hadoop\\output\\BNC\\serialized\\A\\A6\\A63.xml.ser.xz");
        //      flush(source, System.out, 29976, 30073);

        CollectionReader bincas = createCollectionReader(SerializedCasReader.class,
                //            SerializedCasReader.PARAM_PATH, "D:\\ukp\\data\\output\\BNC\\serialized\\A\\A0",
                //            SerializedCasReader.PARAM_PATH, "D:\\hadoop\\output\\BNC\\serialized\\A\\A0",
                SerializedCasReader.PARAM_PATH, "D:\\downloads",
                //            SerializedCasReader.PARAM_PATTERNS, new String[]{ "[+]**/A02.ser.xz" });
                SerializedCasReader.PARAM_PATTERNS, new String[] {
                        "[+]**/_user_dodinh_output_uima_output_attempt_201211181809_0459_m_000001_0_BNC_serialized_A_A0_A02.ser.xz" });

        AnalysisEngineDescription flush = createPrimitiveDescription(CasFlusher.class);

        AnalysisEngineDescription tgrep = createPrimitiveDescription(TGrepWriter.class,
                TGrepWriter.PARAM_DROP_MALFORMED_TREES, true, TGrepWriter.PARAM_TARGET_LOCATION, "D:\\hadoop",
                TGrepWriter.PARAM_WRITE_COMMENTS, true);

        SimplePipeline.runPipeline(bincas, flush);
    }

    @Override
    public void process(CAS aCas) throws AnalysisEngineProcessException {
        try {
            int aBegin = 112715;//98877;
            int aEnd = 112734;//98993;
            OutputStream aOutputStream = System.out;

            Collection<? extends Annotation> annos;
            annos = JCasUtil.select(aCas.getJCas(), Sentence.class);
            Annotation a = new ArrayList<Annotation>(annos).get(92);
            aBegin = a.getBegin();
            aEnd = a.getEnd();
            if (aBegin > -1 && aEnd > -1) {
                annos = JCasUtil.selectCovered(aCas.getJCas(), Annotation.class, aBegin, aEnd);
            } else {
                annos = JCasUtil.select(aCas.getJCas(), Annotation.class);
            }
            for (Annotation anno : annos) {
                StringBuilder sb = new StringBuilder();
                sb.append("[" + anno.getClass().getSimpleName() + "] ");
                sb.append("(" + anno.getBegin() + "," + anno.getEnd() + ") ");
                sb.append(anno.getCoveredText() + "\n");
                try {
                    IOUtils.write(sb, aOutputStream, "UTF-8");
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            for (PennTree pt : JCasUtil.selectCovered(aCas.getJCas(), PennTree.class, aBegin, aEnd)) {
                IOUtils.write(StringUtils.normalizeSpace(pt.getPennTree()), aOutputStream, "UTF-8");
            }
        } catch (CASException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}