org.dkpro.core.io.xces.XcesXmlReader.java Source code

Java tutorial

Introduction

Here is the source code for org.dkpro.core.io.xces.XcesXmlReader.java

Source

/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.dkpro.core.io.xces;

import static org.apache.commons.io.IOUtils.closeQuietly;

import java.io.IOException;
import java.io.InputStream;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.bind.ValidationEvent;
import javax.xml.bind.ValidationEventHandler;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.factory.JCasBuilder;
import org.apache.uima.jcas.JCas;
import org.dkpro.core.io.xces.models.XcesBody;
import org.dkpro.core.io.xces.models.XcesPara;
import org.dkpro.core.io.xces.models.XcesSentence;
import org.dkpro.core.io.xces.models.XcesToken;

import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph",
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma",
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
public class XcesXmlReader extends JCasResourceCollectionReader_ImplBase {

    @Override
    public void getNext(JCas aJCas) throws IOException, CollectionException {

        Resource res = nextFile();
        initCas(aJCas, res);

        InputStream is = null;

        try {
            is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream());

            XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
            XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(is);

            JAXBContext context = JAXBContext.newInstance(XcesBody.class);
            Unmarshaller unmarshaller = context.createUnmarshaller();

            unmarshaller.setEventHandler(new ValidationEventHandler() {
                public boolean handleEvent(ValidationEvent event) {
                    throw new RuntimeException(event.getMessage(), event.getLinkedException());
                }
            });

            JCasBuilder jb = new JCasBuilder(aJCas);

            XMLEvent e = null;
            while ((e = xmlEventReader.peek()) != null) {

                if (isStartElement(e, "body")) {
                    try {
                        XcesBody paras = (XcesBody) unmarshaller.unmarshal(xmlEventReader, XcesBody.class)
                                .getValue();
                        readPara(jb, paras);
                    } catch (RuntimeException ex) {
                        System.out.println("Unable to parse XCES format: " + ex);
                    }
                } else {
                    xmlEventReader.next();
                }

            }
            jb.close();

        } catch (XMLStreamException ex1) {
            throw new IOException(ex1);
        } catch (JAXBException e1) {
            throw new IOException(e1);
        } finally {
            closeQuietly(is);
        }

    }

    private void readPara(JCasBuilder jb, Object bodyObj) {
        // Below is the sample paragraph format
        // <p id="p1">
        // <s id="s1">
        // <t id="t1" word="" tag="PnDmFe03SgNmXx" lemma="" />
        // <t id="t2" word="" tag="VbMnIdPr03SgXxIpPvXx" lemma="" />
        // <t id="t3" word="" tag="AtDfFeSgNm" lemma="" />
        // <t id="t4" word="?" tag="NmOdFeSgNmAj" lemma="?" />
        // <t id="t5" word="?" tag="NoCmFeSgNm" lemma="?" />
        // <t id="t6" word="." tag="PTERM_P" lemma="." />
        // </s>
        // </p>
        if (bodyObj instanceof XcesBody) {
            for (XcesPara paras : ((XcesBody) bodyObj).p) {
                int paraStart = jb.getPosition();
                int paraEnd = jb.getPosition();
                for (XcesSentence s : paras.s) {
                    int sentStart = jb.getPosition();
                    int sentEnd = jb.getPosition();
                    for (int i = 0; i < s.xcesTokens.size(); i++) {
                        XcesToken t = s.xcesTokens.get(i);
                        XcesToken tnext = i + 1 == s.xcesTokens.size() ? null : s.xcesTokens.get(i + 1);

                        Token token = jb.add(t.word, Token.class);

                        if (t.lemma != null) {
                            Lemma lemma = new Lemma(jb.getJCas(), token.getBegin(), token.getEnd());
                            lemma.setValue(t.lemma);
                            lemma.addToIndexes();
                            token.setLemma(lemma);
                        }
                        if (t.tag != null) {
                            POS pos = new POS(jb.getJCas(), token.getBegin(), token.getEnd());
                            pos.setPosValue(t.tag);
                            pos.addToIndexes();
                            token.setPos(pos);
                        }
                        sentEnd = jb.getPosition();
                        if (tnext == null)
                            jb.add("\n");
                        if (tnext != null) {
                            jb.add(" ");
                        }
                    }
                    Sentence sent = new Sentence(jb.getJCas(), sentStart, sentEnd);
                    sent.addToIndexes();
                    paraEnd = sent.getEnd();
                }
                Paragraph para = new Paragraph(jb.getJCas(), paraStart, paraEnd);
                para.addToIndexes();
                jb.add("\n");
            }

        }
    }

    public static boolean isStartElement(XMLEvent aEvent, String aElement) {

        return aEvent.isStartElement() && ((StartElement) aEvent).getName().getLocalPart().equals(aElement);
    }

}