de.tudarmstadt.ukp.csniper.webapp.search.xmi.XmiContextProvider.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.csniper.webapp.search.xmi.XmiContextProvider.java

Source

/*******************************************************************************
 * Copyright 2013
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.csniper.webapp.search.xmi;

import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.zip.GZIPInputStream;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.cas.impl.CASImpl;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.internal.util.Timer;
import org.apache.uima.jcas.JCas;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.fit.util.JCasUtil;
import org.xml.sax.SAXException;

import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.EvaluationItem;
import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.ItemContext;
import de.tudarmstadt.ukp.csniper.webapp.search.ContextProvider;
import de.tudarmstadt.ukp.csniper.webapp.search.CorpusService;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

public class XmiContextProvider implements ContextProvider {
    private static final ThreadLocal<JCasState> jcasThreadLocal = new ThreadLocal<JCasState>() {
        @Override
        protected JCasState initialValue() {
            try {
                JCasState state = new JCasState();

                // Try to get a bit more initial heap to improve performance.
                // See: https://issues.apache.org/jira/browse/UIMA-2385
                Properties props = new Properties();
                props.setProperty(UIMAFramework.CAS_INITIAL_HEAP_SIZE,
                        String.valueOf(CASImpl.DEFAULT_INITIAL_HEAP_SIZE * 4));
                state.jcas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null, props).getJCas();
                return state;
            } catch (UIMAException e) {
                throw new IllegalStateException(e);
            }
        };
    };

    private static class JCasState {
        private String collectionId;
        private String documentId;
        private JCas jcas;
    }

    private static Log log = LogFactory.getLog(XmiContextProvider.class);

    private static final String XMI = "xmi";

    private boolean outputPos = true;
    private CorpusService corpusService;

    @Override
    public void setCorpusService(CorpusService aCorpusService) {
        corpusService = aCorpusService;
    }

    @Override
    public void setOutputPos(boolean showPos) {
        outputPos = showPos;
    }

    @Override
    public ItemContext getContext(EvaluationItem aItem, int aLeftSize, int aRightSize) throws IOException {
        Timer timer = new Timer();

        File base = new File(new File(corpusService.getRepositoryPath(), aItem.getCollectionId().toUpperCase()),
                XMI);
        String docId = aItem.getDocumentId();
        JCasState state = jcasThreadLocal.get();
        // FIXME sometimes cas is not being reused (because of state.documentId==null - is this only
        // available for a limited timed?)
        if ((state.documentId == null) || (state.collectionId == null)
                || !StringUtils.equals(state.documentId, docId)
                || !StringUtils.equals(state.collectionId, aItem.getCollectionId())) {
            timer.start();

            InputStream is = null;
            try {
                // No need to reset the CAS - the XmiCasDeserializer does that
                is = new GZIPInputStream(new FileInputStream(new File(base, docId + ".xmi.gz")));
                XmiCasDeserializer.deserialize(is, state.jcas.getCas());
                state.documentId = aItem.getDocumentId();
                state.collectionId = aItem.getCollectionId();
            } catch (IllegalStateException e) {
                throw new IOException(e);
            } catch (SAXException e) {
                throw new IOException(e);
            } finally {
                closeQuietly(is);
            }

            timer.stop();
            log.debug("Reading the XMI took " + timer.getTime() + "ms");
        } else {
            log.debug("Reusing CAS");
        }

        timer.reset();
        timer.start();

        // text offset based
        String text = state.jcas.getDocumentText();

        // Absolute offsets
        int windowBegin = Math.max(0, (int) aItem.getBeginOffset() - aLeftSize);
        int windowEnd = Math.min(text.length(), (int) aItem.getEndOffset() + aRightSize);

        // Relative offsets
        int unitBegin = (int) aItem.getBeginOffset() - windowBegin;
        int unitEnd = (int) aItem.getEndOffset() - windowBegin;

        StringBuilder windowText = new StringBuilder(text.substring(windowBegin, windowEnd));

        List<Token> tokens = JCasUtil.selectCovered(state.jcas, Token.class, (int) aItem.getBeginOffset(),
                (int) aItem.getEndOffset());
        int unitEndDisplacement = 0;
        int matchEndDisplacement = 0;
        int matchBeginDisplacement = 0;

        boolean anyMatchSet = false;
        int matchBeginOffset = aItem.getOriginalTextMatchBegin();
        int matchEndOffset = aItem.getOriginalTextMatchEnd();

        if (aItem.isOriginalMatchSet()) {
            matchBeginOffset = aItem.getOriginalTextMatchBegin();
            matchEndOffset = aItem.getOriginalTextMatchEnd();
            anyMatchSet = true;
        } else if (aItem.isTokenMatchSet()) {
            matchBeginOffset = tokens.get(aItem.getTokenMatchBegin()).getBegin();
            matchEndOffset = tokens.get(aItem.getTokenMatchEnd()).getEnd();
            anyMatchSet = true;
        }

        Collections.reverse(tokens);
        // default: output pos with tokens
        if (outputPos) {
            for (Token t : tokens) {
                if (t.getPos() != null && t.getPos().getPosValue() != null) {
                    String postfix = "/" + t.getPos().getPosValue();
                    windowText.insert(t.getEnd() - windowBegin, postfix);
                    unitEndDisplacement += postfix.length();
                    if (anyMatchSet) {
                        if ((t.getEnd() <= matchEndOffset) && (t.getBegin() >= matchBeginOffset)) {
                            matchEndDisplacement += postfix.length();
                        }
                        if (t.getEnd() <= matchBeginOffset) {
                            matchBeginDisplacement += postfix.length();
                        }
                    }
                }
            }
        }

        ItemContext ctx = new ItemContext(windowText.toString(), windowBegin, windowEnd, unitBegin,
                unitEnd + unitEndDisplacement);

        if (anyMatchSet) {
            ctx.setMatch(matchBeginOffset - windowBegin + matchBeginDisplacement,
                    matchEndOffset - windowBegin + matchBeginDisplacement + matchEndDisplacement);
        }

        ctx.setTextLength(text.length());

        timer.stop();
        log.debug("Extracting the context took " + timer.getTime() + "ms");

        return ctx;
    }
}