de.tudarmstadt.ukp.dkpro.wsd.senseval.reader.Semeval2AWReader.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.wsd.senseval.reader.Semeval2AWReader.java

Source

/*******************************************************************************
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

package de.tudarmstadt.ukp.dkpro.wsd.senseval.reader;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.apache.uima.collection.CollectionException;
import org.apache.uima.jcas.JCas;
import org.dom4j.Element;
import org.dom4j.Node;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.wsd.type.LexicalItemConstituent;
import de.tudarmstadt.ukp.dkpro.wsd.type.WSDItem;

/**
 * Semeval2AWReader reads the XML data sets for the Semeval-2 all-words
 * tasks.  Currently it does not handle component elements.
 * 
 * @author   Tristan Miller <miller@ukp.informatik.tu-darmstadt.de>
 */
public class Semeval2AWReader extends SensevalAWReader {
    private static final String SENTENCE_ELEMENT_NAME = "s";
    private static final String HEAD_ELEMENT_NAME = "head";

    @SuppressWarnings("unchecked")
    @Override
    public void getNext(JCas jCas) throws IOException, CollectionException {
        int offset = 0;
        String s = "";
        Element text = textIterator.next();

        for (Iterator<Element> sentenceIterator = text.elementIterator(SENTENCE_ELEMENT_NAME); sentenceIterator
                .hasNext();) {

            Map<String, WSDItem> wsdItems = new HashMap<String, WSDItem>();
            Map<String, LexicalItemConstituent> lics = new HashMap<String, LexicalItemConstituent>();
            Map<String, String> sats = new HashMap<String, String>();
            Element sentence = sentenceIterator.next();
            Sentence sentenceAnnotation = new Sentence(jCas);
            sentenceAnnotation.setBegin(offset);

            // Loop over all nodes to get the document text in order
            for (Iterator<Node> nodeIterator = sentence.nodeIterator(); nodeIterator.hasNext();) {

                Node node = nodeIterator.next();
                String nodeText = node.getText().replace('\n', ' ');
                String nodeName = node.getName();

                if (nodeName == null) {
                    offset += nodeText.length();
                    s += nodeText;
                    continue;
                }

                // If the node is a satellite, create a LexicalItemConstituent
                if (nodeName.equals(SATELLITE_ELEMENT_NAME)) {
                    String id = ((Element) node).attributeValue(ID_ATTRIBUTE_NAME);
                    lics.put(id,
                            newLexicalItemConstituent(jCas, id, LIC_TYPE_SATELLITE, offset, nodeText.length()));
                }

                // If the node is a head, create a LexicalItemConstituent and a WSDItem
                else if (nodeName.equals(HEAD_ELEMENT_NAME)) {
                    Element head = (Element) node;
                    String id = head.attributeValue(ID_ATTRIBUTE_NAME);
                    String satellites = head.attributeValue(SATELLITES_ATTRIBUTE_NAME);

                    lics.put(id, newLexicalItemConstituent(jCas, id, LIC_TYPE_HEAD, offset, nodeText.length()));
                    wsdItems.put(id, newWsdItem(jCas, id, LIC_TYPE_HEAD, offset, nodeText.length(),
                            head.attributeValue(POS_ATTRIBUTE_NAME), head.attributeValue(LEMMA_ATTRIBUTE_NAME)));

                    if (satellites != null)
                        sats.put(id, satellites);
                }

                // If the node is any other element, something is wrong
                else if (node.getNodeTypeName().equals("Entity") == false) {
                    throw new CollectionException("unknown_element", new Object[] { node.getName() });
                }

                offset += nodeText.length();
                s += nodeText;
            }

            // Add a sentence annotation
            sentenceAnnotation.setEnd(offset);
            sentenceAnnotation.addToIndexes();

            populateLexicalItemConstituents(jCas, wsdItems, lics, sats);
        }

        jCas.setDocumentText(s);

        try {
            setDocumentMetadata(jCas, text.attributeValue(ID_ATTRIBUTE_NAME));
        } catch (URISyntaxException e) {
            throw new IOException(e);
        }

        textCount++;
    }
}