di.uniba.it.tee2.extraction.TemporalExtractor.java Source code

Java tutorial

Introduction

Here is the source code for di.uniba.it.tee2.extraction.TemporalExtractor.java

Source

/**
 * Copyright (c) 2014, the TEE2 AUTHORS.
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * Neither the name of the University of Bari nor the names of its contributors
 * may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007
 *
 */
package di.uniba.it.tee2.extraction;

import di.uniba.it.tee2.util.TEEUtils;
import de.unihd.dbs.heideltime.standalone.DocumentType;
import de.unihd.dbs.heideltime.standalone.HeidelTimeStandalone;
import de.unihd.dbs.heideltime.standalone.OutputType;
import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
import di.uniba.it.tee2.data.TaggedText;
import di.uniba.it.tee2.data.TimeEvent;
import java.io.StringReader;
import java.util.Calendar;
import java.util.Date;
import java.util.TimeZone;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

/**
 *
 * @author pierpaolo
 */
public class TemporalExtractor {

    private HeidelTimeStandalone heidelTagger;

    private static final Logger logger = Logger.getLogger(TemporalExtractor.class.getName());

    private final Language langObj;

    public TemporalExtractor(String language) {
        langObj = Language.getLanguageFromString(language);
    }

    public void init() {
        heidelTagger = new HeidelTimeStandalone(langObj, DocumentType.NARRATIVES, OutputType.TIMEML,
                "config.props");
    }

    public String getLanguage() {
        return langObj.getName();
    }

    public TaggedText process(String text) throws Exception {
        Date currentTime = Calendar.getInstance(TimeZone.getDefault()).getTime();
        TaggedText taggedText = new TaggedText();
        text = StringEscapeUtils.escapeXml11(text);
        taggedText.setText(text);
        String timemlOutput = heidelTagger.process(text, currentTime);
        taggedText.setTaggedText(timemlOutput);
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        DocumentBuilder builder = factory.newDocumentBuilder();
        org.w3c.dom.Document doc = builder.parse(new InputSource(new StringReader(timemlOutput)));

        StringBuilder sb = new StringBuilder();
        NodeList timemlNodes = doc.getElementsByTagName("TimeML");
        for (int i = 0; i < timemlNodes.getLength(); i++) {
            NodeList childs = timemlNodes.item(i).getChildNodes();
            for (int j = 0; j < childs.getLength(); j++) {
                Node child = childs.item(j);
                if (child.getNodeType() == Node.TEXT_NODE) {
                    sb.append(child.getTextContent());
                } else if (child.getNodeName().equals("TIMEX3")) {
                    String timeText = child.getTextContent();
                    String timeValueString = child.getAttributes().getNamedItem("value").getNodeValue();
                    String normalizedTime = null;
                    try {
                        normalizedTime = TEEUtils.normalizeTime(timeValueString);
                    } catch (Exception ex) {
                        //logger.log(Level.WARNING, "Error to normalize time: ", ex);
                    }
                    if (normalizedTime != null) {
                        TimeEvent event = new TimeEvent(sb.length(), sb.length() + timeText.length(),
                                normalizedTime);
                        event.setEventString(timeText);
                        taggedText.getEvents().add(event);
                    }
                    sb.append(timeText);
                }
                //VERBOSE
                //System.out.println(child.getNodeType() + "\t" + child.getNodeName() + "\t" + child.getTextContent());
                //System.out.println();
            }
        }
        taggedText.setText(sb.toString());
        return taggedText;
    }

    public void close() {
        heidelTagger = null;
        System.gc();
    }

}