org.bultreebank.labpipe.converters.LineConverter.java Source code

Introduction

Here is the source code for org.bultreebank.labpipe.converters.LineConverter.java
Source

/*
 * LABPipe - Natural Language Processing Pipeline for Bulgarian
 * Copyright (C) 2011 Institute for Information and Communication Technologies 
 * 
 * The development of this program was funded by the EuroMatrixPlus Project as 
 * part of the Seventh Framework Program of the European Commission.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.bultreebank.labpipe.converters;

import de.dspin.data.textcorpus.Lemma;
import de.dspin.data.textcorpus.Lemmas;
import de.dspin.data.textcorpus.ObjectFactory;
import de.dspin.data.textcorpus.POStags;
import de.dspin.data.textcorpus.Sentence;
import de.dspin.data.textcorpus.Sentences;
import de.dspin.data.textcorpus.Tag;
import de.dspin.data.textcorpus.TextCorpus;
import de.dspin.data.textcorpus.Token;
import de.dspin.data.textcorpus.TokenRef;
import de.dspin.data.textcorpus.Tokens;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.bultreebank.labpipe.data.ClarkDocumentBuilder;
import org.bultreebank.labpipe.data.Conll;
import org.bultreebank.labpipe.data.WebLicht;
import org.bultreebank.labpipe.utils.DataUtils;
import org.bultreebank.labpipe.utils.Misc;
import org.bultreebank.labpipe.utils.ServiceConstants;
import org.bultreebank.labpipe.utils.XmlUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;

/**
 * <code>LineConverter</code> converts Line encoded data to the other 
 * <code>LABPipe</code> data formats: <code>Gaze</code>, {@link Conll}, 
 * <code>CLaRK</code> and {@link WebLicht}.
 *
 * @author Aleksandar Savkov
 */
public class LineConverter {

    private static final Logger logger = Logger.getLogger(LineConverter.class.getName());

    /**
     * Reads Line encoded data from the <code>InputStream</code> into and 
     * returns a {@link Conll} object containing its <code>String</code> 
     * representation.
     * 
     * @param   is  <code>InputStream</code> reading a Line encoded data.
     * @param   eosToken   end of sentence token
     * @param   conllMap    <code>Map</code> containing connections between full
     *                      BTB tags and their respective CoNLL forms.
     * @return  {@link Conll}
     * @throws UnsupportedEncodingException 
     * @throws FileNotFoundException
     * @throws IOException  
     */
    public static Conll toConll(InputStream is, String eosToken, Properties conllMap)
            throws UnsupportedEncodingException, FileNotFoundException, IOException {

        Conll conll = new Conll();
        ArrayList<String> conllSentence = new ArrayList();
        StringBuilder conllLine = new StringBuilder();

        BufferedReader br = new BufferedReader(new InputStreamReader(is, ServiceConstants.PIPE_CHARACTER_ENCODING));
        String line = null;
        int id = 1;

        while ((line = br.readLine()) != null) {

            if (!line.trim().startsWith(eosToken)) {

                try {
                    conllLine.append(DataUtils.lineTokenToConllToken(line, id, conllMap));
                    conllLine.append("\n");
                } catch (ArrayIndexOutOfBoundsException ex) {
                    id--;
                }
                id++;
                conllSentence.add(conllLine.toString());

            } else {

                id = 1;
                conllLine.append("\n");
                conllSentence.add(conllLine.toString());
                conll.add(conllSentence);

            }

            conllLine.setLength(0);

        }

        br.close();

        return conll;

    }

    /**
     * Reads Line encoded data from the <code>InputStream</code> into, converts
     * it to {@link Conll} object and writes its <code>String</code> 
     * representation into the <code>OutputStream</code>.
     * 
     * @param   is  <code>InputStream</code> reading a Line encoded data.
     * @param   os  <OutputStream</code> writing the <code>String</code> 
     *              representation of the resulting <code>CoNLL</code> object.
     * @param   eosToken   end of sentence token
     * @param   conllMap    <code>Map</code> containing connections between full
     *                      BTB tags and their respective CoNLL forms.
     */
    public static void toConll(InputStream is, OutputStream os, String eosToken, Properties conllMap) {

        try {

            BufferedReader br = new BufferedReader(new InputStreamReader(is));
            BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(os));

            String line;
            int num = 1;

            while ((line = br.readLine()) != null) {

                line = line.trim();

                if (line.startsWith("##")
                        || (!(line.contains(" ") || line.contains("\t")) && !line.contains(eosToken))) {
                    continue;
                }
                if (line.contains(eosToken)) {
                    num = 1;
                    bw.newLine();
                } else {
                    bw.write(DataUtils.lineTokenToConllToken(line, num, conllMap));
                    bw.newLine();
                    num++;
                }

            }
            bw.close();
            br.close();

        } catch (IOException ex) {
            logger.severe(ServiceConstants.EXCEPTION_IO);
        }

    }

    /**
     * Returns a converted Line encoded data from a <code>String</code> to a 
     * {@link Conll} object.
     * 
     * @param   lines  Line encoded data
     * @param   eosToken   end of sentence token
     * @param   conllMap    <code>Map</code> containing connections between full
     *                      BTB tags and their respective CoNLL forms.
     * @return  {@link Conll}
     */
    public static Conll toConll(String lines, String eosToken, Properties conllMap) {

        Conll conll = new Conll();

        int num = 1;

        ArrayList<String> sentence = new ArrayList(100);

        for (String line : lines.split("\n")) {

            line = line.trim();

            if (line.startsWith("##")
                    || (!(line.contains(" ") || line.contains("\t")) && !line.contains(eosToken))) {
                continue;
            }
            if (line.contains(eosToken)) {
                num = 1;
                sentence.trimToSize();
                conll.add(sentence);
                sentence = new ArrayList(100);
            } else {
                sentence.add(DataUtils.lineTokenToConllToken(line, num, conllMap));
                num++;
            }

        }

        sentence.trimToSize();

        if (sentence.size() > 0) {

            conll.add(sentence);

        }

        conll.trimToSize();

        return conll;
    }

    /**
     * Converts a Line encoded token into a CoNLL encoded token.
     * 
     * @param   line    Line encoded token
     * @param   id      ID of the token in CoNLL representation
     * @param   conllMap    <code>Map</code> containing connections between full
     *                      BTB tags and their respective CoNLL forms.
     * @return  {@link String} - CoNll encoded token
     * @throws ArrayIndexOutOfBoundsException  
     */
    public static String lineTokenToConllToken(String line, int id, Properties conllMap)
            throws ArrayIndexOutOfBoundsException {

        ArrayList<String> conll = new ArrayList(10);

        line = line.replaceAll(" ", "\t");

        String[] columns = line.split("\t");

        String token = columns[0];
        String tag = columns[1];
        String lemma = null;
        if (columns.length > 2) {
            lemma = columns[2];
        }

        // ID number
        conll.add(String.valueOf(id));

        // Token
        conll.add(token);

        // Lemma
        if (lemma != null) {
            conll.add(lemma);
        } else {
            conll.add("_");
        }

        // Short tag (BTB first letter)
        if (tag.contains("punct")) {
            conll.add("Punct");
        } else {
            conll.add(String.valueOf(tag.charAt(0)));
        }

        // Long tag
        if (tag.contains("punct") || tag.contains("Punct")) {
            conll.add("Punct");
        } else if (tag.length() > 2 && tag.charAt(1) != '-') {
            conll.add(tag.substring(0, 2));
        } else if (tag.length() > 2 && tag.charAt(1) == '-') {
            conll.add(String.valueOf(tag.charAt(0)));
        } else {
            conll.add(tag);
        }

        // Features (rest of the tag separated with pipe signs)
        if (conllMap.containsKey(tag)) { // using the map configuration

            conll.add(conllMap.getProperty(tag));

        } else { // tags not listed in the map -- failsafe

            if (tag.length() > 2 && !tag.contains("unct")) {

                conll.add(Misc.join(tag.substring(2).split(""), "|").substring(1));

            } else {

                conll.add("_");

            }

        }

        return Misc.join(conll, "\t");

    }

    /**
     * @throws ArrayIndexOutOfBoundsException 
     * @deprecated since v1.0
     */
    public static String toConllLine(String line, int id, Properties conllMap)
            throws ArrayIndexOutOfBoundsException {

        StringBuilder conll = new StringBuilder();

        line = line.replaceAll(" ", "\t");

        String[] columns = line.split("\t");

        String token = columns[0];
        String tag = columns[1];
        String lemma = null;
        if (columns.length > 2) {
            lemma = columns[2];
        }

        // ID number
        conll.append(id);
        conll.append("\t");

        // Token
        conll.append(token);
        conll.append("\t");

        // Lemma
        if (lemma != null) {
            conll.append(lemma);
        } else {
            conll.append("_");
        }
        conll.append("\t");

        // Short tag (BTB first letter)
        if (tag.contains("punct")) {
            conll.append("Punct");
        } else {
            conll.append(tag.charAt(0));
        }
        conll.append("\t");

        // Long tag
        if (tag.contains("punct") || tag.contains("Punct")) {
            conll.append("Punct");
        } else if (tag.length() > 2 && tag.charAt(1) != '-') {
            conll.append(tag.substring(0, 2));
        } else if (tag.length() > 2 && tag.charAt(1) == '-') {
            conll.append(tag.charAt(0));
        } else {
            conll.append(tag);
        }
        conll.append("\t");

        // Features (rest of the tag separated with pipe signs)
        if (conllMap.containsKey(tag)) { // using the map configuration

            conll.append(conllMap.getProperty(tag));

        } else { // tags not listed in the map -- failsafe

            if (tag.length() > 2 && !tag.contains("unct")) {

                conll.append(StringUtils.join(tag.substring(2).split(""), "|").substring(1));

            } else {

                conll.append("_");

            }

        }

        return conll.toString();

    }

    /**
     * Converts Line encoded data from the <code>InputStream</code> to Gaze data
     *  in the <code>OutputStream</code>.
     * 
     * @param   is  <code>InputStream</code> reading a Line encoded data.
     * @param   os  <OutputStream</code> writing <code>Gaze</code> encoded data.
     * @param   eosToken   end of sentence token
     * 
     */
    public static void toGaze(InputStream is, OutputStream os, String eosToken) {

        try {
            Misc.writeStream(new ByteArrayInputStream(
                    toGaze(Misc.readInputStream(is), eosToken).getBytes(ServiceConstants.PIPE_CHARACTER_ENCODING)),
                    os);
        } catch (UnsupportedEncodingException ex) {
            logger.severe(ServiceConstants.EXCEPTION_UNSUPPORTED_ENCODING);
        }

    }

    /**
     * Converts Line encoded data <code>String</code> to Gaze data.
     * 
     * @param   lines   Line encoded data.
     * @param   eosToken   end of sentence token
     * 
     * @return  {@link String} - Gaze data
     */
    public static String toGaze(String lines, String eosToken) {
        String[] linesAr = lines.split("\n");
        StringBuilder sb = new StringBuilder();

        for (String line : linesAr) {
            line = line.replace("\t", " ");
            String[] lineItems = line.split(" ");
            if (lineItems.length < 2 || lines.equals(eosToken)) {
                sb.append(ServiceConstants.GAZE_EOS_TOKEN);
                sb.append("\n");
                continue;
            }
            sb.append(lineItems[0]);
            sb.append("\t");
            sb.append(lineItems[1]);
            sb.append("\t");
            sb.append("O");
            sb.append("\n");
        }

        return sb.toString();
    }

    /**
     * Converts Line encoded data into a CLaRK document.
     * 
     * @param   lines   Line encoded data
     * @param   eosToken   end of sentence token
     * 
     * @return  {@link org.w3c.dom.Document} - CLaRK Document
     */
    public static Document toClark(String lines, String eosToken) {
        try {

            Document clarkDoc = ClarkDocumentBuilder.buildClarkDocument();

            BufferedReader br = new BufferedReader(new StringReader(lines));

            String line;

            Element root = clarkDoc.createElement("root");
            Element sentence = clarkDoc.createElement("s");
            Element token = null;
            String[] lineSegments;
            clarkDoc.appendChild(root);

            while ((line = br.readLine()) != null) {

                if (line.contains(eosToken)) {

                    root.appendChild(sentence);
                    sentence = clarkDoc.createElement("s");

                } else {

                    token = clarkDoc.createElement("tok");
                    line = line.replaceAll("\t", " ");
                    lineSegments = line.split(" ");
                    token.setTextContent(lineSegments[0]);
                    if (lineSegments.length > 1) {
                        token.setAttribute("svm", lineSegments[1]);
                    }
                    if (lineSegments.length > 2) {
                        token.setAttribute("lm", lineSegments[2]);
                    }
                    sentence.appendChild(token);

                }

            }

            return clarkDoc;

        } catch (IOException ex) {
            logger.severe(ServiceConstants.EXCEPTION_IO);
        }

        return null;
    }

    /**
     * Converts Line encoded data <code>InputStream</code> into a CLaRK document
     *  and prints it into the <code>OutputStream</code>.
     * 
     * @param   is  Line encoded data <code>InputStream</code>
     * @param   os  <code>OutputStream</code> containing the CLaRK document representation.
     * @param   eosToken    end of sentence token
     * 
     */
    public static void toClark(InputStream is, OutputStream os, String eosToken) {

        Document doc = toClark(Misc.readInputStream(is), eosToken);
        XmlUtils.print(doc, os);

    }

    /**
     * Converts Line encoded data <code>InputStream</code> into a 
     * {@link WebLicht} document.
     * 
     * @param   is  Line encoded data <code>InputStream</code>
     * @param   eosToken    end of sentence token
     * @param   doc {@link WebLicht} document receiving the data
     * 
     * @return {@link WebLicht}
     * 
     */
    public static WebLicht toWebLicht(InputStream is, String eosToken, WebLicht doc) {

        BufferedReader br = null;

        try {
            br = new BufferedReader(new InputStreamReader(is, ServiceConstants.PIPE_CHARACTER_ENCODING));
            ObjectFactory factory = new ObjectFactory();
            TextCorpus tc = factory.createTextCorpus();
            doc.setTextCorpus(tc);
            List tcList = tc.getTextOrTokensOrSentences();
            Tokens tokens = factory.createTokens();
            List<Token> tokenList = tokens.getToken();
            POStags tags = factory.createPOStags();
            List<Tag> tagList = tags.getTag();
            Lemmas lemmas = factory.createLemmas();
            List<Lemma> lemmaList = lemmas.getLemma();
            Sentences sentences = factory.createSentences();
            List<Sentence> sentenceList = sentences.getSentence();
            int index = 0;
            int id = 1;
            int sId = 1;
            String textStr = "";
            String line;
            Sentence sentence = factory.createSentence();
            sentence.setID("sent" + sId);
            sentence.setStart(id);
            while ((line = br.readLine()) != null) {
                Token token = factory.createToken();
                Tag tag = factory.createTag();
                Lemma lemma = factory.createLemma();
                Matcher m = Pattern.compile("[^\\s\\t]+").matcher(line);
                Matcher m2 = Pattern.compile("([^\\s\\t]+)(?:\\s+|\\t+)([^\\s\\t]+)").matcher(line);
                Matcher m3 = Pattern.compile(eosToken).matcher(line);
                Matcher m4 = Pattern.compile("([^\\s\\t]+)(?:\\s+|\\t+)([^\\s\\t]+)(?:\\s+|\\t+)([^\\s\\t]+)")
                        .matcher(line);

                if (m3.find()) {
                    sentence.setEnd(id - 1);
                    sentenceList.add(sentence);
                    sentence = factory.createSentence();
                    sId++;
                    sentence.setID("sent" + sId);
                    sentence.setStart(id);
                    continue;
                } else if (m.matches()) {
                    token.setValue(m.group());
                } else if (m4.matches()) {
                    token.setValue(m4.group(1));
                    tag.setValue(m4.group(2));
                    tag.setTokID(token);
                    tag.setID("tag." + id);
                    lemma.setValue(m4.group(3));
                    lemma.setTokID(token);
                    lemma.setID("lem." + id);
                } else if (m2.matches()) {
                    token.setValue(m2.group(1));
                    tag.setValue(m2.group(2));
                    tag.setTokID(token);
                    tag.setID("tag." + id);
                } else {
                    continue;
                }

                TokenRef tr = factory.createTokenRef();
                tr.setTokID(token);
                sentence.getTokenRef().add(tr);

                if (!token.getValue().matches(ServiceConstants.PRE_SPACE_PUNCT_SIGNS_PATTERN)) {
                    textStr = textStr.concat(" ");
                }
                textStr = textStr.concat(token.getValue());
                index = textStr.length();
                token.setID("tok." + id);
                token.setStart(index - token.getValue().length());
                token.setEnd(index - 1);
                tokenList.add(token);
                if (tag.getValue() != null) {
                    tagList.add(tag);
                }
                if (lemma.getValue() != null) {
                    lemmaList.add(lemma);
                }
                id++;

            }
            if (sentence.getTokenRef().size() > 0) {
                if (sentence.getEnd() == null) {
                    sentence.setEnd(sentence.getTokenRef().size());
                }
                sentenceList.add(sentence);
            }
            tcList.add(textStr);
            tcList.add(tokens);
            if (lemmaList.size() > 0) {
                tcList.add(lemmas);
            }
            tcList.add(tags);
            tcList.add(sentences);
            return doc;

        } catch (UnsupportedEncodingException ex) {
            logger.log(Level.SEVERE, ServiceConstants.EXCEPTION_UNSUPPORTED_ENCODING, ex);
        } catch (IOException ex) {
            logger.log(Level.SEVERE, ServiceConstants.EXCEPTION_IO, ex);
        } finally {
            try {
                br.close();
            } catch (IOException ex) {
                logger.log(Level.SEVERE, ServiceConstants.EXCEPTION_IO, ex);
            }
        }

        return null;
    }

    /**
     * Returns a {@link WebLicht} document built from the Line encoded data 
     * from the <code>InputStream</code>.
     * 
     * @param   is  Line encoded data <code>InputStream</code>
     * @param   eosToken    end of sentence token
     * 
     * @return {@link WebLicht}
     * 
     */
    public static WebLicht toWebLicht(InputStream is, String eosToken) {

        WebLicht doc = new WebLicht();
        return toWebLicht(is, eosToken, doc);

    }

}