elh.eus.absa.NLPpipelineWrapper.java Source code

Java tutorial

Introduction

Here is the source code for elh.eus.absa.NLPpipelineWrapper.java

Source

/*
 * Copyright 2014 Elhuyar Fundazioa
    
This file is part of EliXa.
    
EliXa is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
    
EliXa is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with EliXa.  If not, see <http://www.gnu.org/licenses/>.
 */

package elh.eus.absa;

import ixa.kaflib.KAFDocument;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Properties;

import org.apache.commons.io.FileUtils;
import org.jdom2.JDOMException;

/**
 * 
 * @author isanvi
 *
 */
public final class NLPpipelineWrapper {

    /**
     * Processes a given string with the Ixa-pipe tokenizer.
     * 
     * @param String text : input text
     * @return KAFDocument : tokenized input text in kaf format
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public static KAFDocument ixaPipesTok(String text, String lang, String savePath)
            throws IOException, JDOMException {
        // Regex added to correct ixa-pipes treatment of punctuation marks : 
        // <wf id="w19" sent="1" para="1" offset="76" length="4">!!??</wf>
        // <term id="t19" type="open" lemma="!!??" pos="N" morphofeat="NCMC000">
        text = text.replaceAll("([!?])", "$1 "); //\p{Po} gets too many punctuation marks.

        //kaf document to store tokenized text
        KAFDocument kaf = new KAFDocument(lang, "v1.naf");
        KAFDocument.LinguisticProcessor newLp = kaf.addLinguisticProcessor("text", "ixa-pipe-tok-" + lang,
                "v1.naf" + "-" + "elixa");
        newLp.setBeginTimestamp();
        // objects needed to call the tokenizer
        BufferedReader breader = new BufferedReader(new StringReader(text));
        Properties tokProp = setTokenizerProperties(lang, "default", "no", "no");

        // tokenizer call
        eus.ixa.ixa.pipe.tok.Annotate tokenizer = new eus.ixa.ixa.pipe.tok.Annotate(breader, tokProp);
        tokenizer.tokenizeToKAF(kaf);
        newLp.setEndTimestamp();

        breader.close();

        System.err.println("NLPpipelineWrapper::ixaPipesTok - tokenizing ready");

        kaf.save(savePath);

        return kaf;
    }

    /**
     * Processes a given string with the Ixa-pipe tokenizer.
     * 
     * @param String text : input text
     * @return KAFDocument : tokenized input text in kaf format
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public static KAFDocument ixaPipesTok(String text, String lang) throws IOException, JDOMException {
        // Regex added to correct ixa-pipes treatment of punctuation marks : 
        // <wf id="w19" sent="1" para="1" offset="76" length="4">!!??</wf>
        // <term id="t19" type="open" lemma="!!??" pos="N" morphofeat="NCMC000">
        text = text.replaceAll("([!?])", "$1 "); //\p{Po} gets too many punctuation marks.

        //kaf document to store tokenized text
        KAFDocument kaf = new KAFDocument(lang, "v1.naf");
        KAFDocument.LinguisticProcessor newLp = kaf.addLinguisticProcessor("text", "ixa-pipe-tok-" + lang,
                "v1.naf" + "-" + "elixa");
        newLp.setBeginTimestamp();
        // objects needed to call the tokenizer
        BufferedReader breader = new BufferedReader(new StringReader(text));
        Properties tokProp = setTokenizerProperties(lang, "default", "no", "no");

        // tokenizer call
        eus.ixa.ixa.pipe.tok.Annotate tokenizer = new eus.ixa.ixa.pipe.tok.Annotate(breader, tokProp);
        tokenizer.tokenizeToKAF(kaf);
        newLp.setEndTimestamp();

        breader.close();

        System.err.println("NLPpipelineWrapper::ixaPipesTok - tokenizing ready");

        return kaf;
    }

    /**
     * Processes a given string with the Ixa-pipe PoS tagger.
     * 
     * @param KAFDocument tokenizedKaf: tokenized input text in KAF format
     * @param String posModelPath : path to the pos tagger model
     * 
     * @return KAFDocument : PoStagged input text in KAF format
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public static KAFDocument ixaPipesPos(KAFDocument tokenizedKaf, String posModelPath)
            throws IOException, JDOMException {

        KAFDocument.LinguisticProcessor posLp = tokenizedKaf.addLinguisticProcessor("terms",
                "ixa-pipe-pos-" + FileUtilsElh.fileName(posModelPath), "v1.naf" + "-" + "elixa");
        //pos tagger parameters
        if (!FileUtilsElh.checkFile(posModelPath)) {
            System.err.println("NLPpipelineWrapper::ixaPipesPos() - provided pos model path is problematic, "
                    + "probably pos tagging will end up badly...");
        }
        Properties posProp = setPostaggerProperties(posModelPath, tokenizedKaf.getLang(), "3", "bin", "true");
        //pos tagger call
        eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);
        posLp.setBeginTimestamp();
        postagger.annotatePOSToKAF(tokenizedKaf);
        posLp.setEndTimestamp();

        System.err.println("NLPpipelineWrapper::ixaPipesPos - pos tagging ready");

        return tokenizedKaf;

    }

    /**
     * Processes a given string with the Ixa-pipe PoS tagger.
     * 
     * @param KAFDocument tokenizedKaf: tokenized input text in KAF format
     * @param String posModelPath : path to the pos tagger model
     * 
     * @return KAFDocument : PoStagged input text in KAF format
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public static String ixaPipesPosConll(KAFDocument tokenizedKaf, String posModelPath)
            throws IOException, JDOMException {

        //KAFDocument.LinguisticProcessor posLp = tokenizedKaf.addLinguisticProcessor(
        //      "terms", "ixa-pipe-pos-"+FileUtilsElh.fileName(posModelPath), "v1.naf" + "-" + "elixa");         
        //pos tagger parameters
        if (!FileUtilsElh.checkFile(posModelPath)) {
            System.err.println("NLPpipelineWrapper::ixaPipesPos() - provided pos model path is problematic, "
                    + "probably pos tagging will end up badly...");
        }
        Properties posProp = setPostaggerProperties(posModelPath, tokenizedKaf.getLang(), "3", "bin", "true");
        //pos tagger call
        eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);
        //posLp.setBeginTimestamp();      
        return (postagger.annotatePOSToCoNLL(tokenizedKaf));
        //posLp.setEndTimestamp();

        //System.err.println("NLPpipelineWrapper::ixaPipesPos - pos tagging ready");

        //return tokenizedKaf;      

    }

    /**
     * Processes a given string with the Ixa-pipe PoS tagger.
     * 
     * @param KAFDocument tokenizedKaf: tokenized input text in KAF format
     * @param String posModelPath : path to the pos tagger model
     * 
     * @return KAFDocument : PoStagged input text in KAF format
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public static KAFDocument ixaPipesPos(KAFDocument tokenizedKaf, String posModelPath,
            eus.ixa.ixa.pipe.pos.Annotate postagger) throws IOException, JDOMException {

        KAFDocument.LinguisticProcessor posLp = tokenizedKaf.addLinguisticProcessor("terms",
                "ixa-pipe-pos-" + FileUtilsElh.fileName(posModelPath), "v1.naf" + "-" + "elixa");
        posLp.setBeginTimestamp();
        postagger.annotatePOSToKAF(tokenizedKaf);
        posLp.setEndTimestamp();

        System.err.println("NLPpipelineWrapper::ixaPipesPos - pos tagging ready");

        return tokenizedKaf;
    }

    /**
     * Processes a given string with the Ixa-pipe NERC tagger.
     * 
     * @param KAFDocument tokenizedKaf: tokenized input text in KAF format
     * @param String nercModelPath : path to the NERC tagger model
     * 
     * @return KAFDocument : NERC tagged input text in KAF format
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public static KAFDocument ixaPipesNERC(KAFDocument tokenizedKaf, String nercModelPath, String lexer,
            String dictTag, String dictPath) throws IOException, JDOMException {

        KAFDocument.LinguisticProcessor nercLp = tokenizedKaf.addLinguisticProcessor("entities",
                "ixa-pipe-pos-" + FileUtilsElh.fileName(nercModelPath), "v1.naf" + "-" + "elixa");
        //pos tagger parameters
        if (!FileUtilsElh.checkFile(nercModelPath)) {
            System.err.println("NLPpipelineWrapper : ixaPipesPos() - provided pos model path is problematic, "
                    + "probably pos tagging will end up badly...");
        }
        Properties nercProp = setIxaPipesNERCProperties(nercModelPath, tokenizedKaf.getLang(), lexer, dictTag,
                dictPath);
        //pos tagger call
        eus.ixa.ixa.pipe.nerc.Annotate nerctagger = new eus.ixa.ixa.pipe.nerc.Annotate(nercProp);
        nercLp.setBeginTimestamp();
        nerctagger.annotateNEsToKAF(tokenizedKaf);
        nercLp.setEndTimestamp();

        return tokenizedKaf;

        //System.err.println("pos tagging amaituta");
    }

    /**
     * Processes a given string with the Ixa-pipe NERC tagger.
     * 
     * @param KAFDocument tokenizedKaf: tokenized input text in KAF format
     * @param String nercModelPath : path to the NERC tagger model
     * 
     * @return KAFDocument : NERC tagged input text in KAF format
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public static KAFDocument ixaPipesNERC(KAFDocument tokenizedKaf, String nercModelPath,
            eus.ixa.ixa.pipe.nerc.Annotate nerctagger) throws IOException, JDOMException {

        KAFDocument.LinguisticProcessor nercLp = tokenizedKaf.addLinguisticProcessor("entities",
                "ixa-pipe-pos-" + FileUtilsElh.fileName(nercModelPath), "v1.naf" + "-" + "elixa");

        //NERC tagger call
        nercLp.setBeginTimestamp();
        nerctagger.annotateNEsToKAF(tokenizedKaf);
        nercLp.setEndTimestamp();

        return tokenizedKaf;

        //System.err.println("pos tagging amaituta");
    }

    /**
     * Tokenizes and PoS tags a given string with Ixa-pipes.
     * 
     * @param String text : input text
     * @param String lang : input text language (ISO-639 code) 
     * @return KAFDocument : PoStagged input text in KAF format
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public static KAFDocument ixaPipesTokPos(String text, String lang, String posModelPath)
            throws IOException, JDOMException {
        return ixaPipesPos(ixaPipesTok(text, lang), posModelPath);
    }

    /**
     * Tokenizes and PoS tags a given string with Ixa-pipes.
     * 
     * @param String text : input text
     * @param String lang : input text language (ISO-639 code) 
     * @return KAFDocument : PoStagged input text in KAF format
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public static String ixaPipesTokPosConll(String text, String lang, String posModelPath)
            throws IOException, JDOMException {
        return ixaPipesPosConll(ixaPipesTok(text, lang), posModelPath);
    }

    /**
     * Tokenizes and PoS tags a given string with Ixa-pipes.
     * 
     * @param String text : input text
     * @param String lang : input text language (ISO-639 code) 
     * @return KAFDocument : PoStagged input text in KAF format
     * 
     * @throws IOException
     * @throws JDOMException
     */
    public static KAFDocument ixaPipesTokPos(String text, String lang, String posModelPath,
            eus.ixa.ixa.pipe.pos.Annotate postagger) throws IOException, JDOMException {
        return ixaPipesPos(ixaPipesTok(text, lang), posModelPath, postagger);
    }

    /**
     * 
     * Set properties for the Ixa-pipe-tok tokenizer module
     * 
     * @param language (ISO-639 code) 
     * @param normalize
     * @param untokenizable
     * @param hardParagraph
     * 
     * @return Properties props
     * 
     */
    private static Properties setTokenizerProperties(String language, String normalize, String untokenizable,
            String hardParagraph) {
        Properties props = new Properties();
        props.setProperty("language", language);
        props.setProperty("normalize", normalize);
        props.setProperty("untokenizable", untokenizable);
        props.setProperty("hardParagraph", hardParagraph);
        return props;
    }

    /**
     * 
     * Set properties for the Ixa-pipe-pos tagger module
     * 
     * @param model
     * @param language (ISO-639 code) 
     * @param beamSize
     * @param lemmatize
     * @param multiwords
     * 
     * @return Properties props
     * 
     */
    public static Properties setPostaggerProperties(String model, String language, String beamSize,
            String lemmatize, String multiwords) {
        Properties props = new Properties();
        props.setProperty("model", model);
        props.setProperty("language", language);
        props.setProperty("beamSize", beamSize);
        props.setProperty("lemmatize", lemmatize);
        //this is a work around for ixa-pipes, because it only allows multiword matching for es and gl.
        if (!language.matches("(gl|es)")) {
            multiwords = "false";
        }
        props.setProperty("multiwords", multiwords);
        return props;
    }

    /**
     * Set a Properties object with the CLI parameters for annotation.
     * @param model the model parameter
     * @param language language parameter
     * @param lexer rule based parameter
     * @param dictTag directly tag from a dictionary
     * @param dictPath directory to the dictionaries
     * @return the properties object
     */
    public static Properties setIxaPipesNERCProperties(String model, String language, String lexer, String dictTag,
            String dictPath) {
        Properties annotateProperties = new Properties();
        annotateProperties.setProperty("model", model);
        annotateProperties.setProperty("language", language);
        annotateProperties.setProperty("ruleBasedOption", lexer);
        annotateProperties.setProperty("dictTag", dictTag);
        annotateProperties.setProperty("dictPath", dictPath);
        return annotateProperties;
    }

    public static int eustaggerCall(String taggerCommand, String string, String fname) {

        try {
            File temp = new File(fname);
            //System.err.println("eustaggerCall: created temp file: "+temp.getAbsolutePath());
            BufferedWriter bw = new BufferedWriter(new FileWriter(temp));
            bw.write(string + "\n");
            bw.close();

            String[] command = { taggerCommand, temp.getName() };
            System.err.println("Eustagger agindua: " + Arrays.toString(command));

            ProcessBuilder eustBuilder = new ProcessBuilder().command(command);
            eustBuilder.directory(new File(temp.getParent()));
            //.redirectErrorStream(true);
            Process eustagger = eustBuilder.start();
            int success = eustagger.waitFor();
            //System.err.println("eustagger succesful? "+success);
            if (success != 0) {
                System.err.println("eustaggerCall: eustagger error");
            } else {
                String tagged = fname + ".kaf";
                BufferedReader reader = new BufferedReader(new InputStreamReader(eustagger.getInputStream()));
                //new Eustagger_lite outputs to stdout. Also called ixa-pipe-pos-eu
                if (taggerCommand.contains("eustagger") || taggerCommand.contains("ixa-pipe")) {
                    Files.copy(eustagger.getInputStream(), Paths.get(tagged));
                }
                // old eustagger (euslem)
                else {
                    FileUtilsElh.renameFile(temp.getAbsolutePath() + ".etiketatua3", tagged);
                }
            }
            //
            // delete all temporal files used in the process.
            temp.delete();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return -1;
        }

        return 0;
    }

    /**
     *  Process linguistically input sentence with ixa-pipes (tokenization and PoS tagging).
     *  A tagged file is generated for each sentence in the corpus and stored in the directory
     *  given as argument. Sentence Ids are used as file names. If a tagged file already exists 
     *  that sentence is not tagged 
     * 
     * @param nafdir : path to the directory were tagged files should be stored
     * @param posModel : model to be used by the PoS tagger
     * @throws IOException
     * @throws JDOMException
     */
    public static String tagSentence(String input, String savePathNoExt, String lang, String posModel,
            eus.ixa.ixa.pipe.pos.Annotate postagger) throws IOException, JDOMException {
        KAFDocument kafinst = new KAFDocument("", "");

        if (FileUtilsElh.checkFile(savePathNoExt + ".kaf")) {
            //System.err.println("NLPpipelineWrapper::tagSentence : file already there:"+savePathNoExt+".kaf");
            return savePathNoExt + ".kaf";
        }
        // if language is basque 'posModel' argument is used to pass the path to the basque morphological analyzer eustagger 
        else if (lang.compareToIgnoreCase("eu") == 0) {
            int ok = eustaggerCall(posModel, input, savePathNoExt);
        } else {
            kafinst = ixaPipesTokPos(input, lang, posModel, postagger);
            kafinst.save(savePathNoExt + ".kaf");
        }
        return savePathNoExt + ".kaf";
    }

}