ubic.pubmedgate.treetagger.TreeTaggerRunner.java Source code

Introduction

Here is the source code for ubic.pubmedgate.treetagger.TreeTaggerRunner.java
Source

/*
 * The WhiteText project
 * 
 * Copyright (c) 2012 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package ubic.pubmedgate.treetagger;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Corpus;
import gate.Document;
import gate.FeatureMap;
import gate.creole.ANNIEConstants;
import gate.util.OffsetComparator;
import gate.util.SimpleFeatureMapImpl;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;

import org.apache.commons.lang.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ubic.basecode.util.FileTools;
import ubic.pubmedgate.Config;
import ubic.pubmedgate.ConnectionsDocument;
import ubic.pubmedgate.GateInterface;
import ubic.pubmedgate.editors.AbbreviationLoader;
import ubic.pubmedgate.mallet.GateReseter;

/**
 * This runs the tree tagger software with parameters from the Genia corpus.
 * 
 * @author leon
 */
public class TreeTaggerRunner {

    // make sure that GATE tokens are not duplicated - this often occurs,
    // overlapping tokens are created if the GATE
    // tokenizer is ran twice

    protected static Log log = LogFactory.getLog(TreeTaggerRunner.class);

    String annotationSetName;
    String checkText;
    boolean GATETokens;

    public TreeTaggerRunner(String annotationSetName, boolean GATETokens) {
        this.annotationSetName = annotationSetName;
        this.GATETokens = GATETokens;
    }

    /**
     * @param args
     */
    public static void main(String[] args) throws Exception {
        String annotationSet = "TreeTagger";
        // set GATE to use gate tokens instead of treetaggers tokenization,
        // untested
        boolean GATETokens = true;
        if (GATETokens) {
            annotationSet += "GATETokens";
        }

        StopWatch watch = new org.apache.commons.lang.time.StopWatch();
        watch.start();
        GateInterface p2g = new GateInterface();

        // Corpus corp = p2g.getCorp();
        // Corpus corp = p2g.getTrainingCorp();
        // Corpus corp = p2g.getNoAbbrevCorp();
        Corpus corp = p2g.getUnseenCorp();
        TreeTaggerRunner runner = new TreeTaggerRunner(annotationSet, GATETokens);

        // code to test a single document
        // Document test = ( Document ) p2g.getUnseenCorp().get( 17761 );
        // runner.runTreeTagger( new ConnectionsDocument( test ) );
        // System.exit( 1 );

        GateReseter reset = new GateReseter(GateInterface.getDocuments(corp), annotationSet);
        reset.reset();
        log.info("Done reseting");

        // 128 had a problem..
        // runner.runTreeTagger( new ConnectionsDocument( ( Document )
        // p2g.getTrainingCorp().get( 128 ) ) );
        // System.exit( 1 );

        System.out.println("Time:" + watch.toString());
        int i = 0;
        int errors = 0;
        for (ConnectionsDocument doc : GateInterface.getDocuments(corp)) {
            log.info(doc.getName());
            try {
                runner.runTreeTagger(doc);
                doc.sync();
            } catch (Exception e) {
                e.printStackTrace();
                errors++;
            }
            log.info("i:" + i++ + " Time:" + watch.toString() + " errors:" + errors);
        }

    }

    private File getFile(ConnectionsDocument document) throws Exception {
        // tokenize
        File gateTextFile = File.createTempFile("treetagger", ".txt");
        // gateTextFile.deleteOnExit();

        FileOutputStream fos = new FileOutputStream(gateTextFile);
        OutputStreamWriter osw = new OutputStreamWriter(fos);
        BufferedWriter bw = new BufferedWriter(osw);

        // get the ArticleTilte and AbstractText
        Annotation title = document.getTitle();
        String titleString = document.getAnnotationText(title);

        Annotation abstractText = document.getAbstract();
        String abstractString = document.getAnnotationText(abstractText);

        // write it
        bw.write(titleString);
        bw.newLine();
        bw.write(abstractString);

        log.info("Wrote:" + abstractString);
        // bad scope - this is per document
        checkText = titleString + System.getProperty("line.separator") + abstractString;
        // this is a fix for abstract PMID: 21294731, it uses a EM space instead
        // of a space!
        checkText = checkText.replaceAll("\u2003", " ");
        checkText = checkText.replaceAll("\u200A", " ");

        log.info("Title:" + titleString);
        log.info("abstractString:" + abstractString);

        bw.flush();
        bw.close();
        return gateTextFile;
    }

    public static String[] getTreeTaggerCommand(File inputFile) {
        // /home/leon/Desktop/WhiteTextTools/Treetagger/bin/tree-tagger -token
        // -lemma -sgml -pt-with-lemma -quiet
        // /home/leon/Desktop/WhiteTextTools/Treetagger/lib/GeniaParams.par
        String[] command = new String[8];
        command[0] = Config.config.getString("whitetext.treetagger.location");
        command[1] = "-token";
        command[2] = "-lemma";
        command[3] = "-sgml";
        command[4] = "-pt-with-lemma";
        command[5] = "-quiet";
        // param file
        command[6] = Config.config.getString("whitetext.GENIA.params");
        // input file
        command[7] = inputFile.getAbsolutePath();
        return command;
    }

    public static String[] getTokenizerCommand(File inputFile) {
        String[] command = new String[4];
        // command[0] = "/usr/bin/perl";
        command[0] = Config.config.getString("whitetext.treetagger.tokenizer"); // the
        // perl
        // script
        // -
        // tokenize.pl
        command[1] = "-w"; // replace white space with SGML
        command[2] = "-e"; // english text
        command[3] = inputFile.getAbsolutePath(); // input file
        return command;
    }

    /**
     * Based on TreeTagger.java in the GATE project
     */
    public void runTreeTagger(ConnectionsDocument document) throws Exception {
        // log.info( "Inputfile:" + inputFile.getAbsolutePath() );

        String line, word, category, lemma;
        AnnotationSet aSet;

        FeatureMap braketFeatures = new SimpleFeatureMapImpl();
        braketFeatures.put("string", ")");
        braketFeatures.put("lemma", ")");
        braketFeatures.put("category", ")");
        braketFeatures.put("length", "1");

        // remove the old annotations
        document.removeAnnotationSet(annotationSetName);

        aSet = document.getAnnotations(annotationSetName);

        // run tokenizer and save output
        File inputFile;
        if (!GATETokens) {
            inputFile = getFile(document);
            inputFile = tokenize(inputFile);
        } else {
            inputFile = gateTokenize(document);
        }
        log.info("Inputfile after tokenization:" + inputFile.getAbsolutePath());

        // run TreeTagger and save output
        ArrayList<String> lines = runTreeTagger(inputFile);

        // take one line at a time
        // check its length and the string feature and go on addition
        // features

        long position;
        if (!GATETokens) {
            position = document.getTitle().getStartNode().getOffset();
        } else {
            position = 0;
        }
        long sentenceStart = position;

        String text = new String();

        for (int i = 0; i < lines.size(); i++) {
            line = (String) lines.get(i);
            // System.out.println( line );
            StringTokenizer st = new StringTokenizer(line);

            category = null;
            lemma = null;
            word = st.nextToken();
            if (word.equals("<internal_BL>")) {
                word = " ";
            } else if (word.equals("<internal_NL>")) {
                word = System.getProperty("line.separator");
            } else {
                // if we have not sentence start, then set one
                if (sentenceStart == -1) {
                    sentenceStart = position;
                }
                if (word.startsWith("<"))
                    log.info(word);

                if (st.hasMoreTokens())
                    category = st.nextToken();
                if (category != null && st.hasMoreTokens())
                    lemma = st.nextToken();

                FeatureMap features = new SimpleFeatureMapImpl();
                features.put("string", word);
                features.put("length", word.length());

                // finally add features on the top of tokens
                if (lemma != null)
                    features.put("lemma", lemma);
                if (category != null)
                    features.put("category", category);

                long start = position;
                long end = position + word.length();

                AbbreviationLoader abbrev = new AbbreviationLoader(document);
                // so here we split into two, if we have an abbreviation we gota
                // make three tokens instead of one
                if (!GATETokens && abbrev.hasShortFormOverlap(start, end) && !word.equals(")")
                        && word.contains("(")) {
                    // turn it into three tokens
                    AnnotationSet shortAbbrevs = abbrev.getShortForms(start, end);
                    Annotation abbrevAnnotation = shortAbbrevs.iterator().next();
                    // start to
                    long abbrevStart = abbrevAnnotation.getStartNode().getOffset();
                    String endString = word.substring((int) (abbrevStart + 1 - start));
                    // log.info( endString );
                    // log.info( word );
                    // log.info( start );
                    // log.info( abbrevStart );
                    aSet.add(start, abbrevStart, ANNIEConstants.TOKEN_ANNOTATION_TYPE, features);
                    // braket
                    aSet.add(abbrevStart, abbrevStart + 1, ANNIEConstants.TOKEN_ANNOTATION_TYPE, braketFeatures);
                    // the short form
                    FeatureMap endFeatures = new SimpleFeatureMapImpl();
                    endFeatures.put("string", endString);
                    endFeatures.put("lemma", endString);
                    endFeatures.put("category", "ABBREV");
                    endFeatures.put("length", endString.length());
                    aSet.add(abbrevStart + 1, end, ANNIEConstants.TOKEN_ANNOTATION_TYPE, endFeatures);

                } else {
                    // log.info( "start=" + start + "end=" + end + " " + word );
                    aSet.add(start, end, ANNIEConstants.TOKEN_ANNOTATION_TYPE, features);
                }

                if (category.equals("SENT")) {
                    aSet.add(sentenceStart, end, ANNIEConstants.SENTENCE_ANNOTATION_TYPE, features);
                    sentenceStart = -1;
                }
            }
            text += word;
            position += word.length();
        }

        if (!text.toString().equals(checkText)) {
            log.info("Text doesnt match! something was changed in the processing, check for unicode characters");
            log.info("Original|" + checkText + "|");
            log.info("FromFile|" + text.toString() + "|");
            System.out.println("Original Size:" + checkText.length());
            System.out.println("FromFile Size:" + text.toString().length());

            for (int cPos = 0; cPos < text.length(); cPos++) {
                char file = text.charAt(cPos);
                char orig = checkText.charAt(cPos);
                if (file != orig) {
                    System.out.println("Character:" + cPos + ":file:orig");
                    System.out.println("Character:" + cPos + ":" + file + ":" + orig);
                    System.out.println("Character:" + cPos + ":" + (int) file + ":" + (int) orig);
                }
            }

            System.exit(1);
        }
    }

    public static ArrayList<String> runTreeTagger(File inputFile) throws IOException {
        String line;
        String[] command = getTreeTaggerCommand(inputFile);
        // log.info( Arrays.asList( command ) );
        Process p = Runtime.getRuntime().exec(command);

        // get tree tagger output (gate input)
        BufferedReader input = new BufferedReader(new InputStreamReader(p.getInputStream()));

        ArrayList<String> lines = new ArrayList<String>();
        while ((line = input.readLine()) != null) {
            lines.add(line);
        }
        input.close();
        p.destroy();
        return lines;
    }

    private File gateTokenize(ConnectionsDocument document) throws IOException, FileNotFoundException {
        checkText = document.getContent().toString();
        checkText = checkText.replaceAll("\u2003", " ");
        checkText = checkText.replaceAll("\u200A", " ");

        File temp = File.createTempFile("GATEtokens", ".txt");

        FileOutputStream fos = new FileOutputStream(temp);
        OutputStreamWriter osw = new OutputStreamWriter(fos);
        BufferedWriter bw = new BufferedWriter(osw);

        AnnotationSet tokens = document.getAnnotations(ConnectionsDocument.GATETOKENS);
        log.info("Token size:" + tokens.size());
        if (tokens.size() == 0) {
            log.info("Warning, no tokens in the GATETokens annotation set");
        }

        Set<String> types = new HashSet<String>();
        types.add("Token");
        types.add("SpaceToken");
        tokens = tokens.get(types);
        // sort
        List<Annotation> tokensSorted = new ArrayList<Annotation>(tokens);
        Collections.sort(tokensSorted, new OffsetComparator());
        log.info("Tokens:" + tokensSorted.size());
        for (Annotation a : tokensSorted) {
            // log.info( a.getType() );
            String line = "";
            if (a.getType().equals("Token")) {
                line = document.getAnnotationText(a);
            }
            if (a.getType().equals("SpaceToken")) {
                if (a.getFeatures().get("kind").equals("control")) {
                    line = "<internal_NL>";
                } else {
                    line = "<internal_BL>";
                }
            }
            bw.write(line);
            bw.newLine();
        }
        bw.close();
        fos.close();
        return temp;
    }

    public static File tokenize(File inputFile) throws IOException, FileNotFoundException {
        File temp = File.createTempFile("tokenizer", ".txt");
        // temp.deleteOnExit();
        String[] command = getTokenizerCommand(inputFile);
        // log.info( Arrays.asList( command ) );
        Process p = null;
        try {
            p = Runtime.getRuntime().exec(command);
        } catch (Exception e) {
            e.printStackTrace();
            log.error("Trouble executing tree tagger");
            log.error(Arrays.asList(command));
            log.error("Please set whitetext.treetagger.tokenizer");
            System.exit(1);
        }
        InputStream is = p.getInputStream();
        // is = p.getErrorStream();
        OutputStream os = new FileOutputStream(temp);
        int c;
        while ((c = is.read()) != -1) {
            os.write(c);
        }
        os.close();
        return temp;
    }
}