pltag.parser.semantics.SemanticLexicon.java Source code

Introduction

Here is the source code for pltag.parser.semantics.SemanticLexicon.java
Source

/* 
 * Copyright (C) 2015 ikonstas
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package pltag.parser.semantics;

import fig.basic.IOUtils;
import fig.basic.Indexer;
import fig.basic.LogInfo;
import fig.basic.Pair;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.collections4.map.MultiValueMap;
import pltag.corpus.ElementaryStringTree;
import pltag.corpus.IdGenerator;
import pltag.corpus.TagNodeType;
import pltag.parser.Lexicon;
import pltag.parser.Options;
import pltag.util.Utils;

/**
 *
 * @author konstas
 */
public class SemanticLexicon extends Lexicon {

    SemanticLexicon lexiconWithAllRoles;
    private final Indexer<String> roleIndexer;

    public SemanticLexicon(Options opts, Set<String> listOfFreqWords, Indexer<String> roleIndexer) {
        super(opts, listOfFreqWords);
        this.roleIndexer = roleIndexer;
    }

    public SemanticLexicon(Options opts, Set<String> listOfFreqWords, Indexer<String> roleIndexer,
            SemanticLexicon lexiconWithAllRoles) {
        super(opts, listOfFreqWords);
        this.lexiconWithAllRoles = lexiconWithAllRoles;
        this.roleIndexer = roleIndexer;
    }

    @Override
    public void processLexicon(String filename) {
        MultiValueMap<String, ?>[] entries = read(filename);
        lexEntriesTree = makeLexTrees(entries[0], null);
    }

    @Override
    public void processLexicon(String[] lines) {
        if (lines.length == 1 && lines[0].equals("")) // empty lexicon (occasionally in predicted lexicon)
        {
            lexEntriesTree = new MultiValueMap<String, String>();
        } else {
            MultiValueMap<String, ?>[] entries = read(lines);
            lexEntriesTree = makeLexTrees(entries[0], null);
        }
    }

    @Override
    protected void extractFamilyLexicon(boolean writeToDisk) {
        try {
            Writer unlexSizeWriter = writeToDisk ? IOUtils.openOutEasy("family_size_lexicon.txt") : null;
            Collection<String> keyset = new ArrayList<String>(noOfTrees.keySet());
            for (String key : keyset) {
                if (!key.contains("LEXEME")) {
                    noOfTrees.remove(key);
                    continue;
                }
                Integer frequency = noOfTrees.get(key);
                String val = frequency.toString();
                if (unlexSizeWriter != null)
                    unlexSizeWriter.append(val).append("\t").append(key).append("\n");
                if (frequency < 5) {
                    noOfTrees.remove(key);
                } else if (frequency >= 100) {
                    String[] posUnlexTree = key.split("\t");
                    MultiValueMap temp = new MultiValueMap();
                    for (Object obj : lexEntriesTree.values()) {
                        LexiconEntryWithRoles entry = (LexiconEntryWithRoles) obj;
                        if (entry.getUnlexEntry().equals(posUnlexTree[1])) {
                            updateEntryWithRolesMap(temp, posUnlexTree[0], entry);
                        }
                    }
                    lexEntriesTree.putAll(temp);
                }
            }
            if (unlexSizeWriter != null)
                unlexSizeWriter.close();
        } catch (IOException e) {
            LogInfo.error(e);
        }
    }

    @Override
    protected void extractVerificationTrees() {
        for (String word : lexEntriesTree.keySet()) {
            for (LexiconEntryWithRoles entry : (Collection<LexiconEntryWithRoles>) lexEntriesTree
                    .getCollection(word)) {
                String rootCategory = getRootCategory(entry.getUnlexEntry());
                rootNodeTreeMap.put(rootCategory, new Pair(word, entry.toString()));
            }
        }
    }

    /**
     * Reads the lexicon file and sorts entries by their type (arg or mod).
     * For each of those types, it creates a MultiValueMap that's keyed on the lexeme, and whose
     * values are the Strings that represent the trees.
     * 
     * @param lines
     * @return a MultiValueMap Array, with the arg string lexicon in first position, and 
     * mod string lexicon in second position. 
     */
    @Override
    protected MultiValueMap<String, ?>[] read(String[] lines) {
        MultiValueMap<String, LexiconEntryWithRoles> lexEntries = new MultiValueMap();
        // temporary map that aggregates frequencies of trees with stripped semantics
        Map<String, Integer> lexTreesFreqs = new HashMap<String, Integer>();
        for (String line : lines) {
            String[] lexcontent = Utils.getCatInventory(line.trim(), opts.combineNNVBcats).split("\t+");
            int freq = Integer.parseInt(lexcontent[0]);
            if (lexcontent[3].contains("<>")) {
                String endswithLex = lexcontent[3].substring(0, lexcontent[3].indexOf("<>"));
                String anchor = endswithLex.substring(endswithLex.lastIndexOf(" ") + 1);
                if (!biWordMap.containsKey(anchor)) {
                    biWordMap.put(anchor, freq);
                } else {
                    biWordMap.put(anchor, biWordMap.get(anchor) + freq);//*/
                }
                if (lexcontent[3].contains("1_1)")) {
                    endswithLex = lexcontent[3].substring(0, lexcontent[3].indexOf("1_1)") - 1);
                    anchor += "%" + endswithLex.substring(endswithLex.lastIndexOf(" ") + 1);
                    if (!biWordMap.containsKey(anchor)) {
                        biWordMap.put(anchor, freq);
                    } else {
                        biWordMap.put(anchor, biWordMap.get(anchor) + freq);//*/
                    }
                    anchor = "UNK%" + endswithLex.substring(endswithLex.lastIndexOf(" ") + 1);
                    if (!biWordMap.containsKey(anchor)) {
                        biWordMap.put(anchor, 1);
                    } else {
                        biWordMap.put(anchor, biWordMap.get(anchor) + 1);//*/
                    }
                    if (!biWordMap.containsKey("UNK")) {
                        biWordMap.put("UNK", 1);
                    } else {
                        biWordMap.put("UNK", biWordMap.get("UNK") + 1);
                    }
                }
            } // if
            if (lexcontent.length < 4) {
                if (opts.verbose) {
                    LogInfo.error("wrong lex");
                }
            }
            lexcontent = lexEntryRemoveDigits(lexcontent);
            int frequency = Integer.parseInt(lexcontent[0]);
            boolean isRelation = hasSemanticFrame(lexcontent[1]);
            String anchorNoSemantics = stripSemanticFrame(lexcontent[1]);
            String treeNoSemantics = stripSemantics(lexcontent[3]);
            String posword = getPosFromTreeString(treeNoSemantics, anchorNoSemantics).toLowerCase();
            Integer oldFreq = lexTreesFreqs.get(treeNoSemantics);
            if (oldFreq != null) {
                lexTreesFreqs.put(treeNoSemantics, frequency + oldFreq);
            } else {
                lexTreesFreqs.put(treeNoSemantics, frequency);
            }
            String rawUnlexEntry = makeUnlex(lexcontent[3], anchorNoSemantics);
            //            LexiconEntryWithRoles entry = new LexiconEntryWithRoles(frequency, lexcontent[3], treeNoSemantics);
            LexiconEntryWithRoles entry = new LexiconEntryWithRoles(frequency, rawUnlexEntry, treeNoSemantics,
                    isRelation, roleIndexer, opts.freqBaseline);
            //            if (!lexcontent[1].equals("NUM"))
            //            {
            updateEntryWithRolesMap(lexEntries, anchorNoSemantics, entry, rawUnlexEntry);
            //            } // if
            //            else
            //            {
            //                LogInfo.error("Incorrect Lexicon format: line " + line);
            //            }
            if (posTagNo.containsKey(posword)) {
                posTagNo.put(posword, posTagNo.get(posword) + frequency);
            } else {
                posTagNo.put(posword, frequency);
            }
        } // for
          // update noOfTrees map with correct aggregated frequencies
        for (Map.Entry<String, Integer> entry : lexTreesFreqs.entrySet()) {
            String val = (entry.getValue() > 1 ? (opts.freqBaseline ? String.valueOf(entry.getValue()) : "1") : "0")
                    + "\t" + entry.getKey();
            noOfTrees.put(val, entry.getValue());
        } // for                
        return new MultiValueMap[] { lexEntries };
    }

    /**
     * Converts a MultiValueMap with String values to one with StringTree values.
     * @param lexEntriesWithOrigWordsAsKeys
     * @param treetype
     * @return MultiValueMap lexTree
     */
    @SuppressWarnings("unchecked")
    protected MultiValueMap makeLexTrees(MultiValueMap<String, ?> lexEntriesWithOrigWordsAsKeys, String treetype) {
        MultiValueMap<String, LexiconEntryWithRoles> unlexEntriesWithNormWordsAsKeys = new MultiValueMap();
        //        HashSet<String> unlexTreeList = new HashSet<String>();
        Map<String, String> origWordsToNormWordsMap = new HashMap<String, String>();

        for (String key : lexEntriesWithOrigWordsAsKeys.keySet()) {
            for (LexiconEntryWithRoles e : (Collection<LexiconEntryWithRoles>) lexEntriesWithOrigWordsAsKeys
                    .getCollection(key)) {
                //need to extract POS tag from treestring and unlexicalize tree.
                LexiconEntryWithRoles entry = new LexiconEntryWithRoles(e);
                String posWord = key;
                //                String treeString = entry.getLexEntry();
                String treeString = entry.toString();
                String unlexTree = treeString;
                if (!key.equals("prediction: ")) {
                    posWord = processKey(treeString, key);
                    unlexTree = makeUnlex(treeString, key);
                    //                    entry.setUnlexEntry(unlexTree.substring(unlexTree.indexOf("\t") + 1));
                }
                entry.setUnlexEntry(unlexTree.substring(unlexTree.indexOf("\t") + 1));
                updateNoOfTreesMap(treeString, posWord, unlexTree);
                String[] words = posWord.split("\t");
                //                if (!unlexTreeList.contains(posWord + "@@" + unlexTree) && words.length == 1)
                if (words.length == 1) {
                    int tempIndex = key.indexOf(" ");
                    String origWord = tempIndex != -1 ? key.substring(0, tempIndex) : key; // key with original case
                    String posWordLowerCase = posWord.toLowerCase();
                    String wordLowerCase = posWordLowerCase.substring(posWordLowerCase.indexOf(" ") + 1);
                    if (!opts.goldPosTags && opts.treeFamilies && !posWordLowerCase.equals("prediction: ")) {
                        //                        lexTree.put(wlc, unlexTree);
                        updateEntryWithRolesMap(unlexEntriesWithNormWordsAsKeys, wordLowerCase, entry);
                        //                        origWordsToNormWordsMap.put(posWord.substring(posWord.indexOf(" ") + 1), wordLowerCase);
                        origWordsToNormWordsMap.put(origWord, wordLowerCase);
                    } else {
                        //                        lexTree.put(lc, unlexTree);
                        updateEntryWithRolesMap(unlexEntriesWithNormWordsAsKeys, posWordLowerCase, entry);
                        origWordsToNormWordsMap.put(posWord.substring(0, posWord.indexOf(" ") + 1) + origWord,
                                posWordLowerCase);
                    }
                    if (!wordPosMap.containsValue(wordLowerCase, posWordLowerCase)) {
                        wordPosMap.put(wordLowerCase, posWordLowerCase);
                    }
                    trees.put(unlexTree.substring(unlexTree.indexOf("\t") + 1), posWordLowerCase);
                } // if
                //                unlexTreeList.add(posWord + "@@" + unlexTree);
                // for lexentries for "put up" etc, add three times into Map: as "put up", "put" and "up".
                if (words.length > 1) {
                    for (String word : words) {
                        if (!(word.startsWith(" *T*") || word.startsWith(" *?*") || word.startsWith(" *-")
                                || word.equals(" *") || word.equals(" 0"))) {
                            String lc = word.toLowerCase();
                            String wlc = lc.substring(lc.indexOf(" ") + 1);
                            if (!opts.goldPosTags && opts.treeFamilies) {
                                //                                lexTree.put(wlc, unlexTree);
                                updateEntryWithRolesMap(unlexEntriesWithNormWordsAsKeys, wlc, entry);
                                origWordsToNormWordsMap.put(word.substring(word.indexOf(" ") + 1), wlc);
                            } else {
                                //                                lexTree.put(lc, unlexTree);
                                updateEntryWithRolesMap(unlexEntriesWithNormWordsAsKeys, lc, entry);
                                origWordsToNormWordsMap.put(word, lc); // TODO: FIX Not handling potential uppercase partial lexemes correctly
                            }
                            if (!wordPosMap.containsValue(wlc, lc)) {
                                wordPosMap.put(wlc, lc);
                            }
                            trees.put(unlexTree.substring(unlexTree.indexOf("\t") + 1), lc);
                        } // if
                    } // for
                } // if
            } // for (values)
        } // for (keys)

        //        MultiValueMap<String, ElementaryStringTree> stringTreeEntriesWithOrigWordsAsKeys = new MultiValueMap<String, ElementaryStringTree>();
        //        for (Map.Entry<String, String> e : origWordsToNormWordsMap.entrySet())
        //        {                        
        //            Collection<EntryWithRoles> unlexEntries = unlexEntriesWithNormWordsAsKeys.getCollection(e.getValue());
        //            // we need to convert to StringTree instances as well
        //            for(LexiconEntryWithRoles entry : unlexEntries)
        //            {
        //                String unlexTreeString = entry.toString();
        //                String treeString = insertLex(e.getKey(), unlexTreeString);
        //                ElementaryStringTree tree = makeToStringTree(treeString, unlexTreeString, entry.getUnlexEntriesWithSemantics());
        //                stringTreeEntriesWithOrigWordsAsKeys.put(e.getKey(), tree);
        //            }            
        //        }
        //        return stringTreeEntriesWithOrigWordsAsKeys;
        //printEntriesWithComplexRolesStats(unlexEntriesWithNormWordsAsKeys);
        return unlexEntriesWithNormWordsAsKeys;
    }

    private void printEntriesWithComplexRolesStats(
            MultiValueMap<String, LexiconEntryWithRoles> unlexEntriesWithNormWordsAsKeys) {
        for (String key : unlexEntriesWithNormWordsAsKeys.keySet()) {
            StringBuilder treesStr = new StringBuilder();
            for (LexiconEntryWithRoles tree : unlexEntriesWithNormWordsAsKeys.getCollection(key)) {
                if (tree.getRoles().size() > 0) {
                    boolean treeMoreThanOneRole = false;
                    StringBuilder treeStr = new StringBuilder("\n");
                    for (RoleSignature sig : tree.getRoles()) {
                        treeStr.append(sig).append(", ");
                        if (sig.numOfRoles() > 1) {
                            treeMoreThanOneRole = true;
                        } // if
                    } // for
                    if (treeMoreThanOneRole && tree.getRoles().size() > 1)
                        treesStr.append("\t").append(treeStr);
                } // if
            } // for
            if (treesStr.length() > 0)
                System.out.println(key + treesStr);
        }
    }

    private void updateNoOfTreesMap(String treeString, String posWord, String unlexTree) {
        if (!noOfTrees.containsKey(treeString)) {
            String prefix = treeString.charAt(0) == '0' ? "1" : "0"; // switch potentially incorrect count prefix (it is 0 for freqs=1, 1 otherwise)
            treeString = prefix + treeString.substring(1);
        }
        if (noOfTrees.containsKey(treeString) && posWord.contains(" ") && (opts.goldPosTags || opts.treeFamilies)
                && !opts.posOnly) {
            String pos = posWord.substring(0, posWord.indexOf(" "));
            String puretree = pos + "\t" + unlexTree.substring(2);
            if (noOfTrees.containsKey(puretree)) {
                noOfTrees.put(puretree, noOfTrees.get(puretree) + noOfTrees.get(treeString));
            } else {
                noOfTrees.put(puretree, noOfTrees.get(treeString));
            }
            noOfTrees.remove(treeString);

        }
    }

    private void updateEntryWithRolesMap(MultiValueMap<String, LexiconEntryWithRoles> map, String key,
            LexiconEntryWithRoles entry) {
        if (lexiconWithAllRoles != null) // oracleAllRoles: expand entry with roles from the full lexicon (not just with roles from the gold standard tree)
        {
            Collection<LexiconEntryWithRoles> col = ((MultiValueMap<String, LexiconEntryWithRoles>) lexiconWithAllRoles.lexEntriesTree)
                    .getCollection(key);
            if (col != null && col.contains(entry)) {
                for (LexiconEntryWithRoles e : col) {
                    if (entry.equals(e)) // syntactically same tree with (potentially) different role assignments
                    {
                        entry.addEntry(e); // add roles to new entry from the existing lexicon entry
                        break;
                    }
                }
            }
        }
        Collection<LexiconEntryWithRoles> col = map.getCollection(key);
        if (col != null && col.contains(entry)) {
            for (LexiconEntryWithRoles e : col) {
                if (entry.equals(e)) // syntactically same tree with (potentially) different role assignments
                {
                    e.addEntry(entry);
                    break;
                }
            } // for                    
        } // if
        else {
            map.put(key, entry);
        }
    }

    private void updateEntryWithRolesMap(MultiValueMap<String, LexiconEntryWithRoles> map, String key,
            LexiconEntryWithRoles entry, String unlexEntryWithSemantics) {
        Collection<LexiconEntryWithRoles> col = map.getCollection(key);
        if (col != null && col.contains(entry)) {
            for (LexiconEntryWithRoles e : col) {
                if (entry.equals(e)) // syntactically same tree with (potentially) different role assignments
                {
                    e.addEntry(entry, unlexEntryWithSemantics);
                    break;
                }
            } // for                    
        } // if
        else {
            map.put(key, entry);
        }
    }

    private String processKey(String treeString, String key) {
        String posWord;
        if (opts.goldPosTags || opts.treeFamilies) {
            posWord = getPosFromTreeString(treeString, key);
            if (opts.posOnly) {//only pos tag given
                String[] words = posWord.split("\t");
                posWord = "";
                for (String w : words) {
                    if (!(w.contains("*") || w.equals("0"))) {
                        posWord += w.substring(0, w.indexOf(" ")) + "\t";
                    }
                }
                posWord = posWord.trim();
            }
        } else {// only word
            posWord = Utils.getCutOffCorrectedMainLex(key.toLowerCase(), listOfFreqWords, opts.train, opts.fullLex);
            if (key.contains(" ")) {
                posWord = posWord.replace(" ", "\t");
            }
        }
        return posWord;
    }

    @Override
    public Collection<ElementaryStringTree> getEntries(String word, String wCor, String posTag,
            boolean noAnalysisParse, int wno) {
        Collection<ElementaryStringTree> treesOut = new ArrayList<ElementaryStringTree>();
        Collection<String> treeStrings = new ArrayList<String>();
        if (lexEntriesTree.isEmpty()) {
            return treesOut;
        }
        String searchWord = wCor;//word.toLowerCase();
        if (!lexEntriesTree.containsKey(searchWord)) {
            searchWord = "";
            if (opts.goldPosTags) {
                for (String w : posTag.split("\t")) {
                    searchWord += w.toLowerCase() + " unk";
                }
            } else {
                searchWord += "unk";
            }
        }
        if (!lexEntriesTree.containsKey(searchWord)) // TODO: FIX
        {
            return treesOut;
        }
        for (LexiconEntryWithRoles entry : (Collection<LexiconEntryWithRoles>) lexEntriesTree
                .getCollection(searchWord)) {
            String treeString = entry.toString();
            if (treeString.contains("LEXEME1")) {
                String postag1 = treeString.substring(0, treeString.indexOf(" @LEXEME1@"));
                posTag = postag1.substring(postag1.lastIndexOf("(") + 2, postag1.lastIndexOf("^"));
            }
            String sts = posTag + "\t" + treeString.substring(treeString.indexOf("\t") + 1);
            if (//StatsRunner.fullLex
            this.lexEntriesTree.size() > 100 && ((!noOfTrees.containsKey(sts) && treeString.contains("^x"))
                    || (noOfTrees.containsKey(sts) && noOfTrees.get(sts) < 3 && treeString.contains("^x")))) {
                if (!noAnalysisParse) {
                    continue;
                }
            }
            if (num.matcher(word).matches() && !posTag.equals("CD")) {
                continue;
            }
            if (!word.contains("NUM") && upperCase.matcher(word).matches() && wno != 0
                    && (!posTag.startsWith("NN") && !posTag.startsWith("JJ")) && !word.equals("I")) {
                continue;
            }
            String unlexTreeString = treeString;
            treeString = insertLex(word, treeString);
            ElementaryStringTree tree = makeToStringTree(treeString, unlexTreeString, entry);
            if (tree != null && !treeStrings.contains(tree.getTreeString().substring(2))) {
                treesOut.add(tree);
                treeStrings.add(tree.getTreeString().substring(2));
            }
        } // for
        if (treesOut.size() > 20) {
            return treesOut;
        }
        HashMap<String, Integer> posTags = new HashMap<String, Integer>();
        if (!posTag.equals("") && opts.treeFamilies && !searchWord.equals("unk")) {//don't do this for prediction trees.
            if (!opts.goldPosTags) {
                posTags = getPosTags(treeStrings, searchWord);
            } else {
                posTags.put(posTag, 1);
            }
            if (posTags.size() > 1) {
                for (String ptag : posTags.keySet()) {
                    if (lexEntriesTree.containsKey(ptag)) {
                        for (LexiconEntryWithRoles entry : (Collection<LexiconEntryWithRoles>) lexEntriesTree
                                .getCollection(ptag)) {
                            String treeString = entry.toString();
                            String unlexTreeString = treeString;
                            treeString = insertLex(word, treeString);
                            ElementaryStringTree tree = makeToStringTree(treeString, unlexTreeString, entry);
                            String ts = tree.getTreeString().substring(2);
                            if (!treeStrings.contains(ts) && noOfTrees.get(ptag + "\t"
                                    + unlexTreeString.substring(unlexTreeString.indexOf("\t") + 1)) > 100) {
                                treesOut.add(tree);
                                treeStrings.add(ts);
                            }
                        } // for
                    } // if
                } // for
            } // if
        } // if
        if (treesOut.size() > 6) {
            return treesOut;
        } else //correct for bad gold pos tag.
        {
            if (!searchWord.equals("prediction: ") && opts.goldPosTags && opts.fullLex) {
                posTags = getPosTags(treeStrings, searchWord);
            }
            posTags.remove(searchWord);
        }
        for (String sw : posTags.keySet()) {
            if (lexEntriesTree.containsKey(sw)) {
                for (LexiconEntryWithRoles entry : (Collection<LexiconEntryWithRoles>) lexEntriesTree
                        .getCollection(sw)) {
                    String treeString = entry.toString();
                    String unlexTreeString = treeString;
                    treeString = insertLex(word, treeString);
                    ElementaryStringTree tree = makeToStringTree(treeString, unlexTreeString, entry);
                    if (tree != null && !treeStrings.contains(tree.getTreeString().substring(2))) {
                        treesOut.add(tree);
                        treeStrings.add(tree.getTreeString().substring(2));
                    }
                }
            }
        }
        return treesOut;
    }

    protected ElementaryStringTree makeToStringTree(String treeString, String unlexString,
            LexiconEntryWithRoles entry) {
        if (!treeString.contains("(")) {
            LogInfo.error("invalid entry: " + treeString);
            return null;
        }
        IdGenerator idgen = new IdGenerator();
        ElementaryStringTree tree = convertToTree(
                new ElementaryStringTree(treeString, opts.useSemantics, entry.getRoles(), entry.isRelation()),
                idgen);
        if (tree == null) {
            return null;
        }
        tree.setTreeString(unlexString);
        if (tree.getAnchor() == Integer.MIN_VALUE) {
            tree.findChoppedSpine();
        }
        if (tree.getAnchor() == Integer.MIN_VALUE) {
            return null;
        }
        tree.annotateHeadStatus();
        return tree;
    }

    //currently only for trees with one lex root TODO 
    @Override
    protected String getPosFromTreeString(String treeString, String key) {
        String[] words = key.split(" ");
        StringBuilder posWord = new StringBuilder();
        for (String w : words) {
            String ts = treeString;
            //cut off everything after key word.
            String pos = "";
            if (ts.contains(w + "<>")) {
                ts = ts.substring(0, ts.indexOf(w + "<>"));
                pos = ts.substring(ts.lastIndexOf("( ") + 2, ts.lastIndexOf("^"));
                pos = stripPosAndSemanticsToken(pos)[0]; // if there is any semantic role information, strip it out                
                if (opts.train) {
                    w = w.toLowerCase();
                } else if (w.equals("@LEXEME1@")) {
                    return pos;
                } else {
                    w = Utils.getCutOffCorrectedMainLex(w.toLowerCase(), listOfFreqWords, opts.train, opts.fullLex);
                }
            }
            if (w.contains("*") || w.contains("0")) {
                continue;
            }
            posWord.append(pos).append(" ").append(w).append("\t");
        }
        String posw = posWord.toString();
        posw = posw.trim();
        return posw;
    }

    protected String stripSemantics(String str) {
        int indexOfDelimiter = str.indexOf("@");
        if (indexOfDelimiter != -1) {
            while (indexOfDelimiter != -1) {
                int endOfRoleIndex = str.indexOf(";", indexOfDelimiter);
                if (endOfRoleIndex != -1) // rare false alarm case: '@' is not a delimiter but an actual lexical anchor
                {
                    str = str.replace(str.substring(indexOfDelimiter, endOfRoleIndex + 1), "");
                    indexOfDelimiter = str.indexOf("@", indexOfDelimiter);
                } else {
                    break;
                }
            }
        }
        return str;
    }

    protected List<String[]> stripPosAndSemantics(String str) {
        List<String[]> res = new ArrayList<String[]>();
        int indexOfDelimiter = str.indexOf("@");
        if (indexOfDelimiter != -1) {
            while (indexOfDelimiter != -1) {
                int endOfRoleIndex = str.indexOf(";", indexOfDelimiter);
                res.add(stripPosAndSemanticsToken(str.substring(0, endOfRoleIndex)));
                indexOfDelimiter = str.indexOf("@", endOfRoleIndex);
            }
            for (String[] ar : res) {
                str = str.replace("@" + ar[1] + ";", "");
            }
        }
        res.add(new String[] { str });

        return res;
    }

    /**
     * Return the pos tag and semantic annotation (if existing) from an input string token
     * that has the following format: POS@SEM;, e.g., NP@ARGO;
     * @param str the input string
     * @return an array of strings. The first element is the POS tag, and the second is the semantic role label
     * or an empty string if there is none.
     */
    public static String[] stripPosAndSemanticsToken(String str) // TODO: Fix tackle cases with more than one role
    {
        int indexOfDelimiter = str.indexOf("@");
        if (indexOfDelimiter == -1) {
            return new String[] { str, "" };
        } else {
            return new String[] { str.substring(0, indexOfDelimiter),
                    str.substring(indexOfDelimiter + 1, str.length() - 1) };
        }
    }

    /**
     * Strips the semantic frame (if any) from an input string
     * @param str the input string
     * @return 
     */
    public static String stripSemanticFrame(String str) {
        int index = str.indexOf(".");
        //        if(index > 1 && index < str.length() - 1) // semantic frame of relation (and not the fullstop lexeme, or the fullstop at the end of a word)
        if (index > 1 && (str.substring(index + 1).matches("\\p{Digit}+") || str.substring(index + 1).equals("XX"))) // semantic frame of relation (and not the fullstop lexeme)
        {
            return str.substring(0, index);
        }
        return str;
    }

    private boolean hasSemanticFrame(String str) {
        int index = str.indexOf(".");
        return index > 1
                && (str.substring(index + 1).matches("\\p{Digit}+") || str.substring(index + 1).equals("XX"));
    }

    protected static String removeAnnotation(String node) {
        if (node.indexOf("<") >= 0) {
            node = node.substring(0, node.indexOf("<"));
        } else if (node.indexOf("!") > 0) {
            node = node.substring(0, node.indexOf("!"));
        } else if (node.indexOf("*") > 0 && getNodeType(node) == TagNodeType.foot) {
            node = node.substring(0, node.indexOf("*"));
        }
        if (node.contains("_")) {
            node = node.substring(0, node.indexOf("^"));
        }
        return node;
    }

}