pltag.parser.Lexicon.java Source code

Java tutorial

Introduction

Here is the source code for pltag.parser.Lexicon.java

Source

/* 
 * Copyright (C) 2015 ikonstas
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package pltag.parser;

import fig.basic.IOUtils;
import fig.basic.LogInfo;
import fig.basic.Pair;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.collections4.map.MultiValueMap;

import pltag.corpus.ElementaryStringTree;
import pltag.corpus.IdGenerator;
import pltag.corpus.TagNodeType;
import pltag.corpus.StringTree;
import pltag.util.Utils;

public class Lexicon {

    protected Options opts;
    protected Set<String> listOfFreqWords;

    //private ArrayList<PennTree> trees = new ArrayList<PennTree>();
    //private MultiValueMap modLexentriesTree = new MultiValueMap();
    //private MultiValueMap argLexentriesTree = new MultiValueMap();
    protected MultiValueMap<String, ?> lexEntriesTree = new MultiValueMap();
    protected MultiValueMap<String, String> wordPosMap = new MultiValueMap();
    //private ArrayList structStringArray = new ArrayList();
    //private HashMap<String, Integer> lexToArraySlot = new HashMap<String, Integer>();
    protected static Pattern digits = Pattern.compile("([^0-9]*)[0-9]+(.*)");
    protected static Pattern upperCase = Pattern.compile("[A-Z][A-Za-z]*");
    protected static Pattern num = Pattern.compile("NUM([^a-z]*NUM?)[^a-z]*");
    protected MultiValueMap<String, String> trees = new MultiValueMap();
    protected HashMap<String, Integer> noOfTrees = new HashMap<String, Integer>();
    protected HashMap<String, Integer> posTagNo = new HashMap<String, Integer>();

    protected MultiValueMap<String, Pair<String, String>> rootNodeTreeMap = new MultiValueMap();
    protected static int numOfTreeTemplates = 6934;
    protected static HashMap<String, Integer> biWordMap = new HashMap<String, Integer>();

    public Lexicon(Options opts, Set<String> listOfFreqWords) {
        this.opts = opts;
        this.listOfFreqWords = listOfFreqWords;
    }

    public Lexicon(Options opts, Set<String> listOfFreqWords, String[] lines) {
        this.opts = opts;
        this.listOfFreqWords = listOfFreqWords;
        if (lines.length == 1 && lines[0].equals("")) // empty lexicon (occasionally in predicted lexicon)
        {
            lexEntriesTree = new MultiValueMap<String, String>();
        } else {
            MultiValueMap<String, ?>[] entries = read(lines);
            lexEntriesTree = makeLexStrings(entries[0], "ARG");
            lexEntriesTree.putAll(makeLexStrings(entries[1], "MOD"));
        }
    }

    public Lexicon() {
    }

    public void processLexicon(String filename) {
        MultiValueMap<String, String>[] entries = read(filename);
        //lexentriesTree = makeLextrees(entries[0], "ARG");
        //lexentriesTree.putAll(makeLextrees(entries[1], "MOD"));
        lexEntriesTree = makeLexStrings(entries[0], "ARG");
        lexEntriesTree.putAll(makeLexStrings(entries[1], "MOD"));
    }

    public void processLexicon(String[] lines) {
        MultiValueMap<String, String>[] entries = (MultiValueMap<String, String>[]) read(lines);
        //lexentriesTree = makeLextrees(entries[0], "ARG");
        //lexentriesTree.putAll(makeLextrees(entries[1], "MOD"));
        lexEntriesTree = makeLexStrings(entries[0], "ARG");
        lexEntriesTree.putAll(makeLexStrings(entries[1], "MOD"));
    }

    public void postProcessLexicon(boolean writeToDisk) {
        //        lexiconReduce();  
        if (writeToDisk) {
            try {
                Writer lexWriter = IOUtils.openOutEasy("normal_lexicon.txt");
                for (String key : lexEntriesTree.keySet()) {
                    lexWriter.append(key).append("-> [");
                    for (Object val : lexEntriesTree.getCollection(key)) {
                        lexWriter.append(val.toString()).append(",");
                    }
                    lexWriter.append("]\n");
                }
                lexWriter.close();

            } catch (IOException e) {
                LogInfo.error(e);
            }
            try {
                Writer familywriter = IOUtils.openOutEasy("family_lexicon.txt");
                for (String key : trees.keySet()) {
                    familywriter.append(key).append("-> [");
                    for (String val : trees.getCollection(key)) {
                        familywriter.append(val).append(",");
                    }
                    familywriter.append("]\n");
                }
                familywriter.close();
            } catch (IOException e) {
                LogInfo.error(e);
            }
        } // if
        extractFamilyLexicon(writeToDisk);
        removeHelps();
    }

    protected void extractFamilyLexicon(boolean writeToDisk) {
        try {
            Writer unlexSizeWriter = writeToDisk ? IOUtils.openOutEasy("family_size_lexicon.txt") : null;
            Collection<String> keyset = new ArrayList<String>(noOfTrees.keySet());
            for (String key : keyset) {
                if (!key.contains("LEXEME")) {
                    noOfTrees.remove(key);
                    continue;
                }
                Integer frequency = noOfTrees.get(key);
                String val = frequency.toString();
                if (unlexSizeWriter != null)
                    unlexSizeWriter.append(val).append("\t").append(key).append("\n");
                if (frequency < 5) {
                    noOfTrees.remove(key);
                } else if (frequency >= 100) {
                    String[] info = key.split("\t");
                    lexEntriesTree.put(info[0], "1\t" + info[1]);
                }
            }
            if (unlexSizeWriter != null)
                unlexSizeWriter.close();
        } catch (IOException e) {
            LogInfo.error(e);
        }
    }

    protected void extractVerificationTrees() {
        for (String word : lexEntriesTree.keySet()) {
            for (String entry : (Collection<String>) lexEntriesTree.getCollection(word)) {
                String rootCategory = getRootCategory(entry);
                rootNodeTreeMap.put(rootCategory, new Pair(word, entry));
            }
        }
    }

    public String getRootCategory(String tree) {
        int startIndex = tree.indexOf("(");
        int endIndex = tree.indexOf("^");
        return tree.substring(startIndex + 2, endIndex);
    }

    @SuppressWarnings("unchecked")
    protected int checkType(String string, MultiValueMap stringTreeMap) {
        Collection<StringTree> values = stringTreeMap.values();
        int errors = 0;
        for (StringTree tree : values) {
            if ((string.equals("MOD") && tree.isAuxtree()) || (string.equals("ARG") && !tree.isAuxtree())) {
                //ok
            } else {
                System.err.println(
                        "wrong tree type: " + string + tree.getStructure(tree.getRoot(), opts.useSemantics));
                errors++;
            }
        }
        return errors;
    }

    /**
     * Converts a MultiValueMap with String values to one with StringTree values.
     * @param treetype 
     * 
     * @param MultiValueMap lexString
     * @return MultiValueMap lexTree
     */
    @SuppressWarnings("unchecked")
    private MultiValueMap makeLexTrees(MultiValueMap lexString, String treetype) {
        MultiValueMap<String, ElementaryStringTree> lexTree = new MultiValueMap();
        Set<String> keys = lexString.keySet();
        for (String key : keys) {
            Collection<String> values = lexString.getCollection(key);
            HashMap<String, ElementaryStringTree> treelist = new HashMap<String, ElementaryStringTree>();
            for (String treeString : values) {
                ElementaryStringTree tree = makeToStringTreeOld(treeString, treetype);
                if (tree == null) {
                    continue;
                }
                String POSword = tree.getPOStagged(tree.getRoot()).trim();
                if (key.equals("prediction: ")) {
                    POSword = key;
                }
                // for lexentries for "put up" etc, add three times into Map: as "put up", "put" and "up".
                String[] words = POSword.split("\t");
                ElementaryStringTree sametree = null;
                if (!treelist.containsKey(POSword + "@@" + tree.toString())) {
                    lexTree.put(POSword, tree);
                }
                treelist.put(POSword + "@@" + tree.toString(), tree);
                if (words.length > 1) {
                    for (String word : words) {
                        if (sametree == null) {
                            lexTree.put(word, tree);
                        }
                    }
                }
            }
        }
        return lexTree;
    }

    private ElementaryStringTree makeToStringTreeOld(String treeString, String treetype) {
        if (!treeString.contains("(")) {
            System.err.println("invalid entry: " + treeString);
            return null;
        }
        IdGenerator idgen = new IdGenerator();
        ElementaryStringTree tree = convertToTree(new ElementaryStringTree(treeString, opts.useSemantics), idgen);
        tree.findChoppedSpine();
        if (tree.getAnchor() == Integer.MIN_VALUE)
        //        if (tree.getAnchor().equals(""))
        {
            return null;
        }
        tree.annotateHeadStatus();
        if ((tree.isAuxtree() && treetype.equals("MOD")) || (!tree.isAuxtree() && treetype.equals("ARG"))) {
        } else {
            System.err
                    .println("wrong tree type: " + treetype + tree.getStructure(tree.getRoot(), opts.useSemantics));
            return null;
        }
        return tree;
    }

    protected ElementaryStringTree makeToStringTree(String treeString, String unlexString) {
        if (!treeString.contains("(")) {
            System.err.println("invalid entry: " + treeString);
            return null;
        }
        IdGenerator idgen = new IdGenerator();
        ElementaryStringTree tree = convertToTree(new ElementaryStringTree(treeString, opts.useSemantics), idgen);
        if (tree == null) {
            return null;
        }
        tree.setTreeString(unlexString);
        //      if (treeString.contains(" *")){
        //         tree.setTreeString(tree.print());
        //      }
        //      else tree.setTreeString(treeString);
        //if (tree.getLowerIndex()
        if (tree.getAnchor() == Integer.MIN_VALUE)
        //        if (tree.getAnchor().equals(""))
        {
            tree.findChoppedSpine();
        }
        if (tree.getAnchor() == Integer.MIN_VALUE)
        //        if (tree.getAnchor().equals(""))
        {
            return null;
        }
        tree.annotateHeadStatus();
        //if ((tree.isAuxtree() && treetype.equals("MOD"))||(!tree.isAuxtree() && treetype.equals("ARG"))){}
        //else {
        //   System.err.println("wrong tree type: " + treetype + tree.getStructure(tree.getRoot()));
        //   return null;
        //}
        return tree;
    }

    /**
     * Converts a MultiValueMap with String values to one with StringTree values.
     * @param lexString
     * @param treetype
     * @return MultiValueMap lexTree
     */
    @SuppressWarnings("unchecked")
    protected MultiValueMap makeLexStrings(MultiValueMap lexString, String treetype) {
        MultiValueMap lexTree = new MultiValueMap();
        Set<String> keys = lexString.keySet();
        HashSet<String> treelist = new HashSet<String>();
        for (String key : keys) {
            Collection<String> values = lexString.getCollection(key);
            for (String treeString : values) {
                //if (tree == null) continue; //need to deal with errors at different point.
                //need to extract POS tag from treestring and unlexicalize tree.
                String posWord = key;
                String tree = treeString;
                if (!key.equals("prediction: ")) {
                    if (opts.goldPosTags || opts.treeFamilies) {//pos and word given
                        //                        if (this.getClass() == UnkLexicon.class) // FIX: Unnecessary check
                        //                        {
                        //                            posWord = UnkLexicon.getPosFromTreeString(treeString, key);
                        //                        }
                        //                        else
                        //                        {
                        //                            posWord = getPosFromTreeString(treeString, key);
                        //                        }                        
                        posWord = getPosFromTreeString(treeString, key);
                        if (opts.posOnly) {//only pos tag given
                            String[] words = posWord.split("\t");
                            posWord = "";
                            for (String w : words) {
                                if (w.contains("*") || w.equals("0")) {
                                    continue;
                                } else {
                                    posWord += w.substring(0, w.indexOf(" ")) + "\t";
                                }
                            }
                            posWord = posWord.trim();
                        }
                    } else {// only word
                        posWord = Utils.getCutOffCorrectedMainLex(key.toLowerCase(), listOfFreqWords, opts.train,
                                opts.fullLex);
                        if (key.contains(" ")) {
                            posWord = posWord.replace(" ", "\t");
                        }
                    }
                    tree = makeUnlex(treeString, key);
                }
                if (noOfTrees.containsKey(treeString) && posWord.contains(" ")
                        && (opts.goldPosTags || opts.treeFamilies) && !opts.posOnly) {
                    String pos = posWord.substring(0, posWord.indexOf(" "));
                    String puretree = pos + "\t" + tree.substring(2);
                    if (noOfTrees.containsKey(puretree)) {
                        noOfTrees.put(puretree, noOfTrees.get(puretree) + noOfTrees.get(treeString));
                    } else {
                        noOfTrees.put(puretree, noOfTrees.get(treeString));
                    }
                    noOfTrees.remove(treeString);

                }
                // for lexentries for "put up" etc, add three times into Map: as "put up", "put" and "up".
                String[] words = posWord.split("\t");
                ElementaryStringTree sametree = null;

                if (!treelist.contains(posWord + "@@" + tree) && words.length == 1) {
                    String lc = posWord.toLowerCase();
                    // 
                    String wlc = lc.substring(lc.indexOf(" ") + 1);
                    if (!opts.goldPosTags && opts.treeFamilies && !lc.equals("prediction: ")) {
                        lexTree.put(wlc, tree);
                    } else {
                        lexTree.put(lc, tree);
                    }
                    if (!wordPosMap.containsValue(wlc, lc)) {
                        wordPosMap.put(wlc, lc);
                    }
                    trees.put(tree.substring(tree.indexOf("\t") + 1), lc);
                }
                treelist.add(posWord + "@@" + tree);
                if (words.length > 1) {
                    for (String word : words) {
                        if (sametree == null && !word.startsWith(" *T*") && !word.startsWith(" *?*")
                                && !word.startsWith(" *-") && !word.equals(" *") && !word.equals(" 0")) {
                            String lc = word.toLowerCase();
                            String wlc = lc.substring(lc.indexOf(" ") + 1);
                            if (!opts.goldPosTags && opts.treeFamilies) {
                                lexTree.put(wlc, tree);
                            } else {
                                lexTree.put(lc, tree);
                            }
                            if (!wordPosMap.containsValue(wlc, lc)) {
                                wordPosMap.put(wlc, lc);
                            }
                            trees.put(tree.substring(tree.indexOf("\t") + 1), lc);
                        } // if
                    } // for
                } // if
            } // for (values)
        } // for (keys)
        return lexTree;
    }

    protected String makeUnlex(String treeString, String key) {
        String[] words = key.split(" ");
        int id = 1;
        String unlexTreeString = treeString;
        for (String word : words) {
            unlexTreeString = unlexTreeString.replace(word + "<>", "@LEXEME" + id + "@<>");
            id++;
        }
        return unlexTreeString;
    }

    //currently only for trees with one lex root TODO 
    protected String getPosFromTreeString(String treeString, String key) {
        String[] words = key.split(" ");
        StringBuilder posWord = new StringBuilder();
        for (String w : words) {
            String ts = treeString;
            //cut off everything after key word.
            String pos = "";
            if (ts.contains(w + "<>")) {
                ts = ts.substring(0, ts.indexOf(w + "<>"));
                pos = ts.substring(ts.lastIndexOf("( ") + 2, ts.lastIndexOf("^"));
                //                pos = stripPosAndSemantics(pos)[0]; // if there is any semantic role information, strip it out                
                if (opts.train) {
                    w = w.toLowerCase();
                } else if (w.equals("@LEXEME1@")) {
                    return pos;
                } else {
                    w = Utils.getCutOffCorrectedMainLex(w.toLowerCase(), listOfFreqWords, opts.train, opts.fullLex);
                }
            }
            if (w.contains("*") || w.contains("0")) {
                continue;
            }
            posWord.append(pos).append(" ").append(w).append("\t");
        }
        String posw = posWord.toString();
        posw = posw.trim();
        return posw;
    }

    /**
     * Converts a string of the lexicon format into a StringTree.
     * 
     * @param tree
     * @param idgen 
     * @return
     */
    public static ElementaryStringTree convertToTree(ElementaryStringTree tree, IdGenerator idgen) {

        String s = tree.getTreeString().trim();
        if (s.charAt(0) == ' ') {
            s = s.substring(1);
        }
        int index = 0;
        boolean notEnd = true;
        while (s.charAt(index) != '(' && s.charAt(index) != ' ' && notEnd) {//as long as no subcategory started
            if (s.charAt(index) == ')') {//leaf
                notEnd = false;
                String catleaf = s.substring(1, index);
                catleaf = catleaf.trim();
                String[] parentAndChildren = catleaf.split(" ");
                String parent = parentAndChildren[0];
                int parentId = idgen.getNewId();
                tree = makeNode(parent, Integer.MIN_VALUE, parentId, tree);

                tree.setRoot(parentId);//in recursive process: always overwrite this info! see below.
                for (int i = 1; i < parentAndChildren.length; i++) {
                    String node = parentAndChildren[i];
                    int nodeId = idgen.getNewId();
                    tree = makeNode(node, parentId, nodeId, tree);
                }
                tree.setTreeString(s.substring(index + 1));
                return tree;
            } else {
                index++;
            }
        }
        s = s.trim();
        if (s.charAt(0) == '(') {
            s = s.substring(1);
        }
        String parent = (s.substring(0, index)).trim();//because always start after opening bracket
        String treeString = s.substring(index, s.length());
        if (parent.equals("") && treeString.length() > 0) // parse children in bracket recursively
        {
            tree.setTreeString(treeString);
            tree = convertToTree(tree, idgen);
            if (tree == null) {
                return null;
            }
            if (treeString.startsWith("  (")) {
                //System.out.println("!" + tree.getStructure(tree.getRoot()));
                int rootNodeId = idgen.getNewId();
                tree = makeNode("", Integer.MIN_VALUE, rootNodeId, tree);
                tree.addChild(rootNodeId, tree.getRoot());
                tree.putParent(tree.getRoot(), rootNodeId);
                tree.setRoot(rootNodeId);
            }
        } // if
        else // parse sibling
        {
            treeString = treeString.trim();
            int parentId = idgen.getNewId();
            tree = makeNode(parent, Integer.MIN_VALUE, parentId, tree);
            if (tree == null) {
                return null;
            }
            while (!treeString.startsWith(")")) {
                if (treeString.startsWith("(")) // parse children in bracket recursively
                {
                    tree.setTreeString(treeString);
                    tree = convertToTree(tree, idgen); // recursion
                    tree.putParent(tree.getRoot(), parentId); // attach parent to children
                    tree.addChild(parentId, tree.getRoot());
                    treeString = tree.getTreeString();
                    treeString = treeString.trim();
                } else // parse children within the bracket iteratively
                {
                    int blankindex = treeString.indexOf(" ");
                    int endindex = treeString.indexOf(")");
                    String child;
                    if (blankindex < endindex && blankindex > 0) {
                        child = treeString.substring(0, blankindex).trim();
                        treeString = treeString.substring(blankindex).trim();
                    } else {
                        if (endindex < 0) {
                            System.err.println("problem");
                        }
                        child = treeString.substring(0, endindex).trim();
                        treeString = treeString.substring(endindex).trim();
                    }
                    int childId = idgen.getNewId();
                    tree = makeNode(child, parentId, childId, tree);
                }
            }
            if (treeString.startsWith(")")) {
                treeString = treeString.substring(1);
                tree.setTreeString(treeString);
                tree.setRoot(parentId);
            }
        } // else

        return tree;
    }

    protected static ElementaryStringTree makeNode(String child, int parentId, int childId,
            ElementaryStringTree tree) {
        tree.makeArraysBigger(childId);
        //        tree.makeArraysBigger(Integer.parseInt(childId));
        TagNodeType type = getNodeType(child);
        tree.putNodeType(childId, type);
        if (type == TagNodeType.foot) {
            tree.setFootNode(childId);
        } else if (type == TagNodeType.anchor) {
            if (tree.getAnchor() == Integer.MIN_VALUE)
            //            if (tree.getAnchor().equals(""))
            {
                tree.setAnchor(childId);
            }
        } else if (type == TagNodeType.internal && child.substring(0, child.indexOf("^")).matches("[a-z]+")) {
            tree.setSubcategorisedAnchor(childId);
        }
        child = removeAnnotation(child);
        child = tree.getIndices(childId, child);
        //        child = tree.getIndices(Integer.parseInt(childId), child);
        if (child == null) {
            return null; //invalid indices.
        }
        child = tree.getHeadAnnotation(childId, child);
        //        child = tree.getHeadAnnotation(Integer.parseInt(childId), child);
        if (parentId != Integer.MIN_VALUE)
        //        if (!parentId.equals(""))
        {
            tree.addChild(parentId, childId);
            tree.putParent(childId, parentId);
        }
        tree.putFullCategory(childId, child);
        String childShort = child;
        if (child.matches(".*-[A-Z0-9].*") && !child.equals("--")
                && tree.getNodeType(childId) != TagNodeType.anchor) {
            childShort = child.substring(0, child.indexOf("-"));
        }
        tree.putCategory(childId, childShort);

        return tree;
    }

    protected static TagNodeType getNodeType(String parent) {
        TagNodeType type;
        if (parent.indexOf("<>") >= 0) {
            type = TagNodeType.anchor;
        } else if (parent.endsWith("!")) {
            type = TagNodeType.subst;
        } else if (parent.endsWith("_null*")) {
            type = TagNodeType.foot;
        } else {
            type = TagNodeType.internal;
        }

        return type;
    }

    protected static String removeAnnotation(String parent) {
        if (parent.indexOf("<") >= 0) {
            parent = parent.substring(0, parent.indexOf("<"));
        } else if (parent.indexOf("!") > 0) {
            parent = parent.substring(0, parent.indexOf("!"));
        } else if (parent.indexOf("*") > 0 && getNodeType(parent) == TagNodeType.foot) {
            parent = parent.substring(0, parent.indexOf("*"));
        }
        return parent;
    }

    protected MultiValueMap[] read(String filename) {
        return read(Utils.readLines(filename));
    }

    /**
     * Reads the lexicon file and sorts entries by their type (arg or mod).
     * For each of those types, it creates a MultiValueMap that's keyed on the lexeme, and whose
     * values are the Strings that represent the trees.
     * 
     * @param lines
     * @return a MultiValueMap Array, with the arg string lexicon in first position, and 
     * mod string lexicon in second position. 
     */
    protected MultiValueMap<String, ?>[] read(String[] lines) {
        MultiValueMap<String, String> modLexentriesString = new MultiValueMap();
        MultiValueMap<String, String> argLexentriesString = new MultiValueMap();
        for (String line : lines) {
            String[] lexcontent = Utils.getCatInventory(line.trim(), opts.combineNNVBcats).split("\t+");
            int freq = Integer.parseInt(lexcontent[0]);
            if (lexcontent[3].contains("<>")) {
                String endswithLex = lexcontent[3].substring(0, lexcontent[3].indexOf("<>"));
                String anchor = endswithLex.substring(endswithLex.lastIndexOf(" ") + 1);
                if (!biWordMap.containsKey(anchor)) {
                    biWordMap.put(anchor, freq);
                } else {
                    biWordMap.put(anchor, biWordMap.get(anchor) + freq);//*/
                }
                if (lexcontent[3].contains("1_1)")) {
                    endswithLex = lexcontent[3].substring(0, lexcontent[3].indexOf("1_1)") - 1);
                    anchor += "%" + endswithLex.substring(endswithLex.lastIndexOf(" ") + 1);
                    if (!biWordMap.containsKey(anchor)) {
                        biWordMap.put(anchor, freq);
                    } else {
                        biWordMap.put(anchor, biWordMap.get(anchor) + freq);//*/
                    }
                    anchor = "UNK%" + endswithLex.substring(endswithLex.lastIndexOf(" ") + 1);
                    if (!biWordMap.containsKey(anchor)) {
                        biWordMap.put(anchor, 1);
                    } else {
                        biWordMap.put(anchor, biWordMap.get(anchor) + 1);//*/
                    }
                    if (!biWordMap.containsKey("UNK")) {
                        biWordMap.put("UNK", 1);
                    } else {
                        biWordMap.put("UNK", biWordMap.get("UNK") + 1);
                    }
                }

            }
            if (lexcontent.length < 4) {
                if (opts.verbose) {
                    System.out.println("wrong lex");
                }
            }
            lexcontent = lexEntryRemoveDigits(lexcontent);
            int baumAnz = Integer.parseInt(lexcontent[0]);
            if (lexcontent[0].equals("1")) {
                lexcontent[0] = "0";
                //continue;
            } else if (!opts.freqBaseline) {
                lexcontent[0] = "1";
            }
            String word = lexcontent[1];
            //            String wordNoSemantics = stripSemanticFrame(lexcontent[1]);
            String val = lexcontent[0] + "\t" + lexcontent[3];
            if (lexcontent[2].equals("ARG")) {
                //                if (!lexcontent[1].equals("NUM") || !argLexentriesString.containsValue(lexcontent[1], lexcontent[0].toString() + "\t" + lexcontent[3].toString()))
                if (!lexcontent[1].equals("NUM") || !argLexentriesString.containsValue(word,
                        lexcontent[0].toString() + "\t" + lexcontent[3].toString())) {
                    //                    argLexentriesString.put(lexcontent[1], val);
                    argLexentriesString.put(word, val);
                }
            } else if (lexcontent[2].equals("MOD")) {
                //                if (!lexcontent[1].equals("NUM") || !modLexentriesString.containsValue(lexcontent[1], lexcontent[0].toString() + "\t" + lexcontent[3].toString()))
                if (!lexcontent[1].equals("NUM") || !modLexentriesString.containsValue(word,
                        lexcontent[0].toString() + "\t" + lexcontent[3].toString())) {
                    modLexentriesString.put(word, val);
                    //                    modLexentriesString.put(lexcontent[1], val);
                }
            } else {
                System.err.println("Incorrect Lexicon format: line " + line);
            }
            //            String posword = getPosFromTreeString(lexcontent[3], lexcontent[1]).toLowerCase();
            String posword = getPosFromTreeString(lexcontent[3], word).toLowerCase();
            if (posTagNo.containsKey(posword)) {
                posTagNo.put(posword, posTagNo.get(posword) + baumAnz);
            } else {
                posTagNo.put(posword, baumAnz);
            }
            if (noOfTrees.containsKey(val)) {
                this.noOfTrees.put(val, noOfTrees.get(val) + baumAnz);
            } else {
                this.noOfTrees.put(val, baumAnz);
            }
        }

        MultiValueMap<String, String>[] entries = new MultiValueMap[2];
        entries[0] = argLexentriesString;
        entries[1] = modLexentriesString;
        return entries;
    }

    protected String[] lexEntryRemoveDigits(String[] lexcontent) {

        lexcontent[3] = lexcontent[3].replaceAll("[)][)]", ") )");
        String[] lexemes = lexcontent[3].split("[)]");
        String resultsentence = "";
        for (String l : lexemes) {
            if (l.contains(" ")) {
                String w = l.substring(l.lastIndexOf(" "), l.length());
                String indeces = "";
                if (w.contains("^")) {
                    indeces = w.substring(w.indexOf("^"));
                    w = w.substring(0, w.indexOf("^"));
                }
                String pref = l.substring(0, l.lastIndexOf(" "));
                if (w.contains("*")) {
                } else if (digits.matcher(w).matches() && !(w.equals(" 0") && pref.contains(":"))
                        && !w.contains("@")) {
                    w = toNUM(w);
                    lexcontent[1] = toNUM(lexcontent[1]);
                    if (digits.matcher(w).matches() && !(w.equals(" 0") && pref.contains(":"))) {
                        w = toNUM(w);
                        lexcontent[1] = toNUM(lexcontent[1]);
                        if (digits.matcher(w).matches() && !(w.equals(" 0") && pref.contains(":"))) {
                            w = toNUM(w);
                            lexcontent[1] = toNUM(lexcontent[1]);
                        }
                    }
                }
                resultsentence += pref + w + indeces + ")";
            } else {
                resultsentence += l + ")";
            }
        }
        resultsentence += ")";
        resultsentence = resultsentence.replaceAll("[)] [)]", "))");
        lexcontent[3] = resultsentence;
        return lexcontent;
    }

    private String toNUM(String w) {
        Matcher matchedexp = digits.matcher(w);
        String numw = matchedexp.replaceAll("$1NUM$2");
        return numw;
    }

    public boolean containsKey(String word) {
        return lexEntriesTree.containsKey(word);
        //        if (lexEntriesTree.containsKey(word))
        //        {
        //            return true;
        //        }
        //        return false;
    }

    @SuppressWarnings("unchecked")
    public Collection<ElementaryStringTree> getEntries(String word, String wCor, String posTag,
            boolean noAnalysisParse, int wno) {
        // expand into ElementaryStringTrees! =
        Collection<ElementaryStringTree> treesOut = new ArrayList<ElementaryStringTree>();
        Collection<String> treeStrings = new ArrayList<String>();
        if (lexEntriesTree.isEmpty()) {
            return treesOut;
        }
        String searchWord = wCor;//word.toLowerCase();
        if (!lexEntriesTree.containsKey(searchWord)) {
            searchWord = "";
            if (opts.goldPosTags) {
                for (String w : posTag.split("\t")) {
                    searchWord += w.toLowerCase() + " unk";
                }
            } else {
                searchWord += "unk";
            }
        }
        if (!lexEntriesTree.containsKey(searchWord)) // TODO: FIX
        {
            return treesOut;
        }
        //System.out.println(trees.size()+ "\t"+ wordPosMap.getCollection(searchWord.substring(searchWord.indexOf(" ")+1)).size());
        for (String treeString : (Collection<String>) lexEntriesTree.getCollection(searchWord)) {
            if (treeString.contains("LEXEME1")) {
                String postag1 = treeString.substring(0, treeString.indexOf(" @LEXEME1@"));
                posTag = postag1.substring(postag1.lastIndexOf("(") + 2, postag1.lastIndexOf("^"));
            }
            String sts = posTag + "\t" + treeString.substring(treeString.indexOf("\t") + 1);
            if (//StatsRunner.fullLex
            this.lexEntriesTree.size() > 100 && ((!noOfTrees.containsKey(sts) && treeString.contains("^x"))
                    || (noOfTrees.containsKey(sts) && noOfTrees.get(sts) < 3 && treeString.contains("^x")))) {
                if (!noAnalysisParse) {
                    continue;
                }
            }
            if (num.matcher(word).matches() && !posTag.equals("CD")) {
                continue;
            }
            if (!word.contains("NUM") && upperCase.matcher(word).matches() && wno != 0
                    && (!posTag.startsWith("NN") && !posTag.startsWith("JJ")) && !word.equals("I")) {
                continue;
            }
            String unlexTreeString = treeString;
            treeString = insertLex(word, treeString);
            ElementaryStringTree tree = makeToStringTree(treeString, unlexTreeString);
            if (tree != null && !treeStrings.contains(tree.getTreeString().substring(2))) {
                treesOut.add(tree);
                treeStrings.add(tree.getTreeString().substring(2));
            }
        } // for
        //      System.out.print(trees.size()+" lexTrees\t");
        if (treesOut.size() > 20) {
            return treesOut;
        }
        HashMap<String, Integer> posTags = new HashMap<String, Integer>();
        if (!posTag.equals("") && !posTag.equals("N/A") && opts.treeFamilies && !searchWord.equals("unk")) {//don't do this for prediction trees.
            if (!opts.goldPosTags) {
                posTags = getPosTags(treeStrings, searchWord);
            } else {
                posTags.put(posTag, 1);
            }
            //         System.out.print(postags.size()+"postags\t");
            if (posTags.size() > 1) {
                for (String ptag : posTags.keySet()) {
                    if (lexEntriesTree.containsKey(ptag)) {
                        for (String treeString : (Collection<String>) lexEntriesTree.getCollection(ptag)) {
                            String unlexTreeString = treeString;
                            treeString = insertLex(word, treeString);
                            ElementaryStringTree tree = makeToStringTree(treeString, unlexTreeString);
                            String ts = tree.getTreeString().substring(2);
                            if (tree != null && !treeStrings.contains(ts) && noOfTrees.get(ptag + "\t"
                                    + unlexTreeString.substring(unlexTreeString.indexOf("\t") + 1)) > 100) {
                                //System.out.println(noOfTrees.get(ptag+"\t"+unlexTreeString.substring(unlexTreeString.indexOf("\t")+1))+"\t"+ts);
                                treesOut.add(tree);
                                treeStrings.add(ts);
                            }
                        } // for
                    } // if
                } // for
            } // if
        } // if
        //      System.out.println(trees.size());
        if (treesOut.size() > 6) {
            return treesOut;
        } else //correct for bad gold pos tag.
        {
            if (!searchWord.equals("prediction: ") && opts.goldPosTags && opts.fullLex) {
                posTags = getPosTags(treeStrings, searchWord);
                //postags = wordPosMap.getCollection(searchWord.substring(searchWord.indexOf(" ")+1));
            }
            posTags.remove(searchWord);
        }
        for (String sw : posTags.keySet()) {
            if (lexEntriesTree.containsKey(sw)) {
                for (String treeString : (Collection<String>) lexEntriesTree.getCollection(sw)) {
                    //if (treeString.startsWith("1") && searchWord.endsWith("unk"))
                    //   continue;
                    String unlexTreeString = treeString;
                    treeString = insertLex(word, treeString);
                    ElementaryStringTree tree = makeToStringTree(treeString, unlexTreeString);
                    if (tree != null && !treeStrings.contains(tree.getTreeString().substring(2))) {
                        treesOut.add(tree);
                        treeStrings.add(tree.getTreeString().substring(2));
                    }
                }
            }
        }
        return treesOut;
    }

    protected HashMap<String, Integer> getPosTags(Collection<String> treeStrings, String searchWord) {
        int maxfreq = 0;
        HashMap<String, Integer> postags = new HashMap<String, Integer>();
        for (String ts : treeStrings) {
            String candpostag = getPosFromTreeString(ts, "@LEXEME1@");
            if (!postags.keySet().contains(candpostag)) {
                Integer freq = posTagNo.get(candpostag.toLowerCase() + " " + searchWord);
                if (freq == null) {
                    freq = 1;
                }
                postags.put(candpostag, freq);
                if (freq > maxfreq) {
                    maxfreq = freq;
                }
            }
        }
        ArrayList<String> pt = new ArrayList<String>();
        pt.addAll(postags.keySet());
        for (String key : pt) {
            if (postags.get(key) * 50 < maxfreq) {
                postags.remove(key);
            }
        }
        return postags;
    }

    protected String insertLex(String word, String tree) {
        String[] words = word.split("\t");
        String treeString = tree;
        int id = 1;
        for (String w : words) {
            if (w.contains(" ")) {
                w = w.substring(w.indexOf(" ") + 1, w.length());
            }
            treeString = treeString.replace("@LEXEME" + id + "@<>", w + "<>");
            id++;
        }
        while (treeString.contains("LEXEME")) {
            //      System.out.println("incorrect lexicon entry "+treeString);
            //insertLex(word, tree);
            treeString = treeString.replace("@LEXEME" + id + "@<>", "*^x_x");
            id++;
        }
        return treeString;
    }

    public void lexiconReduce() {
        ArrayList<String> removelist = new ArrayList<String>();
        for (String tree : trees.keySet()) {
            Collection<String> words = trees.getCollection(tree);
            if (words.size() == 1) {
                String wd = words.iterator().next();
                String success = (String) this.lexEntriesTree.remove(wd, "0\t" + tree);
                if (success != null) {
                    removelist.add(tree);
                }
            }
        }
        for (String t : removelist) {
            trees.remove(t);
        }
    }

    public void getFamily(String string) {
        HashMap<String, Integer> similars = new HashMap<String, Integer>();
        int most = 0;
        HashSet<String> mostSimilar = new HashSet<String>();
        for (String tree : (Collection<String>) lexEntriesTree.getCollection(string)) {
            if (!this.noOfTrees.containsKey(tree) || noOfTrees.get(tree) < 5) {
                continue;
            }
            String t = tree.substring(tree.indexOf("\t") + 1);
            for (String assoc : trees.getCollection(t)) {
                if (assoc.contains(" unk") || assoc.contains(string)) {
                    continue;
                }
                if (similars.containsKey(assoc)) {
                    int newNum = similars.get(assoc) + 1;
                    similars.put(assoc, newNum);
                    if (newNum > most) {
                        most = newNum;
                        mostSimilar.clear();
                        mostSimilar.add(assoc);
                    }
                    if (newNum == most) {
                        mostSimilar.add(assoc);
                    }
                } else {
                    similars.put(assoc, 1);
                }
            }
        }
        HashSet<String> simtrees = new HashSet<String>();
        for (String mostSimWords : mostSimilar) {
            simtrees.addAll((Collection<String>) lexEntriesTree.getCollection(mostSimWords));

        }
        System.out.println(mostSimilar + "\t");
        System.out.print(simtrees.toString() + "\n");
    }

    public void removeHelps() {
        numOfTreeTemplates = trees.keySet().size();
        //        numOfTreeTemplates = 6410;
        trees = null;
        //this.noOfTrees = null;
    }

    public static int getNumOfTreeTemps() {
        return numOfTreeTemplates;
    }

    public int getLexSize() {
        //        if (!opts.fullLex)
        //        {
        //            return 9671; //System.out.println(1.0/wordPosMap.keySet().size());
        //        }
        return wordPosMap.keySet().size();
    }

    public int getPosTagNo(String word) {
        if (this.posTagNo.containsKey(word)) {
            return this.posTagNo.get(word);
        }
        return 0;
    }

    public Collection<String> getPOSs(String word) {
        return wordPosMap.getCollection(word.toLowerCase());
    }

    public static double lexWord(String category, String predictedLex) {
        double a, b;
        if (biWordMap.containsKey(category + "%" + predictedLex)) {
            a = biWordMap.get(category + "%" + predictedLex).doubleValue();
            b = biWordMap.get(category).doubleValue();
        } else {
            a = biWordMap.get("UNK%" + predictedLex).doubleValue();
            b = biWordMap.get("UNK").doubleValue();
        }
        return a / b;
    }

    public MultiValueMap<String, ElementaryStringTree> getLexEntriesContaining(String category) {
        MultiValueMap<String, ElementaryStringTree> out = new MultiValueMap();
        for (String key : wordPosMap.keySet()) {
            String firstWordPos = wordPosMap.getCollection(key).iterator().next();
            Collection<ElementaryStringTree> col = getEntries(key, firstWordPos, firstWordPos.split(" ")[0], false,
                    0);
            for (ElementaryStringTree e : col) {
                if (e.toString().contains(" " + category + "^"))
                    out.put(key, e);
            }
        }
        return out;
    }

    public Collection<Pair<String, String>> getTreeWithRootCategory(String rootCategory) {
        return (Collection<Pair<String, String>>) rootNodeTreeMap.get(rootCategory);
    }

    protected Collection<?> getEntries(String word) {
        return lexEntriesTree.getCollection(word);
    }
}