edu.cmu.cs.lti.ark.fn.identification.FrameIdentificationRelease.java Source code

Introduction

Here is the source code for edu.cmu.cs.lti.ark.fn.identification.FrameIdentificationRelease.java
Source

/*******************************************************************************
 * Copyright (c) 2011 Dipanjan Das 
 * Language Technologies Institute, 
 * Carnegie Mellon University, 
 * All Rights Reserved.
 * 
 * FrameIdentificationRelease.java is part of SEMAFOR 2.0.
 * 
 * SEMAFOR 2.0 is free software: you can redistribute it and/or modify  it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation, either version 3 of the License, or 
 * (at your option) any later version.
 * 
 * SEMAFOR 2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details. 
 * 
 * You should have received a copy of the GNU General Public License along
 * with SEMAFOR 2.0.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package edu.cmu.cs.lti.ark.fn.identification;

import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.io.Files;
import edu.cmu.cs.lti.ark.fn.data.prep.formats.AllLemmaTags;
import edu.cmu.cs.lti.ark.fn.data.prep.formats.Sentence;
import edu.cmu.cs.lti.ark.fn.data.prep.formats.Token;
import edu.cmu.cs.lti.ark.util.ds.Pair;
import gnu.trove.TObjectDoubleHashMap;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.List;

import static com.google.common.base.Strings.nullToEmpty;
import static java.lang.Math.exp;
import static org.apache.commons.io.IOUtils.closeQuietly;

public class FrameIdentificationRelease {
    public static Pair<IdFeatureExtractor, TObjectDoubleHashMap<String>> parseParamFile(String paramsFile)
            throws IOException {
        final List<String> lines = Files.readLines(new File(paramsFile), Charsets.UTF_8);
        final IdFeatureExtractor featureExtractor = IdFeatureExtractor.fromName(lines.get(0));
        final TObjectDoubleHashMap<String> model = readModel(lines.subList(1, lines.size()));
        return Pair.of(featureExtractor, model);
    }

    public static TObjectDoubleHashMap<String> readModel(Collection<String> featureLines) {
        final TObjectDoubleHashMap<String> model = new TObjectDoubleHashMap<String>(featureLines.size());
        int count = 0;
        for (String line : featureLines) {
            String[] fields = line.split("\t");
            final String featureName = fields[0].trim();
            final Double featureValue = Double.parseDouble(fields[1].trim());
            model.put(featureName, featureValue);
            count++;
            if (count % 100000 == 0)
                System.err.print(count + " ");
        }
        return model;
    }

    public static TObjectDoubleHashMap<String> readOldModel(String idParamsFile) throws IOException {
        TObjectDoubleHashMap<String> params = new TObjectDoubleHashMap<String>();
        int count = 0;
        String line;
        final BufferedReader input = Files.newReader(new File(idParamsFile), Charsets.UTF_8);
        try {
            while ((line = input.readLine()) != null) {
                final String[] nameAndVal = line.split("\t");
                final String[] logAndSign = nameAndVal[1].split(", ");
                final double value = exp(Double.parseDouble(logAndSign[0]));
                final double sign = Boolean.parseBoolean(logAndSign[1]) ? 1.0 : -1.0;
                params.put(nameAndVal[0], value * sign);
                count++;
                if (count % 100000 == 0)
                    System.err.print(count + " ");
            }
        } finally {
            closeQuietly(input);
        }
        return params;
    }

    public static Pair<String, String> getTokenRepresentation(String tokNum, String parse) {
        String[] tokNums = tokNum.split("_");
        List<Integer> indices = Lists.newArrayList();
        for (String tokNum1 : tokNums) {
            indices.add(Integer.parseInt(tokNum1));
        }
        return getTokenRepresentation(indices, Sentence.fromAllLemmaTagsArray(AllLemmaTags.readLine(parse)));
    }

    public static Pair<String, String> getTokenRepresentation(List<Integer> indices, Sentence sentence) {
        final List<Token> tokens = sentence.getTokens();
        List<String> actualTokens = Lists.newArrayList();
        for (int i : indices) {
            actualTokens.add(tokens.get(i).getForm());
        }
        return Pair.of(getFirstTokenAndCpos(indices, sentence), Joiner.on(" ").join(actualTokens));
    }

    public static String getFirstTokenAndCpos(List<Integer> indices, Sentence sentence) {
        if (indices.isEmpty())
            return "";
        final Token firstToken = sentence.getTokens().get(indices.get(0));
        return firstToken.getForm().toLowerCase() + "."
                + nullToEmpty(firstToken.getPostag()).substring(0, 1).toLowerCase();

    }
}