edu.cmu.cs.lti.ark.fn.evaluation.PrepareFullAnnotationJson.java Source code

Java tutorial

Introduction

Here is the source code for edu.cmu.cs.lti.ark.fn.evaluation.PrepareFullAnnotationJson.java

Source

/*******************************************************************************
 * Copyright (c) 2011 Dipanjan Das 
 * Language Technologies Institute,
 * Carnegie Mellon University, 
 * All Rights Reserved.
 * 
 * PrepareFullAnnotationJson.java is part of SEMAFOR 2.0.
 * 
 * SEMAFOR 2.0 is free software: you can redistribute it and/or modify  it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation, either version 3 of the License, or 
 * (at your option) any later version.
 * 
 * SEMAFOR 2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details. 
 * 
 * You should have received a copy of the GNU General Public License along
 * with SEMAFOR 2.0.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package edu.cmu.cs.lti.ark.fn.evaluation;

import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.collect.*;
import edu.cmu.cs.lti.ark.fn.parsing.RankedScoredRoleAssignment;
import edu.cmu.cs.lti.ark.fn.parsing.SemaforParseResult;
import edu.cmu.cs.lti.ark.util.ds.*;

import javax.annotation.Nullable;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

import static com.google.common.collect.ImmutableList.copyOf;
import static com.google.common.collect.Lists.transform;
import static edu.cmu.cs.lti.ark.fn.parsing.SemaforParseResult.Frame;
import static edu.cmu.cs.lti.ark.fn.parsing.SemaforParseResult.Frame.NamedSpanSet;
import static edu.cmu.cs.lti.ark.fn.parsing.SemaforParseResult.Frame.Span;
import static edu.cmu.cs.lti.ark.fn.utils.DataPointWithFrameElements.FrameElementAndSpan;
import static edu.cmu.cs.lti.ark.util.IntRanges.xrange;
import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.commons.io.IOUtils.readLines;

/**
 * Collates intermediate output files into a final output file
 * Writes one json object (representing a Semafor-parsed sentence) per line
 * @see edu.cmu.cs.lti.ark.fn.parsing.SemaforParseResult
 *
 * @author Sam Thomson (sthomson@cs.cmu.edu)
 */
public class PrepareFullAnnotationJson {
    private static final Function<RankedScoredRoleAssignment, Integer> getSentenceIndex = new Function<RankedScoredRoleAssignment, Integer>() {
        @Nullable
        @Override
        public Integer apply(RankedScoredRoleAssignment input) {
            return input.sentenceIdx;
        }
    };
    public static final Function<String, RankedScoredRoleAssignment> processPredictionLine = new Function<String, RankedScoredRoleAssignment>() {
        @Nullable
        @Override
        public RankedScoredRoleAssignment apply(@Nullable String input) {
            return RankedScoredRoleAssignment.fromPredictionLine(input);
        }
    };
    private static final Function<RankedScoredRoleAssignment, Range0Based> getTargetSpan = new Function<RankedScoredRoleAssignment, Range0Based>() {
        @Override
        public Range0Based apply(RankedScoredRoleAssignment input) {
            return input.targetSpan;
        }
    };

    /**
     * Generates the json representation of a set of predicted semantic parses
     * 
     * @param args Options to specify:
     *   testFEPredictionsFile
     *   testParseFile 
     *   testTokenizedFile 
     *   outputFile
     * @see #writeJsonForPredictions
     */
    public static void main(String[] args) throws Exception {
        ParseOptions options = new ParseOptions(args);
        writeJsonForPredictions(options.testFEPredictionsFile, options.testTokenizedFile, options.outputFile);
    }

    /**
     * Generates the JSON representation of a set of predicted semantic parses
     *
     * @param testFEPredictionsFile Path to MapReduce output of the parser, formatted as frame elements lines
     * @param testTokenizedFile File Original form of each sentence in the data
     * @param outputFile Where to store the resulting json
     */
    public static void writeJsonForPredictions(String testFEPredictionsFile, String testTokenizedFile,
            String outputFile) throws Exception {
        final FileReader tokenizedInput = new FileReader(testTokenizedFile);
        try {
            final FileReader feInput = new FileReader(testFEPredictionsFile);
            try {
                final BufferedWriter output = new BufferedWriter(new FileWriter(new File(outputFile)));
                try {
                    writeJsonForPredictions(tokenizedInput, feInput, output);
                } finally {
                    closeQuietly(output);
                }
            } finally {
                closeQuietly(feInput);
            }
        } finally {
            closeQuietly(tokenizedInput);
        }
    }

    public static void writeJsonForPredictions(Reader tokenizedInput, Reader frameElementsInput, Writer output)
            throws IOException {
        final List<String> tokenizedLines = readLines(tokenizedInput);
        final Multimap<Integer, RankedScoredRoleAssignment> predictions = parseRoleAssignments(
                readLines(frameElementsInput));
        writeJson(predictions, tokenizedLines, output);
    }

    /**
     * Reads predicted frame elements from testFEPredictionsFile and groups them by sentence index
     *
     * @param lines the predicted frame elements
     * @return a map from sentence num to a set of predicted frame elements for that sentence
     * @throws IOException if there is a problem reading from the file
     */
    public static Multimap<Integer, RankedScoredRoleAssignment> parseRoleAssignments(List<String> lines) {
        final List<RankedScoredRoleAssignment> roleAssignments = copyOf(transform(lines, processPredictionLine));
        // group by sentence index
        return Multimaps.index(roleAssignments, getSentenceIndex);
    }

    private static void writeJson(Multimap<Integer, RankedScoredRoleAssignment> predictions,
            List<String> tokenizedLines, Writer output) throws IOException {
        for (int i : xrange(tokenizedLines.size())) {
            final Collection<RankedScoredRoleAssignment> predictionsForSentence = predictions.get(i);
            final List<String> tokens = Arrays.asList(tokenizedLines.get(i).split(" "));
            final SemaforParseResult semaforParseResult = getSemaforParse(predictionsForSentence, tokens);
            output.write(semaforParseResult.toJson() + "\n");
        }
    }

    /**
     * Given predicted frame instances, including their frame elements, create a SemaforParseResult ready to be serialized to
     * JSON
     *
     * @param rankedScoredRoleAssignments Lines encoding predicted frames & FEs in the same format as the .sentences.frame.elements files
     */
    public static SemaforParseResult getSemaforParse(
            Collection<RankedScoredRoleAssignment> rankedScoredRoleAssignments, List<String> tokens) {
        final ArrayList<Frame> frames = Lists.newArrayList();
        // group by target span (assumes only one predicted frame per target span)
        final ImmutableListMultimap<Range0Based, RankedScoredRoleAssignment> predictionsByFrame = Multimaps
                .index(rankedScoredRoleAssignments, getTargetSpan);
        for (Range0Based targetSpan : predictionsByFrame.keySet()) {
            final List<RankedScoredRoleAssignment> predictionsForFrame = predictionsByFrame.get(targetSpan);
            final RankedScoredRoleAssignment first = predictionsForFrame.get(0);
            final NamedSpanSet target = makeSpan(first.targetSpan.start, first.targetSpan.end + 1, first.frame,
                    tokens);
            final List<Frame.ScoredRoleAssignment> scoredRoleAssignments = Lists.newArrayList();
            for (RankedScoredRoleAssignment ra : predictionsForFrame) {
                // extract frame elements
                final List<FrameElementAndSpan> frameElementsAndSpans = ra.fesAndSpans;
                final List<NamedSpanSet> frameElements = Lists.newArrayList();
                for (FrameElementAndSpan frameElementAndSpan : frameElementsAndSpans) {
                    final Range0Based range = frameElementAndSpan.span;
                    frameElements.add(makeSpan(range.start, range.end + 1, frameElementAndSpan.name, tokens));
                }
                scoredRoleAssignments.add(new Frame.ScoredRoleAssignment(ra.rank, ra.score, frameElements));
            }
            frames.add(new Frame(target, scoredRoleAssignments));
        }
        return new SemaforParseResult(frames, tokens);
    }

    private static NamedSpanSet makeSpan(int start, int end, String name, List<String> tokens) {
        final ImmutableList<Span> spans = ImmutableList
                .of(new Span(start, end, Joiner.on(" ").join(tokens.subList(start, end))));
        return new NamedSpanSet(name, spans);
    }
}