gate.crowdsource.ne.EntityAnnotationResultsImporter.java Source code

Introduction

Here is the source code for gate.crowdsource.ne.EntityAnnotationResultsImporter.java
Source

/*
 *  EntityAnnotationResultsImporter.java
 *
 *  Copyright (c) 1995-2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 3, June 2007 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  
 *  $Id$
 */
package gate.crowdsource.ne;

import static gate.crowdsource.CrowdFlowerConstants.*;

import java.util.List;

import org.apache.log4j.Logger;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;

import gate.Annotation;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.Resource;
import gate.Utils;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.crowdsource.rest.CrowdFlowerClient;
import gate.util.InvalidOffsetException;

@CreoleResource(name = "Entity Annotation Results Importer", comment = "Import judgments from a CrowdFlower job created by "
        + "the Entity Annotation Job Builder as GATE annotations.", helpURL = "http://gate.ac.uk/userguide/sec:crowd:annotation:import")
public class EntityAnnotationResultsImporter extends AbstractLanguageAnalyser {

    private static final long serialVersionUID = 3424823295729835240L;

    private static final Logger log = Logger.getLogger(EntityAnnotationResultsImporter.class);

    private String apiKey;

    private Long jobId;

    private String resultAnnotationType;

    private String resultASName;

    private String snippetAnnotationType;

    private String snippetASName;

    private String tokenAnnotationType;

    private String tokenASName;

    private Boolean annotateSpans;

    protected CrowdFlowerClient crowdFlowerClient;

    public String getApiKey() {
        return apiKey;
    }

    @CreoleParameter(comment = "CrowdFlower API key")
    public void setApiKey(String apiKey) {
        this.apiKey = apiKey;
    }

    public Long getJobId() {
        return jobId;
    }

    @RunTime
    @CreoleParameter
    public void setJobId(Long jobId) {
        this.jobId = jobId;
    }

    public String getResultAnnotationType() {
        return resultAnnotationType;
    }

    @RunTime
    @CreoleParameter
    public void setResultAnnotationType(String resultAnnotationType) {
        this.resultAnnotationType = resultAnnotationType;
    }

    public String getResultASName() {
        return resultASName;
    }

    @Optional
    @RunTime
    @CreoleParameter(defaultValue = "crowdResults")
    public void setResultASName(String resultASName) {
        this.resultASName = resultASName;
    }

    public String getSnippetAnnotationType() {
        return snippetAnnotationType;
    }

    @RunTime
    @CreoleParameter(defaultValue = "Sentence", comment = "Annotation type "
            + "representing the snippets (one snippet = one unit)")
    public void setSnippetAnnotationType(String snippetAnnotationType) {
        this.snippetAnnotationType = snippetAnnotationType;
    }

    public String getSnippetASName() {
        return snippetASName;
    }

    @Optional
    @RunTime
    @CreoleParameter(comment = "Annotation set where the snippets can be found")
    public void setSnippetASName(String snippetASName) {
        this.snippetASName = snippetASName;
    }

    public String getTokenAnnotationType() {
        return tokenAnnotationType;
    }

    @RunTime
    @CreoleParameter(defaultValue = "Token", comment = "Annotation type representing the \"tokens\" - the atomic "
            + "units that workers have selected to mark entity annotations.")
    public void setTokenAnnotationType(String tokenAnnotationType) {
        this.tokenAnnotationType = tokenAnnotationType;
    }

    public String getTokenASName() {
        return tokenASName;
    }

    @Optional
    @RunTime
    @CreoleParameter(comment = "Annotation set where tokens can be found")
    public void setTokenASName(String tokenASName) {
        this.tokenASName = tokenASName;
    }

    public Boolean getAnnotateSpans() {
        return annotateSpans;
    }

    @RunTime
    @CreoleParameter(comment = "If true (the default), create one annotation for "
            + "each contiguous run of marked tokens, if false, annotate each token "
            + "separately.", defaultValue = "true")
    public void setAnnotateSpans(Boolean annotateSpans) {
        this.annotateSpans = annotateSpans;
    }

    @Override
    public Resource init() throws ResourceInstantiationException {
        if (apiKey == null || "".equals(apiKey)) {
            throw new ResourceInstantiationException("API Key must be set");
        }
        crowdFlowerClient = new CrowdFlowerClient(apiKey);
        return this;
    }

    @Override
    public void execute() throws ExecutionException {
        if (isInterrupted())
            throw new ExecutionInterruptedException();
        interrupted = false;
        try {
            if (jobId == null || jobId.longValue() <= 0) {
                throw new ExecutionException("Job ID must be provided");
            }

            AnnotationSet tokens = getDocument().getAnnotations(tokenASName).get(tokenAnnotationType);
            AnnotationSet snippetAnnotations = getDocument().getAnnotations(snippetASName)
                    .get(snippetAnnotationType);
            AnnotationSet resultAS = getDocument().getAnnotations(resultASName);
            List<Annotation> allSnippets = Utils.inDocumentOrder(snippetAnnotations);

            for (Annotation snippet : allSnippets) {
                if (isInterrupted())
                    throw new ExecutionInterruptedException();
                Object unitId = snippet.getFeatures().get(resultAnnotationType + "_unit_id");
                if (unitId != null) {
                    if (!(unitId instanceof Long)) {
                        unitId = Long.valueOf(unitId.toString());
                    }
                    // find any existing result annotations within the span of this snippet
                    // so we can avoid creating another annotation from this judgment if
                    // one already exists
                    AnnotationSet existingResults = Utils.getContainedAnnotations(resultAS, snippet,
                            resultAnnotationType);
                    // tokens under this snippet
                    List<Annotation> snippetTokens = Utils
                            .inDocumentOrder(Utils.getContainedAnnotations(tokens, snippet));

                    JsonArray judgments = crowdFlowerClient.getJudgments(jobId, ((Long) unitId).longValue());

                    if (judgments != null) {
                        for (JsonElement judgmentElt : judgments) {
                            JsonObject judgment = judgmentElt.getAsJsonObject();
                            JsonObject data = judgment.getAsJsonObject("data");
                            JsonArray answer = data.getAsJsonArray("answer");
                            Long judgmentId = judgment.get("id").getAsLong();
                            Double trust = judgment.get("trust").getAsDouble();
                            Long workerId = judgment.get("worker_id").getAsLong();
                            String comment = null;
                            if (data.get("comment") != null) {
                                if (data.get("comment").isJsonPrimitive()) {
                                    comment = data.get("comment").getAsString().trim();
                                    if ("".equals(comment)) {
                                        comment = null;
                                    }
                                }
                            }
                            if (answer != null && answer.size() > 0) {
                                // judgment says there are some entities to annotate.  Look for
                                // sequences of consecutive token indices and create one result
                                // annotation for each such sequence
                                int startTok = 0;
                                int curTok = startTok;
                                while (curTok < answer.size()) {
                                    // we've reached the end of an annotation if either
                                    // (a) we want to annotate each token individually anyway or 
                                    // (b) we're on the last element of answer or
                                    // (c) the next element is not this+1
                                    if (!annotateSpans || curTok == answer.size() - 1
                                            || (answer.get(curTok).getAsInt() + 1) != answer.get(curTok + 1)
                                                    .getAsInt()) {
                                        Long startOffset = snippetTokens.get(answer.get(startTok).getAsInt())
                                                .getStartNode().getOffset();
                                        Long endOffset = snippetTokens.get(answer.get(curTok).getAsInt())
                                                .getEndNode().getOffset();
                                        startTok = curTok + 1;
                                        // check whether there's already an annotation at this location for this judgment
                                        AnnotationSet existingEntities = existingResults.getContained(startOffset,
                                                endOffset);
                                        boolean found = false;
                                        for (Annotation a : existingEntities) {
                                            if (judgmentId.equals(a.getFeatures().get(JUDGMENT_ID_FEATURE_NAME))) {
                                                found = true;
                                                break;
                                            }
                                        }
                                        if (!found) {
                                            // no existing annotation found, create one
                                            try {
                                                FeatureMap features = Utils.featureMap(JUDGMENT_ID_FEATURE_NAME,
                                                        judgmentId, "trust", trust, "worker_id", workerId);
                                                if (comment != null) {
                                                    features.put("comment", comment);
                                                }
                                                resultAS.add(startOffset, endOffset, resultAnnotationType,
                                                        features);
                                            } catch (InvalidOffsetException e) {
                                                throw new ExecutionException(
                                                        "Invalid offset obtained from existing annotation!", e);
                                            }
                                        }
                                    }
                                    curTok++;
                                }
                            }
                        }
                    } else {
                        log.warn("Unit " + unitId + " has no judgments");
                    }

                } else {
                    log.warn("Found " + snippetAnnotationType + " annotation with no " + UNIT_ID_FEATURE_NAME
                            + " feature, ignoring");
                }
            }

        } finally {
            interrupted = false;
        }
    }

}