gate.crowdsource.rest.CrowdFlowerClient.java Source code

Java tutorial

Introduction

Here is the source code for gate.crowdsource.rest.CrowdFlowerClient.java

Source

/*
 *  CrowdFlowerClient.java
 *
 *  Copyright (c) 1995-2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 3, June 2007 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  
 *  $Id: CrowdFlowerClient.java 17796 2014-04-10 12:28:34Z ian_roberts $
 */
package gate.crowdsource.rest;

import static gate.crowdsource.CrowdFlowerConstants.*;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Utils;
import gate.util.GateRuntimeException;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;

public class CrowdFlowerClient {

    private static final Logger log = Logger.getLogger(CrowdFlowerClient.class);

    public static final String CF_ENDPOINT = "https://api.crowdflower.com/v1";

    private String apiKey;

    public CrowdFlowerClient(String apiKey) {
        this.apiKey = apiKey;
    }

    /**
     * Create a new entity classification job on CrowdFlower.
     * 
     * @param title the job title
     * @param instructions the instructions
     * @param caption a caption for the answer form, which can include an
     *          {{entity}} placeholder which will be replaced by the text
     *          of the entity being classified (e.g.
     *          "choose the most appropriate description for {{entity}}")
     * @param commonOptions common options that should be presented for
     *          all tasks, in addition to the specific options for that
     *          task extracted from the entity annotation. This is
     *          expressed as a List of [value, description] two-element
     *          Lists, one for each option.
     * @return the newly created job ID.
     * @throws IOException
     */
    public long createClassificationJob(String title, String instructions, String caption,
            List<List<String>> commonOptions) throws IOException {
        log.debug("Creating classification job");
        log.debug("title: " + title);
        log.debug("instructions: " + instructions);
        log.debug("caption: " + caption);
        log.debug("options: " + commonOptions);
        // load the custom CSS that makes entity highlighting work
        InputStream cssStream = CrowdFlowerClient.class.getResourceAsStream("gate-crowdflower.css");
        String css = null;
        try {
            css = IOUtils.toString(cssStream, "UTF-8");
        } finally {
            cssStream.close();
        }
        StringWriter cml = new StringWriter();

        // construct the CML with the specified caption and common radio
        // options
        cml.append("<h2 id=\"unit_text\">{{text}}</h2>\n\n" + "{% if detail %}\n"
                + "  <div class=\"well\">{{detail}}</div>\n" + "{% endif %}\n"
                + "<cml:radios validates=\"required\" label=\"");
        StringEscapeUtils.escapeXml(cml, caption);
        cml.append("\" name=\"answer\">\n" + "  {% for opt in options %}\n" + "    {% if opt.description %}\n"
                + "      {% assign desc = opt.description %}\n" + "    {% else %}\n"
                + "      {% assign desc = opt.value %}\n" + "    {% endif %}\n"
                + "    <cml:radio value=\"{{opt.value}}\" label=\"{{desc}}\" />\n" + "  {% endfor %}\n");
        if (commonOptions != null) {
            for (List<String> opt : commonOptions) {
                cml.append("  <cml:radio value=\"");
                StringEscapeUtils.escapeXml(cml, opt.get(0));
                cml.append("\" label=\"");
                StringEscapeUtils.escapeXml(cml, opt.get(1));
                cml.append("\" />\n");
            }
        }
        cml.append("</cml:radios>\n");

        log.debug("cml: " + cml.toString());

        log.debug("POSTing to CrowdFlower");
        JsonElement json = post("/jobs", "job[title]", title, "job[instructions]", instructions, "job[cml]",
                cml.toString(), "job[css]", css);
        log.debug("CrowdFlower returned " + json);
        try {
            return json.getAsJsonObject().get("id").getAsLong();
        } catch (Exception e) {
            throw new GateRuntimeException("Failed to create CF job");
        }
    }

    /**
     * <p>
     * Create a single unit in a classification job for the given
     * annotation. If the unit is created successfully, its ID will be
     * stored as a {@link Long} valued feature named <code>cf_unit</code>
     * on the target annotation.
     * </p>
     * 
     * <p>
     * The target annotation must have a feature named "options" whose
     * value is either a <code>Collection</code> of valid choices or a
     * <code>Map</code> where each key represents one option value and the
     * corresponding value is the human-readable description that will be
     * displayed to the annotator (e.g. the key could be an ontology
     * instance URI and the value its rdfs:label). The options will be
     * added to the unit in the order they are delivered by the iterator,
     * so if order is important you should use a collection or map with
     * predictable iteration order, e.g. {@link List} or
     * {@link LinkedHashMap}.
     * </p>
     * 
     * <p>
     * If the target annotation has a feature named "correct" then it will
     * be treated as a gold-standard unit. The "correct" feature must
     * match one of the "options" (i.e. one of the <i>keys</i> if options
     * is a Map) or one of the common options defined when the job was
     * created - typically things like "none" (none of the available
     * options is correct) or "nae" (the target is not an entity).
     * </p>
     * 
     * @param jobId the CrowdFlower job ID
     * @param doc the document containing the annotation
     * @param asName the annotation set containing the target annotation
     * @param context the "context" annotation, i.e. and annotation
     *          covering the complete snippet of text (e.g. the sentence)
     *          within which the target entity will be highlighted in the
     *          unit. This need not live in the same annotation set as the
     *          <code>target</code> but must cover the target's span.
     * @param target the annotation representing the entity to be
     *          classified.
     * @return the ID of the generated unit.
     */
    public long createClassificationUnit(long jobId, Document doc, String asName, Annotation context,
            Annotation target) {
        String text = Utils.stringFor(doc, context);
        String entity = Utils.stringFor(doc, target);
        String documentId = String.valueOf(doc.getLRPersistenceId());
        int formDataSize = 10; // text + entity + docId + asName + annId

        // insert highlight span into text
        int entityStart = (int) (Utils.start(target) - Utils.start(context));
        int entityEnd = entityStart + entity.length();
        text = text.substring(0, entityStart) + "<span class=\"gate-entity\">" + entity + "</span>"
                + text.substring(entityEnd);

        Object options = target.getFeatures().get("options");
        if (options != null) {
            if (options instanceof Map) {
                formDataSize += (4 * ((Map<?, ?>) options).size());
            } else if (options instanceof Collection) {
                formDataSize += (2 * ((Collection<?>) options).size());
            }
        }

        if (target.getFeatures().get("detail") != null) {
            formDataSize += 2;
        }

        String correctAnswer = (String) target.getFeatures().get("correct");
        String reason = (String) target.getFeatures().get("reason");
        if (correctAnswer != null) {
            formDataSize += 4; // "golden" + answer
            if (reason != null) {
                formDataSize += 2; // answer_gold_reason
            }
        }

        String[] formData = new String[formDataSize];
        int i = 0;
        formData[i++] = "unit[data][text]";
        formData[i++] = text;
        formData[i++] = "unit[data][entity]";
        formData[i++] = entity;
        formData[i++] = "unit[data][documentId]";
        formData[i++] = documentId;
        formData[i++] = "unit[data][asName]";
        formData[i++] = (asName == null ? "" : asName);
        formData[i++] = "unit[data][annId]";
        formData[i++] = String.valueOf(target.getId());
        if (options != null) {
            if (options instanceof Map) {
                for (Map.Entry<?, ?> opt : ((Map<?, ?>) options).entrySet()) {
                    formData[i++] = "unit[data][options][][value]";
                    formData[i++] = String.valueOf(opt.getKey());
                    formData[i++] = "unit[data][options][][description]";
                    formData[i++] = String.valueOf(opt.getValue());
                }
            } else if (options instanceof Collection) {
                for (Object opt : (Collection<?>) options) {
                    formData[i++] = "unit[data][options][][value]";
                    formData[i++] = String.valueOf(opt);
                }
            }
        }

        if (target.getFeatures().get("detail") != null) {
            formData[i++] = "unit[data][detail]";
            formData[i++] = target.getFeatures().get("detail").toString();
        }

        if (correctAnswer != null) {
            formData[i++] = "unit[golden]";
            formData[i++] = "true";
            formData[i++] = "unit[data][answer_gold]";
            formData[i++] = correctAnswer;
            if (reason != null) {
                formData[i++] = "unit[data][answer_gold_reason]";
                formData[i++] = reason;
            }
        }

        try {
            JsonElement json = post("/jobs/" + jobId + "/units", formData);
            long unitId = json.getAsJsonObject().get("id").getAsLong();
            // store the unit ID on the annotation
            target.getFeatures().put(UNIT_ID_FEATURE_NAME, Long.valueOf(unitId));
            return unitId;
        } catch (IOException e) {
            throw new GateRuntimeException("Could not add unit for annotation " + target, e);
        }
    }

    /**
     * Get the list of judgments for the given unit. If there are no
     * judgments, null is returned.
     * 
     * @param jobId the CrowdFlower job identifier
     * @param unitId the unit identifier
     * @return the "judgments" array for this unit, or null if none found.
     */
    public JsonArray getJudgments(long jobId, long unitId) {
        try {
            String uri = "/jobs/" + jobId + "/units/" + unitId;
            JsonElement unitResponse = get(uri);
            if (!unitResponse.isJsonObject()) {
                throw new GateRuntimeException("Response from " + uri + " was not a JSON object");
            }
            JsonElement results = unitResponse.getAsJsonObject().get("results");
            if (!results.isJsonObject()) {
                log.info("No results found for job " + jobId + " unit " + unitId);
                return null;
            }
            JsonElement judgments = results.getAsJsonObject().get("judgments");
            if (judgments.isJsonArray()) {
                return judgments.getAsJsonArray();
            } else {
                return null;
            }
        } catch (IOException e) {
            throw new GateRuntimeException("Could not retrieve unit details", e);
        }
    }

    /**
     * Create a named entity annotation job on CrowdFlower.
     * 
     * @param title the job title
     * @param instructions the instructions
     * @param caption a caption for the answer form, which should include
     *          the entity type to be annotated.
     * @param noEntitiesCaption a caption for the "there are no entities"
     *          checkbox.
     * @return the newly created job ID.
     * @throws IOException
     */
    public long createAnnotationJob(String title, String instructions, String caption, String noEntitiesCaption)
            throws IOException {
        log.debug("Creating annotation job");
        log.debug("title: " + title);
        log.debug("instructions: " + instructions);
        log.debug("caption: " + caption);

        // load the CSS that makes highlighting work
        InputStream cssStream = CrowdFlowerClient.class.getResourceAsStream("gate-crowdflower.css");
        String css = null;
        try {
            css = IOUtils.toString(cssStream, "UTF-8");
        } finally {
            cssStream.close();
        }

        // load the JavaScript that toggles the colour of tokens when
        // clicked
        InputStream jsStream = CrowdFlowerClient.class.getResourceAsStream("gate-crowdflower.js");
        String js = null;
        try {
            js = IOUtils.toString(jsStream, "UTF-8");
        } finally {
            jsStream.close();
        }

        // construct the CML
        StringWriter cml = new StringWriter();
        cml.append("<div class=\"gate-snippet\">\n" + "  <cml:checkboxes validates=\"required\" label=\"");
        StringEscapeUtils.escapeXml(cml, caption);
        cml.append("\" name=\"answer\">\n" + "    {% for tok in tokens %}\n"
                + "      <cml:checkbox label=\"{{ tok }}\" value=\"{{ forloop.index0 }}\" />\n"
                + "    {% endfor %}\n" + "  </cml:checkboxes>\n" + "</div>\n" + "{% if detail %}\n"
                + "  <div class=\"well\">{{detail}}</div>\n" + "{% endif %}\n"
                + "<div class=\"gate-no-entities\">\n"
                // TODO work out how to customize the validation error
                // message
                + "  <cml:checkbox name=\"noentities\" label=\"");
        StringEscapeUtils.escapeXml(cml, noEntitiesCaption);
        cml.append(
                "\" value=\"1\"\n" + "      only-if=\"!answer:required\" validates=\"required\"/>\n" + "</div>\n");
        log.debug("cml: " + cml.toString());

        log.debug("POSTing to CrowdFlower");
        JsonElement json = post("/jobs", "job[title]", title, "job[instructions]", instructions, "job[cml]",
                cml.toString(), "job[css]", css, "job[js]", js);
        log.debug("CrowdFlower returned " + json);
        try {
            return json.getAsJsonObject().get("id").getAsLong();
        } catch (Exception e) {
            throw new GateRuntimeException("Failed to create CF job");
        }
    }

    /**
     * Create a single unit for an entity annotation job.
     * 
     * @param jobId the CrowdFlower job ID
     * @param doc the document containing the annotation
     * @param asName the annotation set containing the snippet annotation
     * @param snippet an annotation covering the snippet of text that will
     *          be presented for annotation, typically a Sentence or Tweet
     * @param detail additional details to be shown to the annotator below
     *          the snippet, e.g. a list of URL links that they might want
     *          to follow for more information.  May be null, in which case
     *          no detail section will be added.
     * @param tokens annotations representing the individual substrings of
     *          the snippet that will be the atomic units of annotation.
     *          Typically these will be Token annotations. The supplied
     *          "tokens" should completely cover the non-whitespace
     *          content of the snippet, but need not cover all the
     *          intervening spaces.
     * @param correctAnnotations annotations representing the "correct"
     *          answer - if this parameter is not <code>null</code> then
     *          the unit will be considered as gold-standard data. This
     *          includes the case where an empty annotation set is
     *          provided, as this represents a gold snippet where the
     *          correct answer is that this snippet contains no entities.
     * @param goldReason for a gold-standard unit, the <em>reason</em> why
     *          the annotations should be considered correct. This will be
     *          shown to users as feedback if they get the gold unit
     *          wrong. Ignored for non-gold units.
     * @return the ID of the newly-created unit.
     */
    public long createAnnotationUnit(long jobId, Document doc, String asName, Annotation snippet, String detail,
            AnnotationSet tokens, AnnotationSet correctAnnotations, String goldReason) {
        String documentId = String.valueOf(doc.getLRPersistenceId());
        int formDataSize = 6; // docId + asName + annId
        List<Annotation> tokensList = Utils.inDocumentOrder(tokens);
        formDataSize += 2 * tokensList.size();
        if (detail != null) {
            formDataSize += 2;
        }
        Set<Integer> answerGold = null;
        if (correctAnnotations != null) {
            // gold unit
            answerGold = new HashSet<Integer>();
            for (Annotation a : correctAnnotations) {
                for (int i = 0; i < tokensList.size(); i++) {
                    Annotation tokenI = tokensList.get(i);
                    if (Utils.start(tokenI) >= Utils.start(a) && Utils.end(tokenI) <= Utils.end(a)) {
                        answerGold.add(i);
                    }
                }
            }
            formDataSize += 2; // "golden"
            if (answerGold.size() == 0) {
                formDataSize += 2; // noentities=1
            } else {
                formDataSize += 2 * answerGold.size(); // answer=N for each
                                                       // token
            }
            if (goldReason != null) {
                formDataSize += 2; // answer_gold_reason or
                                   // noentities_gold_reason
            }
        }

        String[] formData = new String[formDataSize];
        int i = 0;
        formData[i++] = "unit[data][documentId]";
        formData[i++] = documentId;
        formData[i++] = "unit[data][asName]";
        formData[i++] = (asName == null ? "" : asName);
        formData[i++] = "unit[data][annId]";
        formData[i++] = String.valueOf(snippet.getId());
        for (Annotation tok : tokensList) {
            formData[i++] = "unit[data][tokens][]";
            formData[i++] = Utils.stringFor(doc, tok);
        }
        if (detail != null) {
            formData[i++] = "unit[data][detail]";
            formData[i++] = detail;
        }
        if (answerGold != null) {
            formData[i++] = "unit[golden]";
            formData[i++] = "true";
            if (answerGold.size() == 0) {
                formData[i++] = "unit[data][noentities_gold]";
                formData[i++] = "1";
                if (goldReason != null) {
                    formData[i++] = "unit[data][noentities_gold_reason]";
                    formData[i++] = goldReason;
                }
            } else {
                Integer[] goldArray = answerGold.toArray(new Integer[answerGold.size()]);
                Arrays.sort(goldArray);
                for (Integer tokIndex : goldArray) {
                    formData[i++] = "unit[data][answer_gold][]";
                    formData[i++] = String.valueOf(tokIndex);
                }
                if (goldReason != null) {
                    formData[i++] = "unit[data][answer_gold_reason]";
                    formData[i++] = goldReason;
                }
            }
        }

        try {
            JsonElement json = post("/jobs/" + jobId + "/units", formData);
            long unitId = json.getAsJsonObject().get("id").getAsLong();
            return unitId;
        } catch (IOException e) {
            throw new GateRuntimeException("Could not add unit for annotation " + snippet, e);
        }
    }

    protected JsonElement post(String uri, String... formData) throws IOException {
        return request("POST", uri, formData);
    }

    protected JsonElement get(String uri) throws IOException {
        return request("GET", uri);
    }

    protected JsonElement request(String method, String uri, String... formData) throws IOException {
        if (log.isDebugEnabled()) {
            log.debug("URI: " + uri + ", formData: " + Arrays.toString(formData));
        }
        URL cfUrl = new URL(CF_ENDPOINT + uri + "?key=" + apiKey);
        HttpURLConnection connection = (HttpURLConnection) cfUrl.openConnection();
        connection.setRequestMethod(method);
        connection.setRequestProperty("Accept", "application/json");
        if (formData != null && formData.length > 0) {
            // send the form data, URL encoded
            connection.setDoOutput(true);
            connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");

            // annoyingly CrowdFlower doesn't support chunked streaming of
            // POSTs so we have to accumulate the content in a buffer, work
            // out its size and then POST with a Content-Length header
            ByteArrayOutputStream buffer = new ByteArrayOutputStream(4096);
            PrintWriter writer = new PrintWriter(new OutputStreamWriter(buffer, "UTF-8"));
            try {
                for (int i = 0; i < formData.length; i++) {
                    String fieldName = formData[i];
                    String fieldValue = formData[++i];
                    if (i > 0) {
                        writer.write("&");
                    }
                    writer.write(URLEncoder.encode(fieldName, "UTF-8"));
                    writer.write("=");
                    writer.write(URLEncoder.encode(fieldValue, "UTF-8"));
                }
            } finally {
                writer.close();
            }

            connection.setFixedLengthStreamingMode(buffer.size());
            OutputStream connectionStream = connection.getOutputStream();
            buffer.writeTo(connectionStream);
            connectionStream.close();
        }

        // parse the response as JSON
        JsonParser parser = new JsonParser();
        Reader responseReader = new InputStreamReader(connection.getInputStream(), "UTF-8");
        try {
            return parser.parse(responseReader);
        } finally {
            responseReader.close();
        }
    }
}