 * Copyright (c) 2014-2018 The University Of Sheffield.
 * This file is part of gateplugin-Tagger_TagMe 
 * (see
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 2.1 of the License, or
 * (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * GNU Lesser General Public License for more details.
 * You should have received a copy of the GNU Lesser General Public License
 * along with this software. If not, see <>.

package gate.tagger.tagme;

import com.fasterxml.jackson.databind.ObjectMapper;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.http.client.fluent.Content;

import org.apache.http.client.fluent.Request;
import org.apache.http.client.fluent.Response;
import org.apache.http.client.utils.URIBuilder;
import org.apache.log4j.Logger;

 *  PR for using the WAT service api for entity linking.
@CreoleResource(name = "Tagger_WAT", comment = "Annotate documents using a WAT web service",
        // icon="taggerIcon.gif",
        helpURL = "")
public class TaggerWatWS extends AbstractLanguageAnalyser {

    private static final long serialVersionUID = 5322455999996492868L;

    protected String inputASName = "";

    @CreoleParameter(comment = "Input annotation set for containing annotations, default is the default set", defaultValue = "")
    public void setInputAnnotationSet(String ias) {
        inputASName = ias;

    public String getInputAnnotationSet() {
        return inputASName;

    protected String inputType = "";

    @CreoleParameter(comment = "Only text covered by each containing annotation is annotated, default: annotate whole document", defaultValue = "")
    public void setContainingAnnotationType(String val) {
        this.containingType = val;

    public String getContainingAnnotationType() {
        return containingType;

    protected String containingType = "";

    protected String outputASName = "";

    @CreoleParameter(comment = "Output annotation set, default is default annotation set", defaultValue = "")
    public void setOutputAnnotationSet(String ias) {
        outputASName = ias;

    public String getOutputAnnotationSet() {
        return outputASName;

    protected String outputType = "";

    @CreoleParameter(comment = "The output annotation type, default is 'Lookup'", defaultValue = "Lookup")
    public void setOutputAnnotationType(String val) {
        this.outputType = val;

    public String getOutputAnnotationType() {
        return outputType;

    protected URL tagMeServiceUrl = null;

    @CreoleParameter(comment = "The URL of the web service to use", defaultValue = "")
    public void setTagMeServiceUrl(URL url) {
        tagMeServiceUrl = url;

    public URL getTagMeServiceUrl() {
        return tagMeServiceUrl;

    @CreoleParameter(comment = "The service auth token to use, required, no default", defaultValue = "")
    public void setApiKey(String key) {
        apiKey = key;

    public String getApiKey() {
        return apiKey;

    protected String apiKey = "";

    @CreoleParameter(comment = "Language code, currently supported: en,it,de", defaultValue = "en")
    public void setLanguageCode(String code) {
        languageCode = code;

    public String getLanguageCode() {
        return languageCode;

    protected String languageCode = "en";

    @CreoleParameter(comment = "Minimum value of rho: all annotations with a rho less than this will be ignored", defaultValue = "0.2")
    public void setMinRho(Double value) {
        minrho = value;

    public Double getMinRho() {
        return minrho;

    protected double minrho = 0.2;

    static final Logger logger = Logger.getLogger(TaggerWatWS.class);

    private static final Pattern patternUrl = Pattern.compile("(?iu:www\\.[\\s]+)|(?iu:https?://[^\\s]+)");
    private static final Pattern patternUser = Pattern.compile("@[^\\s]+");
    private static final String patternHashTag = "#([^\\s]+)";
    private static final String patternStringRT3 = "^(?iu:RT:) ";
    private static final String patternStringRT2 = "^(?iu:RT) ";

    // helper method to produce a String of n spaces
    private String nSpaces(int n) {
        char[] chars = new char[n];
        java.util.Arrays.fill(chars, ' ');
        return new String(chars);

    public void execute() throws ExecutionException {

    protected void doExecute(Document theDocument) throws ExecutionException {
        interrupted = false;
        if (theDocument == null) {
            throw new ExecutionException("No document to process!");
        AnnotationSet outputAS = theDocument.getAnnotations(getOutputAnnotationSet());
        if (containingType == null || containingType.isEmpty()) {
            annotateText(document, outputAS, 0, document.getContent().size());
        } else {
            AnnotationSet inputAS;
            if (inputASName == null || inputASName.isEmpty()) {
                inputAS = theDocument.getAnnotations();
            } else {
                inputAS = theDocument.getAnnotations(inputASName);
            AnnotationSet containingAnns = inputAS.get(containingType);
            for (Annotation containingAnn : containingAnns) {
                annotateText(document, outputAS, gate.Utils.start(containingAnn), gate.Utils.end(containingAnn));

    // carry out the actual annotations on the given span of text in the 
    // document.
    protected void annotateText(Document doc, AnnotationSet outputAS, long from, long to) {
        String text = "";
        try {
            text = doc.getContent().getContent(from, to).toString();
        } catch (InvalidOffsetException ex) {
            throw new GateRuntimeException("Unexpected offset exception, offsets are " + from + "/" + to);
        // send the text to the service and get back the response
        // System.out.println("DEBUG: Annotating text from="+from+", to="+to+", text="+text);
        //System.out.println("Starting offset is "+from);

        WatAnnotation[] tagmeAnnotations = getTagMeAnnotations(text);
        for (WatAnnotation tagmeAnn : tagmeAnnotations) {
            if (tagmeAnn.rho < minrho) {
            FeatureMap fm = Factory.newFeatureMap();
            fm.put("title", tagmeAnn.title);
            fm.put("rho", tagmeAnn.rho);
            if (tagmeAnn.title == null) {
                throw new GateRuntimeException("Odd: got a null title from the TagMe service" + tagmeAnn);
            } else {
                fm.put("inst", "" + recodeForDbp38(tagmeAnn.title));
            try {
                gate.Utils.addAnn(outputAS, from + tagmeAnn.start, from + tagmeAnn.end, getOutputAnnotationType(),
            } catch (Exception ex) {
                        .println("Got an exception in document " + doc.getName() + ": " + ex.getLocalizedMessage());
                System.err.println("from=" + from + ", to=" + to + " TagMeAnn=" + tagmeAnn);


    protected WatAnnotation[] getTagMeAnnotations(String text) {
        String str = retrieveServerResponse(text);
        return convertStringToTagMeAnnotations02(str);

    protected String retrieveServerResponse(String text) {
        URI uri;
        try {
            uri = new URIBuilder(getTagMeServiceUrl().toURI()).setParameter("text", text)
                    .setParameter("gcube-token", getApiKey()).setParameter("lang", getLanguageCode()).build();
        } catch (URISyntaxException ex) {
            throw new GateRuntimeException("Could not create URI for the request", ex);

        //System.err.println("DEBUG: WAT URL="+uri);
        Request req = Request.Get(uri);

        Response res = null;
        try {
            res = req.execute();
        } catch (Exception ex) {
            throw new GateRuntimeException("Problem executing HTTP request: " + req, ex);
        Content cont = null;
        try {
            cont = res.returnContent();
        } catch (Exception ex) {
            throw new GateRuntimeException("Problem getting HTTP response content: " + res, ex);
        String ret = cont.asString();
        logger.debug("WAT server response " + ret);
        return ret;

    // second version of the conversion code: this uses classes to represent
    // the format of the JSON we expect and should be less clumsy, but may 
    // be slower
    protected WatAnnotation[] convertStringToTagMeAnnotations02(String str) {
        // parse the String as JSON
        ObjectMapper mapper = new ObjectMapper();
        WatJsonData data = null;
        try {
            data = mapper.readValue(str, WatJsonData.class);
        } catch (Exception ex) {
            throw new GateRuntimeException("Problem parsing the returned JSON as TagMeJsonData " + str, ex);
        return data.annotations;

    protected static class WatAnnotation {
        public int id = 0;
        public String title = "";
        public int start = 0;
        public int end = 0;
        public double rho = 0.0;
        public String spot = "";

        public String toString() {
            return "WatAnnotation(id=" + id + ",rho=" + rho + ",title=" + title + ",offset=" + start + ", end="
                    + end + ")";

    protected static class WatJsonData {
        public Object metrics = ""; // we do not care about this one
        public WatAnnotation[] annotations = null;

    // UTILITY methods

    public static String recodeForDbp38(String uriString) {
        String ret;
        URI uri = null;
        if (uriString.startsWith("http://") || uriString.startsWith("https://")) {
            // First try to parse the string as an URI so that any superfluous 
            // percent-encodings can get decoded later
            try {
                uri = new URI(uriString);
            } catch (Exception ex) {
                throw new GateRuntimeException("Could not parse URI " + uriString, ex);
            // now use this constructor to-recode only the necessary parts
            try {
                String path = uri.getPath();
                path = path.trim();
                path = path.replaceAll(" +", "_");
                uri = new URI(uri.getScheme(), null, uri.getHost(), -1, path, uri.getQuery(), uri.getFragment());
            } catch (Exception ex) {
                throw new GateRuntimeException("Could not re-construct URI: " + uri);
            ret = uri.toString();
        } else {
            if (uriString.contains("\\u")) {
                uriString = StringEscapeUtils.unescapeJava(uriString);
            uriString = uriString.trim();
            uriString = uriString.replaceAll(" +", "_");
            // We need to %-encode colons, otherwise the getPath() method will return
            // null ...
            uriString = uriString.replaceAll(":", "%3A");
            try {
                uri = new URI(uriString);
                // decode and prepare for minimal percent encoding
                uriString = uri.getPath();
            } catch (URISyntaxException ex) {
                // do nothing: the uriString must already be ready for percent-encoding
            uriString = uriString.replaceAll(" +", "_");
            try {
                uri = new URI(null, null, null, -1, "/" + uriString, null, null);
            } catch (Exception ex) {
                throw new GateRuntimeException("Could not re-construct URI part: " + uriString);
            ret = uri.toString().substring(1);
        return ret;

} // class TaggerTagMeWS