Java tutorial
/* * Copyright (c) 2014-2018 The University Of Sheffield. * * This file is part of gateplugin-Tagger_TagMe * (see https://github.com/GateNLP/gateplugin-Tagger_TagMe). * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 2.1 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this software. If not, see <http://www.gnu.org/licenses/>. */ package gate.tagger.tagme; import com.fasterxml.jackson.databind.ObjectMapper; import gate.Annotation; import gate.AnnotationSet; import gate.Document; import gate.Factory; import gate.FeatureMap; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.Optional; import gate.creole.metadata.RunTime; import gate.util.GateRuntimeException; import gate.util.InvalidOffsetException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringEscapeUtils; import org.apache.http.Consts; import org.apache.http.client.fluent.Content; import org.apache.http.client.fluent.Form; import org.apache.http.client.fluent.Request; import org.apache.http.client.fluent.Response; import org.apache.log4j.Logger; /** * PR for using the TagMe service api for entity linking. */ @CreoleResource(name = "Tagger_TagMe", comment = "Annotate documents using a TagMe web service", // icon="taggerIcon.gif", helpURL = "https://github.com/GateNLP/gateplugin-Tagger_TagMe/wiki/Tagger_TagMe") public class TaggerTagMeWS extends AbstractLanguageAnalyser { private static final long serialVersionUID = 5322455999996492868L; protected String inputASName = ""; @RunTime @Optional @CreoleParameter(comment = "Input annotation set for containing annotations, default is the default set", defaultValue = "") public void setInputAnnotationSet(String ias) { inputASName = ias; } public String getInputAnnotationSet() { return inputASName; } protected String inputType = ""; @RunTime @Optional @CreoleParameter(comment = "Only text covered by each containing annotation is annotated, default: annotate whole document", defaultValue = "") public void setContainingAnnotationType(String val) { this.containingType = val; } public String getContainingAnnotationType() { return containingType; } protected String containingType = ""; protected String outputASName = ""; @RunTime @Optional @CreoleParameter(comment = "Output annotation set, default is default annotation set", defaultValue = "") public void setOutputAnnotationSet(String ias) { outputASName = ias; } public String getOutputAnnotationSet() { return outputASName; } protected String outputType = ""; @RunTime @Optional @CreoleParameter(comment = "The output annotation type, default is 'Lookup'", defaultValue = "Lookup") public void setOutputAnnotationType(String val) { this.outputType = val; } public String getOutputAnnotationType() { return outputType; } protected URL tagMeServiceUrl = null; @RunTime @CreoleParameter(comment = "The URL of the web service to use", defaultValue = "https://tagme.d4science.org/tagme/tag") public void setTagMeServiceUrl(URL url) { tagMeServiceUrl = url; } public URL getTagMeServiceUrl() { return tagMeServiceUrl; } @RunTime @CreoleParameter(comment = "The service auth token to use, required, no default", defaultValue = "") public void setApiKey(String key) { apiKey = key; } public String getApiKey() { return apiKey; } protected String apiKey = ""; @RunTime @CreoleParameter(comment = "Should be true if the text is a tweet or very short", defaultValue = "false") public void setIsTweet(Boolean flag) { isTweet = flag; } public Boolean getIsTweet() { return isTweet; } protected Boolean isTweet = false; @RunTime @CreoleParameter(comment = "Language code, currently supported: en,it,de", defaultValue = "en") public void setLanguageCode(String code) { languageCode = code; } public String getLanguageCode() { return languageCode; } protected String languageCode = "en"; @RunTime @CreoleParameter(comment = "Epsilon: balance between context and commonness, useful range is 0.0 to 0.5", defaultValue = "0.3") public void setEpsilon(Double value) { epsilon = value; } public Double getEpsilon() { return epsilon; } protected Double epsilon = 0.3; @RunTime @CreoleParameter(comment = "long_text parameter sent to the service, value 0 (default) or a positive integer", defaultValue = "0") public void setLongText(Integer value) { long_text = value; } public Integer getLongText() { return long_text; } protected Integer long_text = 0; @RunTime @CreoleParameter(comment = "Minimum value of rho: all annotations with a rho less than this will be ignored", defaultValue = "0.2") public void setMinRho(Double value) { minrho = value; } public Double getMinRho() { return minrho; } protected double minrho = 0.2; static final Logger logger = Logger.getLogger(TaggerTagMeWS.class); private static final Pattern patternUrl = Pattern.compile("(?iu:www\\.[\\s]+)|(?iu:https?://[^\\s]+)"); private static final Pattern patternUser = Pattern.compile("@[^\\s]+"); private static final String patternHashTag = "#([^\\s]+)"; private static final String patternStringRT3 = "^(?iu:RT:) "; private static final String patternStringRT2 = "^(?iu:RT) "; // helper method to produce a String of n spaces private String nSpaces(int n) { char[] chars = new char[n]; java.util.Arrays.fill(chars, ' '); return new String(chars); } @Override public void execute() throws ExecutionException { doExecute(document); } protected void doExecute(Document theDocument) throws ExecutionException { interrupted = false; if (theDocument == null) { throw new ExecutionException("No document to process!"); } AnnotationSet outputAS = theDocument.getAnnotations(getOutputAnnotationSet()); if (containingType == null || containingType.isEmpty()) { annotateText(document, outputAS, 0, document.getContent().size()); } else { AnnotationSet inputAS = null; if (inputASName == null || inputASName.isEmpty()) { inputAS = theDocument.getAnnotations(); } else { inputAS = theDocument.getAnnotations(inputASName); } AnnotationSet containingAnns = inputAS.get(containingType); for (Annotation containingAnn : containingAnns) { annotateText(document, outputAS, gate.Utils.start(containingAnn), gate.Utils.end(containingAnn)); } } } // carry out the actual annotations on the given span of text in the // document. protected void annotateText(Document doc, AnnotationSet outputAS, long from, long to) { String text = ""; try { text = doc.getContent().getContent(from, to).toString(); } catch (InvalidOffsetException ex) { throw new GateRuntimeException("Unexpected offset exception, offsets are " + from + "/" + to); } // send the text to the service and get back the response //System.out.println("Annotating text: "+text); //System.out.println("Starting offset is "+from); // NOTE: there is a bug in the TagMe service which causes offset errors // if we use the tweet mode and there are certain patterns in the tweet. // The approach recommended by Francesco Piccinno is to replace those // patterns by spaces. if (getIsTweet()) { logger.debug("Text before cleaning: >>" + text + "<<"); // replace text = text.replaceAll(patternStringRT3, " "); text = text.replaceAll(patternStringRT2, " "); text = text.replaceAll(patternHashTag, " $1"); // now replace the remaining patterns by spaces StringBuilder sb = new StringBuilder(text); Matcher m = patternUrl.matcher(text); while (m.find()) { int start = m.start(); int end = m.end(); sb.replace(start, end, nSpaces(end - start)); } m = patternUser.matcher(text); while (m.find()) { int start = m.start(); int end = m.end(); sb.replace(start, end, nSpaces(end - start)); } text = sb.toString(); logger.debug("Text after cleaning: >>" + text + "<<"); } TagMeAnnotation[] tagmeAnnotations = getTagMeAnnotations(text); for (TagMeAnnotation tagmeAnn : tagmeAnnotations) { if (tagmeAnn.rho >= minrho) { FeatureMap fm = Factory.newFeatureMap(); fm.put("tagMeId", tagmeAnn.id); fm.put("title", tagmeAnn.title); fm.put("rho", tagmeAnn.rho); fm.put("spot", tagmeAnn.spot); fm.put("link_probability", tagmeAnn.link_probability); if (tagmeAnn.title == null) { throw new GateRuntimeException("Odd: got a null title from the TagMe service" + tagmeAnn); } else { fm.put("inst", "http://dbpedia.org/resource/" + recodeForDbp38(tagmeAnn.title)); } try { gate.Utils.addAnn(outputAS, from + tagmeAnn.start, from + tagmeAnn.end, getOutputAnnotationType(), fm); } catch (Exception ex) { System.err.println( "Got an exception in document " + doc.getName() + ": " + ex.getLocalizedMessage()); ex.printStackTrace(System.err); System.err.println("from=" + from + ", to=" + to + " TagMeAnn=" + tagmeAnn); } } } } protected TagMeAnnotation[] getTagMeAnnotations(String text) { String str = retrieveServerResponse(text); return convertStringToTagMeAnnotations02(str); } protected String retrieveServerResponse(String text) { Request req = Request.Post(getTagMeServiceUrl().toString()); req.addHeader("Content-Type", "application/x-www-form-urlencoded"); req.bodyForm( Form.form().add("text", text).add("gcube-token", getApiKey()).add("lang", getLanguageCode()) .add("tweet", getIsTweet().toString()).add("include_abstract", "false") .add("include_categories", "false").add("include_all_spots", "false") .add("long_text", getLongText().toString()).add("epsilon", getEpsilon().toString()).build(), Consts.UTF_8); logger.debug("Request is " + req); Response res = null; try { res = req.execute(); } catch (Exception ex) { throw new GateRuntimeException("Problem executing HTTP request: " + req, ex); } Content cont = null; try { cont = res.returnContent(); } catch (Exception ex) { throw new GateRuntimeException("Problem getting HTTP response content: " + res, ex); } String ret = cont.asString(); logger.debug("TagMe server response " + ret); return ret; } // second version of the conversion code: this uses classes to represent // the format of the JSON we expect and should be less clumsy, but may // be slower protected TagMeAnnotation[] convertStringToTagMeAnnotations02(String str) { List<TagMeAnnotation> tagmeAnnotations = new ArrayList<TagMeAnnotation>(); // parse the String as JSON ObjectMapper mapper = new ObjectMapper(); TagMeJsonData data = null; try { data = mapper.readValue(str, TagMeJsonData.class); } catch (Exception ex) { throw new GateRuntimeException("Problem parsing the returned JSON as TagMeJsonData " + str, ex); } return data.annotations; } protected static class TagMeAnnotation { public int id = 0; public String title = ""; public int start = 0; public int end = 0; public double rho = 0.0; public double link_probability = 0.0; public String spot = ""; @Override public String toString() { return "TagMeAnnotation(id=" + id + ",rho=" + rho + ",title=" + title + ",offset=" + start + ", end=" + end + ")"; } } protected static class TagMeJsonData { public String timestamp = ""; public int time = 0; public String api = ""; public String lang = ""; public String test = ""; public TagMeAnnotation[] annotations = null; } // UTILITY methods public static String recodeForDbp38(String uriString) { String ret; URI uri = null; if (uriString.startsWith("http://") || uriString.startsWith("https://")) { // First try to parse the string as an URI so that any superfluous // percent-encodings can get decoded later try { uri = new URI(uriString); } catch (Exception ex) { throw new GateRuntimeException("Could not parse URI " + uriString, ex); } // now use this constructor to-recode only the necessary parts try { String path = uri.getPath(); path = path.trim(); path = path.replaceAll(" +", "_"); uri = new URI(uri.getScheme(), null, uri.getHost(), -1, path, uri.getQuery(), uri.getFragment()); } catch (Exception ex) { throw new GateRuntimeException("Could not re-construct URI: " + uri); } ret = uri.toString(); } else { if (uriString.contains("\\u")) { uriString = StringEscapeUtils.unescapeJava(uriString); } uriString = uriString.trim(); uriString = uriString.replaceAll(" +", "_"); // We need to %-encode colons, otherwise the getPath() method will return // null ... uriString = uriString.replaceAll(":", "%3A"); try { uri = new URI(uriString); // decode and prepare for minimal percent encoding uriString = uri.getPath(); } catch (URISyntaxException ex) { // do nothing: the uriString must already be ready for percent-encoding } uriString = uriString.replaceAll(" +", "_"); try { uri = new URI(null, null, null, -1, "/" + uriString, null, null); } catch (Exception ex) { throw new GateRuntimeException("Could not re-construct URI part: " + uriString); } ret = uri.toString().substring(1); } return ret; } } // class TaggerTagMeWS