edu.cmu.lti.oaqa.bioqa.providers.kb.TmToolConceptProvider.java Source code

Java tutorial

Introduction

Here is the source code for edu.cmu.lti.oaqa.bioqa.providers.kb.TmToolConceptProvider.java

Source

/*
 * Open Advancement Question Answering (OAQA) Project Copyright 2016 Carnegie Mellon University
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations
 * under the License.
 */

package edu.cmu.lti.oaqa.bioqa.providers.kb;

import com.google.common.collect.*;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import edu.cmu.lti.oaqa.baseqa.providers.kb.ConceptProvider;
import edu.cmu.lti.oaqa.ecd.config.ConfigurableProvider;
import edu.cmu.lti.oaqa.type.kb.Concept;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.ServiceUnavailableRetryStrategy;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.protocol.HttpContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;

import java.io.IOException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import static java.util.stream.Collectors.toList;

/**
 * <p>
 *   This {@link ConceptProvider} accesses
 *   <a href="https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator/">NCBI PubTator</a> backend
 *   via <a href="https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/RESTfulAPIs.html">TmTool
 *   Restful API</a>, including <tt>tmChem</tt>, <tt>DNorm</tt>, <tt>tmVar</tt>, and
 *   <tt>GNormPlus</tt> services.
 * </p>
 * <p>
 *   We make a number of tweaks to make sure it works with tens of thousands of biomedical passages,
 *   including
 *   <ul>
 *     <li>
 *       The PubTator (tab-separated) format does not support <tt>DNorm</tt> annotation, and the
 *       BioC format (XML) can take forever to process a request.
 *       We use PubAnnotation (JSON) format.
 *       More information about the format can be found at
 *       <a href="http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/Format.html">http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/Format.html</a>.
 *     </li>
 *     <li>
 *       Offset might change, esp. with <tt>tmChem</tt> trigger. A few known issues/fixes:
 *       <ul>
 *         <li>
 *           The PubAnnotation RESTful service does not support some ASCII characters (e.g. '').
 *           We create {@link PubAnnotationConvertUtil} to normalize the text before sending the
 *           request. Sometimes, the length may also change, e.g. in the case of "Waldisphl"
 *           (3 characters added).
 *         </li>
 *         <li>
 *           '%' is a dangerous symbol. We found that if a non whitespace character after it, the
 *           returned text will have a different length, e.g. '%-' or '%4', etc.
 *         </li>
 *         <li>Disabling HTML encoding while converting to JSON.</li>
 *       </ul>
 *     </li>
 *     <li>
 *       <tt>Receive/</tt> method can only be called once.
 *     </li>
 *   </ul>
 * </p>
 *
 * @author <a href="mailto:ziy@cs.cmu.edu">Zi Yang</a> created on 3/19/16.
 */
public class TmToolConceptProvider extends ConfigurableProvider implements ConceptProvider {

    private static final String URL_PREFIX = "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/";

    protected Set<String> triggers = ImmutableSet.of("tmChem", "DNorm", "tmVar", "GNormPlus");

    private static HttpClientBuilder clientBuilder = HttpClientBuilder.create().disableRedirectHandling()
            .setServiceUnavailableRetryStrategy(new ServiceUnavailableRetryStrategy() {

                @Override
                public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
                    int statusCode = response.getStatusLine().getStatusCode();
                    return statusCode == 404 || statusCode == 501;
                }

                @Override
                public long getRetryInterval() {
                    return 1000L;
                }
            });

    private static Gson gson = new GsonBuilder().disableHtmlEscaping().create();

    @Override
    public boolean initialize(ResourceSpecifier aSpecifier, Map<String, Object> aAdditionalParams)
            throws ResourceInitializationException {
        boolean ret = super.initialize(aSpecifier, aAdditionalParams);
        Object triggersParam = getParameterValue("triggers");
        if (triggersParam != null) {
            setTriggers(ImmutableSet.copyOf((Iterable<String>) triggersParam));
        }
        return ret;
    }

    public void setTriggers(Set<String> triggers) {
        this.triggers = triggers;
    }

    @Override
    public List<Concept> getConcepts(JCas jcas) throws AnalysisEngineProcessException {
        return getConcepts(Collections.singletonList(jcas));
    }

    @Override
    public List<Concept> getConcepts(List<JCas> jcases) throws AnalysisEngineProcessException {
        // send request
        List<String> normalizedTexts = jcases.stream().map(JCas::getDocumentText)
                .map(PubAnnotationConvertUtil::normalizeText).collect(toList());
        ListMultimap<Integer, PubAnnotation.Denotation> index2denotations = Multimaps
                .synchronizedListMultimap(ArrayListMultimap.create());
        ExecutorService es = Executors.newCachedThreadPool();
        for (String trigger : triggers) {
            es.submit(() -> {
                try {
                    List<String> denotationStrings = requestConcepts(normalizedTexts, trigger);
                    assert denotationStrings.size() == jcases.size();
                    for (int i = 0; i < jcases.size(); i++) {
                        PubAnnotation.Denotation[] denotations = gson.fromJson(denotationStrings.get(i),
                                PubAnnotation.Denotation[].class);
                        index2denotations.putAll(i, Arrays.asList(denotations));
                    }
                } catch (Exception e) {
                    throw TmToolConceptProviderException.unknownException(trigger, e);
                }
            });
        }
        es.shutdown();
        try {
            boolean status = es.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
            if (!status) {
                throw new AnalysisEngineProcessException();
            }
        } catch (InterruptedException e) {
            throw new AnalysisEngineProcessException(e);
        }
        // convert denotation strings
        List<Concept> concepts = new ArrayList<>();
        for (int i = 0; i < jcases.size(); i++) {
            JCas jcas = jcases.get(i);
            List<PubAnnotation.Denotation> denotations = index2denotations.get(i);
            try {
                concepts.addAll(PubAnnotationConvertUtil.convertDenotationsToConcepts(jcas, denotations));
            } catch (StringIndexOutOfBoundsException e) {
                throw TmToolConceptProviderException.offsetOutOfBounds(jcas.getDocumentText(), denotations, e);
            }
        }
        return concepts;
    }

    protected List<String> requestConcepts(List<String> normalizedTexts, String trigger)
            throws AnalysisEngineProcessException {
        PubAnnotation[] inputs = PubAnnotationConvertUtil.convertTextsToPubAnnotations(normalizedTexts);
        String request = gson.toJson(inputs, PubAnnotation[].class);
        String response;
        try {
            response = submitText(trigger, request);
        } catch (IOException e) {
            throw new AnalysisEngineProcessException(e);
        }
        PubAnnotation[] outputs = gson.fromJson("[" + response + "]", PubAnnotation[].class);
        List<PubAnnotation> sortedOutputs = Arrays.stream(outputs)
                .sorted(Comparator.comparing(pa -> Integer.parseInt(pa.getSourceid()))).collect(toList());
        List<String> denotationStrings = sortedOutputs.stream().map(PubAnnotation::getDenotations).map(gson::toJson)
                .collect(toList());
        if (denotationStrings.size() != normalizedTexts.size()) {
            throw TmToolConceptProviderException.unequalVolume(trigger, normalizedTexts.size(),
                    denotationStrings.size());
        }
        for (int i = 0; i < normalizedTexts.size(); i++) {
            String sentText = normalizedTexts.get(i);
            String recvText = PubAnnotationConvertUtil.normalizeText(sortedOutputs.get(i).getText());
            if (sentText.length() != recvText.length()) {
                throw TmToolConceptProviderException.unequalTextLength(trigger, sentText, recvText);
            }
            //      if (sentText.equals(recvText)) {
            //        throw TmToolConceptProviderException.textChanged(trigger, sentText, recvText);
            //      }
        }
        return denotationStrings;
    }

    private static String submitText(String trigger, String text) throws IOException {
        CloseableHttpClient client = clientBuilder.build();
        HttpPost post = new HttpPost(URL_PREFIX + trigger + "/Submit/");
        post.setEntity(new StringEntity(text));
        HttpResponse response = client.execute(post);
        String session = IOUtils.toString(response.getEntity().getContent());
        HttpGet get = new HttpGet(URL_PREFIX + session + "/Receive/");
        response = client.execute(get);
        return IOUtils.toString(response.getEntity().getContent());
    }

}