Java tutorial
/* * Copyright (c) 2017 Public Library of Science * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package org.ambraproject.rhino.service.taxonomy.impl; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Charsets; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import org.ambraproject.rhino.config.RuntimeConfiguration; import org.ambraproject.rhino.model.Article; import org.ambraproject.rhino.model.ArticleCategoryAssignment; import org.ambraproject.rhino.model.ArticleIngestion; import org.ambraproject.rhino.model.ArticleRevision; import org.ambraproject.rhino.model.Category; import org.ambraproject.rhino.service.ArticleCrudService; import org.ambraproject.rhino.service.taxonomy.TaxonomyClassificationService; import org.ambraproject.rhino.service.taxonomy.TaxonomyRemoteServiceInvalidBehaviorException; import org.ambraproject.rhino.service.taxonomy.TaxonomyRemoteServiceNotAvailableException; import org.ambraproject.rhino.service.taxonomy.TaxonomyRemoteServiceNotConfiguredException; import org.ambraproject.rhino.service.taxonomy.WeightedTerm; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.ContentType; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.hibernate.Query; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.orm.hibernate3.HibernateTemplate; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import javax.xml.parsers.DocumentBuilder; import java.io.IOException; import java.io.InputStream; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import static org.ambraproject.rhino.service.impl.AmbraService.newDocumentBuilder; /** * This is a separate bean from {@link TaxonomyServiceImpl} because it has a special dependency on the remote taxonomy * server, which is useful to inject separately. * * @author Alex Kudlick Date: 7/3/12 */ @SuppressWarnings("JpaQlInspection") public class TaxonomyClassificationServiceImpl implements TaxonomyClassificationService { private static final Logger log = LoggerFactory.getLogger(TaxonomyClassificationServiceImpl.class); private static final String MESSAGE_BEGIN = "<TMMAI project='%s' location = '.'>\n" + " <Method name='getSuggestedTermsFullPathsPlos' returnType='java.util.Vector'/>\n" + " <VectorParam>\n" + " <VectorElement>\n"; private static final String MESSAGE_DOC_ELEMENT = " <doc>\n" + " <header>\n" + "%s" + " </header>\n" + " <content>\n" + " %s\n" + " </content>\n" + " </doc>\n"; private static final String MESSAGE_HEADER = " <publication-date>%s</publication-date>\n" + " <journal-title>%s</journal-title>\n" + " <article-type>%s</article-type>\n" + " <article-id pub-id-type=\"doi\">%s</article-id>\n"; private static final String MESSAGE_END = " </VectorElement>\n" + " </VectorParam>\n" + "</TMMAI>"; // Number of most-weighted category leaf nodes to associate with each article // TODO: Make configurable? private static final int CATEGORY_COUNT = 8; @Autowired private CloseableHttpClient httpClient; @Autowired private RuntimeConfiguration runtimeConfiguration; @Autowired private ArticleCrudService articleCrudService; @Autowired protected HibernateTemplate hibernateTemplate; /** * @inheritDoc */ @Override public List<WeightedTerm> classifyArticle(Article article, Document articleXml) { RuntimeConfiguration.TaxonomyConfiguration configuration = getTaxonomyConfiguration(); List<String> rawTerms = getRawTerms(articleXml, article, false /*isTextRequired*/); List<WeightedTerm> results = new ArrayList<>(rawTerms.size()); for (String rawTerm : rawTerms) { WeightedTerm entry = parseVectorElement(rawTerm); String term = entry.getPath(); if (term != null) { boolean isBlacklisted = false; for (String blacklistedCategory : configuration.getCategoryBlacklist()) { if (term.startsWith(blacklistedCategory)) { isBlacklisted = true; break; } } if (!isBlacklisted) { results.add(entry); } } } return results; } private RuntimeConfiguration.TaxonomyConfiguration getTaxonomyConfiguration() { RuntimeConfiguration.TaxonomyConfiguration configuration = runtimeConfiguration.getTaxonomyConfiguration(); if (configuration.getServer() == null || configuration.getThesaurus() == null) { throw new TaxonomyRemoteServiceNotConfiguredException(); } return configuration; } private static final ContentType APPLICATION_XML_UTF_8 = ContentType.create("application/xml", Charsets.UTF_8); /** * @inheritDoc */ @Override public List<String> getRawTerms(Document articleXml, Article article, boolean isTextRequired) { RuntimeConfiguration.TaxonomyConfiguration configuration = getTaxonomyConfiguration(); String toCategorize = getCategorizationContent(articleXml); ArticleIngestion latest = articleCrudService.readLatestRevision(article).getIngestion(); String header = String.format(MESSAGE_HEADER, new SimpleDateFormat("yyyy-MM-dd").format(latest.getPublicationDate()), latest.getJournal().getTitle(), latest.getArticleType(), article.getDoi()); String aiMessage = String.format(MESSAGE_BEGIN, configuration.getThesaurus()) + StringEscapeUtils.escapeXml10(String.format(MESSAGE_DOC_ELEMENT, header, toCategorize)) + MESSAGE_END; HttpPost post = new HttpPost(configuration.getServer().toString()); post.setEntity(new StringEntity(aiMessage, APPLICATION_XML_UTF_8)); DocumentBuilder documentBuilder = newDocumentBuilder(); Document response; try (CloseableHttpResponse httpResponse = httpClient.execute(post); InputStream stream = httpResponse.getEntity().getContent()) { response = documentBuilder.parse(stream); } catch (IOException e) { throw new TaxonomyRemoteServiceNotAvailableException(e); } catch (SAXException e) { throw new TaxonomyRemoteServiceInvalidBehaviorException( "Invalid XML returned from " + configuration.getServer(), e); } //parse result NodeList vectorElements = response.getElementsByTagName("VectorElement"); List<String> results = new ArrayList<>(vectorElements.getLength()); // Add the text that is sent to taxonomy server if isTextRequired is true if (isTextRequired) { toCategorize = StringEscapeUtils.unescapeXml(toCategorize); results.add(toCategorize); } //The first and last elements of the vector response are just MAITERMS for (int i = 1; i < vectorElements.getLength() - 1; i++) { results.add(vectorElements.item(i).getTextContent()); } if ((isTextRequired && results.size() == 1) || results.isEmpty()) { log.error("Taxonomy server returned 0 terms. " + article.getDoi()); } return results; } @Override public Collection<ArticleCategoryAssignment> getAssignmentsForArticle(Article article) { return hibernateTemplate.execute(session -> { Query query = session .createQuery("" + "FROM ArticleCategoryAssignment aca " + "WHERE aca.article = :article"); query.setParameter("article", article); return (Collection<ArticleCategoryAssignment>) query.list(); }); } private static final Splitter TAXONOMY_PATH_SPLITTER = Splitter.on('/'); private static String getTermFromPath(String path) { return Iterables.getLast(TAXONOMY_PATH_SPLITTER.split(path)); } @Override public Collection<Category> getArticleCategoriesWithTerm(Article article, String term) { Objects.requireNonNull(term); return getAssignmentsForArticle(article).stream().filter((ArticleCategoryAssignment aca) -> { String path = aca.getCategory().getPath(); return getTermFromPath(path).equals(term); }).map(ArticleCategoryAssignment::getCategory).collect(Collectors.toList()); } /** * {@inheritDoc} */ @Override public void populateCategories(ArticleRevision revision) { ArticleIngestion ingestion = revision.getIngestion(); Article article = ingestion.getArticle(); Document xml = articleCrudService.getManuscriptXml(ingestion); List<WeightedTerm> terms; String doi = article.getDoi(); boolean isAmendment = false; //todo: fix or remove this when we find a home for article types if (!isAmendment) { terms = classifyArticle(article, xml); if (terms != null && terms.size() > 0) { List<WeightedTerm> leafNodes = getDistinctLeafNodes(CATEGORY_COUNT, terms); persistCategories(leafNodes, article); } else { log.error("Taxonomy server returned 0 terms. Cannot populate Categories. " + doi); } } } /** * Determine the most heavily weighted leaf nodes, then return all terms that have one of those leaf nodes. * <p> * The returned list is in descending order by weight. The order of terms with equal weight is stably preserved from * the input list. * * @param leafCount the number of distinct leaf nodes to search for * @param weightedTerms all weighted category terms on an article * @return a list, in descending order by weight, of all terms whose leaf node is among the most heavily weighted */ @VisibleForTesting static List<WeightedTerm> getDistinctLeafNodes(int leafCount, List<WeightedTerm> weightedTerms) { List<WeightedTerm> orderedTerms = weightedTerms.stream() .sorted(Comparator.comparing(WeightedTerm::getWeight).reversed()).collect(Collectors.toList()); Set<String> mostWeightedLeaves = orderedTerms.stream().map(WeightedTerm::getLeafTerm).distinct() .limit(leafCount).collect(Collectors.toSet()); return orderedTerms.stream().filter(term -> mostWeightedLeaves.contains(term.getLeafTerm())) .collect(Collectors.toList()); } private void persistCategories(List<WeightedTerm> terms, Article article) { Set<String> termStrings = terms.stream().map(WeightedTerm::getPath).collect(Collectors.toSet()); Collection<Category> existingCategories = hibernateTemplate.execute(session -> { Query query = session.createQuery("FROM Category WHERE path IN (:terms)"); query.setParameterList("terms", termStrings); return (Collection<Category>) query.list(); }); Map<String, Category> existingCategoryMap = Maps.uniqueIndex(existingCategories, Category::getPath); Collection<ArticleCategoryAssignment> existingAssignments = getAssignmentsForArticle(article); Map<Category, ArticleCategoryAssignment> assignmentMap = Maps.uniqueIndex(existingAssignments, ArticleCategoryAssignment::getCategory); assignmentMap = new HashMap<>(assignmentMap); // Make it mutable. We will remove assignments as they are updated. for (WeightedTerm term : terms) { Category category = existingCategoryMap.get(term.getPath()); if (category == null) { /* * A new category from the taxonomy server, which is not yet persisted in our system. Create it now. * * This risks a race condition if two articles are being populated concurrently and both have the same new * category, which can cause a "MySQLIntegrityConstraintViolationException: Duplicate entry" error. */ category = new Category(); category.setPath(term.getPath()); hibernateTemplate.save(category); } ArticleCategoryAssignment assignment = assignmentMap.remove(category); if (assignment == null) { hibernateTemplate.save(new ArticleCategoryAssignment(category, article, term.getWeight())); } else { assignment.setWeight(term.getWeight()); hibernateTemplate.update(assignment); } } // Each assignment that was not removed from assignmentMap is not among the new terms, so it should be deleted. assignmentMap.values().forEach(hibernateTemplate::delete); } // There appears to be a bug in the AI getSuggestedTermsFullPath method. // It's supposed to return a slash-delimited path that starts with a slash, // like an absolute Unix file path. However, rarely, it just returns "naked" // terms without the leading slash. Discard these, since the calling // code won't be able to handle this. (Note the first slash after <TERM> in the regex) //Positive (Good term) example response: //"<TERM>/Biology and life sciences/Computational biology/Computational neuroscience/Single neuron function|(5) neuron*(5)</TERM>" //This regex: //Confirms the response is good //Finds the term and places in the result //Finds first number wrapped in parentheses after the pipe symbol and places it in the result private static final Pattern TERM_PATTERN = Pattern.compile("<TERM>\\s*(/.*)\\|\\s*\\((\\d+)\\).*</TERM>"); /** * Parses a single line of the XML response from the taxonomy server. * * @param vectorElement The text body of a line of the response * @return the term and weight of the term */ @VisibleForTesting static WeightedTerm parseVectorElement(String vectorElement) { Matcher match = TERM_PATTERN.matcher(vectorElement); if (match.find()) { String text = match.group(1); int value = Integer.parseInt(match.group(2)); return new WeightedTerm(text, value); } else { //Bad term throw new TaxonomyRemoteServiceInvalidBehaviorException("Invalid syntax: " + vectorElement); } } /** * Adds the text content of the given element to the StringBuilder, if it exists. If more than one element exists with * the given name, only appends the first one. * * @param sb StringBuilder to be modified * @param dom DOM tree of an article * @param elementName name of element to search for in the dom * @return true if the StringBuilder was modified */ @VisibleForTesting static boolean appendElementIfExists(StringBuilder sb, Document dom, String elementName) { NodeList list = dom.getElementsByTagName(elementName); if (list != null && list.getLength() > 0) { sb.append(list.item(0).getTextContent()); sb.append("\n"); return true; } else { return false; } } /** * Adds the text content of all found elements to the StringBuilder, if they exist. * * @param sb StringBuilder to be modified * @param dom DOM tree of an article * @param elementName name of element to search for in the dom * @return true if the StringBuilder was modified */ private static boolean appendAllElementsIfExists(StringBuilder sb, Document dom, String elementName) { NodeList list = dom.getElementsByTagName(elementName); if (list != null && list.getLength() > 0) { for (int a = 0; a < list.getLength(); a++) { sb.append(list.item(a).getTextContent()); sb.append("\n"); } return true; } else { return false; } } /** * Returns a string containing only the parts of the article that should be examined by the taxonomy server. For * research articles, this is presently the title, the abstract, the Materials and Methods section, and the Results * section. (If any of these sections are not present, they are not sent, but this is not a fatal error.) If none of * these sections (abstract, materials/methods, or results) are present, then this method will return the entire body * text. This is usually the case for non-research-articles, such as corrections, opinion pieces, etc. * Please not that the "getSuggestedTermsFullPathsPlos" requires the data within the "content" tag to be * XML-escaped twice. Hence, we XML escape it once in this method and once when we escape the "doc" tag in * {@link getRawTerms} method. * * @param dom DOM tree of an article * @return raw text content, XML-escaped, of the relevant article sections */ @VisibleForTesting static String getCategorizationContent(Document dom) { StringBuilder sb = new StringBuilder(); appendElementIfExists(sb, dom, "article-title"); appendAllElementsIfExists(sb, dom, "abstract"); appendElementIfExists(sb, dom, "body"); return StringEscapeUtils.escapeXml10(sb.toString().trim()); } }