Java tutorial
/*************************************************************************** * Copyright (C) 2005 Global Biodiversity Information Facility Secretariat. * All Rights Reserved. * * The contents of this file are subject to the Mozilla Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. ***************************************************************************/ package org.gbif.portal.harvest.taxonomy; import java.sql.Timestamp; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.lang.Thread; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.gbif.portal.dao.RelationshipAssertionDAO; import org.gbif.portal.dao.RemoteConceptDAO; import org.gbif.portal.dao.TaxonConceptDAO; import org.gbif.portal.dao.TaxonomyDenormaliserDAO; import org.gbif.portal.model.LinnaeanRankClassification; import org.gbif.portal.model.RelationshipAssertion; import org.gbif.portal.model.RemoteConcept; import org.gbif.portal.model.TaxonConcept; import org.gbif.portal.model.TaxonConceptLite; import org.gbif.portal.model.TaxonName; /** * Utilities for dealing with taxonomies * * @author trobertson */ public class TaxonomyUtils { /** * Threshold for comparisons to decide whether concepts match. This has been selected so as * to be lenient in cases in which only the kingdom matches (provided there is only one * candidate classification in the kingdom). For example, comparing: * * Plantae Magnoliophyta Magnoliopsida Apiales Apiaceae Oenanthe * Plantae Tracheophyta Dicotyledoneae Umbelliferales Umbelliferae Oenanthe * 30/30 0/10 0/10 0/10 0/30 - * * This gives a total of 30/90 or 33. */ public static final int COMPARISON_THRESHOLD = 33; /** * DAOs */ protected TaxonConceptDAO taxonConceptDAO; protected RemoteConceptDAO remoteConceptDAO; protected TaxonomyDenormaliserDAO taxonomyDenormaliserDAO; protected RelationshipAssertionDAO relationshipAssertionDAO; /** * Names that should be ignored from any classification */ protected Set<String> namesToIgnoreUppercase = new HashSet<String>(); /** * The name to use when the kingdom is not known. * This will be created and all concepts stored in it * Defaults to "Unknown" */ public String nameOfUnknownKingdom = "Unknown"; /** * Logger */ protected Log logger = LogFactory.getLog(getClass()); /** * The theashold at which we decide that a match is acceptable * Defaults to 33 */ protected int confidenceThreshold = 33; /** * When paging over the concepts, use this size * Default is 10000 */ protected int pageSize = 10000; /** * Lets the user modifiy any ranks after the regular expression parsing. * This is because the marker list may get pretty long, so handling it all in RegEx is both a massive amount * of configuration, and also slow to process due to the amount of reg ex matches */ protected Map<String, Integer> infraspecifiMarkerMappingsUppercase = new HashMap<String, Integer>(); /** * Denormalise the taxonomy for the provider id given * This will clear the denormalised version of the taxonomy and then rebuild it * - Kingdom and it's children are set to self point * - Phylum and it's children are set to self point * . * . * . * - Species and it's children are set to self point * * Then all children of minor ranks are set, in order of rank. This can be done since the highest taxa minor ranks themselves * are either roots (e.g. no higher parents) or have the parent set from above. If there are several minor ranks between major ranks, * since this is an ordered set, they will be handled. The next major rank, should it be a child of a minor rank will also be handled. * * Users should not that this is capable of taking a long time to complete for large taxonomies (hours?) * @param id to denormalise */ public void denormalisedTaxonomyForProvider(long id) { logger.info("Clearing the taxonomy for provider[" + id + "]"); long time = System.currentTimeMillis(); taxonomyDenormaliserDAO.clearDenormalisedDataForProvider(id); logger.info("Taxonomy for provider[" + id + "] cleared in " + (((1 + System.currentTimeMillis() - time)) / 1000) + " secs"); logger.info("Getting the providers ranks"); time = System.currentTimeMillis(); List<Integer> ranks = taxonomyDenormaliserDAO.getDistinctRanksForProvider(id); logger.info("Provider has " + ranks.size() + " ranks. Determined in " + (((1 + System.currentTimeMillis() - time)) / 1000) + " secs"); for (int rank : ranks) { logger.info("Denormalising rank: " + rank); time = System.currentTimeMillis(); taxonomyDenormaliserDAO.copyParentDenormalisationForRankAndForProvider(id, rank); logger.info("Rank [" + rank + "] denormalised in " + (((1 + System.currentTimeMillis() - time)) / 1000) + " secs"); } } /** * Denormalise the taxonomy for the resource id given * This will clear the denormalised version of the taxonomy and then rebuild it * - Kingdom and it's children are set to self point * - Phylum and it's children are set to self point * . * . * . * - Species and it's children are set to self point * * Then all chilren of minor ranks are set, in order of rank. This can be done since the highest taxa minor ranks themselves * are either roots (e.g. no higher parents) or have the parent set from above. If there are several minor ranks between major ranks, * since this is an ordered set, they will be handled. The next major rank, should it be a child of a minor rank will also be handled. * Users should not that this is capable of taking a long time to complete for large taxonomies (hours?) * @param id to denormalise */ public void denormalisedTaxonomyForResource(long id) { logger.info("Clearing the taxonomy for resource[" + id + "]"); long time = System.currentTimeMillis(); taxonomyDenormaliserDAO.clearDenormalisedDataForResource(id); logger.info("Taxonomy for resource[" + id + "] cleared in " + (((1 + System.currentTimeMillis() - time)) / 1000) + " secs"); logger.info("Getting the resources ranks"); time = System.currentTimeMillis(); List<Integer> ranks = taxonomyDenormaliserDAO.getDistinctRanksForResource(id); logger.info("Resource has " + ranks.size() + " ranks. Determined in " + (((1 + System.currentTimeMillis() - time)) / 1000) + " secs"); for (int rank : ranks) { if (rank >= 1000) { logger.info("Denormalising rank: " + rank); time = System.currentTimeMillis(); taxonomyDenormaliserDAO.copyParentDenormalisationForRankAndForResource(id, rank); logger.info("Rank [" + rank + "] denormalised in " + (((1 + System.currentTimeMillis() - time)) / 1000) + " secs"); } } } /** * Checks that there are no duplicate ranks in the classification, removing an arbitrary one and order the list * @param classification That needs to be ordered and checked */ public void ensureNoDuplicateRanksAndOrder(List<TaxonConceptLite> classification) { logClassification(toListOfTaxonName(classification), "Classification before sorting and removal of duplicates:"); // sort the classification Collections.sort(classification, new java.util.Comparator<TaxonConceptLite>() { public int compare(TaxonConceptLite o1, TaxonConceptLite o2) { if (o1.getRank() < o2.getRank()) { return -1; } else if (o1.getRank() == o2.getRank()) { return 0; } else { return 1; } } }); // Iterate through the classification, removing all duplicate ranks int lastRank = -1; for (Iterator<TaxonConceptLite> it = classification.iterator(); it.hasNext();) { TaxonConceptLite concept = it.next(); if (concept.getRank().intValue() == lastRank) { // remove from the iterator to avoid java.util.ConcurrentModificationException it.remove(); } lastRank = concept.getRank().intValue(); } logClassification(toListOfTaxonName(classification), "Classification after sorting and removal of duplicates:"); } /** * Logs the classification at debug level * @param classification To log */ public void logClassification(List<TaxonName> classification, String prefix) { if (logger.isDebugEnabled()) { StringBuffer sb = new StringBuffer(); if (prefix != null) { sb.append(prefix + "\n"); } for (TaxonName concept : classification) { sb.append(" - " + concept.toString() + "\n"); } logger.debug(sb.toString().trim()); } } /** * This will create a List of TaxonConceptLite from the name, and then call * @see TaxonomyUtils.synchronise(List<TaxonConceptLite, long long); * @param classification To synchronise * @param dataProviderId That the classification is to be synchronised with * @param dataResourceId That the classification is to be synchronised with * @param taxonomicPriority The priority ranking of the classification in generating portal taxonomy and views * @return The most significant concept in the classification (regardless of whether it was newly created or not) */ public TaxonConceptLite synchroniseNames(List<TaxonName> classification, long dataProviderId, long dataResourceId, int taxonomicPriority) { return synchronise(relationshipAssertionDAO, taxonConceptDAO, toTaxonConceptLiteList(classification, dataProviderId, dataResourceId, taxonomicPriority), dataProviderId, dataResourceId); } /** * Creates a list of unsaved concepts for the names provided * @param nameList To create concepts for * @param dataProviderId The provider * @param dataResourceId The resource * @param taxonomicPriority The priority ranking of the classification in generating portal taxonomy and views * @return The list of concepts */ public List<TaxonConceptLite> toTaxonConceptLiteList(List<TaxonName> nameList, long dataProviderId, long dataResourceId, int taxonomicPriority) { List<TaxonConceptLite> concepts = new LinkedList<TaxonConceptLite>(); for (TaxonName name : nameList) { concepts.add(toTaxonConceptLite(name, dataProviderId, dataResourceId, taxonomicPriority)); } return concepts; } /** * Creates an accepted concept for the name and provider and resource given * @param name To use * @param dataProviderId The provider * @param dataResourceId The resource * @param taxonomicPriority The priority ranking of the classification in generating portal taxonomy and views * @return An unpersisted concept for the name provided */ public TaxonConceptLite toTaxonConceptLite(TaxonName name, long dataProviderId, long dataResourceId, int taxonomicPriority) { TaxonConceptLite concept = new TaxonConceptLite(); concept.setAccepted(true); concept.setDataProviderId(dataProviderId); concept.setDataResourceId(dataResourceId); concept.setRank(name.getRank()); concept.setTaxonName(name); concept.setPriority(taxonomicPriority); return concept; } /** * This will synchronise the classification with that of the dataResource given (This will not sync merged taxonomies - * the data resource must be known). * * The algorthim starts at the top of the classification and works down. * It finds any concepts that already exist in the classification which have a parent id in the list of previously handled * concepts. Thus when handling "Puma" from the classification Animalia, Chordata, Puma, Puma concolor it will see if there * is a "Puma" with a parent of Animalia OR Chordata. The taxonomy may not be as full as the classification provided... * * @param classification To synchronise * @param dataProviderId That the classification is to be synchronised with * @param dataResourceId That the classification is to be synchronised with * @return The most significant concept in the classification (regardless of whether it was newly created or not) */ public TaxonConceptLite synchronise(RelationshipAssertionDAO relationshipAssertionDAO, TaxonConceptDAO taxonConceptDAO, List<TaxonConceptLite> classification, long dataProviderId, long dataResourceId) { logger.debug("Synchronising classification to dataResourceId[" + dataResourceId + "] dataProviderId[" + dataProviderId + "]"); if (classification == null || classification.size() == 0) { logger.warn("Received an empty classification"); return null; } // Update infraspecific marker ranks - e.g. detect f. var. etc updateInfraspecificMarkerRanks(classification); // This will ensure for example that there are no K,P,G,G,S ensureNoDuplicateRanksAndOrder(classification); // the persisted classification effectively List<TaxonConceptLite> persistedConcepts = new LinkedList<TaxonConceptLite>(); // maintain a list of the persisted taxon concept ids List<Long> persistedConceptIds = new LinkedList<Long>(); // loop through the names in the classification, making sure they are persisted, // cleaning up (e.g. filling in) any previously persisted taxonomic tree List<TaxonName> classificationNames = new LinkedList<TaxonName>(); for (TaxonConceptLite taxonConcept : classification) { TaxonName taxonName = taxonConcept.getTaxonName(); logger.debug("Synchronising TaxonName: " + taxonName); classificationNames.add(taxonName); TaxonConceptLite persisted = taxonConcept; // the one supplied may be in the taxonomy already so ignore it in that case // or it may be from another resource if (persisted.getId() == null || persisted.getId() < 1 || (taxonConcept.getDataResourceId() != null && taxonConcept.getDataResourceId().longValue() != dataResourceId)) { // get any persisted concepts for the name and rank that have a parent concept equal to that already handled persisted = getTaxonConceptForClassification(relationshipAssertionDAO, taxonConceptDAO, dataProviderId, dataResourceId, classificationNames, COMPARISON_THRESHOLD); } if (persisted != null) { logger.debug("There is a persisted concept for " + taxonName); logger.debug("persistedConceptIds: " + persistedConceptIds); logger.debug("Persisted parent id: " + persisted.getParentId()); // the previously persisted concept may not have as much info as this one // E.g. Maybe persisted was Animalia Mammalia, but now we know Chordata is in there... if (persistedConceptIds.size() > 0 && !persistedConceptIds.get(persistedConceptIds.size() - 1) .equals(persisted.getParentId())) { TaxonConceptLite parent = persistedConcepts.get(persistedConcepts.size() - 1); // make sure that the new parent is actually a more significant rank Long oldParentId = persisted.getParentId(); if (oldParentId != null && oldParentId > 0) { TaxonConceptLite oldParent = taxonConceptDAO.getTaxonConcept(oldParentId); if (oldParent.getRank().intValue() < parent.getRank().intValue()) { logger.debug("Updating the parent of a previously persisted concept"); taxonConceptDAO.updateParent(persisted.getId(), parent.getId()); } else { logger.debug("Not updating the old parent as the rank is same or lower"); } } else { logger.debug("Updating the parent of a previously persisted concept"); taxonConceptDAO.updateParent(persisted.getId(), parent.getId()); } // it is soemtimes (and not very often so this extra DB hit should not be a performance problem) // the case that you import a concept that is not accepted, and then find one within the same // taxonomy that is - this means it for sure needs to be accepted if ((dataResourceId == 1 && taxonConcept.isSecondary() || taxonConcept.isAccepted()) && (!persisted.isAccepted())) { logger.debug("Updating a previously stored concept [" + persisted.getId() + "] that was not accepted to be accepted"); taxonConceptDAO.updateAccepted(persisted.getId(), true); } } persistedConcepts.add(persisted); persistedConceptIds.add(persisted.getId()); } else { logger.debug("Creating a new concept for DR[" + dataResourceId + "]: " + taxonName); TaxonConcept tc = new TaxonConcept(); tc.setTaxonName(taxonName); tc.setRank(taxonName.getRank()); tc.setDataProviderId(dataProviderId); tc.setDataResourceId(dataResourceId); if (dataResourceId == 1 && taxonConcept.isSecondary()) { tc.setAccepted(false); } else { tc.setAccepted(taxonConcept.isAccepted()); } tc.setNubConcept(taxonConcept.isNubConcept()); tc.setPartnerConceptId(taxonConcept.getPartnerConceptId()); tc.setSecondary(taxonConcept.isSecondary()); tc.setPriority(taxonConcept.getPriority()); if (persistedConceptIds.size() > 0) { TaxonConceptLite parent = persistedConcepts.get(persistedConcepts.size() - 1); tc.setParentId(parent.getId()); } long id = taxonConceptDAO.create(tc); persistedConcepts.add(tc); persistedConceptIds.add(id); } } return (persistedConcepts.size() > 0) ? persistedConcepts.get(persistedConcepts.size() - 1) : null; } /** * Inspects any infraspecific markers and updates the rank of the taxon name and the taxon concept * @param classification */ public void updateInfraspecificMarkerRanks(List<TaxonConceptLite> classification) { for (TaxonConceptLite tcl : classification) { String marker = tcl.getTaxonName().getInfraSpecificMarker(); if (marker != null) { logger.debug("Inspecting the infraspecific marker to see if the rank needs modified: " + marker); if (infraspecifiMarkerMappingsUppercase.containsKey(marker.toUpperCase())) { if (tcl.getTaxonName().getRank() != infraspecifiMarkerMappingsUppercase .get(marker.toUpperCase())) { logger.debug("Marker [" + marker + "] found. Setting rank from [" + tcl.getTaxonName().getRank() + "] to [" + infraspecifiMarkerMappingsUppercase.get(marker.toUpperCase()) + "]"); tcl.setRank(infraspecifiMarkerMappingsUppercase.get(marker.toUpperCase())); tcl.getTaxonName().setRank(infraspecifiMarkerMappingsUppercase.get(marker.toUpperCase())); } else { logger.debug("Marker [" + marker + "] found and is already set correctly"); } } else { logger.debug("Marker [" + marker + "] not found in list of markers to modify - rank will remain unchanged"); } } } } /** * A utility that will effectively ensure that the taxonomy from one data resource is represented fully in another. * For all concepts that exists in the source, the target is checked to see if there exists a concept representing the same * classification (note that the target may be a more complete classification that the source). If the concept does not exist, * then the concept is created. * * Typically this method would be used to build a NUB taxonomy. Taxonomic data resources would be imported with allowCreateKingdoms first, * and then inferred taxonomies would be imported with unknownKingdoms collated. * * This will import accepted concepts and then non accepted concepts in order of rank * * @param sourceDataResourceId The resource holding the concepts that are to be imported into the target * @param targetDataResourceId The target resource to ensure encapsualtes all concepts in the source * @param targetDataProviderId The data provider for the resource owning the taxonomy being built - this MUST own the targetDataResourceId * @param allowCreateUnknownKingdoms If this is set to false then the TaxonomyUtils.nameOfUnknownKingdom is used for any kingdom that * @param majorRanksOnly If this is set to true, then only major ranks will be imported * @param unpartneredOnly If this is set to true, then only concepts with no partner concept id will be imported * is not represented in the target taxonomy. If set to true, then the kingdoms are imported from the source. * @throws InterruptedException */ public void importTaxonomyFromDataResource(long sourceDataResourceId, long targetDataResourceId, long targetDataProviderId, boolean allowCreateUnknownKingdoms, boolean majorRanksOnly, boolean unpartneredOnly) throws InterruptedException { List<Integer> ranksToImport = null; if (unpartneredOnly) { ranksToImport = taxonConceptDAO.getUnpartneredRanksWithinResource(sourceDataResourceId); } else { ranksToImport = taxonConceptDAO.getRanksWithinResource(sourceDataResourceId); } logger.debug("There are " + ranksToImport.size() + " ranks to import from data resource[" + sourceDataResourceId + "]: " + ranksToImport); ExecutorService es = Executors.newCachedThreadPool(); for (int i = 0; i < ranksToImport.size(); i++) { int rank = (ranksToImport.get(i)); es.execute(new Thread(new TaxonomyThread(relationshipAssertionDAO, taxonConceptDAO, targetDataResourceId, targetDataProviderId, allowCreateUnknownKingdoms, majorRanksOnly, unpartneredOnly, sourceDataResourceId, rank))); } es.shutdown(); while (!es.isTerminated()) { try { Thread.sleep(100); } catch (InterruptedException e) { e.printStackTrace(); } } System.out.println("Finalizados todos los hilos del recurso : " + sourceDataResourceId); } /** * Imports the ranks taxonomy from one resource to another for the given parameters * @param sourceDataResourceId The source resource * @param targetDataResourceId The target resource * @param targetDataProviderId The target provider * @param allowCreateUnknownKingdoms Control flag * @param majorRanksOnly Control flag * @param rank The rank to import * @param accepted Control flag - will import only accepted / non accepted concepts * @param unpartneredOnly Control flag - to set whether we want to import only unpartnered concepts */ protected void importTaxonomyFromDataResource(RelationshipAssertionDAO relationshipAssertionDAO, TaxonConceptDAO taxonConceptDAO, long sourceDataResourceId, long targetDataResourceId, long targetDataProviderId, boolean allowCreateUnknownKingdoms, boolean majorRanksOnly, int rank, boolean accepted, boolean unpartnered) { boolean hasMore = true; long minId = 0; while (hasMore) { logger.info("Getting concepts of rank[" + rank + "] with minimum id[" + minId + "] and accepted[" + accepted + "] unpartneredOnly[" + unpartnered + "]"); List<List<TaxonConceptLite>> classifications = taxonConceptDAO.getClassificationsOf(rank, sourceDataResourceId, false, accepted, minId, pageSize, unpartnered); if (accepted) { logger.info("Received " + classifications.size() + " accepted concepts of rank[" + rank + "] unpartneredOnly[" + unpartnered + "]"); } else { logger.info("Received " + classifications.size() + " non accepted concepts of rank[" + rank + "] unpartneredOnly[" + unpartnered + "]"); } for (List<TaxonConceptLite> classification : classifications) { if (classification.size() > 0) { minId = classification.get(classification.size() - 1).getId(); } classification = removeUnwantedConcepts(classification); if (majorRanksOnly) { classification = removeMinorRanks(classification); } if (classification.size() > 0) { // store the importing one long importingConceptId = classification.get(classification.size() - 1).getId(); logger.debug("Finding target id"); TaxonConceptLite nub = synchroniseAtLowestJoinPoint(relationshipAssertionDAO, taxonConceptDAO, classification, targetDataProviderId, targetDataResourceId, allowCreateUnknownKingdoms); logger.debug("Target id: " + nub.getId()); logger.debug("Setting " + importingConceptId + " to partner " + nub.getId()); taxonConceptDAO.updatePartnerConcept(importingConceptId, nub.getId()); } } // see if there needs to be another page received hasMore = (classifications.size() >= pageSize); } } /** * Using the configured names to ignore, removes any concepts from the classification that are not wanted * @return The trimmed classification */ public List<TaxonConceptLite> removeUnwantedConcepts(List<TaxonConceptLite> classification) { List<TaxonConceptLite> newClassification = new LinkedList<TaxonConceptLite>(); for (TaxonConceptLite concept : classification) { boolean add = true; for (String nameToIgnore : getNamesToIgnoreUppercase()) { if (StringUtils.equalsIgnoreCase(nameToIgnore, concept.getTaxonName().getCanonical())) { add = false; } } if (add) { newClassification.add(concept); } } return newClassification; } /** * Using the configured names to ignore, removes any concepts from Linnaean ranks * @return The trimmed classification */ public void removeUnwantedNames(LinnaeanRankClassification classification) { if (shouldIgnore(classification.getKingdom())) { classification.setKingdom(null); } if (shouldIgnore(classification.getPhylum())) { classification.setPhylum(null); } if (shouldIgnore(classification.getKlass())) { classification.setKlass(null); } if (shouldIgnore(classification.getOrder())) { classification.setOrder(null); } if (shouldIgnore(classification.getFamily())) { classification.setFamily(null); } if (shouldIgnore(classification.getGenus())) { classification.setGenus(null); } if (shouldIgnore(classification.getScientificName())) { classification.setScientificName(null); } } protected boolean shouldIgnore(String name) { for (String nameToIgnore : getNamesToIgnoreUppercase()) { if (StringUtils.equalsIgnoreCase(nameToIgnore, name)) { return true; } } return false; } /** * Using the configured names to ignore, removes any concepts from the classification that are not wanted * @return The trimmed classification */ public List<TaxonName> removeUnwantedNames(List<TaxonName> classification) { List<TaxonName> newClassification = new LinkedList<TaxonName>(); for (TaxonName name : classification) { boolean add = true; for (String nameToIgnore : getNamesToIgnoreUppercase()) { if (StringUtils.equalsIgnoreCase(nameToIgnore, name.getCanonical())) { logger.debug("Ignoring name: " + name); add = false; } } if (add) { newClassification.add(name); } } return newClassification; } /** * Removes any minor ranks * @return The trimmed classification */ public List<TaxonConceptLite> removeMinorRanks(List<TaxonConceptLite> classification) { List<TaxonConceptLite> newClassification = new LinkedList<TaxonConceptLite>(); for (TaxonConceptLite concept : classification) { if (concept.getRank() % 1000 == 0) { newClassification.add(concept); } } return newClassification; } /** * This will merge the given classification into the target resource taxonomy. * Only the required concepts are created from the lowest join point. That is to say a classification of * * - Animalia Chordata Felidae Puma Puma concolor * * Merged into a taxonomy containing * * - Animalia Felidae Puma * * Would only create "Puma concolor", since the join point is Felidae * * @param classification To synchronise * @param targetProviderId The target provider * @param targetResourceId The target taxonomy resource * @param createUnknownKingdoms Flag to determine if kingdoms can be created during the syncronising process * @return The most significant concept in the classification (regardless of * whether it was newly created or not) */ public TaxonConceptLite synchroniseAtLowestJoinPoint(RelationshipAssertionDAO relationshipAssertionDAO, TaxonConceptDAO taxonConceptDAO, List<TaxonConceptLite> classification, long targetProviderId, long targetResourceId, boolean createUnknownKingdoms) { TaxonConceptLite targetConcept = getTaxonConceptForClassification(relationshipAssertionDAO, taxonConceptDAO, targetProviderId, targetResourceId, toListOfTaxonName(classification), COMPARISON_THRESHOLD); if (targetConcept != null) { logger.debug("The concept already exists in target resource [id: " + targetResourceId + "]: " + toListOfTaxonName(classification)); // still synchronise to ensure that any higher ranks are filled in if (!createUnknownKingdoms) { for (TaxonConceptLite sourceTaxonConcept : classification) { if (sourceTaxonConcept.getRank() == 1000) { if (taxonConceptDAO.getTaxonConcept(sourceTaxonConcept.getTaxonName().getCanonical(), 1000, targetResourceId, null) == null) { // The classification includes an unknown kingdom - replace it if (logger.isDebugEnabled()) logger.debug("Replacing the unknown kingdom " + sourceTaxonConcept.getTaxonName().getCanonical()); sourceTaxonConcept.setTaxonName(new TaxonName(getNameOfUnknownKingdom(), null, 1000)); sourceTaxonConcept.setAccepted(false); sourceTaxonConcept.setPriority(100); break; } } } } targetConcept = synchronise(relationshipAssertionDAO, taxonConceptDAO, classification, targetProviderId, targetResourceId); // Handle the situation in which this concept was added to the nub as a non-accepted secondary // taxon but the taxon in the supplied classification is not secondary. if (targetResourceId == 1) { TaxonConceptLite mostSignificantConcept = classification.get(classification.size() - 1); if (!mostSignificantConcept.isSecondary() && targetConcept.isSecondary() && !targetConcept.isAccepted()) { processSecondaryConcepts(mostSignificantConcept, targetConcept); } } return targetConcept; } else { logger.debug("Not found in target resource [id: " + targetResourceId + "] - determining join point to resource for classification: " + toListOfTaxonName(classification)); TaxonConceptLite targetJoinPoint = getJoinPoint(taxonConceptDAO, toListOfTaxonName(classification), null, targetProviderId, targetResourceId); if (targetJoinPoint != null) { logger.debug("Join point: " + targetJoinPoint.getTaxonName()); if (targetJoinPoint.getRank() == 1000) { logger.debug("This classification is joining onto kingdom in the target taxonomy: " + toListOfTaxonName(classification)); } // build a new list of the concepts to synchronise List<TaxonConceptLite> toSync = new LinkedList<TaxonConceptLite>(); for (TaxonConceptLite concept : classification) { if (concept.getRank() > targetJoinPoint.getRank()) { if (!createUnknownKingdoms && concept.getRank() == 1000) { concept.setTaxonName(new TaxonName(getNameOfUnknownKingdom(), null, 1000)); concept.setAccepted(false); concept.setPriority(100); } concept.setPartnerConceptId(concept.getId()); concept.setNubConcept(true); concept.setId(null); toSync.add(concept); } } TaxonConceptLite conceptToAdd = targetJoinPoint; while (conceptToAdd != null && conceptToAdd.getParentId() != null) { toSync.add(0, conceptToAdd); conceptToAdd = taxonConceptDAO.getTaxonConceptLite(conceptToAdd.getParentId()); } logger.debug("Synchronising: " + toListOfTaxonName(toSync)); TaxonConceptLite insertedConcept = synchronise(relationshipAssertionDAO, taxonConceptDAO, toSync, targetProviderId, targetResourceId); // For the nub taxonomy, handle secondary concepts if (targetResourceId == 1) { processSecondaryConcepts(insertedConcept, targetJoinPoint); } return insertedConcept; } else { logger.debug("No join point found for: " + toListOfTaxonName(classification)); for (TaxonConceptLite concept : classification) { concept.setPartnerConceptId(concept.getId()); concept.setNubConcept(true); concept.setId(null); } // if it's trying to make a kingdom, check it is allowed if (classification.get(0).getRank() == 1000 && createUnknownKingdoms) { logger.warn( "Classification cannot be joined onto the target in any way - (creating a new kingdom): " + toListOfTaxonName(classification)); return synchronise(relationshipAssertionDAO, taxonConceptDAO, classification, targetProviderId, targetResourceId); } else { TaxonConceptLite oldKingdom = null; // removed the kingdom that can't be created if any if (classification.get(0).getRank() == 1000) { oldKingdom = classification.remove(0); } // add an unknown kingdom TaxonConceptLite tcl = new TaxonConceptLite(); tcl.setTaxonName(new TaxonName(getNameOfUnknownKingdom(), null, 1000)); tcl.setRank(1000); tcl.setNubConcept(true); tcl.setAccepted(false); tcl.setPriority(100); tcl.setPartnerConceptId(oldKingdom == null ? null : oldKingdom.getPartnerConceptId()); classification.add(0, tcl); logger.debug( "Classification cannot be joined onto the target in any way - (using the \"unknown\" kingdom): " + toListOfTaxonName(classification)); return synchronise(relationshipAssertionDAO, taxonConceptDAO, classification, targetProviderId, targetResourceId); } } } } /** * The nub taxonomy may include concepts which are added because they appear as part of an * occurrence data resource, but which should not normally be displayed because the associated * occurrences are attached to concepts elsewhere in the taxonomy. These are marked as * secondary, non-accepted concepts. If however they subsequently receive children with data * or data of their own, they need to be accepted. * * This method handles setting the isAccepted flag for nub concepts based on the isSecondary flag. * * Presuppositions: * 1. Concepts in occurrence resource taxonomies are marked as secondary if they have no associated * records, and primary otherwise. * 2. When these concepts are added to the nub taxonomy, the secondary flag is preserved * 3. All concepts inserted into the nub with the secondary flag are initially marked as not accepted. * 4. When a primary, accepted concept (from any source) is joined to a concept in the nub which * is marked as secondary and non-accepted, the entire classification for the new concept should * be marked a accepted. (Note that concepts marked as primary and non-accepted are ones for * which a resource like CoL has given us a different accepted concept - in these cases we do not * accept the non-accepted concept. * @param insertedConcept * @param parentConcept */ private void processSecondaryConcepts(TaxonConceptLite testConcept, TaxonConceptLite conceptToProcess) { // There is nothing to do if the new concept is secondary if (!testConcept.isSecondary() && testConcept.isAccepted()) { // This concept is not secondary - ensure the entire hierarchy is accepted while (conceptToProcess != null && conceptToProcess.isSecondary() && !conceptToProcess.isAccepted()) { conceptToProcess.setAccepted(true); taxonConceptDAO.updateAccepted(conceptToProcess.getId(), true); conceptToProcess = taxonConceptDAO.getTaxonConceptLite(conceptToProcess.getParentId()); } } } /** * Gets the join join point for the classification provided in the target providers taxonomy. * This should be called AFTER it has been deduced that there is no concept in the target taxonomy representing the lowest * taxa in the classification. This is required because this method will ignore the lowest taxa and go up the * tree to find the first non clashing classification in the target taxonomy. * Should the provided classification not have a kingdom, then the one provided is added. This is for safety checking, * as without the kingdom, the results could be undesirable (Note, that it will except a NULL kingdom which will not be added - this should * not be supplied unless you REALLY know what you are doing...) * @param classification To find where it can be merged to * @param kingdom To use should the classification not have one already. The kingdom is added to the classification * should it be missing * @return The taxon concept within the target that can be merged to, at the appropriate rank */ public TaxonConceptLite getJoinPoint(TaxonConceptDAO taxonConceptDAO, List<TaxonName> classification, String kingdom, long targetProviderId, long targetResourceId) { TaxonConceptLite targetConcept = null; // get the kingdom in there if needbe if (StringUtils.isNotEmpty(kingdom)) { if (classification.get(0).getRank() != 1000) { classification.add(0, new TaxonName(kingdom, null, 1000)); } } else { if (classification.get(0).getRank() != 1000) { classification.add(0, new TaxonName(getNameOfUnknownKingdom(), null, 1000)); } } // It must be at least 2 big - Kingdom + something, or else it is just a kingdom // Since this is called AFTER it has been deduced that the concept does not exist then // what we actually have is a kingdom that is not in the target - HIGHLY SUSPICIOUS // Thus we only deal with classifications larger that 2 if (classification.size() >= 2) { // build a copy of the classification to shrink down - don't modify the original // ignoring the last one as we know that it is not in the target List<TaxonName> workingClassification = new LinkedList<TaxonName>(classification); workingClassification.remove(workingClassification.size() - 1); // go from lowest to highest taxa finding a point at which this can be merged into the target for (int i = workingClassification.size() - 1; i >= 0; i--) { targetConcept = getTaxonConceptForClassification(relationshipAssertionDAO, taxonConceptDAO, targetProviderId, targetResourceId, workingClassification, COMPARISON_THRESHOLD); if (targetConcept != null) { break; } else { workingClassification.remove(workingClassification.size() - 1); } } } if (targetConcept != null) { logger.debug("Target taxonomy join point found: " + targetConcept.getTaxonName() + " for classification: " + classification); } else { logger.debug( "No concept join point found for classification in the target taxonomy: " + classification); } return targetConcept; } /** * Gets the taxon concept for the classification provided if it exists. * * This method is intended to be used in the following subtly different situations: * * 1. When importing the taxonomy for a well-managed taxonomic database. In this case * it is expected that each taxon will consistently appear with the same classification * in all records in which it appears. If the same taxon name appears in multiple * slightly different records, it is to be expected that the instances are cases of * homonymy and should be preserved as separate entities. * * 2. When importing the taxonomy for a database not intended to be taxonomically * authoritative (e.g. a collection database). In this case the assumption is that * the data may include varying classifications for the same taxon and that the * system should be more cautious about presenting them as different. * * 3. When merging the taxonomy from a well-managed taxonomic database into the portal * nub taxonomy. In this case the classifications may not match those from other * (even authoritative) sources but the portal should be able to maintain any * distinctions made in the taxonomic database itself. * * 4. When merging the taxonomy from a database not intended to be taxonomically * into the portal nub taxonomy. In this case the classifications will often not * match those from other sources and the portal should not assume that they * represent different taxa unless there are very strong reasons to do so. * * The key requirements to handle these cases are as follows: * * A. In situations 1 and 2, the requirement is to import the resource * taxonomy with as much fidelity as possible. The method should therefore * preserve every apparent classification and only merge those which provide * different compatible subsets of the same classification. If the resource * taxonomy is not at all well-managed, this may mean that there are many different * representations for the same taxon in different locations in the same * taxonomy. This will be handled when the taxonomy is tied to the portal * nub taxonomy. More generally, when importing taxonomies, this method should * only return completely compatible matches from the same resource's taxonomy. * Little harm will befall the portal from over-distinguishing taxa at * this point, since the important stage will be in situation 3, when the * dataset is merged into the portal taxonomy. * * REQUIREMENT - INCLUDE A MODE THAT FINDS ONLY FULLY COMPATIBLE CLASSIFICATIONS * * NOTE: special rules may be required for some databases that require unique * handling. The hardest cases will relate to records representing different * concepts for the same taxon name with the same classification. Special * processing outside this method will be required to handle such cases. * * B. In situation 4 on the other hand, the requirement is to minimise the number * of cases in which the same taxon is split into multiple locations in the * taxonomy. The method should therefore be able to determine whether a * suitable join point already exists and to use it, even if a significant * proportion of the classification is different. * * REQUIREMENT - INCLUDE A MODE THAT SELECTS THE MOST SUITABLE CLASSIFICATION IF * ONE EXISTS * * C. In situation 3, the requirement is again to reuse an existing taxon if one * is suitable, but the method should not conflate taxa that have been explicitly * separated by the resource. * * REQUIREMENT - INCLUDE A MODE THAT SELECTS THE MOST SUITABLE CLASSIFICATION * BUT RESPECTS DIFFERENT TAXA SHARING THE SAME NAME WITHIN THE * SOURCE CLASSIFICATION * * D. In situations 3 and 4, the portal should detect cases in which it is not * possible safely to merge a candidate taxon with any of the existing taxa under * the given name, and should create a special taxon concept to store the * ambiguous information. It should avoid multiplying these disambiguation * concepts for the same name, since otherwise the taxonomy will become * impossibly complex. * * REQUIREMENT - INCLUDE A MODE THAT CAN CREATE DISAMBIGUATION TAXA AS NEEDED * * These requirements are handled as follows: * * i. If the request is to find a concept in taxonomies other than the portal * taxonomy, full compatibility is required (i.e. rejecting any classification * with a different name in the same position. Otherwise no match is returned. * This addresses requirement A * * ii. If the request is to find a concept in the portal taxonomy (resource 1), this * method will find the most suitable classification using a reasonably lenient * matching algorithm (threshold set to 33). This addresses requirement B. * * iii. If the matching algorithm in ii. cannot distinguish between multiple concepts, * and the request is for the portal taxonomy, a disambiguation taxon is returned. * This addresses requirement D. * * iv. In cases in which the resource taxonomy may include homonyms, it is the * responsibility of code using this method to determine which resource concepts * to associate with existing concepts, and which resource concepts require new * concepts. This cannot be handled at this level. This addresses requirement * C. * * NOTE: Import of authoritative taxonomies requires additional logic around this * method to ensure correct import and merging of homonyms. * * @param targetProviderId To define what the target taxonomy is * @param targetResourceId To define what the target taxonomy is * @param classification That needs to be allocated a target Concept Id * @param threshold Minimum acceptable measure for classificationsComparator() * @return The target concept or null if non found */ public TaxonConceptLite getTaxonConceptForClassification(RelationshipAssertionDAO relationshipAssertionDAO, TaxonConceptDAO taxonConceptDAO, Long targetProviderId, Long targetResourceId, List<TaxonName> classification, int threshold) { // This method is simply a public wrapper around the potentially recursive // implementation. It should always be called externally in such a way that // it can recurse, but it needs to be able to invoke itself with or without // further recursion. boolean disambiguate = (targetResourceId == 1); boolean requireFullCompatibility = (targetResourceId != 1); return getTaxonConceptForClassification(relationshipAssertionDAO, taxonConceptDAO, targetProviderId, targetResourceId, classification, threshold, disambiguate, requireFullCompatibility, true); } /** * Private implementation allowing for recursion * * @param targetProviderId To define what the target taxonomy is * @param targetResourceId To define what the target taxonomy is * @param classification That needs to be allocated a target Concept Id * @param threshold Minimum acceptable measure for classificationsComparator() * (use COMPARISON_THRESHOLD for normal behaviour) * @param disambiguate True if the method should create disambiguation concepts * @param requireFullCompatibility True if only fully compatible matches are allowed * (N.B. leave threshold at its normal level when using this option) * @param recurse True if recursion is allowed * @return The target concept or null if non found */ private TaxonConceptLite getTaxonConceptForClassification(RelationshipAssertionDAO relationshipAssertionDAO, TaxonConceptDAO taxonConceptDAO, Long targetProviderId, Long targetResourceId, List<TaxonName> classification, int threshold, boolean disambiguate, boolean requireFullCompatibility, boolean recurse) { TaxonConceptLite targetConcept = null; if (classification != null && classification.size() > 0) { TaxonName mostSignificantName = classification.get(classification.size() - 1); List<List<TaxonConceptLite>> targetOptions = null; if (targetResourceId != null) { targetOptions = taxonConceptDAO.getClassificationsOf(mostSignificantName.getCanonical(), // be lenient - get the one that does not care authorship... //mostSignificantName.getAuthor(), mostSignificantName.getRank(), targetResourceId); } else { targetOptions = taxonConceptDAO.getClassificationsWithinProviderOf( mostSignificantName.getCanonical(), // be lenient - get the one that does not care authorship... //mostSignificantName.getAuthor(), mostSignificantName.getRank(), targetProviderId); } // We measure the classifications as two sets, those that are accepted as part of the taxonomy and those // that are not. If there is a suitable accepted taxon, we always take it over a better matching // unaccepted taxon. This is because many of these unaccepted taxa will have been added precisely // because they appear in the hierarchy of the present record - using these would mean that the // merged taxonomy would just include all classifications from all resources without any filter. int bestMeasure = -1; int bestUnacceptedMeasure = -1; List<List<TaxonConceptLite>> bestClassifications = null; List<List<TaxonConceptLite>> bestUnacceptedClassifications = null; for (List<TaxonConceptLite> targetClassificationTC : targetOptions) { List<TaxonName> target = toListOfTaxonName(targetClassificationTC); int measure = classificationsComparator(target, classification, mostSignificantName.getRank() - 1, requireFullCompatibility); if (targetClassificationTC.get(targetClassificationTC.size() - 1).isAccepted()) { if (measure > bestMeasure) { bestMeasure = measure; bestClassifications = new LinkedList<List<TaxonConceptLite>>(); } if (measure == bestMeasure) { bestClassifications.add(targetClassificationTC); } } else { if (measure > bestUnacceptedMeasure) { bestUnacceptedMeasure = measure; bestUnacceptedClassifications = new LinkedList<List<TaxonConceptLite>>(); } if (measure == bestUnacceptedMeasure) { bestUnacceptedClassifications.add(targetClassificationTC); } } } // No accepted taxa - try unaccepted if (bestMeasure < threshold) { bestMeasure = bestUnacceptedMeasure; bestClassifications = bestUnacceptedClassifications; } if (bestMeasure >= threshold) { if (bestClassifications.size() == 1) { List<TaxonConceptLite> bestClassification = bestClassifications.get(0); targetConcept = bestClassification.get(bestClassification.size() - 1); } else { if (recurse) { // We have an embarassment of riches here - let's see if we can choose // one of these with a little more work... // If the supplied classification includes another real taxon name, let's // see if we can find a taxon concept for it. int ancestorIndex; for (ancestorIndex = classification.size() - 2; ancestorIndex >= 0; ancestorIndex--) { TaxonName ancestorName = classification.get(ancestorIndex); if (ancestorName != null && ancestorName.getCanonical() != null && !namesToIgnoreUppercase .contains(ancestorName.getCanonical().toUpperCase())) { break; } } if (ancestorIndex >= 0) { // Here is the one to try - note that this is recursive // Create a shorter classification to test List<TaxonName> ancestorClassification = new LinkedList<TaxonName>(); int i = 0; for (TaxonName name : classification) { ancestorClassification.add(name); if (i == ancestorIndex) { break; } else { i++; } } // Call ourselves recursively to find the ancestor, but do not trigger // disambiguation at the ancestor level TaxonConceptLite ancestorConcept = getTaxonConceptForClassification( relationshipAssertionDAO, taxonConceptDAO, targetProviderId, targetResourceId, ancestorClassification, threshold, false, requireFullCompatibility, true); if (ancestorConcept != null) { // We managed to find the immediate ancestor of this name in the classification // Let's build a classification based on that concept's classification with the // original name from the request and then try using that (with both recursion // and disambiguation disabled). List<TaxonConceptLite> ancestorConcepts = getClassificationConcepts(taxonConceptDAO, ancestorConcept.getId()); List<TaxonName> newClassification = toListOfTaxonName(ancestorConcepts); newClassification.add(mostSignificantName); targetConcept = getTaxonConceptForClassification(relationshipAssertionDAO, taxonConceptDAO, targetProviderId, targetResourceId, newClassification, threshold, disambiguate, requireFullCompatibility, false); } } } // If we still cannot distinguish the classifications should we set up a // disambiguation concept for them? if (targetConcept == null && disambiguate) { if (targetConcept == null) { targetConcept = createDisambiguationConcept(relationshipAssertionDAO, taxonConceptDAO, targetProviderId, targetResourceId, bestClassifications); } } } } } return targetConcept; } /** * A disambigution concept is a taxon concept with very low priority (so it is never * shown in search and browse operations) which is created to attach information to * the nub when the classification for the information is ambiguous between one or * more real taxon concepts. The disambiguation concept will include isAmbiguousSynonym * relationships with all the potentially matching concepts. * * This method checks for the any existing disambiguation concept for the * name in the given resource and otherwise creates a new one. It them makes sure that * this concept has appropriate relationships with all the potentially matching concepts. * * @param dataProviderId id of data provider for which disambiguation concept is required * @param dataResourceId id of data resource for which disambiguation concept is required * @param classifications classifications which need to be disambiguated * @return disambiguation concept */ private TaxonConceptLite createDisambiguationConcept(RelationshipAssertionDAO relationshipAssertionDAO, TaxonConceptDAO taxonConceptDAO, Long dataProviderId, Long dataResourceId, List<List<TaxonConceptLite>> classifications) { TaxonConcept disambiguationConcept = null; TaxonName name = classifications.get(0).get(classifications.get(0).size() - 1).getTaxonName(); disambiguationConcept = taxonConceptDAO.getDisambiguationConcept(name.getCanonical(), dataResourceId); List<RelationshipAssertion> existingAssertions = null; if (disambiguationConcept == null) { disambiguationConcept = new TaxonConcept(); disambiguationConcept.setDataProviderId(dataProviderId); disambiguationConcept.setDataResourceId(dataResourceId); disambiguationConcept.setTaxonName(name); disambiguationConcept.setPriority(TaxonConceptDAO.DISAMBIGUATION_PRIORITY); disambiguationConcept.setAccepted(false); disambiguationConcept.setNubConcept(dataResourceId == 1); disambiguationConcept.setRank(name.getRank()); taxonConceptDAO.create(disambiguationConcept); existingAssertions = new LinkedList<RelationshipAssertion>(); } else { existingAssertions = relationshipAssertionDAO .getRelationshipAssertionsForFromConcept(disambiguationConcept.getId()); } for (List<TaxonConceptLite> classification : classifications) { long toConceptId = classification.get(classification.size() - 1).getId(); boolean exists = false; for (RelationshipAssertion assertion : existingAssertions) { if (assertion.getRelationshipType() == 1 && assertion.getToConceptId() == toConceptId) { exists = true; } } if (!exists) { relationshipAssertionDAO.create(disambiguationConcept.getId(), toConceptId, 1); } } return disambiguationConcept; } /** * Checks that the root of the sourceList classification is in the target classification, provided that the * root concept is greater than or equal to the lowestRankToCheck. Should the root in the sourceList be a lower taxa * than that provided, the check is ignored. Thus this can be used to "Check if the kingdoms are the same, if there is * one in the sourceList". * @param sourceList That is being checked * @param targetList That the source is to be compared to (E.g. A Nub classification) * @param lowestRankToCheck The lowest rank of taxa that should be checked in the source classification * @param rankMustExist Set to true if there must be a match made in the root node * @return true if there is a clash, false if there is no clash */ public boolean rootSourceConceptClashesWithTarget(List<TaxonName> sourceList, List<TaxonConceptLite> targetList, int lowestRankToCheck, boolean rankMustExist) { TaxonName root = sourceList.get(0); boolean ranksCompared = false; // indicator to see if there has been a match made - they may not clash but no comparisons are made... for (TaxonConceptLite targetConcept : targetList) { if (targetConcept.getRank() > lowestRankToCheck) { // only check down to this rank break; } else if (targetConcept.getTaxonName().getRank() == root.getRank()) { ranksCompared = true; if (!StringUtils.equals(targetConcept.getTaxonName().getCanonical(), root.getCanonical())) { logger.debug("The root names do do match, so the classifications clash. Rank[" + root.getRank() + "], Source[" + root.getCanonical() + "], Target[" + targetConcept.getTaxonName().getCanonical() + "]"); return true; } } } // if they were not compared but should have been if (rankMustExist && !ranksCompared) { return true; } return false; } /** * A utility to get a confidence rating of how equal 2 classifications appear. * The procedure is as follows: * * - Find the major taxa in each classification (those with ranks that are exact multiples of 1000) * - Align the taxa in case of the same name being assigned to different ranks * - Assign top ratings (up to 100) for fully matching classifications or classifications with * differ only because of a null in one or other classification (-1 per null) * - Assign high ratings (up to 95) if the genus and family match and neither is null (on the grounds * that homonymous genera in homonymous families seem +/- impossibly unlikely) - give extra marks * if the root of the family name differs from the root of the genus name * - Otherwise assign a rating based on the individual rank matches, weighted to give extra for * matching kingdom and family and subtracting small amounts for null elements. * * @param sourceList The source classification to compare to the target * @param targetList The target classification would typically be the backbone/nub classification that you are matching against * It is normal that this would be from the most complete taxonomy - e.g. Nub * @param lowestRankInclusive The lowest rank (inclusive) to compare down to within the source classification. * It should be noted that if you specify 7000 (Species) but only supply a source classification with a Kingdom and Phylum, * then only 2 ranks will be used in the percentage calculation. * @param requireFullCompatibility if true, reject any classification with any incompatible names * @return A percentage that represents the confidence of match of the classifications */ public int classificationsComparator(List<TaxonName> sourceList, List<TaxonName> targetList, int lowestRankInclusive, boolean requireFullCompatibility) { // Keep rank in range if (lowestRankInclusive < 1000 || lowestRankInclusive > 8000) { lowestRankInclusive = 8000; } // Get the names tidily into arrays to simplify later steps - ignore ignorable names String source[] = new String[lowestRankInclusive / 1000]; String target[] = new String[lowestRankInclusive / 1000]; for (TaxonName name : sourceList) { if (name.getRank() % 1000 == 0) { int index = (name.getRank() / 1000) - 1; if (index >= 0 && index < source.length && !namesToIgnoreUppercase.contains(name.getCanonical().toUpperCase())) { source[index] = name.getCanonical(); } } } for (TaxonName name : targetList) { if (name.getRank() % 1000 == 0) { int index = (name.getRank() / 1000) - 1; if (index >= 0 && index < target.length && !namesToIgnoreUppercase.contains(name.getCanonical().toUpperCase())) { target[index] = name.getCanonical(); } } } // Align intermediate taxa (in case of taxa shifting rank in the Kingdom to Family range, but not handling suffix changes) - we only compare down to family - even this may be overkill // This just runs through both taxonomies in turn moving any names found lower in the second taxonomy to the level from the first taxonomy // This just checks for ranks higher than genus int lastToCompare = (source.length < 5) ? source.length - 1 : 4; for (int s = 0; s < lastToCompare; s++) { if (source[s] != null) { if (target[s] == null || !source[s].equals(target[s])) { for (int i = s + 1; i <= lastToCompare; i++) { if (target[i] != null && source[s].equals(target[i])) { // Move array elements target[s] = target[i]; while (i > s) { target[i--] = null; } break; } } } } } for (int s = 0; s < lastToCompare; s++) { if (target[s] != null) { if (source[s] == null || !target[s].equals(source[s])) { for (int i = s + 1; i <= lastToCompare; i++) { if (source[i] != null && target[s].equals(source[i])) { // Move array elements source[s] = source[i]; while (i > s) { source[i--] = null; } break; } } } } } // Now assign a value in the range 0 (no match) to 100 (total match) for the relationship between the classifications int measure = 0; if (compareRange(source, target, 0, source.length - 1, true)) { // Full match allowing for nulls measure = 100; // Deduct a little for each null for (int i = 0; i < source.length; i++) { if (source[i] == null || target[i] == null) { measure--; } } } else if (requireFullCompatibility) { measure = 0; } else if (compareRange(source, target, 4, 5, false)) { // genus and family match - very good sign // Get start of genus int genusLength = source[5].length(); String genusRoot = source[5].substring(0, genusLength > 3 ? source[5].length() - 2 : 1); if (!source[4].startsWith(genusRoot)) { // Genus and family have different roots - even more significant measure = 95; } else { measure = 90; } } else { // Track null matches separately because they count for nothing if there is no other match // Note that totally null matches will already have been handled. int nullMeasure = 0; int maxPossible = 30; // See if kingdoms match boolean nullKingdom = source[0] == null || target[0] == null; boolean kingdomMatch = nullKingdom || StringUtils.equals(source[0], target[0]); if (kingdomMatch) { if (nullKingdom) { nullMeasure += 15; } else { measure += 30; } } for (int i = 1; i < source.length; i++) { int value = (i == 4) ? 30 : 10; if (source[i] == null || target[i] == null) { nullMeasure += value - 1; } else if (source[i].equals(target[i])) { measure += value; } maxPossible += value; } if (measure > 0) { // We have some reason for thinking a match may exist - add in the null matches measure += nullMeasure; } measure = (measure * 100) / maxPossible; } /* StringBuffer sb = new StringBuffer(); for (TaxonName s : sourceList) { sb.append(s.getCanonical()); sb.append(" "); } sb.append(" <--> "); for (TaxonName s : targetList) { sb.append(s.getCanonical()); sb.append(" "); } sb.append("MEASURE: "); sb.append(measure); logger.debug(sb.toString()); */ return measure; } /** * Compare a subrange of two arrays of strings * @param source Array * @param target Array * @param startIndex (inclusive) * @param endIndex (inclusive) * @param treatNullsAsMatch * @return true if all strings match */ private boolean compareRange(String[] source, String[] target, int startIndex, int endIndex, boolean treatNullsAsMatch) { boolean match = true; if (startIndex >= 0 && endIndex < source.length) { for (int i = startIndex; i <= endIndex; i++) { if (source[i] == null || target[i] == null) { if (!treatNullsAsMatch) { match = false; break; } } else { if (!source[i].equals(target[i])) { match = false; break; } } } } else { match = false; } return match; } /** * Utility to convert a list of concepts to a list of the contained names * This is particularly useful as a name list may be logged directly * @param conceptList To convert * @return The List of TaxonName that the concept list represents */ public List<TaxonName> toListOfTaxonName(List<TaxonConceptLite> conceptList) { List<TaxonName> nameList = new LinkedList<TaxonName>(); for (TaxonConceptLite concept : conceptList) { nameList.add(concept.getTaxonName()); } return nameList; } /** * @return Returns the taxonConceptDAO. */ public TaxonConceptDAO getTaxonConceptDAO() { return taxonConceptDAO; } /** * @param taxonConceptDAO The taxonConceptDAO to set. */ public void setTaxonConceptDAO(TaxonConceptDAO taxonConceptDAO) { this.taxonConceptDAO = taxonConceptDAO; } /** * @return Returns the taxonomyDenormaliserDAO. */ public TaxonomyDenormaliserDAO getTaxonomyDenormaliserDAO() { return taxonomyDenormaliserDAO; } /** * @param taxonomyDenormaliserDAO The taxonomyDenormaliserDAO to set. */ public void setTaxonomyDenormaliserDAO(TaxonomyDenormaliserDAO taxonomyDenormaliserDAO) { this.taxonomyDenormaliserDAO = taxonomyDenormaliserDAO; } /** * @return Returns the namesToIgnoreUppercase. */ public Set<String> getNamesToIgnoreUppercase() { return namesToIgnoreUppercase; } /** * @param namesToIgnoreUppercase The namesToIgnoreUppercase to set. */ public void setNamesToIgnoreUppercase(Set<String> namesToIgnoreUppercase) { this.namesToIgnoreUppercase = namesToIgnoreUppercase; } /** * @return Returns the confidenceThreshold. */ public int getConfidenceThreshold() { return confidenceThreshold; } /** * @param confidenceThreshold The confidenceThreshold to set. */ public void setConfidenceThreshold(int confidenceThreshold) { this.confidenceThreshold = confidenceThreshold; } /** * @return Returns the nameOfUnknownKingdom. */ public String getNameOfUnknownKingdom() { return nameOfUnknownKingdom; } /** * @param nameOfUnknownKingdom The nameOfUnknownKingdom to set. */ public void setNameOfUnknownKingdom(String nameOfUnknownKingdom) { this.nameOfUnknownKingdom = nameOfUnknownKingdom; } /** * @param remoteId * @param dataResourceId * @return */ public List<RemoteConcept> findRemoteConceptsByRemoteIdAndIdTypeAndDataResourceId(String remoteId, long idType, long dataResourceId) { return remoteConceptDAO.findByRemoteIdAndIdTypeAndDataResourceId(remoteId, idType, dataResourceId); } /** * This method deals only with remote concepts of type 1 ("local ids") * @param tc * @param remoteConceptId */ public Long synchroniseRemoteConcepts(TaxonConceptLite tc, String remoteConceptId) { Long id = null; if (remoteConceptId != null) { List<RemoteConcept> conceptsForTaxon = remoteConceptDAO.findByTaxonConceptId(tc.getId()); RemoteConcept remoteConcept = null; if (conceptsForTaxon != null) { for (RemoteConcept concept : conceptsForTaxon) { if (concept.getIdType() == 1) { remoteConcept = concept; id = remoteConcept.getId(); remoteConcept.setRemoteId(remoteConceptId); remoteConcept.setModified(new Timestamp(System.currentTimeMillis())); remoteConceptDAO.updateOrCreate(remoteConcept); } } } if (remoteConcept == null) { remoteConcept = new RemoteConcept(tc.getId(), 1, remoteConceptId); id = remoteConceptDAO.create(remoteConcept); } } return id; } /** * Link taxon concepts (identified via the ids for associated remote concept records) * with a parent taxon concept (identified via the remoteId string for the parent) * * @param parentConceptId * @param childConceptIds */ public void linkTaxonConceptsToParent(Long parentConceptId, List<Long> childConceptIds) { taxonConceptDAO.linkTaxonConceptsToParent(parentConceptId, childConceptIds); } /** * @return the remoteConceptDAO */ public RemoteConceptDAO getRemoteConceptDAO() { return remoteConceptDAO; } /** * @param remoteConceptDAO the remoteConceptDAO to set */ public void setRemoteConceptDAO(RemoteConceptDAO remoteConceptDAO) { this.remoteConceptDAO = remoteConceptDAO; } /** * Clean up out-of-date remote concepts * @param dataResourceId * @param timer */ public void deleteRemoteConceptsOlderThan(Long dataResourceId, Long timer) { remoteConceptDAO.deleteRemoteConceptsOlderThan(dataResourceId, timer); } /** * If taxon_name and taxon_concept records have rank 0 but their parents have * assigned ranks, set the rank to a value lower than the parent rank. * * This method needs to loop to ensure that the ranks are cleared even if they * are grandchildren or even more remote from the nearest ranked ancestor * * @param dataResourceId */ public void updateUnknownRanks(Long dataResourceId) { // Don't continue forever... for (int i = 0; i < 10; i++) { if (!taxonConceptDAO.updateUnknownRanks(dataResourceId)) { // No ranks to update... break; } } } /** * @param taxonConceptId * @return */ public List<TaxonConceptLite> getClassificationConcepts(TaxonConceptDAO taxonConceptDAO, long taxonConceptId) { return taxonConceptDAO.getClassificationConcepts(taxonConceptId); } /** * @param taxonConceptId * @return */ public List<TaxonConceptLite> getClassificationConcepts(long taxonConceptId) { return taxonConceptDAO.getClassificationConcepts(taxonConceptId); } /** * @param parentId * @return */ public TaxonConceptLite getTaxonConceptLite(Long id) { return taxonConceptDAO.getTaxonConceptLite(id); } /** * @param taxonConceptId * @param rank * @return */ public void updateRank(TaxonConceptLite concept, Integer rank) { concept.setRank(rank); taxonConceptDAO.updateRank(concept.getId(), rank); } /** * @return Returns the pageSize. */ public int getPageSize() { return pageSize; } /** * @param pageSize The pageSize to set. */ public void setPageSize(int pageSize) { this.pageSize = pageSize; } /** * @return the relationshipAssertionDAO */ public RelationshipAssertionDAO getRelationshipAssertionDAO() { return relationshipAssertionDAO; } /** * @param relationshipAssertionDAO the relationshipAssertionDAO to set */ public void setRelationshipAssertionDAO(RelationshipAssertionDAO relationshipAssertionDAO) { this.relationshipAssertionDAO = relationshipAssertionDAO; } /** * @return Returns the infraspecifiMarkerMappingsUppercase. */ public Map<String, Integer> getInfraspecifiMarkerMappingsUppercase() { return infraspecifiMarkerMappingsUppercase; } /** * @param infraspecifiMarkerMappingsUppercase The infraspecifiMarkerMappingsUppercase to set. */ public void setInfraspecifiMarkerMappingsUppercase(Map<String, Integer> infraspecifiMarkerMappingsUppercase) { this.infraspecifiMarkerMappingsUppercase = infraspecifiMarkerMappingsUppercase; } }