ubic.gemma.core.loader.expression.geo.DatasetCombiner.java Source code

Java tutorial


Here is the source code for ubic.gemma.core.loader.expression.geo.DatasetCombiner.java


 * The Gemma project
 * Copyright (c) 2006 University of British Columbia
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *       http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package ubic.gemma.core.loader.expression.geo;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tools.ant.filters.StringInputStream;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import ubic.basecode.math.StringDistance;
import ubic.basecode.util.StringUtil;
import ubic.gemma.core.loader.entrez.EutilFetch;
import ubic.gemma.core.loader.expression.geo.model.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

 * Class to handle cases where there are multiple GEO dataset for a single actual experiment. This can occur in at least
 * two ways:
 * <ol>
 * <li>There is a single GSE (e.g., GSE674) but two datasets (GDS472, GDS473). This can happen when there are two
 * different microarrays used such as the "A" and B" HG-U133 Affymetrix arrays. (Each GDS can only refer to a single
 * platform)</li>
 * <li>Rarely, there can be two series, as well as two data sets, for the situation described above. These are
 * 'pathological' (due to incorrect data entry by a user, back in the day) and GEO folks should be removing them
 * eventually.</li>
 * </ol>
 * <p>
 * One major problem is figuring out which samples (GSMs) correspond across the datasets. In the example of GSE674,
 * there are samples like C6-U133A (in GDS472) and C6-133B (in GDS473), which apparently, but not "officially"
 * correspond to the same biological RNA. The difficulty is that there is no fail-proof way to determine which samples
 * match up. We do the best we can by using the edit distance between the sample names. Ties can be a problem but for
 * now the samples are sorted and the first best match is the one kept, on the assumption that corresponding samples
 * will have lower numbers. (that is, sample 12929 will match with 12945, not 12955, if the edit distance among the
 * choices is the same).
 * </p>
 * Another problem is that there is no way to go from GDS--&gt;GSE--&gt;other GDS without scraping the GEO web site.
 * @author pavlidis
@SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use
public class DatasetCombiner {

    static final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    private static final String PUNCTUATION_REGEXP = "[()\\s-._]";
     * Careful, GEO changes this sometimes.
    private static final String GSE_RECORD_REGEXP = "(GSE\\d+)";
    private static final String ENTREZ_GEO_QUERY_URL_SUFFIX = "[Accession]&cmd=search";
    private static final String ENTREZ_GEO_QUERY_URL_BASE = "https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gds&term=";
    private static final Log log = LogFactory.getLog(DatasetCombiner.class.getName());
     * Threshold normalized similarity between two strings before we bother to make a match. The normalized similarity
     * is the ratio between the unnormalized edit distance and the length of the longer of the two strings. This is used
     * as a maximum distance (the pair of descriptors must be at least this close).
     * Setting this correctly is important if there are to be singletons (samples that don't match to others)
    private static final double SIMILARITY_THRESHOLD = 0.5;
     * Used to help ignore identifiers of microarrays in sample titles.
    private static final Map<String, Collection<String>> microarrayNameStrings = new HashMap<>();

    static {
        // note : all lower case!
        DatasetCombiner.microarrayNameStrings.put("u133", new HashSet<String>());
        DatasetCombiner.microarrayNameStrings.put("u95", new HashSet<String>());
        DatasetCombiner.microarrayNameStrings.put("u74", new HashSet<String>());
        DatasetCombiner.microarrayNameStrings.put("v2", new HashSet<String>());
        DatasetCombiner.microarrayNameStrings.put("chip", new HashSet<String>());
        DatasetCombiner.microarrayNameStrings.get("chip").add("chip a");
        DatasetCombiner.microarrayNameStrings.get("chip").add("chip b");
        DatasetCombiner.microarrayNameStrings.get("chip").add("chip c");

    // Maps of sample accessions to other useful bits.
    private final LinkedHashMap<String, String> accToPlatform = new LinkedHashMap<>();
    private final LinkedHashMap<String, String> accToTitle = new LinkedHashMap<>();
    private final LinkedHashMap<String, String> accToDataset = new LinkedHashMap<>();
    private final LinkedHashMap<String, String> accToOrganism = new LinkedHashMap<>();
    private final LinkedHashMap<String, String> accToSecondaryTitle = new LinkedHashMap<>();
    private boolean doSampleMatching;

    public DatasetCombiner() {
        this.doSampleMatching = true;

    public DatasetCombiner(boolean doSampleMatching) {
        this.doSampleMatching = doSampleMatching;

     * Given GEO series ids, find all associated data sets.
     * @param seriesAccessions accessions
     * @return a collection of associated GDS accessions. If no GDS is found, the collection will be empty.
    public static Collection<String> findGDSforGSE(Collection<String> seriesAccessions) {
         * go from GSE to GDS, using screen scraping.
        // http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gds&term=GSE674[Accession]&cmd=search
        // grep on "GDS[[digits]] record"
        Collection<String> associatedDatasetAccessions = new HashSet<>();

        for (String seriesAccession : seriesAccessions) {
        return associatedDatasetAccessions;


     * @param seriesAccession series accession
     * @return GDSs that correspond to the given series. It will be empty if there is no GDS matching.
    public static Collection<String> findGDSforGSE(String seriesAccession) {

        Collection<String> associatedDatasetAccessions = new HashSet<>();
        try {
            String details = EutilFetch.fetch("gds", seriesAccession, 100);
            if (details.equalsIgnoreCase("no results")) {
                return associatedDatasetAccessions;
            XPathFactory xf = XPathFactory.newInstance();
            XPath xpath = xf.newXPath();

             * Get all Items of type GDS that are from a DocSum with an Item entryType of GDS.
            XPathExpression xgds = xpath.compile(
                    "/eSummaryResult/DocSum[Item/@Name=\"entryType\" and (Item=\"GDS\")]/Item[@Name=\"GDS\"][1]/text()");

            DocumentBuilder builder = DatasetCombiner.factory.newDocumentBuilder();

             * Bug 2690. There must be a better way.
            details = details.replaceAll("encoding=\"UTF-8\"", "");
            try (StringInputStream sis = new StringInputStream(StringUtils.trim(details))) {

                Document document = builder.parse(sis);

                NodeList result = (NodeList) xgds.evaluate(document, XPathConstants.NODESET);
                for (int i = 0; i < result.getLength(); i++) {
                    String nodeValue = result.item(i).getNodeValue();
                    // if ( nodeValue.contains( ";" ) ) continue; //
                    associatedDatasetAccessions.add("GDS" + nodeValue);

                return associatedDatasetAccessions;


        } catch (IOException e) {
            throw new RuntimeException("Could not parse XML data from remote server", e);
        } catch (ParserConfigurationException | SAXException | XPathExpressionException e) {
            throw new RuntimeException("XML parsing error of remote data", e);

     * Given a GDS, find the corresponding GSEs (there can be more than one in rare cases).
     * @param datasetAccession dataset accession
     * @return Collection of series this data set is derived from (this is almost always just a single item).
    public static Collection<String> findGSEforGDS(String datasetAccession) {
         * go from GDS to GSE, using screen scraping.
        // http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gds&term=GSE674[Accession]&cmd=search
        // grep on "GDS[[digits]] record"
        URL url = null;

        Pattern pat = Pattern.compile(DatasetCombiner.GSE_RECORD_REGEXP);

        Collection<String> associatedSeriesAccession = new HashSet<>();

        try {
            url = new URL(DatasetCombiner.ENTREZ_GEO_QUERY_URL_BASE + datasetAccession
                    + DatasetCombiner.ENTREZ_GEO_QUERY_URL_SUFFIX);
            URLConnection conn = url.openConnection();

            try (InputStream is = conn.getInputStream();
                    BufferedReader br = new BufferedReader(new InputStreamReader(is))) {

                String line;
                while ((line = br.readLine()) != null) {
                    Matcher mat = pat.matcher(line);
                    if (mat.find()) {
                        String capturedAccession = mat.group(1);
        } catch (MalformedURLException e) {
            DatasetCombiner.log.error(e, e);
            throw new RuntimeException("Invalid URL " + url, e);
        } catch (IOException e) {
            DatasetCombiner.log.error(e, e);
            throw new RuntimeException("Could not get data from remote server", e);

        if (associatedSeriesAccession.size() == 0) {
            throw new IllegalStateException("No GSE found for " + datasetAccession);

        return associatedSeriesAccession;


    public static Map<GeoPlatform, List<GeoSample>> getPlatformSampleMap(GeoSeries geoSeries) {
        Map<GeoPlatform, List<GeoSample>> platformSamples = new HashMap<>();

        for (GeoSample sample : geoSeries.getSamples()) {

            for (GeoPlatform platform : sample.getPlatforms()) {
                if (!platformSamples.containsKey(platform)) {
                    platformSamples.put(platform, new ArrayList<GeoSample>());
        return platformSamples;

     * Given a GEO dataset id, find all GDS ids that are associated with it.
     * @param datasetAccession the geo accession
     * @return all GDS associated with the given accession
    public Collection<String> findGDSforGDS(String datasetAccession) {
        return DatasetCombiner.findGDSforGSE(DatasetCombiner.findGSEforGDS(datasetAccession));

     * Try to line up samples across datasets.
     * @param dataSets datasets
     * @return sample correspondence
    public GeoSampleCorrespondence findGSECorrespondence(Collection<GeoDataset> dataSets) {

        if (dataSets == null)
            return null;
        if (dataSets.size() == 0) {
            throw new IllegalArgumentException("No datasets!");


        int numDatasets = dataSets.size();
        return this.findCorrespondence(numDatasets);

     * Try to line up samples across datasets contained in a series.
     * @param series geo series
     * @return geo sample correspondence
    public GeoSampleCorrespondence findGSECorrespondence(GeoSeries series) {
        Collection<GeoDataset> datasets = series.getDatasets();
        if (datasets != null && datasets.size() > 0) {

            // make sure all samples are accounted for - just informative
            Collection<GeoSample> missed = new HashSet<>();
            for (GeoSample sample : series.getSamples()) {
                if (!this.accToDataset.containsKey(sample.getGeoAccession())) {
            if (!missed.isEmpty()) {
                DatasetCombiner.log.warn("There were one or more samples missing from the datasets: "
                        + StringUtils.join(missed, " | "));
            return this.findGSECorrespondence(datasets);

        int numPlatforms = this.fillAccessionMaps(series);
        return this.findCorrespondence(numPlatforms);

     * See bug 1672 for why this is needed.
     * @param dataSets datasets
    private void checkPlatformsMatchSeries(Collection<GeoDataset> dataSets) {
        for (GeoDataset dataset : dataSets) {
            boolean found = false;
            GeoPlatform platform = dataset.getPlatform();

            if (dataset.getSeries().size() == 0)

            Collection<GeoPlatform> seenPlatforms = new HashSet<>();

            for (GeoSeries series : dataset.getSeries()) {
                for (GeoSample sample : series.getSamples()) {
                    if (sample.getPlatforms().contains(platform)) {
                        found = true;

            if (!found) {

                if (seenPlatforms.size() == 1) {
                    DatasetCombiner.log.warn(dataset + " is associated with wrong platform? " + platform
                            + ", switching it to use series platform " + seenPlatforms.iterator().next());
                } else {
                     * Maybe there is a way to handle this, but not worth it. Dataset uses the wrong platform.
                    throw new IllegalStateException(platform + " on dataset " + dataset
                            + " is not used at all by associated series, can't determine correct platform as series uses more than one.");


    private int computeDistance(String trimmedTest, String trimmedTarget) {

        return StringDistance.editDistance(trimmedTarget, trimmedTest);


    private void fillAccessionMap(GeoSample sample, GeoData owner) {
        String title = sample.getTitle();
        if (StringUtils.isNotBlank(title)) {
            accToTitle.put(sample.getGeoAccession(), title);
        accToDataset.put(sample.getGeoAccession(), owner.getGeoAccession());
        accToSecondaryTitle.put(sample.getGeoAccession(), sample.getTitleInDataset()); // could be null.
        String organism = this.getSampleOrganism(sample);
        if (StringUtils.isNotBlank(organism)) {
            accToOrganism.put(sample.getGeoAccession(), organism);

    private void fillAccessionMaps(Collection<GeoDataset> dataSets) {

        for (GeoDataset dataset : dataSets) {
            GeoPlatform platform = dataset.getPlatform();
            assert platform != null;
            if (dataset.getSubsets().size() == 0) {
                assert dataset.getSeries().size() > 0;
                for (GeoSeries series : dataset.getSeries()) {
                    for (GeoSample sample : series.getSamples()) {

                        if (sample.getPlatforms().size() == 0)
                        this.fillAccessionMap(sample, dataset);
            } else {
                for (GeoSubset subset : dataset.getSubsets()) {
                    for (GeoSample sample : subset.getSamples()) {

                        if (sample.getPlatforms().size() == 0)

                        this.fillAccessionMap(sample, dataset);

     * This is used if there are no 'datasets' (GDS) to work with; we just use platforms.
     * @param series geo series
     * @return platform sample size
    private int fillAccessionMaps(GeoSeries series) {

        Map<GeoPlatform, List<GeoSample>> platformSamples = DatasetCombiner.getPlatformSampleMap(series);

        for (GeoPlatform platform : platformSamples.keySet()) {
            for (GeoSample sample : platformSamples.get(platform)) {
                assert sample != null : "Null sample for platform " + platform.getDescription();
                this.fillAccessionMap(sample, platform);

        return platformSamples.keySet().size();

     * This is the main point where comparisons are made.
     * @param numDatasetsOrPlatforms number of datasets or platforms
     * @return geo sample correspondence
    private GeoSampleCorrespondence findCorrespondence(int numDatasetsOrPlatforms) {
        GeoSampleCorrespondence result = new GeoSampleCorrespondence();


        final List<String> sampleAccs = new ArrayList<>(accToDataset.keySet());
        assert sampleAccs.size() > 0;

        if (numDatasetsOrPlatforms <= 1 || !this.doSampleMatching) {
            DatasetCombiner.log.debug("Each bioassay will get a distinct biomaterial");
            for (String sample : sampleAccs) {
                result.addCorrespondence(sample, null);
            return result;

        String commonPrefix = StringUtil.commonPrefix(accToTitle.values());
        if (commonPrefix != null) {
            DatasetCombiner.log.debug("Common prefix = " + commonPrefix);
            commonPrefix = commonPrefix.toLowerCase();
        String commonSuffix = StringUtil.commonSuffix(accToTitle.values());
        if (commonSuffix != null) {
            DatasetCombiner.log.debug("Common suffix = " + commonSuffix);
            commonSuffix = commonSuffix.toLowerCase();

        // using the sorted order helps find the right matches.

        Map<String, Collection<String>> alreadyMatched = new HashMap<>();

        // do it by data set, so we constrain comparing items in _this_ data set to ones in _other_ data sets (or other
        // platforms)
        // The inner loops are just to get the samples in the data set (platform) being considered.
        //noinspection MismatchedQueryAndUpdateOfCollection // I do not dare to touch this method without proper refactoring
        Collection<String> alreadyTestedDatasetsOrPlatforms = new HashSet<>();

        List<String> dataSets = new ArrayList<>(new HashSet<>(accToDataset.values()));

        List<String> platforms = new ArrayList<>(new HashSet<>(accToPlatform.values()));

        List<String> valuesToUse;
        LinkedHashMap<String, String> accToDatasetOrPlatform;

        if (dataSets.size() > 0) {
            valuesToUse = dataSets;
            this.sortDataSets(sampleAccs, valuesToUse);
            accToDatasetOrPlatform = accToDataset;
            DatasetCombiner.log.debug(dataSets.size() + " datasets");
        } else {
            valuesToUse = platforms;
            this.sortPlatforms(sampleAccs, valuesToUse);
            accToDatasetOrPlatform = accToPlatform;
            DatasetCombiner.log.debug(platforms.size() + " platforms");

        this.processDatasets(result, sampleAccs, commonPrefix, commonSuffix, alreadyMatched,
                alreadyTestedDatasetsOrPlatforms, valuesToUse, accToDatasetOrPlatform);

        return result;

    private void processDatasets(GeoSampleCorrespondence result, List<String> sampleAccs, String commonPrefix,
            String commonSuffix, Map<String, Collection<String>> alreadyMatched,
            Collection<String> alreadyTestedDatasetsOrPlatforms, List<String> valuesToUse,
            LinkedHashMap<String, String> accToDatasetOrPlatform) {

        Collection<String> allMatched = new HashSet<>();

        // we start with the smallest dataset/platform.
        for (String datasetOrPlatformA : valuesToUse) {
            DatasetCombiner.log.debug("Finding matches for samples in " + datasetOrPlatformA);

            // for each sample in this data set...
            for (int j = 0; j < sampleAccs.size(); j++) {
                this.processSample(result, sampleAccs, commonPrefix, commonSuffix, alreadyMatched, valuesToUse,
                        accToDatasetOrPlatform, allMatched, datasetOrPlatformA, j);

    private void processSample(GeoSampleCorrespondence result, List<String> sampleAccs, String commonPrefix,
            String commonSuffix, Map<String, Collection<String>> alreadyMatched, List<String> valuesToUse,
            LinkedHashMap<String, String> accToDatasetOrPlatform, Collection<String> allMatched,
            String datasetOrPlatformA, int j) {

        String targetAcc = sampleAccs.get(j);

        // skip samples that are not in this data set.
        if (!accToDataset.get(targetAcc).equals(datasetOrPlatformA)) {
        if (allMatched.contains(targetAcc))

        if (!accToTitle.containsKey(targetAcc)) {
        String targetTitle = accToTitle.get(targetAcc).toLowerCase();
        String targetSecondaryTitle = null;
        if (accToSecondaryTitle.get(targetAcc) != null) {
            targetSecondaryTitle = accToSecondaryTitle.get(targetAcc).toLowerCase();

        DatasetCombiner.log.debug("Target: " + targetAcc + " (" + datasetOrPlatformA + ") " + targetTitle
                + (targetSecondaryTitle == null ? "" : " a.k.a " + targetSecondaryTitle));
        if (StringUtils.isBlank(targetTitle))
            throw new IllegalArgumentException("Can't have blank titles for samples");

        Collection<String> bonusWords = this.getMicroarrayStringsToMatch(targetTitle);

         * For each of the other data sets
        boolean wasTied = false;
        for (String datasetOrPlatformB : valuesToUse) {
            wasTied = this.processSecondDataset(result, sampleAccs, commonPrefix, commonSuffix, alreadyMatched,
                    accToDatasetOrPlatform, allMatched, datasetOrPlatformA, wasTied, targetAcc, targetTitle,
                    targetSecondaryTitle, bonusWords, datasetOrPlatformB);

    private boolean processSecondDataset(GeoSampleCorrespondence result, List<String> sampleAccs,
            String commonPrefix, String commonSuffix, Map<String, Collection<String>> alreadyMatched,
            LinkedHashMap<String, String> accToDatasetOrPlatform, Collection<String> allMatched,
            String datasetOrPlatformA, boolean wasTied, String targetAcc, String targetTitle,
            String targetSecondaryTitle, Collection<String> bonusWords, String datasetOrPlatformB) {

        if (datasetOrPlatformB.equals(datasetOrPlatformA)) {
            return wasTied;

        // initialize data structure.
        if (alreadyMatched.get(targetAcc) == null) {
            alreadyMatched.put(targetAcc, new HashSet<String>());

         * Keep us from getting multiple matches.
        if (alreadyMatched.get(targetAcc).contains(datasetOrPlatformB)) {
            return wasTied;

        wasTied = this.findBestSampleHit(result, sampleAccs, commonPrefix, commonSuffix, alreadyMatched,
                accToDatasetOrPlatform, allMatched, datasetOrPlatformA, wasTied, targetAcc, targetTitle,
                targetSecondaryTitle, bonusWords, datasetOrPlatformB);

        return wasTied;

    private boolean findBestSampleHit(GeoSampleCorrespondence result, List<String> sampleAccs, String commonPrefix,
            String commonSuffix, Map<String, Collection<String>> alreadyMatched,
            LinkedHashMap<String, String> accToDatasetOrPlatform, Collection<String> allMatched,
            String datasetOrPlatformA, boolean wasTied, String targetAcc, String targetTitle,
            String targetSecondaryTitle, Collection<String> bonusWords, String datasetOrPlatformB) {
        // find the best match in this data set.
        double minDistance = Double.MAX_VALUE;
        String bestMatch = null;
        String bestMatchAcc = null;

        int numTested = 0;
        for (String testAcc : sampleAccs) {

            if (this.checkCanSkip(alreadyMatched, accToDatasetOrPlatform, allMatched, datasetOrPlatformA, targetAcc,
                    datasetOrPlatformB, testAcc))


            if (!accToTitle.containsKey(testAcc)) {

            String testTitle = accToTitle.get(testAcc).toLowerCase();
            String testSecondaryTitle = null;
            if (accToSecondaryTitle.get(testAcc) != null) {
                testSecondaryTitle = accToSecondaryTitle.get(testAcc).toLowerCase();

            if (StringUtils.isBlank(testTitle))
                throw new IllegalArgumentException("Can't have blank titles for samples");

            double bonus = this.calculateBonus(targetTitle, bonusWords, testTitle);

             * If one name is much longer than the other, presumably the author didn't use the same naming
             * scheme for all samples; we need to trim the longer one to match the shorter one; we use the
             * prefix.
            String trimmedTest = testTitle;
            String trimmedTarget = targetTitle;

            if (commonPrefix != null) {
                trimmedTest = trimmedTest.replaceFirst("^" + Pattern.quote(commonPrefix), "");
                trimmedTarget = trimmedTarget.replaceFirst("^" + Pattern.quote(commonPrefix), "");
            if (commonSuffix != null) {
                trimmedTest = trimmedTest.replaceFirst(Pattern.quote(commonSuffix) + "$", "");
                trimmedTarget = trimmedTarget.replaceFirst(Pattern.quote(commonSuffix) + "$", "");

            // remove some punctuation
            trimmedTest = trimmedTest.replaceAll(DatasetCombiner.PUNCTUATION_REGEXP, "");
            trimmedTarget = trimmedTarget.replaceAll(DatasetCombiner.PUNCTUATION_REGEXP, "");

            // Computing the distance
            double distance = this.computeDistance(trimmedTest, trimmedTarget);

            distance -= bonus;

            double normalizedDistance = distance / Math.max(trimmedTarget.length(), trimmedTest.length());

            double secondaryDistance;
            if (targetSecondaryTitle != null && testSecondaryTitle != null) {
                secondaryDistance = this.computeDistance(targetSecondaryTitle, testSecondaryTitle);

                if (secondaryDistance < distance) {
                    distance = secondaryDistance;
                    normalizedDistance = distance
                            / Math.max(targetSecondaryTitle.length(), testSecondaryTitle.length());

            if (!this.meetsMinimalThreshold(normalizedDistance)) {

            // better than last one?
            if (distance > minDistance) {

            // Try to resolve the tie. Messy, yes.
            if (distance == minDistance) {
                wasTied = true;
                double prefixWeightedDistanceA = StringDistance.prefixWeightedHammingDistance(targetAcc,
                        bestMatchAcc, 1.0);
                double prefixWeightedDistanceB = StringDistance.prefixWeightedHammingDistance(targetAcc, testAcc,
                if (prefixWeightedDistanceA == prefixWeightedDistanceB) {
                    double suffixWeightedDistanceA = StringDistance.suffixWeightedHammingDistance(targetAcc,
                            bestMatchAcc, 1.0);
                    double suffixWeightedDistanceB = StringDistance.suffixWeightedHammingDistance(targetAcc,
                            testAcc, 1.0);
                    if (prefixWeightedDistanceA == prefixWeightedDistanceB) {
                        continue; // still tied, keep old one
                    } else if (suffixWeightedDistanceA < suffixWeightedDistanceB) {
                        // new one is better.
                        minDistance = distance;
                        bestMatch = testTitle;
                        bestMatchAcc = testAcc;
                        DatasetCombiner.log.debug("Current best match (tie broken): " + testAcc + " ("
                                + datasetOrPlatformB + ") " + testTitle + (testSecondaryTitle == null ? ""
                                        : " a.k.a " + testSecondaryTitle + ", distance = " + distance));
                        wasTied = false;
                    if (suffixWeightedDistanceA > suffixWeightedDistanceB) {
                        // old one is still better.
                        wasTied = false;
                        //noinspection UnnecessaryContinue // better for readability
                } else if (prefixWeightedDistanceA > prefixWeightedDistanceB) {
                    // new one is better.
                    minDistance = distance;
                    bestMatch = testTitle;
                    bestMatchAcc = testAcc;
                    DatasetCombiner.log.debug("Current best match (tie broken): " + testAcc + " ("
                            + datasetOrPlatformB + ") " + testTitle + (testSecondaryTitle == null ? ""
                                    : " a.k.a " + testSecondaryTitle + ", distance = " + distance));
                    wasTied = false;
                } else if (prefixWeightedDistanceA < prefixWeightedDistanceB) {
                    wasTied = false;
                    //noinspection UnnecessaryContinue // better for readability
                    continue; // old best is still better.
            } else {
                // definite new winner no tie
                minDistance = distance;
                bestMatch = testTitle;
                bestMatchAcc = testAcc;
                DatasetCombiner.log.debug("Current best match: " + testAcc + " (" + datasetOrPlatformB + ") "
                        + testTitle + (testSecondaryTitle == null ? ""
                                : " a.k.a " + testSecondaryTitle + ", distance = " + distance));
                wasTied = false;

        } // end loop over samples in second data set.
        DatasetCombiner.log.debug("Tested " + numTested + " samples");

         * Now have the best hit for sample from the outer dataset, in the inner data set.
        if (bestMatchAcc == null || wasTied) {
            if (DatasetCombiner.log.isDebugEnabled())
                DatasetCombiner.log.debug("No match found in " + datasetOrPlatformB + " for " + targetAcc + "\t"
                        + targetTitle + " (" + datasetOrPlatformA
                        + ") (This can happen if sample was not run on all the platforms used; or if there were ties that could not be broken; or when we were unable to match)");
            result.addCorrespondence(targetAcc, null);
        } else {
            if (DatasetCombiner.log.isDebugEnabled())
                DatasetCombiner.log.debug("Match:\n" + targetAcc + "\t" + targetTitle + " ("
                        + accToDataset.get(targetAcc) + ")" + "\n" + bestMatchAcc + "\t" + bestMatch + " ("
                        + accToDataset.get(bestMatchAcc) + ")" + " (Distance: " + minDistance + ")");
            result.addCorrespondence(targetAcc, bestMatchAcc);
        return wasTied;

    private boolean checkCanSkip(Map<String, Collection<String>> alreadyMatched,
            LinkedHashMap<String, String> accToDatasetOrPlatform, Collection<String> allMatched,
            String datasetOrPlatformA, String targetAcc, String datasetOrPlatformB, String testAcc) {
        return allMatched.contains(testAcc) || !this.shouldTest(accToDatasetOrPlatform, alreadyMatched,
                datasetOrPlatformA, targetAcc, datasetOrPlatformB, testAcc);

    private double calculateBonus(String targetTitle, Collection<String> bonusWords, String testTitle) {
        double bonus = 0.0;
        for (String n : bonusWords) {
            if (testTitle.contains(n)) {
                DatasetCombiner.log.debug(testTitle + " gets a bonus in matching " + targetTitle);
                bonus = 1; // this basically means we discount that difference.
        return bonus;

     * @param title title
     * @return stop-strings relating to microarray names.
    private Collection<String> getMicroarrayStringsToMatch(String title) {
        Collection<String> result = new HashSet<>();
        for (String key : DatasetCombiner.microarrayNameStrings.keySet()) {
            if (title.contains(key)) {
                for (String value : DatasetCombiner.microarrayNameStrings.get(key)) {
                    if (title.contains(value)) {
        return result;

    private String getSampleOrganism(GeoSample sample) {
        Collection<GeoPlatform> platforms = sample.getPlatforms();
        assert platforms.size() > 0 : sample + " had no platform assigned";
        GeoPlatform platform = platforms.iterator().next();
        Collection<String> organisms = platform.getOrganisms();
        assert organisms.size() > 0;
        return organisms.iterator().next();

    private boolean meetsMinimalThreshold(double normalizedDistance) {
        return !(normalizedDistance > DatasetCombiner.SIMILARITY_THRESHOLD);

     * Implements constraints on samples to test.
     * @param accToDatasetOrPlatform (depending on which we are using, platforms or data sets)
     * @return should test
    private boolean shouldTest(LinkedHashMap<String, String> accToDatasetOrPlatform,
            Map<String, Collection<String>> alreadyMatched, String datasetA, String targetAcc, String datasetB,
            String testAcc) {
        boolean shouldTest = true;

        // initialize data structure.
        if (alreadyMatched.get(testAcc) == null) {
            alreadyMatched.put(testAcc, new HashSet<String>());

        // only use samples from the current test dataset.
        if (!accToDatasetOrPlatform.get(testAcc).equals(datasetB)) {
            shouldTest = false;

        // disallow multiple matches.
        if (alreadyMatched.get(testAcc).contains(datasetA)) {
            // log.debug( testAcc + " already matched to a sample in " + datasetA + ", skipping" );
            shouldTest = false;

        if (!accToOrganism.get(targetAcc).equals(accToOrganism.get(testAcc))) {
            DatasetCombiner.log.debug(testAcc + " From wrong organism");
            shouldTest = false;
        return shouldTest;

    private void sortDataSets(final List<String> sampleAccs, List<String> dataSets) {
        this.sortMap(sampleAccs, dataSets, accToDataset);

    private void sortPlatforms(final List<String> sampleAccs, List<String> platforms) {
        this.sortMap(sampleAccs, platforms, accToPlatform);

    private void sortMap(final List<String> sampleAccs, final List<String> objects, final Map<String, String> map) {
        Collections.sort(objects, new Comparator<String>() {
            public int compare(String arg0, String arg1) {
                int numSamples0 = 0;
                int numSamples1 = 0;
                for (String targetAcc : sampleAccs) {
                    // skip samples that are not in this data set.
                    if (map.get(targetAcc).equals(arg0)) {
                    } else if (map.get(targetAcc).equals(arg1)) {

                return Integer.compare(numSamples0, numSamples1);
