Java tutorial
/* * The Gemma project * * Copyright (c) 2006 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.loader.expression.geo; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.hsqldb.lib.StringInputStream; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import ubic.basecode.math.StringDistance; import ubic.basecode.util.StringUtil; import ubic.gemma.loader.entrez.EutilFetch; import ubic.gemma.loader.expression.geo.model.GeoData; import ubic.gemma.loader.expression.geo.model.GeoDataset; import ubic.gemma.loader.expression.geo.model.GeoPlatform; import ubic.gemma.loader.expression.geo.model.GeoSample; import ubic.gemma.loader.expression.geo.model.GeoSeries; import ubic.gemma.loader.expression.geo.model.GeoSubset; /** * Class to handle cases where there are multiple GEO dataset for a single actual experiment. This can occur in at least * two ways: * <ol> * <li>There is a single GSE (e.g., GSE674) but two datasets (GDS472, GDS473). This can happen when there are two * different microarrays used such as the "A" and B" HG-U133 Affymetrix arrays. (Each GDS can only refer to a single * platform)</li> * <li>Rarely, there can be two series, as well as two data sets, for the situation described above. These are * 'pathological' (due to incorrect data entry by a user, back in the day) and GEO folks should be removing them * eventually.</li> * </ol> * <p> * One major problem is figuring out which samples (GSMs) correspond across the datasets. In the example of GSE674, * there are samples like C6-U133A (in GDS472) and C6-133B (in GDS473), which apparently, but not "officially" * correspond to the same biological RNA. The difficulty is that there is no fail-proof way to determine which samples * match up. We do the best we can by using the edit distance between the sample names. Ties can be a problem but for * now the samples are sorted and the first best match is the one kept, on the assumption that corresponding samples * will have lower numbers. (that is, sample 12929 will match with 12945, not 12955, if the edit distance among the * choices is the same). * </p> * <p> * Another problem is that there is no way to go from GDS-->GSE-->other GDS without scraping the GEO web site. * * @author pavlidis * @version $Id: DatasetCombiner.java,v 1.46 2012/11/02 22:10:56 paul Exp $ */ public class DatasetCombiner { private static final String PUNCTUATION_REGEXP = "[\\(\\)\\s-\\._]"; /** * Careful, GEO changes this sometimes. */ private static final String GSE_RECORD_REGEXP = "(GSE\\d+)"; /** * */ private static final String ENTREZ_GEO_QUERY_URL_SUFFIX = "[Accession]&cmd=search"; /** * */ private static final String ENTREZ_GEO_QUERY_URL_BASE = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gds&term="; private boolean doSampleMatching = true; /** * Threshold normalized similarity between two strings before we bother to make a match. The normalized similarity * is the ratio between the unnormalized edit distance and the length of the longer of the two strings. This is used * as a maximum distance (the pair of descriptors must be at least this close). * <p> * Setting this correctly is important if there are to be singletons (samples that don't match to others) */ private final double SIMILARITY_THRESHOLD = 0.5; public DatasetCombiner(boolean doSampleMatching) { this.doSampleMatching = doSampleMatching; } public DatasetCombiner() { this.doSampleMatching = true; } private static Log log = LogFactory.getLog(DatasetCombiner.class.getName()); // Maps of sample accessions to other useful bits. LinkedHashMap<String, String> accToPlatform = new LinkedHashMap<String, String>(); LinkedHashMap<String, String> accToTitle = new LinkedHashMap<String, String>(); LinkedHashMap<String, String> accToDataset = new LinkedHashMap<String, String>(); LinkedHashMap<String, String> accToOrganism = new LinkedHashMap<String, String>(); LinkedHashMap<String, String> accToSecondaryTitle = new LinkedHashMap<String, String>(); /** * Given a GDS, find the corresponding GSEs (there can be more than one in rare cases). * * @param datasetAccession * @return Collection of series this data set is derived from (this is almost always just a single item). */ public static Collection<String> findGSEforGDS(String datasetAccession) { /* * go from GDS to GSE, using screen scraping. */ // http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gds&term=GSE674[Accession]&cmd=search // grep on "GDS[[digits]] record" URL url = null; Pattern pat = Pattern.compile(GSE_RECORD_REGEXP); Collection<String> associatedSeriesAccession = new HashSet<String>(); try { url = new URL(ENTREZ_GEO_QUERY_URL_BASE + datasetAccession + ENTREZ_GEO_QUERY_URL_SUFFIX); URLConnection conn = url.openConnection(); conn.connect(); InputStream is = conn.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is)); String line = null; while ((line = br.readLine()) != null) { Matcher mat = pat.matcher(line); if (mat.find()) { String capturedAccession = mat.group(1); associatedSeriesAccession.add(capturedAccession); } } is.close(); } catch (MalformedURLException e) { log.error(e, e); throw new RuntimeException("Invalid URL " + url, e); } catch (IOException e) { log.error(e, e); throw new RuntimeException("Could not get data from remote server", e); } if (associatedSeriesAccession.size() == 0) { throw new IllegalStateException("No GSE found for " + datasetAccession); } return associatedSeriesAccession; } /** * Given a GEO dataset it, find all GDS ids that are associated with it. * * @param seriesAccession * @return */ public Collection<String> findGDSforGDS(String datasetAccession) { return findGDSforGSE(findGSEforGDS(datasetAccession)); } /** * Given GEO series ids, find all associated data sets. * * @param seriesAccession * @return a collection of associated GDS accessions. If no GDS is found, the collection will be empty. */ public static Collection<String> findGDSforGSE(Collection<String> seriesAccessions) { /* * go from GSE to GDS, using screen scraping. */ // http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gds&term=GSE674[Accession]&cmd=search // grep on "GDS[[digits]] record" Collection<String> associatedDatasetAccessions = new HashSet<String>(); for (String seriesAccession : seriesAccessions) { associatedDatasetAccessions.addAll(findGDSforGSE(seriesAccession)); } return associatedDatasetAccessions; } static DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); /** * @param seriesAccession * @return GDSs that correspond to the given series. It will be empty if there is no GDS matching. */ public static Collection<String> findGDSforGSE(String seriesAccession) { Collection<String> associatedDatasetAccessions = new HashSet<String>(); try { String details = EutilFetch.fetch("gds", seriesAccession, 100); if (details.equalsIgnoreCase("no results")) { return associatedDatasetAccessions; } XPathFactory xf = XPathFactory.newInstance(); XPath xpath = xf.newXPath(); /* * Get all Items of type GDS that are from a DocSum with an Item entryType of GDS. */ XPathExpression xgds = xpath.compile( "/eSummaryResult/DocSum[Item/@Name=\"entryType\" and (Item=\"GDS\")]/Item[@Name=\"GDS\"][1]/text()"); DocumentBuilder builder = factory.newDocumentBuilder(); /* * Bug 2690. There must be a better way. */ details = details.replaceAll("encoding=\"UTF-8\"", ""); StringInputStream sis = new StringInputStream(StringUtils.trim(details)); Document document = builder.parse(sis); NodeList result = (NodeList) xgds.evaluate(document, XPathConstants.NODESET); for (int i = 0; i < result.getLength(); i++) { String nodeValue = result.item(i).getNodeValue(); // if ( nodeValue.contains( ";" ) ) continue; // associatedDatasetAccessions.add("GDS" + nodeValue); } return associatedDatasetAccessions; } catch (IOException e) { throw new RuntimeException("Could not parse XML data from remote server", e); } catch (XPathExpressionException e) { throw new RuntimeException("XML parsing error of remote data", e); } catch (ParserConfigurationException e) { throw new RuntimeException("XML parsing error of remote data", e); } catch (SAXException e) { throw new RuntimeException("XML parsing error of remote data", e); } } /** * Try to line up samples across datasets contained in a series. * * @param series * @return */ public GeoSampleCorrespondence findGSECorrespondence(GeoSeries series) { Collection<GeoDataset> datasets = series.getDatasets(); if (datasets != null && datasets.size() > 0) { fillAccessionMaps(datasets); // make sure all samples are accounted for - just informative Collection<GeoSample> missed = new HashSet<GeoSample>(); for (GeoSample sample : series.getSamples()) { if (!this.accToDataset.containsKey(sample.getGeoAccession())) { missed.add(sample); } } if (!missed.isEmpty()) { log.warn("There were one or more samples missing from the datasets: " + StringUtils.join(missed, " | ")); } return findGSECorrespondence(datasets); } int numPlatforms = fillAccessionMaps(series); return findCorrespondence(numPlatforms); } /** * Try to line up samples across datasets. * * @param dataSets */ public GeoSampleCorrespondence findGSECorrespondence(Collection<GeoDataset> dataSets) { if (dataSets == null) return null; if (dataSets.size() == 0) { throw new IllegalArgumentException("No datasets!"); } checkPlatformsMatchSeries(dataSets); fillAccessionMaps(dataSets); int numDatasets = dataSets.size(); return findCorrespondence(numDatasets); } /** * See bug 1672 for why this is needed. * * @param dataSets */ private void checkPlatformsMatchSeries(Collection<GeoDataset> dataSets) { for (GeoDataset dataset : dataSets) { boolean found = false; GeoPlatform platform = dataset.getPlatform(); if (dataset.getSeries().size() == 0) continue; Collection<GeoPlatform> seenPlatforms = new HashSet<GeoPlatform>(); for (GeoSeries series : dataset.getSeries()) { for (GeoSample sample : series.getSamples()) { if (sample.getPlatforms().contains(platform)) { found = true; } seenPlatforms.addAll(sample.getPlatforms()); } } if (!found) { if (seenPlatforms.size() == 1) { log.warn(dataset + " is associated with wrong platform? " + platform + ", switching it to use series platform " + seenPlatforms.iterator().next()); dataset.setPlatform(seenPlatforms.iterator().next()); } else { /* * Maybe there is a way to handle this, but not worth it. Dataset uses the wrong platform. */ throw new IllegalStateException(platform + " on dataset " + dataset + " is not used at all by associated series, can't determine correct platform as series uses more than one."); } } } } /** * This is the main point where comparisons are made. * * @param numDatasetsOrPlatforms * @param accToTitle * @param accToDataset * @return */ private GeoSampleCorrespondence findCorrespondence(int numDatasetsOrPlatforms) { GeoSampleCorrespondence result = new GeoSampleCorrespondence(); result.setAccToTitleMap(accToTitle); final List<String> sampleAccs = new ArrayList<String>(accToDataset.keySet()); assert sampleAccs.size() > 0; if (numDatasetsOrPlatforms <= 1 || !this.doSampleMatching) { log.debug("Each bioassay will get a distinct biomaterial"); for (String sample : sampleAccs) { result.addCorrespondence(sample, null); } return result; } String commonPrefix = StringUtil.commonPrefix(accToTitle.values()); if (commonPrefix != null) { log.debug("Common prefix = " + commonPrefix); commonPrefix = commonPrefix.toLowerCase(); } String commonSuffix = StringUtil.commonSuffix(accToTitle.values()); if (commonSuffix != null) { log.debug("Common suffix = " + commonSuffix); commonSuffix = commonSuffix.toLowerCase(); } // using the sorted order helps find the right matches. Collections.sort(sampleAccs); Map<String, Collection<String>> alreadyMatched = new HashMap<String, Collection<String>>(); // do it by data set, so we constrain comparing items in _this_ data set to ones in _other_ data sets (or other // platforms) // The inner loops are just to get the samples in the data set (platform) being considered. Collection<String> alreadyTestedDatasetsOrPlatforms = new HashSet<String>(); List<String> dataSets = new ArrayList<String>(); dataSets.addAll(new HashSet<String>(accToDataset.values())); List<String> platforms = new ArrayList<String>(); platforms.addAll(new HashSet<String>(accToPlatform.values())); List<String> valuesToUse; LinkedHashMap<String, String> accToDatasetOrPlatform; if (dataSets.size() > 0) { valuesToUse = dataSets; sortDataSets(sampleAccs, valuesToUse); accToDatasetOrPlatform = accToDataset; result.setAccToDatasetOrPlatformMap(accToDataset); log.debug(dataSets.size() + " datasets"); } else { valuesToUse = platforms; sortPlatforms(sampleAccs, valuesToUse); accToDatasetOrPlatform = accToPlatform; result.setAccToDatasetOrPlatformMap(accToPlatform); log.debug(platforms.size() + " platforms"); } // we start with the smallest dataset/platform. Collection<String> allMatched = new HashSet<String>(); for (String datasetOrPlatformA : valuesToUse) { alreadyTestedDatasetsOrPlatforms.add(datasetOrPlatformA); log.debug("Finding matches for samples in " + datasetOrPlatformA); // for each sample in this data set... for (int j = 0; j < sampleAccs.size(); j++) { boolean wasTied = false; String targetAcc = sampleAccs.get(j); // skip samples that are not in this data set. if (!accToDataset.get(targetAcc).equals(datasetOrPlatformA)) { continue; } if (allMatched.contains(targetAcc)) continue; if (!accToTitle.containsKey(targetAcc)) { continue; } String targetTitle = accToTitle.get(targetAcc).toLowerCase(); String targetSecondaryTitle = null; if (accToSecondaryTitle.get(targetAcc) != null) { targetSecondaryTitle = accToSecondaryTitle.get(targetAcc).toLowerCase(); } log.debug("Target: " + targetAcc + " (" + datasetOrPlatformA + ") " + targetTitle + (targetSecondaryTitle == null ? "" : " a.k.a " + targetSecondaryTitle)); if (StringUtils.isBlank(targetTitle)) throw new IllegalArgumentException("Can't have blank titles for samples"); Collection<String> bonusWords = getMicroarrayStringsToMatch(targetTitle); // log.debug( bonusWords.size() + " bonus words" ); /* * For each of the other data sets */ for (String datasetOrPlatformB : valuesToUse) { // if ( alreadyTestedDatasets.contains( datasetB ) ) { // log.debug( "Skip self" ); // continue; // } if (datasetOrPlatformB.equals(datasetOrPlatformA)) { continue; } // initialize data structure. if (alreadyMatched.get(targetAcc) == null) { alreadyMatched.put(targetAcc, new HashSet<String>()); } /* * Keep us from getting multiple matches. */ if (alreadyMatched.get(targetAcc).contains(datasetOrPlatformB)) { continue; } // find the best match in this data set. double mindistance = Double.MAX_VALUE; String bestMatch = null; String bestMatchAcc = null; int numTested = 0; for (int i = 0; i < sampleAccs.size(); i++) { String testAcc = sampleAccs.get(i); if (allMatched.contains(testAcc)) continue; boolean shouldTest = shouldTest(accToDatasetOrPlatform, alreadyMatched, datasetOrPlatformA, targetAcc, datasetOrPlatformB, testAcc); if (!shouldTest) continue; numTested++; if (!accToTitle.containsKey(testAcc)) { continue; } String testTitle = accToTitle.get(testAcc).toLowerCase(); String testSecondaryTitle = null; if (accToSecondaryTitle.get(testAcc) != null) { testSecondaryTitle = accToSecondaryTitle.get(testAcc).toLowerCase(); } if (StringUtils.isBlank(testTitle)) throw new IllegalArgumentException("Can't have blank titles for samples"); double bonus = 0.0; bonusWords.addAll(getMicroarrayStringsToMatch(testTitle)); for (String n : bonusWords) { if (testTitle.contains(n)) { log.debug(testTitle + " gets a bonus in matching " + targetTitle); bonus = 1; // this basically means we discount that difference. break; } } /* * If one name is much longer than the other, presumably the author didn't use the same naming * scheme for all samples; we need to trim the longer one to match the shorter one; we use the * prefix. */ String trimmedTest = testTitle; String trimmedTarget = targetTitle; if (commonPrefix != null) { trimmedTest = trimmedTest.replaceFirst("^" + Pattern.quote(commonPrefix), ""); trimmedTarget = trimmedTarget.replaceFirst("^" + Pattern.quote(commonPrefix), ""); } if (commonSuffix != null) { trimmedTest = trimmedTest.replaceFirst(Pattern.quote(commonSuffix) + "$", ""); trimmedTarget = trimmedTarget.replaceFirst(Pattern.quote(commonSuffix) + "$", ""); } // remove some punctuation trimmedTest = trimmedTest.replaceAll(PUNCTUATION_REGEXP, ""); trimmedTarget = trimmedTarget.replaceAll(PUNCTUATION_REGEXP, ""); // Computing the distance double distance = computeDistance(trimmedTest, trimmedTarget); distance -= bonus; double normalizedDistance = distance / Math.max(trimmedTarget.length(), trimmedTest.length()); double secondaryDistance = Double.MAX_VALUE; if (targetSecondaryTitle != null && testSecondaryTitle != null) { secondaryDistance = computeDistance(targetSecondaryTitle, testSecondaryTitle); if (secondaryDistance < distance) { distance = secondaryDistance; normalizedDistance = distance / Math.max(targetSecondaryTitle.length(), testSecondaryTitle.length()); } } if (!meetsMinimalThreshold(normalizedDistance)) { continue; } // better than last one? if (distance > mindistance) { // log.debug( "Didn't beat best previous match, " + bestMatch ); continue; } // handle ties if (distance == mindistance) { wasTied = true; /* * Try to resolve the tie. Messy, yes. */ double prefixWeightedDistanceA = StringDistance.prefixWeightedHammingDistance(targetAcc, bestMatchAcc, 1.0); double prefixWeightedDistanceB = StringDistance.prefixWeightedHammingDistance(targetAcc, testAcc, 1.0); if (prefixWeightedDistanceA == prefixWeightedDistanceB) { double suffixWeightedDistanceA = StringDistance .suffixWeightedHammingDistance(targetAcc, bestMatchAcc, 1.0); double suffixWeightedDistanceB = StringDistance .suffixWeightedHammingDistance(targetAcc, testAcc, 1.0); if (prefixWeightedDistanceA == prefixWeightedDistanceB) { continue; // still tied, keep old one } else if (suffixWeightedDistanceA < suffixWeightedDistanceB) { // new one is better. mindistance = distance; bestMatch = testTitle; bestMatchAcc = testAcc; log.debug("Current best match (tie broken): " + testAcc + " (" + datasetOrPlatformB + ") " + testTitle + (testSecondaryTitle == null ? "" : " a.k.a " + testSecondaryTitle + ", distance = " + distance)); wasTied = false; } if (suffixWeightedDistanceA > suffixWeightedDistanceB) { // old one is still better. wasTied = false; continue; } } else if (prefixWeightedDistanceA > prefixWeightedDistanceB) { // new one is better. mindistance = distance; bestMatch = testTitle; bestMatchAcc = testAcc; log.debug("Current best match (tie broken): " + testAcc + " (" + datasetOrPlatformB + ") " + testTitle + (testSecondaryTitle == null ? "" : " a.k.a " + testSecondaryTitle + ", distance = " + distance)); wasTied = false; } else if (prefixWeightedDistanceA < prefixWeightedDistanceB) { wasTied = false; continue; // old best is still better. } } else { // definite new winner no tie mindistance = distance; bestMatch = testTitle; bestMatchAcc = testAcc; log.debug("Current best match: " + testAcc + " (" + datasetOrPlatformB + ") " + testTitle + (testSecondaryTitle == null ? "" : " a.k.a " + testSecondaryTitle + ", distance = " + distance)); wasTied = false; } } // end loop over samples in second data set. log.debug("Tested " + numTested + " samples"); /* * Now have the best hit for sample from the outer dataset, in the inner data set. */ if (bestMatchAcc == null || wasTied) { if (log.isDebugEnabled()) log.debug("No match found in " + datasetOrPlatformB + " for " + targetAcc + "\t" + targetTitle + " (" + datasetOrPlatformA + ") (This can happen if sample was not run on all the platforms used; or if there were ties that could not be broken; or when we were unable to match)"); result.addCorrespondence(targetAcc, null); allMatched.add(targetAcc); } else { if (log.isDebugEnabled()) log.debug("Match:\n" + targetAcc + "\t" + targetTitle + " (" + accToDataset.get(targetAcc) + ")" + "\n" + bestMatchAcc + "\t" + bestMatch + " (" + accToDataset.get(bestMatchAcc) + ")" + " (Distance: " + mindistance + ")"); result.addCorrespondence(targetAcc, bestMatchAcc); alreadyMatched.get(bestMatchAcc).add(datasetOrPlatformA); alreadyMatched.get(targetAcc).add(datasetOrPlatformB); allMatched.add(targetAcc); allMatched.add(bestMatchAcc); } } // loop second data sets } // loop over samples in first data set } // loop over data sets log.debug(result); return result; } private void sortDataSets(final List<String> sampleAccs, List<String> dataSets) { Collections.sort(dataSets, new Comparator<String>() { @Override public int compare(String arg0, String arg1) { int numSamples0 = 0; int numSamples1 = 0; for (int j = 0; j < sampleAccs.size(); j++) { String targetAcc = sampleAccs.get(j); // skip samples that are not in this data set. if (accToDataset.get(targetAcc).equals(arg0)) { numSamples0++; } else if (accToDataset.get(targetAcc).equals(arg1)) { numSamples1++; } } if (numSamples0 == numSamples1) { return 0; } else if (numSamples0 < numSamples1) { return -1; } else { return 1; } } }); } private void sortPlatforms(final List<String> sampleAccs, List<String> platforms) { Collections.sort(platforms, new Comparator<String>() { @Override public int compare(String arg0, String arg1) { int numSamples0 = 0; int numSamples1 = 0; for (int j = 0; j < sampleAccs.size(); j++) { String targetAcc = sampleAccs.get(j); // skip samples that are not in this data set. if (accToPlatform.get(targetAcc).equals(arg0)) { numSamples0++; } else if (accToPlatform.get(targetAcc).equals(arg1)) { numSamples1++; } } if (numSamples0 == numSamples1) { return 0; } else if (numSamples0 < numSamples1) { return -1; } else { return 1; } } }); } /** * Identify stop-strings relating to microarray names. * * @param title * @return */ private Collection<String> getMicroarrayStringsToMatch(String title) { Collection<String> result = new HashSet<String>(); for (String key : microarrayNameStrings.keySet()) { if (title.contains(key)) { for (String value : microarrayNameStrings.get(key)) { if (title.contains(value)) { result.add(value); } } } } return result; } /** * Implements constraints on samples to test. * * @param accToDatasetOrPlatform (depending on which we are using, platforms or data sets) * @param alreadyMatched * @param allmatched * @param datasetA * @param targetAcc * @param datasetB * @param testAcc * @return */ private boolean shouldTest(LinkedHashMap<String, String> accToDatasetOrPlatform, Map<String, Collection<String>> alreadyMatched, String datasetA, String targetAcc, String datasetB, String testAcc) { boolean shouldTest = true; // initialize data structure. if (alreadyMatched.get(testAcc) == null) { alreadyMatched.put(testAcc, new HashSet<String>()); } // only use samples from the current test dataset. if (!accToDatasetOrPlatform.get(testAcc).equals(datasetB)) { shouldTest = false; } // disallow multiple matches. if (alreadyMatched.get(testAcc).contains(datasetA)) { // log.debug( testAcc + " already matched to a sample in " + datasetA + ", skipping" ); shouldTest = false; } if (!accToOrganism.get(targetAcc).equals(accToOrganism.get(testAcc))) { log.debug(testAcc + " From wrong organism"); shouldTest = false; } return shouldTest; } private boolean meetsMinimalThreshold(double normalizedDistance) { if (normalizedDistance > SIMILARITY_THRESHOLD) { return false; } return true; } /** * compute the distance. * * @param trimmedTest * @param trimmedTarget * @return */ private int computeDistance(String trimmedTest, String trimmedTarget) { return StringDistance.editDistance(trimmedTarget, trimmedTest); } /** * @param geoSeries * @return */ public static Map<GeoPlatform, List<GeoSample>> getPlatformSampleMap(GeoSeries geoSeries) { Map<GeoPlatform, List<GeoSample>> platformSamples = new HashMap<GeoPlatform, List<GeoSample>>(); for (GeoSample sample : geoSeries.getSamples()) { for (GeoPlatform platform : sample.getPlatforms()) { if (!platformSamples.containsKey(platform)) { platformSamples.put(platform, new ArrayList<GeoSample>()); } platformSamples.get(platform).add(sample); } } return platformSamples; } /** * @param dataSets */ private void fillAccessionMaps(Collection<GeoDataset> dataSets) { for (GeoDataset dataset : dataSets) { GeoPlatform platform = dataset.getPlatform(); assert platform != null; platform.getOrganisms().add(dataset.getOrganism()); if (dataset.getSubsets().size() == 0) { assert dataset.getSeries().size() > 0; for (GeoSeries series : dataset.getSeries()) { for (GeoSample sample : series.getSamples()) { if (sample.getPlatforms().size() == 0) sample.addPlatform(platform); fillAccessionMap(sample, dataset); } } } else { for (GeoSubset subset : dataset.getSubsets()) { for (GeoSample sample : subset.getSamples()) { if (sample.getPlatforms().size() == 0) sample.addPlatform(platform); fillAccessionMap(sample, dataset); } } } } } /** * This is used if there are no 'datasets' (GDS) to work with; we just use platforms. * * @param series * @param accToTitle * @param accToOwneracc * @param accToOrganism * @return */ private int fillAccessionMaps(GeoSeries series) { Map<GeoPlatform, List<GeoSample>> platformSamples = getPlatformSampleMap(series); for (GeoPlatform platform : platformSamples.keySet()) { for (GeoSample sample : platformSamples.get(platform)) { assert sample != null : "Null sample for platform " + platform.getDescription(); fillAccessionMap(sample, platform); } } return platformSamples.keySet().size(); } /** * @param sample * @param accToTitle * @param accToDataset */ private void fillAccessionMap(GeoSample sample, GeoData owner) { String title = sample.getTitle(); if (StringUtils.isNotBlank(title)) { accToTitle.put(sample.getGeoAccession(), title); } accToDataset.put(sample.getGeoAccession(), owner.getGeoAccession()); accToSecondaryTitle.put(sample.getGeoAccession(), sample.getTitleInDataset()); // could be null. String organism = getSampleOrganism(sample); if (StringUtils.isNotBlank(organism)) { accToOrganism.put(sample.getGeoAccession(), organism); } } /** * @param sample * @return */ private String getSampleOrganism(GeoSample sample) { Collection<GeoPlatform> platforms = sample.getPlatforms(); assert platforms.size() > 0 : sample + " had no platform assigned"; GeoPlatform platform = platforms.iterator().next(); Collection<String> organisms = platform.getOrganisms(); assert organisms.size() > 0; String organism = organisms.iterator().next(); return organism; } /** * Used to help ignore identifiers of microarrays in sample titles. */ private static Map<String, Collection<String>> microarrayNameStrings = new HashMap<String, Collection<String>>(); static { // note : all lower case! microarrayNameStrings.put("u133", new HashSet<String>()); microarrayNameStrings.put("u95", new HashSet<String>()); microarrayNameStrings.put("u74", new HashSet<String>()); microarrayNameStrings.put("v2", new HashSet<String>()); microarrayNameStrings.put("chip", new HashSet<String>()); microarrayNameStrings.get("u133").add("u133A"); microarrayNameStrings.get("u133").add("u133B"); microarrayNameStrings.get("u95").add("u95A"); microarrayNameStrings.get("u95").add("u95B"); microarrayNameStrings.get("u95").add("u95C"); microarrayNameStrings.get("u95").add("u95D"); microarrayNameStrings.get("u95").add("u95E"); microarrayNameStrings.get("u74").add("u74A"); microarrayNameStrings.get("u74").add("u74B"); microarrayNameStrings.get("u74").add("u74C"); microarrayNameStrings.get("v2").add("av2"); microarrayNameStrings.get("v2").add("av2"); microarrayNameStrings.get("v2").add("av2"); microarrayNameStrings.get("chip").add("chip a"); microarrayNameStrings.get("chip").add("chip b"); microarrayNameStrings.get("chip").add("chip c"); microarrayNameStrings.get("chip").add("chipa"); microarrayNameStrings.get("chip").add("chipb"); microarrayNameStrings.get("chip").add("chipc"); } }