Java tutorial
/* * The Gemma project * * Copyright (c) 2006 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.core.loader.expression.geo; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import ubic.gemma.core.loader.expression.geo.fetcher.DatasetFetcher; import ubic.gemma.core.loader.expression.geo.fetcher.PlatformFetcher; import ubic.gemma.core.loader.expression.geo.fetcher.SeriesFetcher; import ubic.gemma.core.loader.expression.geo.model.*; import ubic.gemma.core.loader.util.fetcher.Fetcher; import ubic.gemma.core.loader.util.sdo.SourceDomainObjectGenerator; import ubic.gemma.model.common.description.DatabaseEntry; import ubic.gemma.model.common.description.ExternalDatabase; import ubic.gemma.model.common.description.LocalFile; import java.io.IOException; import java.util.Collection; import java.util.HashSet; import java.util.Map; /** * Handle fetching and parsing GEO files. * * @author pavlidis */ @SuppressWarnings({ "WeakerAccess", "unused" }) // Possible external use public class GeoDomainObjectGenerator implements SourceDomainObjectGenerator { protected static Log log = LogFactory.getLog(GeoDomainObjectGenerator.class.getName()); protected Fetcher datasetFetcher; protected Fetcher seriesFetcher; protected Fetcher platformFetcher; protected GeoFamilyParser parser; private boolean processPlatformsOnly; private boolean doSampleMatching = true; public GeoDomainObjectGenerator() { this.initialize(); } /** * @param geoAccession, either a GPL, GDS or GSE value. * @return If processPlatformsOnly is true, a collection of GeoPlatforms. Otherwise a Collection of series (just * one). If the accession is a GPL then processPlatformsOnly is set to true and any sample data is ignored. */ @Override public Collection<? extends GeoData> generate(String geoAccession) { GeoDomainObjectGenerator.log .info("Generating objects for " + geoAccession + " using " + this.getClass().getSimpleName()); Collection<GeoData> result = new HashSet<>(); if (geoAccession.startsWith("GPL")) { this.processPlatformsOnly = true; GeoPlatform platform = this.processPlatform(geoAccession); result.add(platform); } else if (geoAccession.startsWith("GDS")) { // common starting point. Collection<String> seriesAccessions = DatasetCombiner.findGSEforGDS(geoAccession); if (processPlatformsOnly) { return this.processSeriesPlatforms(seriesAccessions); } GeoDomainObjectGenerator.log.info(geoAccession + " corresponds to " + seriesAccessions); for (String seriesAccession : seriesAccessions) { GeoSeries series = this.processSeries(seriesAccession); if (series == null) continue; result.add(series); } } else if (geoAccession.startsWith("GSE")) { if (processPlatformsOnly) { return this.processSeriesPlatforms(geoAccession); } GeoSeries series = this.processSeries(geoAccession); if (series == null) return result; result.add(series); return result; } else { throw new IllegalArgumentException( "Cannot handle accession: " + geoAccession + ", must be a GDS, GSE or GPL"); } return result; } /** * Determine the set of external accession values that will be generated during parsing. This can be used to * pre-empty time-consuming fetch and download of data we already have. * * @param geoAccession geo accession * @return database entries */ public Collection<DatabaseEntry> getProjectedAccessions(String geoAccession) { ExternalDatabase ed = ExternalDatabase.Factory.newInstance(); ed.setName("GEO"); Collection<DatabaseEntry> accessions = new HashSet<>(); // DatabaseEntry StringBuilder seriesAccession = new StringBuilder(); if (geoAccession.startsWith("GSE")) { seriesAccession = new StringBuilder(geoAccession); } else if (geoAccession.startsWith("GPL")) { GeoDomainObjectGenerator.log.warn( "Determining if the data already exist for a GPL (" + geoAccession + ") is not implemented."); return null; } else if (geoAccession.startsWith("GDS")) { Collection<String> seriesAccessions = DatasetCombiner.findGSEforGDS(geoAccession); if (seriesAccessions == null || seriesAccessions.size() == 0) { throw new InvalidAccessionException("There is no series (GSE) for the accession " + geoAccession); } for (String string : seriesAccessions) { seriesAccession.append(string).append(","); } seriesAccession = new StringBuilder(StringUtils.removeEnd(seriesAccession.toString(), ",")); } else { if (StringUtils.isBlank(geoAccession)) { throw new InvalidAccessionException("GEO accession must not be blank. Enter a GSE, GDS or GPL"); } throw new InvalidAccessionException("'" + geoAccession + "' is not understood by Gemma; must be a GSE, GDS or GPL. Did you choose the right source database?"); } DatabaseEntry de = DatabaseEntry.Factory.newInstance(ed); de.setAccession(seriesAccession.toString()); accessions.add(de); return accessions; } /** * Initialize fetchers, clear out any data that was already generated by this Generator. */ public void initialize() { parser = new GeoFamilyParser(); datasetFetcher = new DatasetFetcher(); seriesFetcher = new SeriesFetcher(); platformFetcher = new PlatformFetcher(); } /** * Process a data set and add it to the series * * @param series series * @param dataSetAccession dataset accession */ public void processDataSet(GeoSeries series, String dataSetAccession) { GeoDomainObjectGenerator.log.info("Processing " + dataSetAccession); GeoDataset gds = this.processDataSet(dataSetAccession); assert gds != null; boolean ok = this.checkDatasetMatchesSeries(series, gds); if (!ok) { GeoDomainObjectGenerator.log.warn(dataSetAccession + " does not use a platform associated with the series " + series + ", ignoring."); return; } series.addDataSet(gds); gds.getSeries().add(series); } public void setDatasetFetcher(Fetcher df) { this.datasetFetcher = df; } public void setDoSampleMatching(boolean doSampleMatching) { this.doSampleMatching = doSampleMatching; } /** * @param platformFetcher The platformFetcher to set. */ public void setPlatformFetcher(Fetcher platformFetcher) { this.platformFetcher = platformFetcher; } public void setProcessPlatformsOnly(boolean b) { this.processPlatformsOnly = b; } /** * @param seriesFetcher The seriesFetcher to set. */ public void setSeriesFetcher(Fetcher seriesFetcher) { this.seriesFetcher = seriesFetcher; } /** * It is possible for the GDS to use a platform not used by the GSE. Yep. GSE2121 is on GPL81, and is associated * with GDS1862; but GSE2122 (GPL11) is not, but GDS1862 is linked to GSE2122 anyway. There is no superseries * relationship there. * * @param series series * @param gds geo dataset * @return true if the dataset uses a platform that the series uses. */ private boolean checkDatasetMatchesSeries(GeoSeries series, GeoDataset gds) { boolean ok = false; GeoPlatform platform = gds.getPlatform(); assert platform != null; for (GeoSample s : series.getSamples()) { for (GeoPlatform p : s.getPlatforms()) { if (p.equals(platform)) { ok = true; } } } return ok; } private String fetchDataSetToLocalFile(String geoDataSetAccession) { Collection<LocalFile> result = datasetFetcher.fetch(geoDataSetAccession); if (result == null) return null; if (result.size() != 1) { throw new IllegalStateException( "Got " + result.size() + " files for " + geoDataSetAccession + ", expected only one."); } LocalFile dataSetFile = (result.iterator()).next(); String dataSetPath; dataSetPath = dataSetFile.getLocalURL().getPath(); return dataSetPath; } /** * Process a data set from an accession values * * @param dataSetAccession dataset accession * @return A GeoDataset object */ private GeoDataset processDataSet(String dataSetAccession) { if (!dataSetAccession.startsWith("GDS")) { throw new IllegalArgumentException("Invalid GEO dataset accession " + dataSetAccession); } String dataSetPath = this.fetchDataSetToLocalFile(dataSetAccession); GeoDataset gds; try { gds = this.processDataSet(dataSetAccession, dataSetPath); } catch (IOException e) { throw new RuntimeException(e); } return gds; } /** * Parse a GEO GDS file, return the extracted GeoDataset. * * @param geoDataSetAccession geo dataset accession * @param dataSetPath dataset path * @return GeoDataset * @throws IOException if there is a problem while manipulating the file */ private GeoDataset processDataSet(String geoDataSetAccession, String dataSetPath) throws IOException { parser.parse(dataSetPath); // first result is where we start. GeoParseResult results = (GeoParseResult) parser.getResults().iterator().next(); Map<String, GeoDataset> datasetMap = results.getDatasets(); if (!datasetMap.containsKey(geoDataSetAccession)) { throw new IllegalStateException("Failed to get parse of " + geoDataSetAccession); } return datasetMap.get(geoDataSetAccession); } private GeoPlatform processPlatform(String geoAccession) { assert platformFetcher != null; Collection<LocalFile> platforms = platformFetcher.fetch(geoAccession); if (platforms == null) { throw new RuntimeException("No series file found for " + geoAccession); } LocalFile platformFile = (platforms.iterator()).next(); String platformPath; platformPath = platformFile.getLocalURL().getPath(); parser.setProcessPlatformsOnly(true); try { parser.parse(platformPath); } catch (IOException e1) { throw new RuntimeException(e1); } return ((GeoParseResult) parser.getResults().iterator().next()).getPlatformMap().get(geoAccession); } /** * Download and parse a GEO series. * * @param seriesAccession series accession */ private GeoSeries processSeries(String seriesAccession) { Collection<LocalFile> fullSeries = seriesFetcher.fetch(seriesAccession); if (fullSeries == null) { GeoDomainObjectGenerator.log.warn("No series file found for " + seriesAccession); return null; } LocalFile seriesFile = (fullSeries.iterator()).next(); String seriesPath = seriesFile.getLocalURL().getPath(); parser.setProcessPlatformsOnly(this.processPlatformsOnly); try { parser.parse(seriesPath); } catch (IOException e1) { throw new RuntimeException(e1); } // Only allow one series... GeoSeries series = ((GeoParseResult) parser.getResults().iterator().next()).getSeriesMap() .get(seriesAccession); if (series == null) { throw new RuntimeException("No series was parsed for " + seriesAccession); } Collection<String> datasetsToProcess = DatasetCombiner.findGDSforGSE(seriesAccession); if (datasetsToProcess != null) { for (String dataSetAccession : datasetsToProcess) { this.processDataSet(series, dataSetAccession); } } DatasetCombiner datasetCombiner = new DatasetCombiner(this.doSampleMatching); GeoSampleCorrespondence correspondence = datasetCombiner.findGSECorrespondence(series); assert correspondence != null; series.setSampleCorrespondence(correspondence); return series; } /** * Download and parse GEO platform(s) using series accession(s). * * @param seriesAccessions series accessions */ private Collection<GeoPlatform> processSeriesPlatforms(Collection<String> seriesAccessions) { for (String seriesAccession : seriesAccessions) { this.processSeriesPlatforms(seriesAccession); } return ((GeoParseResult) parser.getResults().iterator().next()).getPlatformMap().values(); } /** * @param seriesAccession series accession */ private Collection<GeoPlatform> processSeriesPlatforms(String seriesAccession) { Collection<LocalFile> fullSeries = seriesFetcher.fetch(seriesAccession); if (fullSeries == null) { throw new RuntimeException("No series file found for " + seriesAccession); } LocalFile seriesFile = (fullSeries.iterator()).next(); String seriesPath; seriesPath = seriesFile.getLocalURL().getPath(); parser.setProcessPlatformsOnly(this.processPlatformsOnly); try { parser.parse(seriesPath); } catch (IOException e1) { throw new RuntimeException(e1); } return ((GeoParseResult) parser.getResults().iterator().next()).getPlatformMap().values(); } }