Java tutorial
/* * The Gemma project * * Copyright (c) 2006-2010 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.core.loader.expression.geo; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.junit.Test; import ubic.gemma.core.loader.expression.geo.model.GeoDataset; import ubic.gemma.core.loader.expression.geo.model.GeoSeries; import java.io.InputStream; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.zip.GZIPInputStream; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; /** * @author pavlidis */ public class DatasetCombinerTest { private static final Log log = LogFactory.getLog(DatasetCombinerTest.class.getName()); private Collection<GeoDataset> gds; @Test public void testFindGDSGrouping() { try { Collection<String> result = DatasetCombiner.findGDSforGSE("GSE674"); assertEquals(2, result.size()); assertTrue(result.contains("GDS472") && result.contains("GDS473")); } catch (RuntimeException e) { if (e.getCause() instanceof java.net.UnknownHostException) { DatasetCombinerTest.log.warn("Test skipped due to unknown host exception"); return; } else if (e.getCause() instanceof java.io.IOException && e.getCause().getMessage().contains("503")) { DatasetCombinerTest.log.warn("Test skipped due to 503 from NCBI"); DatasetCombinerTest.log.error(e, e); return; } throw e; } } @Test public void testFindGSE13() throws Exception { GeoFamilyParser parser = new GeoFamilyParser(); try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE13Short/GDS44.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream(this.getClass() .getResourceAsStream("/data/loader/expression/geo/GSE13Short/GSE13_family.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE13Short/GDS52.soft.gz"))) { parser.parse(is); } GeoParseResult parseResult = ((GeoParseResult) parser.getResults().iterator().next()); gds = parseResult.getDatasets().values(); assertEquals(2, gds.size()); DatasetCombiner datasetCombiner = new DatasetCombiner(); GeoSampleCorrespondence result = datasetCombiner.findGSECorrespondence(gds); DatasetCombinerTest.log.debug(result); Iterator<Set<String>> it = result.iterator(); int numBioMaterials = 0; while (it.hasNext()) { Collection<String> c = it.next(); assertTrue(c.size() == 1 || c.size() == 2); numBioMaterials++; } assertTrue(result.getCorrespondingSamples("GSM623").contains("GSM650")); assertTrue(result.getCorrespondingSamples("GSM612").contains("GSM638")); assertEquals(1, result.getCorrespondingSamples("GSM618").size()); assertEquals(33, numBioMaterials); // used to be 28 } @Test public void testFindGSE267() { Collection<String> result = DatasetCombiner.findGDSforGSE("GSE267"); assertEquals(0, result.size()); } /* * Has multiple platforms, but no GES's are defined */ @Test public void testFindGSE3193() throws Exception { GeoFamilyParser parser = new GeoFamilyParser(); try (InputStream is = new GZIPInputStream(this.getClass() .getResourceAsStream("/data/loader/expression/geo/GSE3193Short/GSE3193_family.soft.gz"))) { parser.parse(is); } GeoParseResult parseResult = ((GeoParseResult) parser.getResults().iterator().next()); // GeoDataset gd = parseResult.getDatasets().values().iterator().next(); GeoSeries gse = parseResult.getSeries().values().iterator().next(); DatasetCombiner datasetCombiner = new DatasetCombiner(); GeoSampleCorrespondence result = datasetCombiner.findGSECorrespondence(gse); DatasetCombinerTest.log.debug(result); Iterator<Set<String>> it = result.iterator(); int numBioMaterials = 0; while (it.hasNext()) { it.next(); // assertTrue( c.size() == 1 ); numBioMaterials++; } assertEquals(57, numBioMaterials); // note, i'm not at all sure these are right! this used to be 60. } @Test public void testFindGSE469() throws Exception { GeoFamilyParser parser = new GeoFamilyParser(); try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE469Short/GDS233.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream(this.getClass() .getResourceAsStream("/data/loader/expression/geo/GSE469Short/GSE469_family.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE469Short/GDS234.soft.gz"))) { parser.parse(is); } GeoParseResult parseResult = ((GeoParseResult) parser.getResults().iterator().next()); gds = parseResult.getDatasets().values(); assertEquals(2, gds.size()); DatasetCombiner datasetCombiner = new DatasetCombiner(); GeoSampleCorrespondence result = datasetCombiner.findGSECorrespondence(gds); DatasetCombinerTest.log.debug(result); Iterator<Set<String>> it = result.iterator(); int numBioMaterials = 0; while (it.hasNext()) { Collection<String> c = it.next(); assertTrue(c.size() == 1 || c.size() == 2); numBioMaterials++; } // there are some questionable matches, but I can't really tell! assertEquals(54, numBioMaterials); assertEquals(1, result.getCorrespondingSamples("GSM4301").size()); } @Test public void testFindGSE493() throws Exception { GeoFamilyParser parser = new GeoFamilyParser(); try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE493Short/GDS215.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream(this.getClass() .getResourceAsStream("/data/loader/expression/geo/GSE493Short/GSE493_family.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE493Short/GDS258.soft.gz"))) { parser.parse(is); } GeoParseResult parseResult = ((GeoParseResult) parser.getResults().iterator().next()); gds = parseResult.getDatasets().values(); assertEquals(2, gds.size()); DatasetCombiner datasetCombiner = new DatasetCombiner(); GeoSampleCorrespondence result = datasetCombiner.findGSECorrespondence(gds); DatasetCombinerTest.log.debug(result); Iterator<Set<String>> it = result.iterator(); int numBioMaterials = 0; while (it.hasNext()) { Collection<String> c = it.next(); assertTrue(c.size() == 1 || c.size() == 2); numBioMaterials++; } // there are some questionable matches, but I can't really tell! assertEquals(10, numBioMaterials); assertTrue(result.getCorrespondingSamples("GSM4362").contains("GSM4363")); assertTrue(result.getCorrespondingSamples("GSM4366").contains("GSM4368")); assertEquals(1, result.getCorrespondingSamples("GSM4371").size()); } /* * Fairly hard case; twelve samples, 3 array design each sample run on each array design */ @Test public void testFindGSE611() throws Exception { GeoFamilyParser parser = new GeoFamilyParser(); try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE611Short/GDS428.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream(this.getClass() .getResourceAsStream("/data/loader/expression/geo/GSE611Short/GSE611_family.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE611Short/GDS429.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE611Short/GDS430.soft.gz"))) { parser.parse(is); } GeoParseResult parseResult = ((GeoParseResult) parser.getResults().iterator().next()); gds = parseResult.getDatasets().values(); assertEquals(3, gds.size()); DatasetCombiner datasetCombiner = new DatasetCombiner(); GeoSampleCorrespondence result = datasetCombiner.findGSECorrespondence(gds); Iterator<Set<String>> it = result.iterator(); int numBioMaterials = 0; while (it.hasNext()) { Collection<String> c = it.next(); assertEquals(3, c.size()); numBioMaterials++; } assertEquals(4, numBioMaterials); DatasetCombinerTest.log.debug(result); } @Test public void testFindGSE88() throws Exception { GeoFamilyParser parser = new GeoFamilyParser(); try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE88Short/GDS184.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream(this.getClass() .getResourceAsStream("/data/loader/expression/geo/GSE88Short/GSE88_family.soft.gz"))) { parser.parse(is); } GeoParseResult parseResult = ((GeoParseResult) parser.getResults().iterator().next()); GeoDataset gd = parseResult.getDatasets().values().iterator().next(); GeoSeries gse = parseResult.getSeries().values().iterator().next(); gd.getSeries().add(gse); gds = new HashSet<>(); gds.add(gd); DatasetCombiner datasetCombiner = new DatasetCombiner(); GeoSampleCorrespondence result = datasetCombiner.findGSECorrespondence(gds); DatasetCombinerTest.log.debug(result); Iterator<Set<String>> it = result.iterator(); int numBioMaterials = 0; while (it.hasNext()) { Collection<String> c = it.next(); assertTrue(c.size() == 1); numBioMaterials++; } assertEquals(31, numBioMaterials); } @Test public void testFindGSE91() throws Exception { GeoFamilyParser parser = new GeoFamilyParser(); try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE91Short/GDS168.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream(this.getClass() .getResourceAsStream("/data/loader/expression/geo/GSE91Short/GSE91_family.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE91Short/GDS169.soft.gz"))) { parser.parse(is); } GeoParseResult parseResult = ((GeoParseResult) parser.getResults().iterator().next()); gds = parseResult.getDatasets().values(); assertEquals(2, gds.size()); DatasetCombiner datasetCombiner = new DatasetCombiner(); GeoSampleCorrespondence result = datasetCombiner.findGSECorrespondence(gds); DatasetCombinerTest.log.debug(result); Iterator<Set<String>> it = result.iterator(); int numBioMaterials = 0; while (it.hasNext()) { Collection<String> c = it.next(); assertTrue(c.size() == 1 || c.size() == 2); numBioMaterials++; } assertEquals(9, numBioMaterials); assertTrue(result.getCorrespondingSamples("GSM2560").contains("GSM2561")); assertTrue(result.getCorrespondingSamples("GSM2573").contains("GSM2574")); assertEquals(1, result.getCorrespondingSamples("GSM2564").size()); } @Test public void testFindGSECorrespondence() throws Exception { GeoFamilyParser parser = new GeoFamilyParser(); try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/twoDatasets/GDS472.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream(this.getClass() .getResourceAsStream("/data/loader/expression/geo/twoDatasets/GSE674_family.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/twoDatasets/GDS473.soft.gz"))) { parser.parse(is); } GeoParseResult parseResult = ((GeoParseResult) parser.getResults().iterator().next()); gds = parseResult.getDatasets().values(); assertEquals(2, gds.size()); DatasetCombiner datasetCombiner = new DatasetCombiner(); GeoSampleCorrespondence result = datasetCombiner.findGSECorrespondence(gds); DatasetCombinerTest.log.debug(result); assertEquals(15, result.size()); // these are just all the sample names. String[] keys = new String[] { "GSM10354", "GSM10355", "GSM10356", "GSM10359", "GSM10360", "GSM10361", "GSM10362", "GSM10363", "GSM10364", "GSM10365", "GSM10366", "GSM10367", "GSM10368", "GSM10369", "GSM10370", "GSM10374", "GSM10375", "GSM10376", "GSM10377", "GSM10378", "GSM10379", "GSM10380", "GSM10381", "GSM10382", "GSM10383", "GSM10384", "GSM10385", "GSM10386", "GSM10387", "GSM10388" }; for (String string : keys) { assertEquals("Wrong result for " + string + ", expected 2", 2, result.getCorrespondingSamples(string).size()); } assertTrue(result.getCorrespondingSamples("GSM10354").contains("GSM10374")); assertTrue(result.getCorrespondingSamples("GSM10374").contains("GSM10354")); } /* * This has just a single data set but results in a "no platform assigned" error. */ @Test public void testGDS186() throws Exception { GeoFamilyParser parser = new GeoFamilyParser(); try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/gse106Short/GDS186.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/gse106Short/GSE106.soft.gz"))) { parser.parse(is); } GeoParseResult parseResult = ((GeoParseResult) parser.getResults().iterator().next()); GeoDataset gd = parseResult.getDatasets().values().iterator().next(); GeoSeries gse = parseResult.getSeries().values().iterator().next(); gd.getSeries().add(gse); gds = new HashSet<>(); gds.add(gd); DatasetCombiner datasetCombiner = new DatasetCombiner(); GeoSampleCorrespondence result = datasetCombiner.findGSECorrespondence(gds); DatasetCombinerTest.log.debug(result); Iterator<Set<String>> it = result.iterator(); int numBioMaterials = 0; while (it.hasNext()) { Collection<String> c = it.next(); assertTrue(c.size() == 1); numBioMaterials++; } assertEquals(11, numBioMaterials); } /* * A difficult case, lots of singletons. */ @Test public void testGSE465() throws Exception { GeoFamilyParser parser = new GeoFamilyParser(); try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE465Short/GDS214.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream(this.getClass() .getResourceAsStream("/data/loader/expression/geo/GSE465Short/GSE465_family.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE465Short/GDS262.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE465Short/GDS263.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE465Short/GDS264.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE465Short/GDS265.soft.gz"))) { parser.parse(is); } try (InputStream is = new GZIPInputStream( this.getClass().getResourceAsStream("/data/loader/expression/geo/GSE465Short/GDS270.soft.gz"))) { parser.parse(is); } GeoParseResult parseResult = ((GeoParseResult) parser.getResults().iterator().next()); gds = parseResult.getDatasets().values(); assertEquals(6, gds.size()); DatasetCombiner datasetCombiner = new DatasetCombiner(); GeoSampleCorrespondence result = datasetCombiner.findGSECorrespondence(gds); DatasetCombinerTest.log.debug(result); Iterator<Set<String>> it = result.iterator(); int numBioMaterials = 0; while (it.hasNext()) { Collection<String> c = it.next(); assertTrue("Unexpected group size: " + c.size(), c.size() == 1 || c.size() == 2 || c.size() == 6 || c.size() == 5); numBioMaterials++; } assertEquals(30, numBioMaterials); } }