Java tutorial
/* * The Gemma project * * Copyright (c) 2006 University of British Columbia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package ubic.gemma.loader.expression.geo; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.beanutils.BeanUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.WordUtils; import org.apache.commons.lang.time.StopWatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import ubic.basecode.util.FileTools; import ubic.gemma.loader.expression.geo.model.GeoChannel; import ubic.gemma.loader.expression.geo.model.GeoContact; import ubic.gemma.loader.expression.geo.model.GeoData; import ubic.gemma.loader.expression.geo.model.GeoDataset; import ubic.gemma.loader.expression.geo.model.GeoPlatform; import ubic.gemma.loader.expression.geo.model.GeoReplication; import ubic.gemma.loader.expression.geo.model.GeoSample; import ubic.gemma.loader.expression.geo.model.GeoSeries; import ubic.gemma.loader.expression.geo.model.GeoSubset; import ubic.gemma.loader.expression.geo.model.GeoValues; import ubic.gemma.loader.expression.geo.model.GeoVariable; import ubic.gemma.loader.util.parser.Parser; /** * Class for parsing GSE and GDS files from NCBI GEO. See {@link http * ://www.ncbi.nlm.nih.gov/projects/geo/info/soft2.html} for format information. * * @author keshav * @author pavlidis * @version $Id: GeoFamilyParser.java,v 1.94 2013/04/18 22:53:10 paul Exp $ */ public class GeoFamilyParser implements Parser<Object> { /** * */ private static final char FIELD_DELIM = '\t'; private static final int MAX_WARNINGS = 100; private static Log log = LogFactory.getLog(GeoFamilyParser.class.getName()); private Integer previousNumTokens = null; /** * For each platform, the map of column names to column numbers in the data. */ private Map<GeoPlatform, Map<String, Integer>> quantitationTypeKey = new HashMap<GeoPlatform, Map<String, Integer>>(); /** * This is used to put the data in the right place later. We know the actual column is where it is NOW, for this * sample, but in our data structure we put it where we EXPECT it to be (where it was the first time we saw it). * This is our attempt to fix problems with columns moving around from sample to sample. */ private Map<GeoPlatform, Map<Integer, Integer>> quantitationTypeTargetColumn = new HashMap<GeoPlatform, Map<Integer, Integer>>(); boolean alreadyWarnedAboutClobbering = false; boolean alreadyWarnedAboutInconsistentColumnOrder = false; boolean alreadyWarnedAboutDuplicateColumnName = false; private String currentDatasetAccession; private String currentPlatformAccession; private String currentSampleAccession; private String currentSeriesAccession; private String currentSubsetAccession; private int dataSetDataLines = 0; private boolean haveReadPlatformHeader = false; private boolean haveReadSampleDataHeader = false; private boolean inDatabase = false; private boolean inDataset = false; private boolean inDatasetTable = false; private boolean inPlatform = false; private boolean inPlatformTable = false; private boolean inSample = false; private boolean inSampleTable = false; private boolean inSeries = false; private boolean inSeriesTable = false; private boolean inSubset = false; private int parsedLines; private int platformLines = 0; private GeoParseResult results = new GeoParseResult(); private int sampleDataLines = 0; private int seriesDataLines = 0; private boolean processPlatformsOnly; private int numWarnings = 0; /* * Elements seen for the 'current sample'. */ private Collection<String> processedDesignElements = new HashSet<String>(); /** * */ private Collection<Integer> wantedQuantitationTypes = new HashSet<Integer>(); /** * */ private boolean aggressiveQuantitationTypeRemoval; /* * (non-Javadoc) * * @see ubic.gemma.loader.loaderutils.Parser#getResults() */ @Override public Collection<Object> getResults() { Collection<Object> r = new HashSet<Object>(); r.add(this.results); return r; } /* * (non-Javadoc) * * @see ubic.gemma.loader.loaderutils.Parser#parse(java.io.File) */ @Override public void parse(File f) throws IOException { InputStream a = new FileInputStream(f); this.parse(a); a.close(); } /* * (non-Javadoc) * * @see ubic.gemma.loader.loaderutils.Parser#parse(java.io.InputStream) */ @Override public void parse(InputStream is) throws IOException { if (is == null) { throw new IOException("Inputstream was null"); } if (is.available() == 0) { throw new IOException("No bytes to read from the input stream."); } final BufferedReader dis = new BufferedReader(new InputStreamReader(is)); log.debug("Parsing...."); final ExecutorService executor = Executors.newSingleThreadExecutor(); FutureTask<Exception> future = new FutureTask<Exception>(new Callable<Exception>() { @Override public Exception call() { try { return doParse(dis); } catch (Exception e) { log.error(e, e); return e; } } }); executor.execute(future); executor.shutdown(); while (!future.isDone() && !future.isCancelled()) { try { TimeUnit.SECONDS.sleep(5L); } catch (InterruptedException e) { // probably cancelled. return; } log.info(parsedLines + " lines parsed."); } try { Exception e = future.get(); if (e != null) { log.error(e.getMessage()); throw new RuntimeException(e.getCause()); } } catch (ExecutionException e) { throw new RuntimeException("Parse failed", e.getCause()); } catch (java.util.concurrent.CancellationException e) { throw new RuntimeException("Parse was cancelled", e.getCause()); } catch (InterruptedException e) { throw new RuntimeException("Parse was interrupted", e.getCause()); } executor.shutdownNow(); assert future.isDone(); // assert executor.isTerminated(); log.info("Done parsing."); } /* * (non-Javadoc) * * @see ubic.gemma.loader.loaderutils.Parser#parse(java.lang.String) */ @Override public void parse(String fileName) throws IOException { InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(fileName); parse(is); is.close(); } /** * @param accession * @param string */ public void sampleTypeSet(String accession, String string) { GeoSample sample = results.getSampleMap().get(accession); if (string.equalsIgnoreCase("cDNA")) { sample.setType("RNA"); } else if (string.equalsIgnoreCase("RNA")) { sample.setType("RNA"); } else if (string.equalsIgnoreCase("genomic")) { sample.setType("genomic"); } else if (string.equalsIgnoreCase("protein")) { sample.setType("protein"); } else if (string.equalsIgnoreCase("mixed")) { sample.setType("mixed"); } else if (string.equalsIgnoreCase("SAGE")) { sample.setType("SAGE"); } else if (string.equalsIgnoreCase("MPSS") || string.equalsIgnoreCase("SRA")) { sample.setType("MPSS"); } else if (string.equalsIgnoreCase("SARST")) { sample.setType("protein"); } else if (string.equalsIgnoreCase("other")) { sample.setType("other"); } else { throw new IllegalArgumentException("Unknown sample type " + string); // sample.setType("other"); } } public void setAgressiveQtRemoval(boolean aggressiveQuantitationTypeRemoval) { this.aggressiveQuantitationTypeRemoval = aggressiveQuantitationTypeRemoval; } /** * @param b */ public void setProcessPlatformsOnly(boolean b) { this.processPlatformsOnly = b; } /** * Check to make sure data has been added for all the design elements, and all quantitation types. This is necessary * where the data for some design elements is omitted. This can happen if there is some variability between the * samples in terms of what design elements they have. Important: This has to be called IMMEDIATELY after the data * for the sample is read in, so the values get added in the right place. * * @param currentSample */ private void addMissingData(GeoSample currentSample) { /* * Skip if we're not going to use the data. */ if (!currentSample.hasUsableData()) { log.info("Sample is not expected to have any data"); return; } if (currentSample.getPlatforms().size() > 1) { log.warn("Multi-platform sample: " + currentSample); } GeoPlatform samplePlatform = currentSample.getPlatforms().iterator().next(); assert samplePlatform != null; Collection<String> designElementNames = samplePlatform.getColumnData(samplePlatform.getIdColumnName()); if (designElementNames == null) throw new IllegalStateException(samplePlatform + " did not have recognizable id column"); if (log.isDebugEnabled()) log.debug("Checking " + currentSample + " for missing design elements on " + samplePlatform); GeoValues values = results.getSeriesMap().get(currentSeriesAccession).getValues(); Collection<Object> qTypeIndexes = values.getQuantitationTypes(samplePlatform); int countMissing = 0; String lastMissingValue = null; for (String el : designElementNames) { if (!processedDesignElements.contains(el)) { countMissing++; lastMissingValue = el; for (Object i : qTypeIndexes) { values.addValue(currentSample, (Integer) i, el, " "); } if (log.isDebugEnabled()) log.debug("Added data missing from sample=" + currentSample + " for probe=" + el + " on " + samplePlatform); } } if (countMissing > 0) { log.warn("Added data missing for " + countMissing + " probes for sample=" + currentSample + " on " + samplePlatform + "; last probe with missing data was " + lastMissingValue); } } /** * Add a new sample to the results. * * @param sampleAccession */ private void addNewSample(String sampleAccession) { if (log.isDebugEnabled()) log.debug("Adding new sample " + sampleAccession); GeoSample newSample = new GeoSample(); newSample.setGeoAccession(sampleAccession); results.getSampleMap().put(sampleAccession, newSample); } /** * @param value */ private void addSeriesSample(String value) { if (!results.getSampleMap().containsKey(value)) { log.debug("New sample (for series): " + value); addNewSample(value); } log.debug("Adding sample: " + value + " to series " + currentSeriesAccession); results.getSeriesMap().get(currentSeriesAccession).addSample(results.getSampleMap().get(value)); } /** * @param target * @param property * @param value */ private void addTo(Object target, String property, Object value) { try { if (value == null) { log.warn("Value is null for target=" + target + " property=" + property); return; } if (target == null) { log.warn("Target is null for value=" + value + " property=" + property); return; } if (property == null) { log.warn("Property is null for value=" + value + " target=" + target); } Method adder = target.getClass().getMethod("addTo" + WordUtils.capitalize(property), new Class[] { value.getClass() }); adder.invoke(target, new Object[] { value }); } catch (SecurityException e) { throw new RuntimeException(e); } catch (IllegalArgumentException e) { throw new RuntimeException(e); } catch (NoSuchMethodException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { throw new RuntimeException(e); } catch (InvocationTargetException e) { throw new RuntimeException(e); } } private void checkDataCompleteness() { if (currentSampleAccession != null) { GeoSample currentSample = this.results.getSampleMap().get(currentSampleAccession); assert currentSample != null; if (currentSample.getPlatforms().size() > 1) { log.warn("Can't check for data completeness when sample uses more than one platform."); } else { addMissingData(currentSample); } validate(); } } /** * @param contact * @param property * @param value */ private void contactSet(GeoContact contact, String property, Object value) { if (contact == null) throw new IllegalArgumentException(); try { BeanUtils.setProperty(contact, property, value); } catch (IllegalAccessException e) { throw new RuntimeException(e); } catch (InvocationTargetException e) { throw new RuntimeException(e); } } /** * @param object * @param property * @param value */ private void contactSet(Object object, String property, Object value) { if (object instanceof GeoContact) { contactSet((GeoContact) object, property, value); } else if (object instanceof GeoData) { GeoContact contact = ((GeoData) object).getContact(); contactSet(contact, property, value); } } private GeoDataset currentDataset() { return this.results.getDatasetMap().get(currentDatasetAccession); } private GeoPlatform currentPlatform() { return this.results.getPlatformMap().get(currentPlatformAccession); } private GeoSample currentSample() { return this.results.getSampleMap().get(currentSampleAccession); } private GeoSeries currentSeries() { return this.results.getSeriesMap().get(currentSeriesAccession); } /** * @param accession * @param property * @param value */ private void datasetSet(String accession, String property, Object value) { GeoDataset dataset = results.getDatasetMap().get(accession); if (dataset == null) throw new IllegalArgumentException("Unknown dataset " + accession); if (property.equals("experimentType")) { value = GeoDataset.convertStringToExperimentType((String) value); } else if (property.equals("platformType")) { value = GeoDataset.convertStringToPlatformType((String) value); } else if (property.equals("sampleType")) { value = GeoDataset.convertStringToSampleType((String) value); } else if (property.equals("valueType")) { value = GeoDataset.convertStringToValueType((String) value); } try { BeanUtils.setProperty(dataset, property, value); } catch (IllegalAccessException e) { log.error(e, e); throw new RuntimeException(e); } catch (InvocationTargetException e) { log.error(e, e); throw new RuntimeException(e); } } /** * @param dis * @throws IOException */ private Exception doParse(BufferedReader dis) { if (dis == null) { throw new RuntimeException("Null reader"); } this.numWarnings = 0; haveReadPlatformHeader = false; haveReadSampleDataHeader = false; alreadyWarnedAboutClobbering = false; alreadyWarnedAboutInconsistentColumnOrder = false; alreadyWarnedAboutDuplicateColumnName = false; String line = ""; parsedLines = 0; processedDesignElements.clear(); StopWatch timer = new StopWatch(); timer.start(); try { while ((line = dis.readLine()) != null) { if (StringUtils.isBlank(line)) { continue; } parseLine(line); if (++parsedLines % 20000 == 0 && Thread.currentThread().isInterrupted()) { dis.close(); // clean up throw new java.util.concurrent.CancellationException( "Thread was terminated during parsing. " + this.getClass()); } } tidyUp(); } catch (Exception e) { log.error("Parsing failed (Cancelled?) :" + e.getMessage()); /* * This happens if there was a cancellation. */ throw new RuntimeException(e); } timer.stop(); if (timer.getTime() > 10000) { // 10 s log.info("Parsed total of " + parsedLines + " lines in " + String.format("%.2gs", timer.getTime() / 1000.0)); } log.debug(this.platformLines + " platform lines"); log.debug(this.seriesDataLines + " series data lines"); log.debug(this.dataSetDataLines + " data set data lines"); log.debug(this.sampleDataLines + " sample data lines"); return null; } /** * Check for problems and fix them. */ private void tidyUp() { checkForAndFixMissingColumnNames(); } /** * Due to bug 2326, when a sample has no data at all. * * @param sampleMap */ private void checkForAndFixMissingColumnNames() { Map<String, GeoSample> sampleMap = this.results.getSampleMap(); List<String> representativeColumnNames = null; GeoSample representativeSample = null; for (GeoSample sam : sampleMap.values()) { if (!sam.getColumnNames().isEmpty()) { representativeColumnNames = sam.getColumnNames(); representativeSample = sam; break; } } if (representativeColumnNames == null || representativeSample == null) { return; } for (GeoSample sam : sampleMap.values()) { if (sam.getColumnNames().isEmpty()) { int i = 0; for (String colName : representativeColumnNames) { sam.addColumnName(colName); sam.getColumnDescriptions().add(representativeSample.getColumnDescriptions().get(i)); i++; } } } } /** * @param line * @return */ private int extractChannelNumber(String line) { int chIndex = line.lastIndexOf("_ch"); if (chIndex < 0) return 1; // that's okay, there is only one channel. String candidateInt = line.substring(chIndex + 3, chIndex + 4); try { return Integer.parseInt(candidateInt); } catch (NumberFormatException e) { return 1; } } /** * Turns a line in the format #key = value into a column name and description. This is used to handle lines such as * (in a platform section of a GSE file): * * <pre> * #SEQ_LEN = Sequence length * </pre> * * @param line * @param dataToAddTo GeoData object, must not be null. */ private void extractColumnIdentifier(String line, GeoData dataToAddTo) { if (dataToAddTo == null) throw new IllegalArgumentException("Data cannot be null"); Map<String, String> res = extractKeyValue(line); if (res == null) return; String columnName = res.keySet().iterator().next(); dataToAddTo.addColumnName(columnName); dataToAddTo.getColumnDescriptions().add(res.get(columnName)); if (log.isDebugEnabled()) log.debug("Adding " + columnName + " to column names for " + dataToAddTo); } /** * Extract a key and value pair from a line in the format #key = value. * * @param line. * @return Map containing the String key and String value. Return null if it is misformatted. */ private Map<String, String> extractKeyValue(String line) { if (!line.startsWith("#")) throw new IllegalArgumentException("Wrong type of line"); Map<String, String> result = new HashMap<String, String>(); String fixed = line.substring(line.indexOf('#') + 1); String[] tokens = fixed.split("=", 2); if (tokens.length != 2) { log.warn("Invalid key-value line, expected an '=' somewhere, got: '" + line + "'"); return null; } String key = tokens[0]; String value = tokens[1]; key = StringUtils.strip(key); value = StringUtils.strip(value); result.put(key, value); return result; } /** * Extract a value from a line in the format xxxx=value. * * @param line * @return String following the first occurrence of '=', or null if there is no '=' in the String. */ private String extractValue(String line) { int eqIndex = line.indexOf('='); if (eqIndex < 0) { return null; // that's okay, there are lines that just indicate the end of sections. } return StringUtils.strip(line.substring(eqIndex + 1)); } /** * Parse a line to extract an integer <em>n</em> from the a variable description line like "!Series_variable_[n] = * age" * * @param line * @return int * @throws Exception if the line doesn't fit the format. */ private int extractVariableNumber(String line) { Pattern p = Pattern.compile("_(\\d+)$"); Matcher m = p.matcher(line); if (m.matches()) { try { return Integer.parseInt(line.substring(m.start(1))); } catch (NumberFormatException e) { throw new IllegalArgumentException("Wrong kind of string: " + line); } } throw new IllegalArgumentException("Wrong kind of string: " + line); } /** * This is run once for each sample, to try to figure out where the data are for each quantitation type (because it * can vary from sample to sample). Each platform in the series gets its own reference. * <p> * Note that the first column is the "ID_REF"; the first 'real' quantitation type gets column number 0. This * initialization is run for each sample. */ private void initializeQuantitationTypes() { wantedQuantitationTypes.clear(); quantitationTypeTargetColumn.clear(); GeoSeries geoSeries = results.getSeriesMap().get(currentSeriesAccession); if (geoSeries == null) { throw new IllegalStateException("No series is being parsed"); } GeoValues values = geoSeries.getValues(); Map<GeoPlatform, Integer> currentIndex = new HashMap<GeoPlatform, Integer>(); Collection<String> seenColumnNames = new HashSet<String>(); /* * In some data sets, the quantitation types are not in the same columns in different samples. ARRRGH! */ Collection<GeoPlatform> platforms = this.currentSample().getPlatforms(); if (platforms.size() > 1) { log.warn("Multiple platforms for " + this.currentSample()); } GeoPlatform platformForSample = platforms.iterator().next(); log.debug("Initializing quantitation types for " + currentSample() + ", Platform=" + platformForSample); if (currentSample().getColumnNames().isEmpty()) { /* * We need to fill in dummy values. */ geoSeries.getValues().addSample(currentSample()); } for (String columnName : currentSample().getColumnNames()) { boolean isWanted = values.isWantedQuantitationType(columnName, this.aggressiveQuantitationTypeRemoval); if (!isWanted) log.debug(columnName + " will not be included in final data"); if (!currentIndex.containsKey(platformForSample)) { currentIndex.put(platformForSample, 0); } int actualColumnNumber = currentIndex.get(platformForSample) - 1; /* * In some datasets (e.g. GSE432) the column names are not distinct. ARRRGH. We try to salvage the situation * by adding a suffix to the name. */ if (seenColumnNames.contains(columnName)) { if (!alreadyWarnedAboutDuplicateColumnName) { log.warn("\n---------- WARNING ------------\n" + columnName + " appears more than once for sample " + currentSample() + ", it will be mangled to make it unique.\nThis usually indicates a problem with the GEO file format! (future similar warnings for this data set suppressed)\n"); alreadyWarnedAboutDuplicateColumnName = true; } /* * This method of mangling the name means that the repeated name had better show up in the same column * each time. If it doesn't, then things are REALLY confused. */ columnName = columnName + "___" + actualColumnNumber; } initMaps(platformForSample); /* * Stores the column index for the column name. */ Map<String, Integer> qtMapForPlatform = quantitationTypeKey.get(platformForSample); /* * Once we've seen a column, we check to see if it is in the same place as before. */ Integer desiredColumnNumber = actualColumnNumber; if (qtMapForPlatform.containsKey(columnName)) { desiredColumnNumber = qtMapForPlatform.get(columnName); if (desiredColumnNumber != actualColumnNumber) { if (!alreadyWarnedAboutInconsistentColumnOrder) { log.warn("\n---------- POSSIBLE GEO FILE FORMAT PROBLEM WARNING! ------------\n" + columnName + " is not in previous column " + desiredColumnNumber + ":\nFor sample " + currentSample() + ", it is in column " + actualColumnNumber + ". This usually isn't a problem but it's worth checking to make sure data isn't misaligned" + " (future warnings for this data set suppressed)\n"); alreadyWarnedAboutInconsistentColumnOrder = true; } /* * This is used to put the data in the right place later. We know the actual column is where it is * NOW, for this sample, but in our data structure we put it where we EXPECT it to be (where it was * the first time we saw it). This is our attempt to fix problems with columns moving around. */ quantitationTypeTargetColumn.get(platformForSample).put(actualColumnNumber, desiredColumnNumber); } values.addQuantitationType(platformForSample, columnName, desiredColumnNumber); } else { /* * First time we see this column name (for the platform for the current sample). Normally we assume it * just goes at the column index we're at. However, make sure that there isn't another column name in * this sample that should be at the same index. We have to 'look ahead'. This isn't the usual case, but * it isn't rare either. (Example: GSE3500) */ boolean clobbers = willClobberOtherQuantitationType(columnName, actualColumnNumber, qtMapForPlatform); if (clobbers) { // we need to put it at the end - at the highest index we know about. Collection<Integer> allIndexes = qtMapForPlatform.values(); int max = -1; for (Integer v : allIndexes) { if (v > max) { max = v; } } desiredColumnNumber = max + 1; quantitationTypeTargetColumn.get(platformForSample).put(actualColumnNumber, desiredColumnNumber); if (!alreadyWarnedAboutClobbering) { log.warn("\n---------- POSSIBLE GEO FILE FORMAT PROBLEM WARNING! ------------\n" + "Current column name " + columnName + " reassigned to index " + desiredColumnNumber + " to avoid clobbering. This usually isn't a problem but it's worth checking to make sure data isn't misaligned " + "(future similar warnings for this data set suppressed)\n"); alreadyWarnedAboutClobbering = true; } } log.debug(columnName + " ---> " + desiredColumnNumber); qtMapForPlatform.put(columnName, desiredColumnNumber); values.addQuantitationType(platformForSample, columnName, desiredColumnNumber); } /* * Some quantitation types are skipped to save space. */ if (!isWanted) { if (log.isDebugEnabled()) log.debug("Data column " + columnName + " will be skipped for " + currentSample() + " - it is an 'unwanted' quantitation type (column number " + currentIndex.get(platformForSample) + ", " + desiredColumnNumber + "the quantitation type.)"); } else { wantedQuantitationTypes.add(desiredColumnNumber); } seenColumnNames.add(columnName); // update the current index, note that it is platform-specific. currentIndex.put(platformForSample, currentIndex.get(platformForSample) + 1); } // end iteration over column names. } /** * @param platformForSample */ private void initMaps(GeoPlatform platformForSample) { if (!quantitationTypeKey.containsKey(platformForSample)) { quantitationTypeKey.put(platformForSample, new HashMap<String, Integer>()); } if (!quantitationTypeTargetColumn.containsKey(platformForSample)) { quantitationTypeTargetColumn.put(platformForSample, new HashMap<Integer, Integer>()); } } private boolean isWantedQuantitationType(int index) { return wantedQuantitationTypes.contains(index); } /** * @param series * @param value */ private void lastUpdateDateSet(Object object, String value) { if (object instanceof GeoPlatform) ((GeoPlatform) object).setLastUpdateDate(value); else if (object instanceof GeoSeries) ((GeoSeries) object).setLastUpdateDate(value); else if (object instanceof GeoSample) ((GeoSample) object).setLastUpdateDate(value); } /** * Parse the column identifier strings from a GDS or GSE file. * <p> * In GSE files, in a 'platform' section, these become column descriptions for the platform descriptors. * <p> * For samples in GSE files, they become values for the data in the sample. For example * * <pre> * #ID_REF = probe id * #VALUE = RMA value * </pre> * <p> * FIXME For subsets, these lines are ignored (do they even occur?). In 'series' sections of GSE files, the data are * kept (but does this occur?) . * <p> * In GDS files, if we are in a 'dataset' section, these become "titles" for the samples if they aren't already * provided. Here is an example. * * <pre> * #GSM549 = Value for GSM549: lexA vs. wt, before UV treatment, MG1655; src: 0' wt, before UV treatment, 25 ug total RNA, 2 ug pdN6<->0' lexA, before UV 25 ug total RNA, 2 ug pdN6 * #GSM542 = Value for GSM542: lexA 20' after NOuv vs. 0', MG1655; src: 0', before UV treatment, 25 ug total RNA, 2 ug pdN6<->lexA 20 min after NOuv, 25 ug total RNA, 2 ug pdN6 * #GSM543 = Value for GSM543: lexA 60' after NOuv vs. 0', MG1655; src: 0', before UV treatment, 25 ug total RNA, 2 ug pdN6<->lexA 60 min after NOuv, 25 ug total RNA, 2 ug pdN6 * </pre> * * @param line */ private void parseColumnIdentifier(String line) { if (inPlatform) { extractColumnIdentifier(line, currentPlatform()); } else if (inSample) { if (!processPlatformsOnly) { extractColumnIdentifier(line, currentSample()); } } else if (inSeries) { if (!processPlatformsOnly) extractColumnIdentifier(line, currentSeries()); } else if (inSubset) { // nothing. } else if (inDataset) { if (processPlatformsOnly) return; /* * Datasets give titles to samples that sometimes differ from the ones given in the GSE files. Sometimes * these are useful to keep around (for matching across data sets), so we store it in an "auxiliary" title. */ extractColumnIdentifier(line, currentDataset()); Map<String, String> res = extractKeyValue(line); String potentialSampleAccession = res.keySet().iterator().next(); String potentialTitle = res.get(potentialSampleAccession); // First add the sample if we haven't seen it before. if (potentialSampleAccession.startsWith("GSM") && !results.getSampleMap().containsKey(potentialSampleAccession)) { this.addNewSample(potentialSampleAccession); } // Set the titleInDataset if (potentialSampleAccession.startsWith("GSM") && !StringUtils.isBlank(potentialTitle)) { potentialTitle = potentialTitle.substring(potentialTitle.indexOf(':') + 2); // throw out the sampleSet(potentialSampleAccession, "titleInDataset", potentialTitle); } } else { throw new IllegalStateException("Wrong state to deal with '" + line + "'"); } } /** * Parse a line in a 'dataset' section of a GDS file. This is metadata about the experiment. * * @param line * @param value */ private void parseDatasetLine(String line, String value) { if (this.processPlatformsOnly) return; /*************************************************************************************************************** * DATASET **************************************************************************************************************/ if (startsWithIgnoreCase(line, "!Dataset_title")) { datasetSet(currentDatasetAccession, "title", value); } else if (startsWithIgnoreCase(line, "!dataset_description")) { datasetSet(currentDatasetAccession, "title", value); } else if (startsWithIgnoreCase(line, "!dataset_maximum_probes")) { datasetSet(currentDatasetAccession, "numProbes", value); } else if (startsWithIgnoreCase(line, "!dataset_order")) { datasetSet(currentDatasetAccession, "order", value); } else if (startsWithIgnoreCase(line, "!dataset_organism")) { // note, no longer used? datasetSet(currentDatasetAccession, "organism", value); } else if (startsWithIgnoreCase(line, "!dataset_platform_organism")) { // redundant, we get this from the // series // results.getPlatformMap().get( currentDatasetPlatformAccession ).addToOrganisms( value ); } else if (startsWithIgnoreCase(line, "!dataset_platform_technology_type")) { // results.getPlatformMap().get( currentDatasetPlatformAccession ).setTechnology( value ); // we also get // this // // from the platform // // directly. } else if (startsWithIgnoreCase(line, "!dataset_platform")) { if (!results.getPlatformMap().containsKey(value)) { results.getPlatformMap().put(value, new GeoPlatform()); results.getPlatformMap().get(value).setGeoAccession(value); } results.getDatasetMap().get(currentDatasetAccession).setPlatform(results.getPlatformMap().get(value)); // currentDatasetPlatformAccession = value; } else if (startsWithIgnoreCase(line, "!dataset_probe_type")) { // obsolete datasetSet(currentDatasetAccession, "platformType", value); } else if (startsWithIgnoreCase(line, "!dataset_platform_technology_type")) { datasetSet(currentDatasetAccession, "platformType", value); } else if (startsWithIgnoreCase(line, "!dataset_reference_series")) { if (!results.getSeriesMap().containsKey(value)) { log.debug("Adding series " + value); results.getSeriesMap().put(value, new GeoSeries()); results.getSeriesMap().get(value).setGeoAccession(value); } // FIXME this is really a bug: the same series comes up more than once, but empty in some case. GeoSeries series = results.getSeriesMap().get(value); if (!results.getDatasetMap().get(currentDatasetAccession).getSeries().contains(series)) { log.debug(currentDatasetAccession + " already has reference to series " + value); } if (series.getSamples() != null && series.getSamples().size() > 0) { results.getDatasetMap().get(currentDatasetAccession).addSeries(series); } else { log.warn("Empty series " + series); } } else if (startsWithIgnoreCase(line, "!dataset_total_samples")) { datasetSet(currentDatasetAccession, "numSamples", value); } else if (startsWithIgnoreCase(line, "!dataset_sample_count")) { // is this the same as "total_samples"? datasetSet(currentDatasetAccession, "numSamples", value); } else if (startsWithIgnoreCase(line, "!dataset_update_date")) { datasetSet(currentDatasetAccession, "updateDate", value); } else if (startsWithIgnoreCase(line, "!dataset_value_type")) { datasetSet(currentDatasetAccession, "valueType", value); } else if (startsWithIgnoreCase(line, "!dataset_completeness")) { datasetSet(currentDatasetAccession, "completeness", value); } else if (startsWithIgnoreCase(line, "!dataset_experiment_type")) { datasetSet(currentDatasetAccession, "experimentType", value); // this is now "platform type"? in new GEO // files? } else if (startsWithIgnoreCase(line, "!dataset_type")) { datasetSet(currentDatasetAccession, "datasetType", value); } else if (startsWithIgnoreCase(line, "!dataset_feature_count")) { datasetSet(currentDatasetAccession, "featureCount", value); } else if (startsWithIgnoreCase(line, "!dataset_sample_organism")) { datasetSet(currentDatasetAccession, "organism", value); // note, redundant with 'organism'. } else if (startsWithIgnoreCase(line, "!dataset_sample_type")) { datasetSet(currentDatasetAccession, "sampleType", value); } else if (startsWithIgnoreCase(line, "!dataset_pubmed_id")) { datasetSet(currentDatasetAccession, "pubmedId", value); } else if (startsWithIgnoreCase(line, "!dataset_table_begin")) { this.inDatasetTable = true; // haveReadDatasetDataHeader = false; } else if (startsWithIgnoreCase(line, "!dataset_table_end")) { this.inDatasetTable = false; } else if (startsWithIgnoreCase(line, "!dataset_channel_count")) { datasetSet(currentDatasetAccession, "channelCount", Integer.parseInt(value)); } else { log.error("Unknown flag in dataset: " + line); } } /** * @param line */ private void parseLine(String line) { if (StringUtils.isBlank(line)) return; if (line.startsWith("^")) { if (startsWithIgnoreCase(line, "^DATABASE")) { inDatabase = true; inSubset = false; inDataset = false; inSample = false; inPlatform = false; inSeries = false; } else if (startsWithIgnoreCase(line, "^SAMPLE")) { processedDesignElements.clear(); inSample = true; inSubset = false; inDataset = false; inDatabase = false; inPlatform = false; inSeries = false; if (this.processPlatformsOnly) return; String value = extractValue(line); currentSampleAccession = value; log.debug("Starting new sample " + value); if (results.getSampleMap().containsKey(value)) return; addNewSample(value); } else if (startsWithIgnoreCase(line, "^PLATFORM")) { inPlatform = true; inSubset = false; inDataset = false; inDatabase = false; inSample = false; inSeries = false; String value = extractValue(line); currentPlatformAccession = value; if (results.getPlatformMap().containsKey(value)) return; GeoPlatform platform = new GeoPlatform(); platform.setGeoAccession(value); results.getPlatformMap().put(value, platform); log.info("Starting platform " + platform); } else if (startsWithIgnoreCase(line, "^SERIES")) { inSeries = true; inSubset = false; inDataset = false; inPlatform = false; inSample = false; inDatabase = false; if (this.processPlatformsOnly) return; String value = extractValue(line); currentSeriesAccession = value; if (results.getSeriesMap().containsKey(value)) return; GeoSeries series = new GeoSeries(); series.setGeoAccession(value); results.getSeriesMap().put(value, series); log.debug("In series " + series); } else if (startsWithIgnoreCase(line, "^DATASET")) { inDataset = true; inSubset = false; inSeries = false; inPlatform = false; inSample = false; inDatabase = false; if (this.processPlatformsOnly) return; String value = extractValue(line); currentDatasetAccession = value; if (results.getDatasetMap().containsKey(value)) return; GeoDataset ds = new GeoDataset(); ds.setGeoAccession(value); results.getDatasetMap().put(value, ds); log.debug("In dataset " + ds); } else if (startsWithIgnoreCase(line, "^SUBSET")) { inSubset = true; inDataset = false; inSeries = false; inPlatform = false; inSample = false; inDatabase = false; if (this.processPlatformsOnly) return; String value = extractValue(line); currentSubsetAccession = value; if (results.getSubsetMap().containsKey(value)) return; GeoSubset ss = new GeoSubset(); ss.setGeoAccession(value); ss.setOwningDataset(results.getDatasetMap().get(this.currentDatasetAccession)); results.getDatasetMap().get(this.currentDatasetAccession).addSubset(ss); results.getSubsetMap().put(value, ss); log.debug("In subset " + ss); } else { log.error("Unknown flag in subset: " + line); } } else { parseRegularLine(line); } } /** * If a line does not have the same number of fields as the column headings, it is skipped. * * @param line */ private void parsePlatformLine(String line) { if (!haveReadPlatformHeader) { haveReadPlatformHeader = true; return; } GeoPlatform currentPlatform = results.getPlatformMap().get(currentPlatformAccession); assert currentPlatform != null; /* * Skip platform information when it is not going to be usable, unless we are ONLY parsing a platform. */ // Actually this isn't as important, since we filter out bad elements. // if ( !processPlatformsOnly && !currentPlatform.useDataFromGeo() ) { // return; // } String[] tokens = StringUtils.splitPreserveAllTokens(line, FIELD_DELIM); List<String> columnNames = currentPlatform.getColumnNames(); int numColumns = columnNames.size(); if (numColumns != tokens.length && numWarnings < MAX_WARNINGS) { log.warn("Wrong number of tokens in line (" + tokens.length + ", expected " + numColumns + "), line was '" + line + "'; Possible corrupt file or invalid format?"); numWarnings++; if (numWarnings == MAX_WARNINGS) { log.warn("Further warnings suppressed"); } return; } GeoPlatform platform = currentPlatform; for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; String columnName = columnNames.get(i); platform.addToColumnData(columnName, token); } platformLines++; } /** * Parse a line in a 'platform' section of a GSE file. This deals with meta-data about the platform. * * @param line * @param value */ private void parsePlatformLine(String line, String value) { /*************************************************************************************************************** * PLATFORM **************************************************************************************************************/ if (startsWithIgnoreCase(line, "!Platform_title")) { platformSet(currentPlatformAccession, "title", value); } else if (startsWithIgnoreCase(line, "!Platform_geo_accession")) { currentPlatformAccession = value; } else if (startsWithIgnoreCase(line, "!Platform_status")) { platformSet(currentPlatformAccession, "status", value); } else if (startsWithIgnoreCase(line, "!Platform_manufacturer")) { platformSet(currentPlatformAccession, "manufacturer", value); } else if (startsWithIgnoreCase(line, "!Platform_manufacture_protocol")) { platformSet(currentPlatformAccession, "manufactureProtocol", value); } else if (startsWithIgnoreCase(line, "!Platform_submission_date")) { platformSet(currentPlatformAccession, "submissionDate", value); } else if (startsWithIgnoreCase(line, "!Platform_technology")) { platformSet(currentPlatformAccession, "technology", value); } else if (startsWithIgnoreCase(line, "!Platform_distribution")) { platformSet(currentPlatformAccession, "distribution", value); } else if (startsWithIgnoreCase(line, "!Platform_organism")) { platformAddTo(currentPlatformAccession, "organisms", value); } else if (startsWithIgnoreCase(line, "!Platform_description")) { platformAddTo(currentPlatformAccession, "description", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_name")) { platformContactSet(currentPlatformAccession, "name", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_email")) { platformContactSet(currentPlatformAccession, "email", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_institute")) { platformContactSet(currentPlatformAccession, "institute", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_laboratory")) { platformContactSet(currentPlatformAccession, "laboratory", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_department")) { platformContactSet(currentPlatformAccession, "department", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_address")) { // may not be used any more. platformContactSet(currentPlatformAccession, "address", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_city")) { platformContactSet(currentPlatformAccession, "city", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_zip/postal_code")) { platformContactSet(currentPlatformAccession, "postCode", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_state")) { platformContactSet(currentPlatformAccession, "state", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_country")) { platformContactSet(currentPlatformAccession, "country", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_phone")) { platformContactSet(currentPlatformAccession, "phone", value); } else if (startsWithIgnoreCase(line, "!Platform_contact_web_link")) { platformContactSet(currentPlatformAccession, "webLink", value); } else if (startsWithIgnoreCase(line, "!Platform_support")) { // use this (maybe) } else if (startsWithIgnoreCase(line, "!Platform_coating")) { // use this (maybe) } else if (startsWithIgnoreCase(line, "!Platform_contact_fax")) { platformContactSet(currentSeriesAccession, "fax", value); } else if (startsWithIgnoreCase(line, "!Platform_web_link")) { platformSet(currentPlatformAccession, "webLink", value); } else if (startsWithIgnoreCase(line, "!Platform_sample_id")) { platformSet(currentPlatformAccession, "id", value); } else if (startsWithIgnoreCase(line, "!Platform_table_begin")) { inPlatformTable = true; haveReadPlatformHeader = false; } else if (startsWithIgnoreCase(line, "!Platform_table_end")) { inPlatformTable = false; } else if (startsWithIgnoreCase(line, "!Platform_contributor")) { // noop. This is the name of the person who submitted the platform. } else if (startsWithIgnoreCase(line, "!Platform_series_id")) { // no-op. This identifies which series were run on this platform. We don't care to get this // information this way. } else if (startsWithIgnoreCase(line, "!Platform_data_row_count")) { // nothing. However, if this is zero, we might be able to skip later steps. } else if (startsWithIgnoreCase(line, "!Platform_catalog_number")) { // do nothing } else if (startsWithIgnoreCase(line, "!Platform_last_update_date")) { platformLastUpdateDate(currentPlatformAccession, value); } else if (startsWithIgnoreCase(line, "!Platform_supplementary_file")) { platformSupplementaryFileSet(currentPlatformAccession, value); } else if (startsWithIgnoreCase(line, "!Platform_pubmed_id")) { // do nothing. for now. } else if (startsWithIgnoreCase(line, "!Platform_relation")) { // no op for now. Links to other platforms this is derived from. } else if (startsWithIgnoreCase(line, "!Platform_taxid")) { // no op for now.... } else { log.error("Unknown flag in platform: " + line); } } /** * Parse lines in GSE and GDS files. Lines are classified into three types: * <ul> * <li>Starting with "!". These indicate meta data. * <li>Starting with "#". These indicate descriptions of columns in a data table. * <li>Starting with anything else, primarily (only?) data tables (expression data or platform probe annotations). * </ul> * * @param line */ private void parseRegularLine(String line) { if (line.startsWith("!")) { String value = extractValue(line); if (inSample) { parseSampleLine(line, value); } else if (inSeries) { parseSeriesLine(line, value); } else if (inDatabase) { // we are going to ignore these lines. } else if (inPlatform) { parsePlatformLine(line, value); } else if (inDataset) { inDatasetTable = true; parseDatasetLine(line, value); } else if (inSubset) { parseSubsetLine(line, value); } else { throw new IllegalStateException("Unknown flag: " + line); } } else if (line.startsWith("#")) { parseColumnIdentifier(line); } else { if (inPlatformTable) { parsePlatformLine(line); } else if (inSampleTable) { parseSampleDataLine(line); } else if (inSeriesTable) { // we ignore this and use the sample data instead. // parseSeriesDataLine( line ); } else if (inDatasetTable) { // parseDataSetDataLine( line ); // we ignore this and use the sample data instead. } else if (inSubset) { // do nothing. } else { // throw new IllegalStateException( "Wrong state to deal with '" + line + "'" ); } } } /** * The data for one sample is all the values for each quantitation type. * <p> * Important implementation note: In the sample table sections of GSEXXX_family files, the first column is always * ID_REF, according to the kind folks at NCBI. If this changes, this code will BREAK. * <p> * Similarly, the column names between the different samples are not necessarily the same, but we trust that they * all refer to the same quantitation types in the same order, for a given platform. That is, the nth column for * this sample 'means' the same thing as the nth column for another sample in this series (on the same platform). If * that isn't true, this will be BROKEN. However, we do try to sort it out if we can. * * @param line * @see initializeQuantitationTypes */ private void parseSampleDataLine(String line) { if (StringUtils.isBlank(line)) return; if (!haveReadSampleDataHeader) { haveReadSampleDataHeader = true; previousNumTokens = null; initializeQuantitationTypes(); return; } GeoSample sample = results.getSampleMap().get(currentSampleAccession); /* * skip this step if it's not a supported platform type (RNA-seq, exon arrays: we put the data in later) */ if (!sample.hasUsableData()) { return; } String[] tokens = StringUtils.splitPreserveAllTokens(line, FIELD_DELIM); assert tokens != null; /* * This can happen in some files that are mildly corrupted. -- we have to ignore it. */ if (tokens.length <= 1 && numWarnings < MAX_WARNINGS) { log.error("Parse error, sample data line has too few elements (" + tokens.length + "), line was '" + line + "'"); numWarnings++; if (numWarnings == MAX_WARNINGS) { log.warn("Further warnings suppressed"); } return; } if (previousNumTokens != null && tokens.length != previousNumTokens) { log.warn("Last line had " + (previousNumTokens - 1) + " quantitation types, this one has " + (tokens.length - 1)); } previousNumTokens = tokens.length; if (results.getSeriesMap().get(currentSeriesAccession) == null) { return; // this happens if we are parsing a GPL file. } GeoPlatform platformForSample = sample.getPlatforms().iterator().next(); // slow GeoValues values = results.getSeriesMap().get(currentSeriesAccession).getValues(); String designElement = tokens[0]; // ID_REF. For bug 1709, adding toLower() will fix this. Map<Integer, Integer> map = quantitationTypeTargetColumn.get(platformForSample); for (int i = 1; i < tokens.length; i++) { String value = tokens[i]; int qtIndex = i - 1; /* * This map tells us which column this quantitation type is SUPPOSED to go in. */ if (map.containsKey(qtIndex)) qtIndex = map.get(qtIndex); if (!isWantedQuantitationType(qtIndex)) { continue; } if (log.isTraceEnabled()) { log.trace("Adding: " + value + " to quantitationType " + (qtIndex) + " for " + designElement); } values.addValue(sample, qtIndex, designElement, value); processedDesignElements.add(designElement); } sampleDataLines++; } /** * Parse a line from a sample section of a GSE file. These contain details about the samples and the 'raw' data for * the sample. * * @param line * @param value */ private void parseSampleLine(String line, String value) { if (this.processPlatformsOnly) return; /*************************************************************************************************************** * SAMPLE **************************************************************************************************************/ if (startsWithIgnoreCase(line, "!sample_table_begin")) { inSampleTable = true; haveReadSampleDataHeader = false; } else if (startsWithIgnoreCase(line, "!sample_table_end")) { checkDataCompleteness(); inSampleTable = false; } else if (startsWithIgnoreCase(line, "!Sample_title")) { if (this.inDataset) { sampleSet(currentSampleAccession, "titleInDataset", value); } else { sampleSet(currentSampleAccession, "title", value); } } else if (startsWithIgnoreCase(line, "!Sample_geo_accession")) { currentSampleAccession = value; if (!results.getSampleMap().containsKey(currentSampleAccession)) { log.debug("New sample " + currentSampleAccession); results.getSampleMap().put(currentSampleAccession, new GeoSample()); } } else if (startsWithIgnoreCase(line, "!Sample_status")) { sampleSet(currentSampleAccession, "status", value); } else if (startsWithIgnoreCase(line, "!Sample_submission_date")) { sampleSet(currentSampleAccession, "submissionDate", value); } else if (startsWithIgnoreCase(line, "!Sample_channel_count")) { int numExtraChannelsNeeded = Integer.parseInt(value) - 1; for (int i = 0; i < numExtraChannelsNeeded; i++) { results.getSampleMap().get(currentSampleAccession).addChannel(); } sampleSet(currentSampleAccession, "channelCount", Integer.parseInt(value)); } else if (startsWithIgnoreCase(line, "!Sample_source_name")) { int channel = extractChannelNumber(line); sampleChannelSet(currentSampleAccession, "sourceName", channel, value); } else if (startsWithIgnoreCase(line, "!Sample_organism")) { int channel = extractChannelNumber(line); sampleChannelSet(currentSampleAccession, "organism", channel, value); } else if (startsWithIgnoreCase(line, "!Sample_biomaterial_provider")) { int channel = extractChannelNumber(line); sampleChannelSet(currentSampleAccession, "bioMaterialProvider", channel, value); } else if (startsWithIgnoreCase(line, "!Sample_treatment_protocol")) { int channel = extractChannelNumber(line); sampleChannelAddTo(currentSampleAccession, "treatmentProtocol", channel, value); } else if (startsWithIgnoreCase(line, "!Sample_molecule")) { int channel = extractChannelNumber(line); sampleChannelSet(currentSampleAccession, "molecule", channel, value); } else if (startsWithIgnoreCase(line, "!Sample_growth_protocol")) { int channel = extractChannelNumber(line); sampleChannelAddTo(currentSampleAccession, "growthProtocol", channel, value); } else if (startsWithIgnoreCase(line, "!sample_extract_protocol")) { int channel = extractChannelNumber(line); sampleChannelAddTo(currentSampleAccession, "extractProtocol", channel, value); } else if (startsWithIgnoreCase(line, "!Sample_hyb_protocol")) { sampleAddTo(currentSampleAccession, "hybProtocol", value); } else if (startsWithIgnoreCase(line, "!Sample_scan_protocol")) { sampleAddTo(currentSampleAccession, "scanProtocol", value); } else if (startsWithIgnoreCase(line, "!Sample_data_processing")) { sampleAddTo(currentSampleAccession, "dataProcessing", value); } else if (startsWithIgnoreCase(line, "!Sample_description")) { sampleAddTo(currentSampleAccession, "description", value); } else if (startsWithIgnoreCase(line, "!Sample_label_protocol")) { int channel = extractChannelNumber(line); sampleChannelSet(currentSampleAccession, "labelProtocol", channel, value); } else if (startsWithIgnoreCase(line, "!Sample_label")) { int channel = extractChannelNumber(line); sampleChannelSet(currentSampleAccession, "label", channel, value); } else if (startsWithIgnoreCase(line, "!Sample_characteristics")) { int channel = extractChannelNumber(line); GeoSample sample = results.getSampleMap().get(currentSampleAccession); sample.getChannel(channel).addCharacteristic(value); } else if (startsWithIgnoreCase(line, "!Sample_platform_id")) { sampleSet(currentSampleAccession, "id", value); if (results.getPlatformMap().containsKey(value)) { results.getSampleMap().get(currentSampleAccession).addPlatform(results.getPlatformMap().get(value)); } } else if (startsWithIgnoreCase(line, "!Sample_contact_name")) { sampleContactSet(currentSampleAccession, "name", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_email")) { sampleContactSet(currentSampleAccession, "email", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_institute")) { sampleContactSet(currentSampleAccession, "institute", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_laboratory")) { sampleContactSet(currentSampleAccession, "laboratory", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_department")) { sampleContactSet(currentSampleAccession, "department", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_address")) { sampleContactSet(currentSampleAccession, "address", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_city")) { sampleContactSet(currentSampleAccession, "city", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_state")) { sampleContactSet(currentSampleAccession, "state", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_country")) { sampleContactSet(currentSampleAccession, "country", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_zip/postal_code")) { sampleContactSet(currentSampleAccession, "postCode", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_phone")) { sampleContactSet(currentSampleAccession, "phone", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_web_link")) { sampleContactSet(currentSampleAccession, "webLink", value); } else if (startsWithIgnoreCase(line, "!Sample_contact_fax")) { sampleContactSet(currentSeriesAccession, "fax", value); } else if (startsWithIgnoreCase(line, "!Sample_series_id")) { if (results.getSeriesMap().containsKey(value)) { results.getSeriesMap().get(value).addSample(results.getSampleMap().get(currentSampleAccession)); } seriesSet(currentSeriesAccession, "seriesId", value); results.getSampleMap().get(currentSampleAccession).addSeriesAppearsIn(value); } else if (startsWithIgnoreCase(line, "!Sample_supplementary_file")) { sampleSupplementaryFileSet(currentSampleAccession, value); } else if (startsWithIgnoreCase(line, "!Sample_last_update_date")) { sampleLastUpdateDate(currentSampleAccession, value); } else if (startsWithIgnoreCase(line, "!Sample_data_row_count")) { if (value.equals("0")) { /* * Empty sample, we won't get any data and this messes things up later. */ log.warn("No data for sample " + currentSampleAccession); initializeQuantitationTypes(); checkDataCompleteness(); // because we don't get the table_end. } } else if (startsWithIgnoreCase(line, "!Sample_type")) { sampleTypeSet(currentSampleAccession, value); } else if (startsWithIgnoreCase(line, "!Sample_comment")) { // noop. } else if (startsWithIgnoreCase(line, "!Sample_taxid_ch")) { // noop. } else if (startsWithIgnoreCase(line, "!Sample_relation")) { // noop, for now. Example is "!Sample_relation = Reanalyzed by: GSE26971" in GSE12093 } else if (startsWithIgnoreCase(line, "!Sample_instrument_model")) { // e.g. Illumina HiSeq 2000 } else if (startsWithIgnoreCase(line, "!Sample_library_selection")) { // e.g. 'cDNA' } else if (startsWithIgnoreCase(line, "!Sample_library_source")) { // e.g. 'transcriptomic' } else if (startsWithIgnoreCase(line, "!Sample_library_strategy")) { // e.g. 'RNA-seq' if (value.equals("RNA-seq")) { sampleSet(currentSampleAccession, "mightNotHaveDataInFile", true); } } else if (startsWithIgnoreCase(line, "!Sample_anchor")) { // e.g. NlaIII for SAGE } else if (startsWithIgnoreCase(line, "!Sample_tag_length")) { // SAGE } else if (startsWithIgnoreCase(line, "!Sample_tag_count")) { // SAGE } else { log.error("Unknown flag in sample: " + line); } } /** * Parse a line from the "series" section of a GSE file. This contains annotations about the series. * * @param line * @param value */ private void parseSeriesLine(String line, String value) { if (this.processPlatformsOnly) return; /*************************************************************************************************************** * SERIES **************************************************************************************************************/ if (startsWithIgnoreCase(line, "!Series_title")) { seriesSet(currentSeriesAccession, "title", value); } else if (startsWithIgnoreCase(line, "!Series_geo_accession")) { currentSeriesAccession = value; } else if (startsWithIgnoreCase(line, "!Series_status")) { seriesSet(currentSeriesAccession, "status", value); } else if (startsWithIgnoreCase(line, "!Series_submission_date")) { seriesSet(currentSeriesAccession, "submissionDate", value); } else if (startsWithIgnoreCase(line, "!Series_pubmed_id")) { seriesAddTo(currentSeriesAccession, "pubmedIds", value); } else if (startsWithIgnoreCase(line, "!Series_overall_design")) { // FIXME add support for this description. } else if (startsWithIgnoreCase(line, "!Series_relation")) { if (value.toLowerCase().startsWith("superseries")) { log.info(" ** SuperSeries detected **"); seriesSet(currentSeriesAccession, "isSuperSeries", true); } else if (value.toLowerCase().startsWith("subseries")) { log.info(" ** Subseries detected **"); seriesSet(currentSeriesAccession, "isSubSeries", true); } } else if (startsWithIgnoreCase(line, "!Series_summary")) { if (value.toLowerCase().startsWith("this superseries")) { log.info(" ** SuperSeries detected **"); seriesSet(currentSeriesAccession, "isSuperSeries", true); } else if (value.toLowerCase().startsWith("gse") && results.getSeriesMap().get(currentSeriesAccession).isSuperSeries()) { String[] fields = value.split(":", 2); if (fields.length != 2) { throw new IllegalStateException("Expected a colon in " + value); } results.getSeriesMap().get(currentSeriesAccession).addSubSeries(fields[0]); } else if (value.toLowerCase().contains("keyword")) { String keyword = extractValue(value); seriesAddTo(currentSeriesAccession, "keyWords", keyword); } else { seriesAddTo(currentSeriesAccession, "summary", value); } } else if (startsWithIgnoreCase(line, "!Series_type")) { seriesSet(currentSeriesAccession, "overallDesign", value); } else if (startsWithIgnoreCase(line, "!Series_contributor")) { GeoContact contributer = new GeoContact(); String[] nameFields = StringUtils.split(value, ","); contributer.setName(StringUtils.join(nameFields, " ")); results.getSeriesMap().get(currentSeriesAccession).addContributer(contributer); } else if (startsWithIgnoreCase(line, "!Series_sample_id")) { addSeriesSample(value); } else if (startsWithIgnoreCase(line, "!Series_contact_name")) { seriesContactSet(currentSeriesAccession, "name", value); } else if (startsWithIgnoreCase(line, "!Series_contact_email")) { seriesContactSet(currentSeriesAccession, "email", value); } else if (startsWithIgnoreCase(line, "!Series_contact_institute")) { seriesContactSet(currentSeriesAccession, "institute", value); } else if (startsWithIgnoreCase(line, "!Series_contact_laboratory")) { seriesContactSet(currentSeriesAccession, "laboratory", value); } else if (startsWithIgnoreCase(line, "!Series_contact_department")) { seriesContactSet(currentSeriesAccession, "department", value); } else if (startsWithIgnoreCase(line, "!Series_contact_address")) { // may not be used any longer. seriesContactSet(currentSeriesAccession, "address", value); } else if (startsWithIgnoreCase(line, "!Series_contact_state")) { // new seriesContactSet(currentSeriesAccession, "state", value); } else if (startsWithIgnoreCase(line, "!Series_contact_zip/postal_code")) { // new seriesContactSet(currentSeriesAccession, "postCode", value); } else if (startsWithIgnoreCase(line, "!Series_contact_country")) { // new seriesContactSet(currentSeriesAccession, "country", value); } else if (startsWithIgnoreCase(line, "!Series_contact_city")) { seriesContactSet(currentSeriesAccession, "city", value); } else if (startsWithIgnoreCase(line, "!Series_contact_phone")) { seriesContactSet(currentSeriesAccession, "phone", value); } else if (startsWithIgnoreCase(line, "!Series_contact_fax")) { seriesContactSet(currentSeriesAccession, "fax", value); } else if (startsWithIgnoreCase(line, "!Series_contact_web_link")) { seriesContactSet(currentSeriesAccession, "webLink", value); } else if (startsWithIgnoreCase(line, "!series_platform_id")) { seriesSet(currentSeriesAccession, "platformId", value); } else if (startsWithIgnoreCase(line, "!series_table_begin")) { inSeriesTable = true; } else if (startsWithIgnoreCase(line, "!series_table_end")) { inSeriesTable = false; } else if (startsWithIgnoreCase(line, "!Series_variable_description_")) { Integer variableId = new Integer(extractVariableNumber(line)); results.getSeriesMap().get(currentSeriesAccession).getVariables().get(variableId).setDescription(value); } else if (startsWithIgnoreCase(line, "!Series_variable_sample_list_")) { parseSeriesVariableSampleListLine(line, value); } else if (startsWithIgnoreCase(line, "!Series_variable_repeats_")) { Integer variableId = new Integer(extractVariableNumber(line)); results.getSeriesMap().get(currentSeriesAccession).getReplicates().get(variableId) .setRepeats(GeoReplication.convertStringToRepeatType(value)); } else if (startsWithIgnoreCase(line, "!Series_variable_repeats_sample_list")) { parseSeriesVariableRepeatsSampleListLine(line, value); } else if (startsWithIgnoreCase(line, "!Series_web_link")) { // seriesSet( currentSeriesAccession, "platformId", value ); } else if (startsWithIgnoreCase(line, "!Series_variable_")) { Integer variableId = extractVariableNumber(line); GeoVariable v = new GeoVariable(); v.setType(GeoVariable.convertStringToType(value)); results.getSeriesMap().get(currentSeriesAccession).addToVariables(variableId, v); } else if (startsWithIgnoreCase(line, "!Series_supplementary_file")) { seriesSupplementaryFileSet(currentSeriesAccession, value); } else if (startsWithIgnoreCase(line, "!Series_last_update_date")) { seriesLastUpdateDate(currentSeriesAccession, value); } else if (startsWithIgnoreCase(line, "!Series_citation")) { // no-op. This should be redundant with the pubmed info and is hard to parse anyway } else if (startsWithIgnoreCase(line, "!Series_platform_taxid")) { // no-op for now } else if (startsWithIgnoreCase(line, "!Series_sample_taxid")) { // no-op for now. } else { log.error("Unknown flag in series: " + line); } } /** * @param line * @param value */ private void parseSeriesVariableRepeatsSampleListLine(String line, String value) { Integer variableId = extractVariableNumber(line); GeoReplication var = currentSeries().getReplicates().get(variableId); Collection<String> samples = Arrays.asList(StringUtils.split(value, ", ")); for (String string : samples) { GeoSample sam = results.getSampleMap().get(string); var.addToRepeatsSampleList(sam); sam.addReplication(var); } } /** * @param line * @param value */ private void parseSeriesVariableSampleListLine(String line, String value) { Integer variableId = extractVariableNumber(line); GeoVariable var = currentSeries().getVariables().get(variableId); Collection<String> samples = Arrays.asList(StringUtils.split(value, ",")); for (String string : samples) { GeoSample sam = results.getSampleMap().get(string); var.addToVariableSampleList(sam); sam.addVariable(var); } } /** * Parse a line from a "subset" section of a GDS file. This section contains information about experimental subsets * within a dataset. These usually correspond to different factor values such as "drug-treated" vs. "placebo". * * @param line * @param value */ private void parseSubsetLine(String line, String value) { /*************************************************************************************************************** * SUBSET **************************************************************************************************************/ if (startsWithIgnoreCase(line, "!Dataset_title")) { subsetSet(currentSubsetAccession, "title", value); } else if (startsWithIgnoreCase(line, "!subset_dataset_id")) { subsetSet(currentSubsetAccession, "dataSet", value); } else if (startsWithIgnoreCase(line, "!subset_description")) { subsetAddTo(currentSubsetAccession, "description", value); } else if (startsWithIgnoreCase(line, "!subset_sample_id")) { // This should yield a list of samples we have already seen. String[] values = value.split(","); for (int i = 0; i < values.length; i++) { String sampleAccession = values[i]; if (!results.getSampleMap().containsKey(sampleAccession)) { addNewSample(sampleAccession); } if (log.isDebugEnabled()) log.debug("Adding sample: " + sampleAccession + " to subset " + currentSubsetAccession); results.getSubsetMap().get(currentSubsetAccession) .addSample(results.getSampleMap().get(sampleAccession)); } } else if (startsWithIgnoreCase(line, "!subset_type")) { subsetSet(currentSubsetAccession, "type", value); } else { log.error("Unknown flag: " + line); } } /** * @param accession * @param property * @param value */ private void platformAddTo(String accession, String property, Object value) { GeoPlatform platform = results.getPlatformMap().get(accession); if (platform == null) throw new IllegalArgumentException("Unknown platform " + accession); addTo(platform, property, value); } /** * @param accession * @param property * @param value */ private void platformContactSet(String accession, String property, Object value) { GeoPlatform platform = results.getPlatformMap().get(accession); contactSet(platform, property, value); } /** * @param accession * @param value */ private void platformLastUpdateDate(String accession, String value) { GeoPlatform platform = results.getPlatformMap().get(accession); lastUpdateDateSet(platform, value); } /** * @param accession * @param property * @param value */ private void platformSet(String accession, String property, Object value) { GeoPlatform platform = results.getPlatformMap().get(accession); if (platform == null) throw new IllegalArgumentException("Unknown platform " + accession); if (property.equals("technology")) { assert value instanceof String; value = GeoDataset.convertStringToPlatformType((String) value); } try { BeanUtils.setProperty(platform, property, value); } catch (IllegalAccessException e) { log.error(e, e); throw new RuntimeException(e); } catch (InvocationTargetException e) { log.error(e, e); throw new RuntimeException(e); } } /** * @param accession * @param value */ private void platformSupplementaryFileSet(String accession, String value) { GeoPlatform platform = results.getPlatformMap().get(accession); supplementaryFileSet(platform, value); } /** * @param currentSampleAccession2 * @param string * @param value */ private void sampleAddTo(String accession, String property, Object value) { GeoSample sample = results.getSampleMap().get(accession); if (sample == null) throw new IllegalArgumentException("Unknown sample " + accession); addTo(sample, property, value); } /** * @param currentSampleAccession2 * @param string * @param channel * @param value */ private void sampleChannelAddTo(String sampleAccession, String property, int channel, String value) { GeoSample sample = results.getSampleMap().get(sampleAccession); this.addTo(sample.getChannel(channel), property, value); } /** * @param currentSampleAccession2 * @param string * @param channel * @param value */ private void sampleChannelSet(String sampleAccession, String property, int channel, Object value) { GeoSample sample = results.getSampleMap().get(sampleAccession); if (property.equals("molecule")) { value = GeoChannel.convertStringToMolecule((String) value); } try { BeanUtils.setProperty(sample.getChannel(channel), property, value); } catch (IllegalAccessException e) { log.error(e, e); throw new RuntimeException(e); } catch (InvocationTargetException e) { log.error(e, e); throw new RuntimeException(e); } } /** * @param accession * @param property * @param value */ private void sampleContactSet(String accession, String property, Object value) { GeoSample sample = results.getSampleMap().get(accession); contactSet(sample, property, value); } /** * @param accession * @param value */ private void sampleLastUpdateDate(String accession, String value) { GeoSample sample = results.getSampleMap().get(accession); lastUpdateDateSet(sample, value); } /** * @param accession * @param property * @param value */ private void sampleSet(String accession, String property, Object value) { GeoSample sample = results.getSampleMap().get(accession); if (sample == null) throw new IllegalArgumentException("Unknown sample " + accession); try { BeanUtils.setProperty(sample, property, value); } catch (IllegalAccessException e) { throw new RuntimeException(e); } catch (InvocationTargetException e) { throw new RuntimeException(e); } } /** * @param accession * @param value */ private void sampleSupplementaryFileSet(String accession, String value) { GeoSample sample = results.getSampleMap().get(accession); supplementaryFileSet(sample, value); } /** * @param accession * @param property * @param value */ private void seriesAddTo(String accession, String property, Object value) { GeoSeries series = results.getSeriesMap().get(accession); if (series == null) throw new IllegalArgumentException("Unknown series " + accession); addTo(series, property, value); } /** * @param accession * @param property * @param value */ private void seriesContactSet(String accession, String property, Object value) { GeoSeries series = results.getSeriesMap().get(accession); contactSet(series, property, value); } /** * @param accession * @param value */ private void seriesLastUpdateDate(String accession, String value) { GeoSeries series = results.getSeriesMap().get(accession); lastUpdateDateSet(series, value); } /** * @param accession * @param property * @param value */ private void seriesSet(String accession, String property, Object value) { GeoSeries series = results.getSeriesMap().get(accession); if (series == null) throw new IllegalArgumentException("Unknown series " + accession); try { BeanUtils.setProperty(series, property, value); } catch (IllegalAccessException e) { log.error(e, e); throw new RuntimeException(e); } catch (InvocationTargetException e) { log.error(e, e); throw new RuntimeException(e); } } /** * @param accession * @param value */ private void seriesSupplementaryFileSet(String accession, String value) { GeoSeries series = results.getSeriesMap().get(accession); supplementaryFileSet(series, value); } /** * @param line * @param string * @return */ private boolean startsWithIgnoreCase(String string, String pattern) { // it will never be the same string. return string.regionMatches(true, 0, pattern, 0, pattern.length()); } /** * @param accession * @param property * @param value */ private void subsetAddTo(String accession, String property, Object value) { GeoSubset subset = results.getSubsetMap().get(accession); if (subset == null) throw new IllegalArgumentException("Unknown subset " + accession); addTo(subset, property, value); } /** * @param accession * @param property * @param value */ private void subsetSet(String accession, String property, Object value) { GeoSubset subset = results.getSubsetMap().get(accession); if (subset == null) throw new IllegalArgumentException("Unknown subset " + accession); if (property.equals("type")) { value = GeoVariable.convertStringToType((String) value); } try { BeanUtils.setProperty(subset, property, value); } catch (IllegalAccessException e) { log.error(e, e); throw new RuntimeException(e); } catch (InvocationTargetException e) { log.error(e, e); throw new RuntimeException(e); } } /** * @param series * @param value */ private void supplementaryFileSet(Object object, String value) { if (object instanceof GeoSeries) ((GeoSeries) object).setSupplementaryFile(value); else if (object instanceof GeoPlatform) ((GeoPlatform) object).setSupplementaryFile(value); else if (object instanceof GeoSample) ((GeoSample) object).setSupplementaryFile(value); } private void validate() { GeoValues values = results.getSeriesMap().get(currentSeriesAccession).getValues(); values.validate(); } /** * @param columnName * @param actualColumnNumber * @param qtMapForPlatform The map of * @return */ private boolean willClobberOtherQuantitationType(String columnName, int actualColumnNumber, Map<String, Integer> qtMapForPlatform) { boolean clobbers = false; for (String name : currentSample().getColumnNames()) { if (name.equals(columnName)) continue; if (!qtMapForPlatform.containsKey(name)) continue; Integer checkColInd = qtMapForPlatform.get(name); if (checkColInd == actualColumnNumber) { // log.warn( "Current column name " + columnName // + " is new for the current platform, would be going in the index previously occupied by " // + name ); clobbers = true; break; } } return clobbers; } }