List of usage examples for org.apache.commons.lang StringUtils splitPreserveAllTokens
public static String[] splitPreserveAllTokens(String str, String separatorChars)
Splits the provided text into an array, separators specified, preserving all tokens, including empty tokens created by adjacent separators.
From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java
@Override public void importDesign(ExpressionExperiment experiment, InputStream is, boolean dryRun) throws IOException { this.mgedOntologyService = this.ontologyService.getMgedOntologyService(); log.debug("Parsing input file"); boolean readHeader = false; BufferedReader r = new BufferedReader(new InputStreamReader(is)); String line = null;//from w ww . ja va 2 s .c o m if (mgedOntologyService == null) { throw new IllegalStateException("Please set the MGED OntologyService, thanks."); } ExperimentalDesign experimentalDesign = experiment.getExperimentalDesign(); if (!experimentalDesign.getExperimentalFactors().isEmpty()) { log.warn("Experimental design already has factors, import will add new ones"); } experimentalDesign.setDescription("Parsed from file."); List<String> experimentalFactorLines = new ArrayList<String>(); String sampleHeaderLine = ""; List<String> factorValueLines = new ArrayList<String>(); while ((line = r.readLine()) != null) { if (line.startsWith(EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR)) { experimentalFactorLines.add(line); } else if (line.startsWith("#") || StringUtils.isBlank(line)) { continue; } else if (!readHeader) { sampleHeaderLine = line; readHeader = true; } else { factorValueLines.add(line); } } String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t"); Collection<BioMaterial> experimentBioMaterials = this.bioMaterialService.findByExperiment(experiment); validateFileComponents(experimentalFactorLines, sampleHeaderLine, factorValueLines); validateExperimentalFactorFileContent(experimentalFactorLines, sampleHeaderLine); validateFactorFileContent(experimentalFactorLines.size(), factorValueLines); validateBioMaterialFileContent(experiment, experimentBioMaterials, factorValueLines); // build up the composite: create experimental factor then add the experimental value addExperimentalFactorsToExperimentalDesign(experimentalDesign, experimentalFactorLines, headerFields, factorValueLines); experimentalDesignService.update(experimentalDesign); // a bit tricky as there is an assumption that the first biomaterial in the bioassay set is the relevent one; // safer to use biomaterial collection returned; cannot guarantee order of objects in collection. Collection<BioMaterial> bioMaterialsWithFactorValues = addFactorValuesToBioMaterialsInExpressionExperiment( experiment, experimentBioMaterials, experimentalDesign, factorValueLines, headerFields); for (BioMaterial bioMaterial : bioMaterialsWithFactorValues) { this.bioMaterialService.update(bioMaterial); // just a debugging sanity check. BioMaterial bbm = this.bioMaterialService.load(bioMaterial.getId()); if (log.isDebugEnabled()) log.debug(bbm + ": " + bbm.getFactorValues().size() + " factor values: " + StringUtils.join(bbm.getFactorValues(), " ; ")); } }
From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Add the factor values to the biomaterial * /*from w ww . ja v a2 s. c o m*/ * @param experiment * @param experimentBioMaterials Current expression experiment's biomaterials. * @param experimentalDesign experimental design * @param factorValueLines Lines from file containing factor values and biomaterial ids * @param headerFields * @return Collection of biomaterials associated with this experiment, this is returned as the biomaterial is in a * bioassay (first one retrieved) */ private Collection<BioMaterial> addFactorValuesToBioMaterialsInExpressionExperiment( ExpressionExperiment experiment, Collection<BioMaterial> experimentBioMaterials, ExperimentalDesign experimentalDesign, List<String> factorValueLines, String[] headerFields) { log.debug("Adding factors values to biomaterials: " + experimentalDesign.getId()); Collection<ExperimentalFactor> experimentalFactorsInExperiment = experimentalDesign .getExperimentalFactors(); Collection<BioMaterial> biomaterialsWithFactorValuesInExperiment = new HashSet<BioMaterial>(); Collection<BioMaterial> seenBioMaterials = new HashSet<BioMaterial>(); Map<ExperimentalFactor, Collection<BioMaterial>> factorsAssociatedWithBioMaterials = new HashMap<ExperimentalFactor, Collection<BioMaterial>>(); for (String factorValueLine : factorValueLines) { String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t"); String externalId = null; boolean hasExternalId = headerFields[1].toUpperCase().equals("EXTERNALID"); if (hasExternalId) { externalId = factorValueFields[1]; } BioMaterial currentBioMaterial = getBioMaterialFromExpressionExperiment(experiment, experimentBioMaterials, factorValueFields[0], externalId); if (currentBioMaterial == null) { throw new IllegalStateException("No biomaterial for " + factorValueFields[0]); } if (seenBioMaterials.contains(currentBioMaterial)) { throw new IllegalArgumentException( "A biomaterial occurred more than once in the file: " + currentBioMaterial); } seenBioMaterials.add(currentBioMaterial); int start = 1; if (hasExternalId) { start = 2; } for (int i = start; i < factorValueFields.length; i++) { ExperimentalFactor currentExperimentalFactor = null; String currentExperimentalFactorName = StringUtils.strip(headerFields[i]); FactorValue currentFactorValue = null; String currentFactorValueValue = StringUtils.strip(factorValueFields[i]); if (StringUtils.isBlank(currentFactorValueValue)) { // Missing value. Note that catching 'NA' etc. is hard, because they could be valid strings. continue; } for (ExperimentalFactor experimentalFactor : experimentalFactorsInExperiment) { if (experimentalFactor.getName().equals(currentExperimentalFactorName)) { currentExperimentalFactor = experimentalFactor; } } if (currentExperimentalFactor == null) throw new IllegalStateException("No factor matches column " + currentExperimentalFactorName); Collection<FactorValue> factorValuesInCurrentExperimentalFactor = currentExperimentalFactor .getFactorValues(); for (FactorValue factorValue : factorValuesInCurrentExperimentalFactor) { if (factorValue.getValue().trim().equalsIgnoreCase(currentFactorValueValue.trim())) { currentFactorValue = factorValue; } } if (currentFactorValue == null) { log.error("Current factor value not found " + currentExperimentalFactor + currentFactorValueValue); } else { if (!checkForDuplicateFactorOnBioMaterial(currentBioMaterial, currentFactorValue)) { currentBioMaterial.getFactorValues().add(currentFactorValue); } else { // already got warned. } } log.debug("Added factor value " + currentFactorValue + " to biomaterial " + currentBioMaterial); biomaterialsWithFactorValuesInExperiment.add(currentBioMaterial); if (!factorsAssociatedWithBioMaterials.containsKey(currentExperimentalFactor)) { factorsAssociatedWithBioMaterials.put(currentExperimentalFactor, new HashSet<BioMaterial>()); } factorsAssociatedWithBioMaterials.get(currentExperimentalFactor).add(currentBioMaterial); } } /* * Check if every biomaterial got used. Worth a warning, at least. */ for (ExperimentalFactor ef : factorsAssociatedWithBioMaterials.keySet()) { if (!factorsAssociatedWithBioMaterials.get(ef).containsAll(experimentBioMaterials)) { log.warn( "File did not contain values for all factor - biomaterial combinations: Missing at least one for " + ef + " [populated " + factorsAssociatedWithBioMaterials.get(ef).size() + "/" + experimentBioMaterials.size() + " ]"); } } return biomaterialsWithFactorValuesInExperiment; }
From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Get a map of experimental values keyed on experimental factor name * //from ww w. j av a2 s. c o m * @param headerFields * @param factorValueLines * @return map of experimental factor values keyed on experimental factor */ private Map<String, Set<String>> getMapFactorSampleValues(String[] headerFields, List<String> factorValueLines) { Map<String, Set<String>> factorSampleValues = new HashMap<String, Set<String>>(); for (String factorValueLine : factorValueLines) { String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t"); for (int i = 1; i < headerFields.length; i++) { // get the key String value = headerFields[i]; value = StringUtils.strip(value); String factorValue = StringUtils.strip(factorValueFields[i]); Set<String> listFactorValues = factorSampleValues.get(value); if (listFactorValues == null) { listFactorValues = new HashSet<String>(); } listFactorValues.add(factorValue); factorSampleValues.put(value, listFactorValues); } } return factorSampleValues; }
From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Check that the biomaterial is in the file and in the experiment. It is arguable whether this should be an * exception. I think it has to be to make sure that simple errors in the format are caught. But it's inconvenient * for cases where a single 'design' file is to be used for multiple microarray studies. Biomaterial ids should * match what is stored/* www.ja va 2 s .c o m*/ * * @param experiment Current experiment * @param factorValueLines Lines containing biomaterial names and their factor values */ private void validateBioMaterialFileContent(ExpressionExperiment experiment, Collection<BioMaterial> bioMaterials, List<String> factorValueLines) throws IllegalArgumentException { for (String factorValueLine : factorValueLines) { String[] vals = StringUtils.splitPreserveAllTokens(factorValueLine, '\t'); if (vals.length < 2) { throw new IllegalArgumentException( "Expected a file with at least two columns separated by tabs, got " + factorValueLine); } BioMaterial bioMaterialInFile = getBioMaterialFromExpressionExperiment(experiment, bioMaterials, vals[0], vals[1]); if (bioMaterialInFile == null) { throw new IllegalArgumentException( "The uploaded file has a biomaterial name that does not match the study: " + StringUtils.splitPreserveAllTokens(factorValueLine, "\t")[0] + " (formatted based on on input: "); } } }
From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Validates that factor values given in file for each biomaterial match the number of experimental factor values * expected./* w w w . j av a 2 s. co m*/ * * @para numberOfExperimentalFactors * @param factorValueList Represents lines of file containing factor values for a biomaterial */ private void validateFactorFileContent(Integer numberOfExperimentalFactors, List<String> factorValueList) throws IOException { for (String factorValueLine : factorValueList) { String[] fields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t"); if (fields.length > numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED) { throw new IOException( "Expected no more than " + (numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED) + " columns based on EF descriptions (plus id column), got " + fields.length); } if (fields.length <= numberOfExperimentalFactors) { throw new IOException("Expected at least " + (numberOfExperimentalFactors + 1) + " columns based on EF descriptions (plus id column), got " + fields.length); } } }
From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Validates that the sample header is correctly formatted. Checks that the experimental factors defined in the * header match those in the experimental factor file lines. * /* ww w . j a va 2 s.co m*/ * @param experimentalFactorValueNames * @param numberOfExperimentalFactors * @param sampleHeaderLine * @throws IOException Validation fails. */ private void validateSampleHeaderFileContent(Set<String> experimentalFactorValueNames, Integer numberOfExperimentalFactors, String sampleHeaderLine) throws IOException { String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t"); // we might have the ids, and the external id. if (headerFields.length > numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED) { throw new IOException("Expected " + (numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED) + " columns based on EF descriptions (plus id column), got " + headerFields.length); } for (int i = 1; i < headerFields.length; i++) { String value = headerFields[i]; value = StringUtils.strip(value); if (value.equals("ExternalID")) { // that's fine. continue; } if (!experimentalFactorValueNames.contains(value)) { throw new IOException("Expected to find an EF matching the column heading '" + value + "'"); } } }
From source file:ubic.gemma.loader.genome.FastaParser.java
/** * The following formats are supported//from w w w .j av a 2 s . c om * <ul> * <li>GenBank: gi|gi-number|gb|accession|locus * <li>EMBL Data Library : gi|gi-number|emb|accession|locus * <li>DDBJ, DNA Database of Japan : gi|gi-number|dbj|accession|locus * <li>NBRF PIR : pir||entry * <li>Protein Research Foundation : prf||name * <li>SWISS-PROT : sp|accession|name * <li>Brookhaven Protein Data Bank (1) : pdb|entry|chain * <li>Brookhaven Protein Data Bank (2) : entry:chain|PDBID|CHAIN|SEQUENCE * <li>Patents : pat|country|number * <li>GenInfo Backbone Id bbs|number * <li>General database identifier : gnl|database|identifier * <li>NCBI Reference Sequence : ref|accession|locus * <li>Local Sequence identifier : lcl|identifier * <li>NIA 15k and 7k sets : H[0-9A-Z]{1-9}-\d | alternate (example: >H4002F12-5 ) * <li>Generic: probeid * </ul> * * @param bioSequence * @param header * @return */ private boolean parseDeflineHeader(BioSequence bioSequence, String header) { // one of the genbank formats. String[] split = StringUtils.splitPreserveAllTokens(header, "|;"); String firstTag = split[0]; // assert firstTag.startsWith( ">" ); // assert firstTag.length() > 1; firstTag = StringUtils.removeStart(firstTag, ">"); // FIXME check for array lengths, throw illegal argument exceptions. if (firstTag.equals("gi")) { bioSequence.setDescription(split[4]); String genbankAcc = split[3]; // with version number, possibly DatabaseEntry genbank = ExternalDatabaseUtils.getGenbankAccession(genbankAcc); bioSequence.setName(genbank.getAccession()); // without version number. bioSequence.setSequenceDatabaseEntry(genbank); } else if (firstTag.equals("pir")) { bioSequence.setName(split[1]); } else if (firstTag.equals("sp")) { bioSequence.setName(split[1]); bioSequence.setDescription(split[2]); } else if (firstTag.equals("ref")) { bioSequence.setName(split[1]); bioSequence.setDescription(split[2]); } else if (firstTag.equals("lcl")) { bioSequence.setName(split[1]); } else if (firstTag.equals("pdb")) { bioSequence.setName(split[1]); bioSequence.setDescription(split[2]); } else if (firstTag.equals("gnl")) { bioSequence.setName(split[2]); } else if (firstTag.equals("entry:chain")) { bioSequence.setName(split[1]); } else if (firstTag.matches(NIA_HEADER_REGEX)) { return parseNIA(bioSequence, header); } else { // generic. bioSequence.setName(split[0]); if (split.length > 1) bioSequence.setDescription(split[1]); // log.warn( "Defline-style FASTA header in unrecognized format, started with " + firstTag ); // return false; } return true; }
From source file:ubic.gemma.loader.genome.gene.ExternalFileGeneLoaderServiceImpl.java
/** * Read a gene file line, splitting the line into 3 strings. * //from ww w.ja va2 s.co m * @param line A line from the gene file * @return Array of strings representing a line in a gene file. * @throws IOException Thrown if file is not readable */ private String[] readLine(String line) throws IOException { if (StringUtils.isBlank(line)) { return null; } if (line.startsWith("#")) { return null; } String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length < 2) { throw new IOException("Illegal format, expected at least 2 columns, got " + fields.length); } return fields; }
From source file:ubic.gemma.loader.genome.gene.ncbi.homology.HomologeneServiceImpl.java
/** * @param is/*w w w .jav a 2s .c om*/ * @throws IOException */ protected void parseHomologGeneFile(InputStream is) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(is)); String line = null; while ((line = br.readLine()) != null) { if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) { continue; } String[] fields = StringUtils.splitPreserveAllTokens(line, DELIMITING_CHARACTER); Integer taxonId = Integer.parseInt(fields[1]); Long groupId; Long geneId; try { groupId = Long.parseLong(fields[0]); geneId = Long.parseLong(fields[2]); } catch (NumberFormatException e) { log.warn("Unparseable line from homologene: " + line); continue; } String geneSymbol = fields[3]; if (!group2Gene.containsKey(groupId)) { group2Gene.put(groupId, new ArrayList<Long>()); } group2Gene.get(groupId).add(geneId); if (!gene2Group.containsKey(geneId)) { gene2Group.put(geneId, groupId); } else { log.warn("Duplicate gene ID encountered. Skipping: geneID=" + geneId + " ,taxonID = " + taxonId + " ,geneSymbol = " + geneSymbol); } } ready.set(true); log.info("Gene Homology successfully loaded: " + gene2Group.keySet().size() + " genes covered in " + group2Gene.keySet().size() + " groups"); }
From source file:ubic.gemma.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java
@Override public NCBIGene2Accession parseOneLine(String line) { String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length != NCBI_GENE2ACCESSION_FIELDS_PER_ROW) { throw new IllegalArgumentException("Line is not in the right format: has " + fields.length + " fields, expected " + NCBI_GENE2ACCESSION_FIELDS_PER_ROW); }//from w w w .j a v a 2 s .co m NCBIGene2Accession currentAccession = processFields(fields); if (currentAccession == null) { return null; } addResult(currentAccession); // really doesn't serve much of a purpose /* * Only some genes are relevant - for example, we might have filtered them by taxon. */ if (geneInfo != null && !geneInfo.containsKey(currentAccession.getGeneId())) { return null; } // if the current gene Id is different from this current one, then // we are done with the gene Id. Push the geneCollection into the queue. if (lastGeneId != null && !lastGeneId.equalsIgnoreCase(currentAccession.getGeneId())) { // push the gene set to the queue try { queue.put(geneData); } catch (InterruptedException e) { throw new RuntimeException(e); } // clear the gene set geneData = new NcbiGeneData(); if (geneInfo != null) geneInfo.remove(lastGeneId); } assert currentAccession.getGeneId() != null; // we're either starting a new one, or continuing with an old one. lastGeneId = currentAccession.getGeneId(); geneData.addAccession(currentAccession); geneData.setGeneInfo(geneInfo.get(currentAccession.getGeneId())); // this will be a trailing accession.? return currentAccession; }