List of usage examples for org.apache.commons.lang3 StringUtils splitPreserveAllTokens
public static String[] splitPreserveAllTokens(final String str, final String separatorChars)
Splits the provided text into an array, separators specified, preserving all tokens, including empty tokens created by adjacent separators.
From source file:ubic.gemma.core.analysis.service.ArrayDesignAnnotationServiceImpl.java
private static Map<Long, Collection<Gene>> parseAnnotationFile(Map<Long, Collection<Gene>> results, InputStream is, Map<String, Long> probeNameToId) { try {//from ww w. j a v a 2 s . co m BufferedReader br = new BufferedReader(new InputStreamReader(is)); String line; while ((line = br.readLine()) != null) { if (StringUtils.isBlank(line) || line.startsWith(ArrayDesignAnnotationServiceImpl.COMMENT_CHARACTER)) { continue; } String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length < 3) continue; // means there are no gene annotations. String probeName = fields[0]; if (!probeNameToId.containsKey(probeName)) continue; Long probeId = probeNameToId.get(probeName); List<String> geneSymbols = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[1], '|')); List<String> geneNames = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[2], '|')); if (geneSymbols.size() != geneNames.size()) { ArrayDesignAnnotationServiceImpl.log.warn( "Annotation file format error: Unequal number of gene symbols and names for probe=" + probeName + ", skipping row"); continue; } List<String> gemmaGeneIds = null; List<String> ncbiIds = null; if (fields.length > 4) { // new style. fields[3] is the GO annotations. gemmaGeneIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[4], '|')); } if (fields.length > 5) { ncbiIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[5], '|')); } for (int i = 0; i < geneSymbols.size(); i++) { String symbol = geneSymbols.get(i); String name = geneNames.get(i); if (StringUtils.isBlank(symbol)) { continue; } String[] symbolsB = StringUtils.split(symbol, ','); String[] namesB = StringUtils.split(name, '$'); for (int j = 0; j < symbolsB.length; j++) { String s = symbolsB[j]; Gene g = Gene.Factory.newInstance(); g.setOfficialSymbol(s); try { if (gemmaGeneIds != null) { g.setId(Long.parseLong(gemmaGeneIds.get(j))); } if (ncbiIds != null) { g.setNcbiGeneId(Integer.parseInt(ncbiIds.get(j))); } } catch (NumberFormatException e) { // oh well, couldn't populate extra info. } if (namesB.length >= j + 1) { String n = namesB[j]; g.setName(n); } results.get(probeId).add(g); } } } return results; } catch (IOException e) { throw new RuntimeException(e); } }
From source file:ubic.gemma.core.loader.association.NCBIGene2GOAssociationParser.java
/** * Note that "-" means a missing value, which in practice only occurs in the "qualifier" and "pubmed" columns. * * @param line line//from w w w. j a v a2 s .com * @return Object */ @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use public Gene2GOAssociation mapFromGene2GO(String line) { String[] values = StringUtils.splitPreserveAllTokens(line, "\t"); if (line.startsWith(NCBIGene2GOAssociationParser.COMMENT_INDICATOR)) return null; if (values.length < 8) return null; Integer taxonId; try { taxonId = Integer.parseInt(values[TAX_ID]); } catch (NumberFormatException e) { throw new RuntimeException(e); } if (!taxaNcbiIds.containsKey(taxonId)) { return null; } Gene gene = Gene.Factory.newInstance(); gene.setNcbiGeneId(Integer.parseInt(values[GENE_ID])); gene.setTaxon(taxaNcbiIds.get(taxonId)); Characteristic oe = Characteristic.Factory.newInstance(); String value = values[GO_ID].replace(":", "_"); oe.setValueUri(GeneOntologyService.BASE_GO_URI + value); oe.setValue(value); // g2GOAss.setSource( ncbiGeneDb ); GOEvidenceCode evcode = null; String evidenceCode = values[EVIDENCE_CODE]; if (!(StringUtils.isBlank(evidenceCode) || evidenceCode.equals("-"))) { if (NCBIGene2GOAssociationParser.ignoredEvidenceCodes.contains(evidenceCode)) { return null; } evcode = GOEvidenceCode.fromString(evidenceCode); } Gene2GOAssociation g2GOAss = Gene2GOAssociation.Factory.newInstance(gene, oe, evcode); try { queue.put(g2GOAss); } catch (InterruptedException e) { throw new RuntimeException(e); } return g2GOAss; }
From source file:ubic.gemma.core.loader.expression.arrayDesign.ArrayDesignParser.java
@Override public ArrayDesign parseOneLine(String line) { ArrayDesign ad = ArrayDesign.Factory.newInstance(); String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); ad.setName(fields[0]);/* w w w. j a v a 2s. co m*/ ad.setDescription(fields[5]); Taxon t = Taxon.Factory.newInstance(); t.setCommonName(fields[4].toLowerCase()); t.setIsGenesUsable(true); // assumption ad.setPrimaryTaxon(t); Contact manufacturer = Contact.Factory.newInstance(); manufacturer.setName(fields[1]); ad.setDesignProvider(manufacturer); ad.setAdvertisedNumberOfDesignElements(Integer.parseInt(fields[4])); return ad; }
From source file:ubic.gemma.core.loader.expression.arrayDesign.ArrayDesignProbeMapperServiceImpl.java
@Override public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB, boolean ncbiIds) throws IOException { if (taxon == null && !ncbiIds) { throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true"); }//from ww w .j a v a 2 s . co m if (arrayDesign.getTechnologyType().equals(TechnologyType.GENELIST) || arrayDesign.getTechnologyType().equals(TechnologyType.SEQUENCING) || arrayDesign.getTechnologyType().equals(TechnologyType.OTHER)) { throw new IllegalArgumentException( "Do not use this service to process platforms that do not use an probe-based technology."); } try (BufferedReader b = new BufferedReader(new FileReader(source))) { String line; int numSkipped = 0; ArrayDesignProbeMapperServiceImpl.log.info("Removing any old associations"); arrayDesignService.deleteGeneProductAssociations(arrayDesign); while ((line = b.readLine()) != null) { if (StringUtils.isBlank(line)) { continue; } if (line.startsWith("#")) { continue; } String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length != 3) { throw new IOException("Illegal format, expected three columns, got " + fields.length); } String probeId = fields[0]; String seqName = fields[1]; /* * FIXME. We have to allow NCBI gene ids here. */ String geneSymbol = fields[2]; if (StringUtils.isBlank(geneSymbol)) { numSkipped++; continue; } CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId); if (c == null) { if (ArrayDesignProbeMapperServiceImpl.log.isDebugEnabled()) ArrayDesignProbeMapperServiceImpl.log .debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping"); numSkipped++; continue; } // a probe can have more than one gene associated with it if so they are piped | Collection<Gene> geneListProbe = new HashSet<>(); // indicate multiple genes Gene geneDetails; StringTokenizer st = new StringTokenizer(geneSymbol, "|"); while (st.hasMoreTokens()) { String geneToken = st.nextToken().trim(); if (ncbiIds) { geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken)); } else { geneDetails = geneService.findByOfficialSymbol(geneToken, taxon); } if (geneDetails != null) { geneListProbe.add(geneDetails); } } if (geneListProbe.size() == 0) { ArrayDesignProbeMapperServiceImpl.log .warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping"); numSkipped++; continue; } else if (geneListProbe.size() > 1) { // this is a common situation, when the geneSymbol actually has |-separated genes, so no need to // make a // lot of fuss. ArrayDesignProbeMapperServiceImpl.log .debug("More than one gene found for '" + geneSymbol + "' in " + taxon); } BioSequence bs = c.getBiologicalCharacteristic(); if (bs != null) { if (StringUtils.isNotBlank(seqName)) { bs = bioSequenceService.thaw(bs); if (!bs.getName().equals(seqName)) { ArrayDesignProbeMapperServiceImpl.log.warn("Sequence name '" + seqName + "' given for " + probeId + " does not match existing entry " + bs.getName() + ", skipping"); numSkipped++; continue; } } // otherwise we assume everything is okay. } else { // create one based on the text provided. if (StringUtils.isBlank(seqName)) { ArrayDesignProbeMapperServiceImpl.log.warn( "You must provide sequence names for probes which are not already mapped. probeName=" + probeId + " had no sequence associated and no name provided; skipping"); numSkipped++; continue; } bs = BioSequence.Factory.newInstance(); bs.setName(seqName); bs.setTaxon(taxon); bs.setDescription("Imported from annotation file"); // Placeholder. bs.setType(SequenceType.OTHER); bs = bioSequenceService.create(bs); c.setBiologicalCharacteristic(bs); compositeSequenceService.update(c); } assert bs != null; assert bs.getId() != null; for (Gene gene : geneListProbe) { gene = geneService.thaw(gene); if (gene.getProducts().size() == 0) { ArrayDesignProbeMapperServiceImpl.log.warn("There are no gene products for " + gene + ", it cannot be mapped to probes. Skipping"); numSkipped++; continue; } for (GeneProduct gp : gene.getProducts()) { AnnotationAssociation association = AnnotationAssociation.Factory.newInstance(); association.setBioSequence(bs); association.setGeneProduct(gp); association.setSource(sourceDB); annotationAssociationService.create(association); } } } arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId()); this.deleteOldFiles(arrayDesign); ArrayDesignProbeMapperServiceImpl.log.info( "Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped"); } }
From source file:ubic.gemma.core.loader.expression.arrayDesign.CompositeSequenceParser.java
@Override public CompositeSequence parseOneLine(String line) { String[] tokens = StringUtils.splitPreserveAllTokens(line, '\t'); if (tokens.length != 3) { return null; }// w w w .j a v a 2 s .co m String probeid = tokens[0]; String genbankAcc = tokens[1]; String description = tokens[2]; CompositeSequence result = CompositeSequence.Factory.newInstance(); result.setName(probeid); result.setDescription(description); DatabaseEntry dbEntry = ExternalDatabaseUtils.getGenbankAccession(genbankAcc); BioSequence biologicalCharacteristic = BioSequence.Factory.newInstance(); biologicalCharacteristic.setName(genbankAcc); // this will be changed later, typically. // this will be changed later, typically. biologicalCharacteristic.setDescription(description + " (From platform source)"); biologicalCharacteristic.setSequenceDatabaseEntry(dbEntry); result.setBiologicalCharacteristic(biologicalCharacteristic); return result; }
From source file:ubic.gemma.core.loader.expression.geo.GeoFamilyParser.java
/** * If a line does not have the same number of fields as the column headings, it is skipped. * * @param line line/* w w w . j a va 2s . c o m*/ */ private void parsePlatformLine(String line) { if (!haveReadPlatformHeader) { haveReadPlatformHeader = true; return; } GeoPlatform currentPlatform = results.getPlatformMap().get(currentPlatformAccession); assert currentPlatform != null; /* * Skip platform information when it is not going to be usable, unless we are ONLY parsing a platform. */ // Actually this isn't as important, since we filter out bad elements. // if ( !processPlatformsOnly && !currentPlatform.useDataFromGeo() ) { // return; // } String[] tokens = StringUtils.splitPreserveAllTokens(line, GeoFamilyParser.FIELD_DELIM); List<String> columnNames = currentPlatform.getColumnNames(); int numColumns = columnNames.size(); if (numColumns != tokens.length && numWarnings < GeoFamilyParser.MAX_WARNINGS) { GeoFamilyParser.log.warn("Wrong number of tokens in line (" + tokens.length + ", expected " + numColumns + "), line was '" + line + "'; Possible corrupt file or invalid format?"); numWarnings++; if (numWarnings == GeoFamilyParser.MAX_WARNINGS) { GeoFamilyParser.log.warn("Further warnings suppressed"); } return; } for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; String columnName = columnNames.get(i); currentPlatform.addToColumnData(columnName, token); } platformLines++; }
From source file:ubic.gemma.core.loader.expression.geo.GeoFamilyParser.java
/** * The data for one sample is all the values for each quantitation type. * Important implementation note: In the sample table sections of GSEXXX_family files, the first column is always * ID_REF, according to the kind folks at NCBI. If this changes, this code will BREAK. * Similarly, the column names between the different samples are not necessarily the same, but we trust that they * all refer to the same quantitation types in the same order, for a given platform. That is, the nth column for * this sample 'means' the same thing as the nth column for another sample in this series (on the same platform). If * that isn't true, this will be BROKEN. However, we do try to sort it out if we can. * * @param line line// www . jav a 2s . c om */ private void parseSampleDataLine(String line) { if (StringUtils.isBlank(line)) return; if (!haveReadSampleDataHeader) { haveReadSampleDataHeader = true; previousNumTokens = null; this.initializeQuantitationTypes(); return; } GeoSample sample = results.getSampleMap().get(currentSampleAccession); /* * skip this step if it's not a supported platform type (RNA-seq, exon arrays: we put the data in later) */ if (!sample.hasUsableData()) { return; } String[] tokens = StringUtils.splitPreserveAllTokens(line, GeoFamilyParser.FIELD_DELIM); assert tokens != null; /* * This can happen in some files that are mildly corrupted. -- we have to ignore it. */ if (tokens.length <= 1 && numWarnings < GeoFamilyParser.MAX_WARNINGS) { GeoFamilyParser.log.error("Parse error, sample data line has too few elements (" + tokens.length + "), line was '" + line + "'"); numWarnings++; if (numWarnings == GeoFamilyParser.MAX_WARNINGS) { GeoFamilyParser.log.warn("Further warnings suppressed"); } return; } if (previousNumTokens != null && tokens.length != previousNumTokens) { GeoFamilyParser.log.warn("Last line had " + (previousNumTokens - 1) + " quantitation types, this one has " + (tokens.length - 1)); } previousNumTokens = tokens.length; if (results.getSeriesMap().get(currentSeriesAccession) == null) { return; // this happens if we are parsing a GPL file. } GeoPlatform platformForSample = sample.getPlatforms().iterator().next(); // slow GeoValues values = results.getSeriesMap().get(currentSeriesAccession).getValues(); String designElement = tokens[0]; // ID_REF. For bug 1709, adding toLower() will fix this. Map<Integer, Integer> map = quantitationTypeTargetColumn.get(platformForSample); for (int i = 1; i < tokens.length; i++) { String value = tokens[i]; int qtIndex = i - 1; /* * This map tells us which column this quantitation type is SUPPOSED to go in. */ if (map.containsKey(qtIndex)) qtIndex = map.get(qtIndex); if (!this.isWantedQuantitationType(qtIndex)) { continue; } if (GeoFamilyParser.log.isTraceEnabled()) { GeoFamilyParser.log .trace("Adding: " + value + " to quantitationType " + (qtIndex) + " for " + designElement); } values.addValue(sample, qtIndex, designElement, value); processedDesignElements.add(designElement); } sampleDataLines++; }
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
@Override @Transactional//from w ww . ja v a 2 s . c o m public void importDesign(ExpressionExperiment experiment, InputStream is) throws IOException { this.efoService = this.ontologyService.getExperimentalFactorOntologyService(); ExperimentalDesignImporterImpl.log.debug("Parsing input file"); boolean readHeader = false; BufferedReader r = new BufferedReader(new InputStreamReader(is)); String line; // experiment = expressionExperimentService.thawBioAssays( experiment ); ExperimentalDesign experimentalDesign = experiment.getExperimentalDesign(); if (!experimentalDesign.getExperimentalFactors().isEmpty()) { ExperimentalDesignImporterImpl.log .warn("Experimental design already has factors, import will add new ones"); } experimentalDesign.setDescription("Parsed from file."); List<String> experimentalFactorLines = new ArrayList<>(); String sampleHeaderLine = ""; List<String> factorValueLines = new ArrayList<>(); while ((line = r.readLine()) != null) { if (line.startsWith(ExperimentalDesignImporterImpl.EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR)) { experimentalFactorLines.add(line); } else if (line.startsWith("#") || StringUtils.isBlank(line)) { //noinspection UnnecessaryContinue // Better for readability continue; } else if (!readHeader) { sampleHeaderLine = line; readHeader = true; } else { factorValueLines.add(line); } } String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t"); Collection<BioMaterial> experimentBioMaterials = this.bioMaterialService.findByExperiment(experiment); this.validateFileComponents(experimentalFactorLines, sampleHeaderLine, factorValueLines); this.validateExperimentalFactorFileContent(experimentalFactorLines, sampleHeaderLine); this.validateFactorFileContent(experimentalFactorLines.size(), factorValueLines); this.validateBioMaterialFileContent(experimentBioMaterials, factorValueLines); // build up the composite: create experimental factor then add the experimental value this.addExperimentalFactorsToExperimentalDesign(experimentalDesign, experimentalFactorLines, headerFields, factorValueLines); assert !experimentalDesign.getExperimentalFactors().isEmpty(); assert !experiment.getExperimentalDesign().getExperimentalFactors().isEmpty(); experimentalDesignService.update(experimentalDesign); Collection<BioMaterial> bioMaterialsWithFactorValues = this .addFactorValuesToBioMaterialsInExpressionExperiment(experimentBioMaterials, experimentalDesign, factorValueLines, headerFields); for (BioMaterial bioMaterial : bioMaterialsWithFactorValues) { this.bioMaterialService.update(bioMaterial); } }
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Add the factor values to the biomaterials * * @param experimentBioMaterials Current expression experiment's biomaterials. * @param experimentalDesign experimental design * @param factorValueLines Lines from file containing factor values and biomaterial ids * @param headerFields header fields * @return Collection of biomaterials associated with this experiment, this is returned as * the biomaterial is in a * bioassay (first one retrieved) *//*from w w w. j a va2s.c o m*/ private Collection<BioMaterial> addFactorValuesToBioMaterialsInExpressionExperiment( Collection<BioMaterial> experimentBioMaterials, ExperimentalDesign experimentalDesign, List<String> factorValueLines, String[] headerFields) { ExperimentalDesignImporterImpl.log .debug("Adding factors values to biomaterials: " + experimentalDesign.getId()); Collection<ExperimentalFactor> experimentalFactorsInExperiment = experimentalDesign .getExperimentalFactors(); Collection<BioMaterial> biomaterialsWithFactorValuesInExperiment = new HashSet<>(); Collection<BioMaterial> seenBioMaterials = new HashSet<>(); Map<ExperimentalFactor, Collection<BioMaterial>> factorsAssociatedWithBioMaterials = new HashMap<>(); for (String factorValueLine : factorValueLines) { String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t"); String externalId = null; boolean hasExternalId = headerFields[1].toUpperCase().equals("EXTERNALID"); if (hasExternalId) { externalId = factorValueFields[1]; } BioMaterial currentBioMaterial = this.getBioMaterialFromExpressionExperiment(experimentBioMaterials, factorValueFields[0], externalId); if (currentBioMaterial == null) { // this could just be due to extras. throw new IllegalStateException( "No biomaterial for " + factorValueFields[0] + ", " + factorValueFields[1]); } if (seenBioMaterials.contains(currentBioMaterial)) { throw new IllegalArgumentException( "A biomaterial occurred more than once in the file: " + currentBioMaterial); } seenBioMaterials.add(currentBioMaterial); int start = 1; if (hasExternalId) { start = 2; } for (int i = start; i < factorValueFields.length; i++) { ExperimentalFactor currentExperimentalFactor = null; String currentExperimentalFactorName = StringUtils.strip(headerFields[i]); FactorValue currentFactorValue = null; String currentFVtext = StringUtils.strip(factorValueFields[i]); if (StringUtils.isBlank(currentFVtext)) { // Missing value. Note that catching 'NA' etc. is hard, because they could be valid strings. continue; } for (ExperimentalFactor experimentalFactor : experimentalFactorsInExperiment) { if (experimentalFactor.getName().equals(currentExperimentalFactorName)) { currentExperimentalFactor = experimentalFactor; } } if (currentExperimentalFactor == null) throw new IllegalStateException("No factor matches column " + currentExperimentalFactorName); Collection<FactorValue> factorValuesInCurrentExperimentalFactor = currentExperimentalFactor .getFactorValues(); for (FactorValue factorValue : factorValuesInCurrentExperimentalFactor) { String fvv = factorValue.getValue(); if (StringUtils.isBlank(fvv)) { // try characteristics; this would be a mess if there are more than one. if (factorValue.getCharacteristics().size() == 1) { fvv = factorValue.getCharacteristics().iterator().next().getValue(); if (StringUtils.isBlank(fvv)) { continue; // we can't match to factor values that lack a value string. } } } if (fvv.trim().equalsIgnoreCase(currentFVtext)) { currentFactorValue = factorValue; } } /* * If we can't find the factorvalue that matches this, we don't get a value for this biomaterial. */ if (currentFactorValue == null) { ExperimentalDesignImporterImpl.log.error("No factor value for " + currentExperimentalFactor + " matches the text value=" + currentFVtext); } else { if (!this.checkForDuplicateFactorOnBioMaterial(currentBioMaterial, currentFactorValue)) { currentBioMaterial.getFactorValues().add(currentFactorValue); } } ExperimentalDesignImporterImpl.log.debug( "Added factor value " + currentFactorValue + " to biomaterial " + currentBioMaterial); biomaterialsWithFactorValuesInExperiment.add(currentBioMaterial); if (!factorsAssociatedWithBioMaterials.containsKey(currentExperimentalFactor)) { factorsAssociatedWithBioMaterials.put(currentExperimentalFactor, new HashSet<BioMaterial>()); } factorsAssociatedWithBioMaterials.get(currentExperimentalFactor).add(currentBioMaterial); } } /* * Check if every biomaterial got used. Worth a warning, at least. */ for (ExperimentalFactor ef : factorsAssociatedWithBioMaterials.keySet()) { if (!factorsAssociatedWithBioMaterials.get(ef).containsAll(experimentBioMaterials)) { ExperimentalDesignImporterImpl.log.warn( "File did not contain values for all factor - biomaterial combinations: Missing at least one for " + ef + " [populated " + factorsAssociatedWithBioMaterials.get(ef).size() + "/" + experimentBioMaterials.size() + " ]"); } } return biomaterialsWithFactorValuesInExperiment; }
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Get a map of experimental values keyed on experimental factor name * * @param headerFields header fields * @param factorValueLines factor value lines * @return map of experimental factor values keyed on experimental factor *///from w w w . j a v a 2s.c om private Map<String, Set<String>> getMapFactorSampleValues(String[] headerFields, List<String> factorValueLines) { Map<String, Set<String>> factorSampleValues = new HashMap<>(); for (String factorValueLine : factorValueLines) { String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t"); for (int i = 1; i < headerFields.length; i++) { // get the key String value = headerFields[i]; value = StringUtils.strip(value); String factorValue = StringUtils.strip(factorValueFields[i]); Set<String> listFactorValues = factorSampleValues.get(value); if (listFactorValues == null) { listFactorValues = new HashSet<>(); } listFactorValues.add(factorValue); factorSampleValues.put(value, listFactorValues); } } return factorSampleValues; }