List of usage examples for org.apache.commons.lang3 StringUtils strip
public static String strip(final String str)
Strips whitespace from the start and end of a String.
This is similar to #trim(String) but removes whitespace.
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Validates that the input for experimental factors is correct: Experimental factor file line should be for e.g. * #$Run time : Category=EnvironmentalHistory Type=categorical Checks there is a colon, between experimental factor * and category and that category is correctly formatted. * * @param sampleHeaderLine Lines in file corresponding to order of experimental factors * @param experimentalFactorLines The lines in the file corresponding to experimental factors. * @throws IOException Experimental factor lines were not correctly format. *///from w w w . ja v a2 s.co m private void validateExperimentalFactorFileContent(List<String> experimentalFactorLines, String sampleHeaderLine) throws IOException { Set<String> experimentalFactorValueNames = new HashSet<>(); // validate experimental factor lines for (String line : experimentalFactorLines) { String[] fields = line.split(":"); if (fields.length != 2) { throw new IOException( "EF description must have two fields with a single ':' in between (" + line + ")"); } String factorName = StringUtils.strip(fields[0].replaceFirst( Pattern.quote(ExperimentalDesignImporterImpl.EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR) + "\\s*", "")); experimentalFactorValueNames.add(factorName); String category = StringUtils.strip(fields[1]); String[] descriptions = StringUtils.split(category); if (descriptions.length != 2) { throw new IOException("EF details should have the format 'Category=CATEGORY Type=TYPE'"); } } this.validateSampleHeaderFileContent(experimentalFactorValueNames, experimentalFactorLines.size(), sampleHeaderLine); }
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Validates that the sample header is correctly formatted. Checks that the experimental factors defined in the * header match those in the experimental factor file lines. * * @param experimentalFactorValueNames experimental factor value names * @param numberOfExperimentalFactors number fo EFs * @param sampleHeaderLine sample header line * @throws IOException Validation fails. *//* w w w . ja v a2 s . co m*/ private void validateSampleHeaderFileContent(Set<String> experimentalFactorValueNames, Integer numberOfExperimentalFactors, String sampleHeaderLine) throws IOException { String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t"); // we might have the ids, and the external id. if (headerFields.length > numberOfExperimentalFactors + ExperimentalDesignImporterImpl.NUMBER_OF_EXTRA_COLUMNS_ALLOWED) { throw new IOException("Expected " + (numberOfExperimentalFactors + ExperimentalDesignImporterImpl.NUMBER_OF_EXTRA_COLUMNS_ALLOWED) + " columns based on EF descriptions (plus id column), got " + headerFields.length); } for (int i = 1; i < headerFields.length; i++) { String value = headerFields[i]; value = StringUtils.strip(value); if (value.equals("ExternalID")) { // that's fine. continue; } if (!experimentalFactorValueNames.contains(value)) { throw new IOException("Expected to find an EF matching the column heading '" + value + "'"); } } }
From source file:ubic.gemma.core.loader.genome.FastaParser.java
/** * <pre>//from w ww . j a v a 2 s . c o m * Affymetrix targets or collapsed sequence target:array:probeset; * Affymetrix "style" file target:probename * Affymetrix probe probe:array:probeset:xcoord:ycoord; Interrogation_Position=XXXX; Antisense; * Affymetrix consensus/exemplar exemplar:array:probeset; gb|accession; gb:accession /DEF=Homo sapiens metalloprotease-like, disintegrin-like, cysteine-rich protein 2 delta (ADAM22) mRNA, alternative splice product, complete cds. /FEA=mRNA /GEN=ADAM22 /PROD=metalloprotease-like, * Affymetrix-like format array:probe or other string containing ':'. * </pre> * * @param bioSequence BA * @param header header * @return boolean always true */ @SuppressWarnings("SameReturnValue") // Consistency with other similar methods private boolean parseAffyHeader(BioSequence bioSequence, String header) { // affymetrix format String[] split = StringUtils.split(header, ":;"); String firstTag = StringUtils.removeStart(split[0], ">"); switch (firstTag) { case "probe": bioSequence.setName(split[1] + ":" + split[2] + ":" + split[3] + ":" + split[4]); break; case "target": // split[1] = array name or probe name // split[2] = probe name if (split.length > 2) { bioSequence.setName(split[2]); } else { bioSequence.setName(split[1]); } break; case "exemplar": bioSequence.setName(split[1] + ":" + split[2]); bioSequence.setDescription(split[3]); break; default: // This is the case if the xxxx:xxxx format is used on non-affy bioSequence.setName(StringUtils.removeStart(header, ">")); return true; } for (String string : split) { string = StringUtils.strip(string); // fill in the sequence database entry if (string.startsWith("gb|") || string.startsWith("gb:")) { String[] splits = StringUtils.split(string, ":|"); String genbankAcc = splits[1]; DatabaseEntry genbank = ExternalDatabaseUtils.getGenbankAccession(genbankAcc); bioSequence.setName(genbank.getAccession()); bioSequence.setSequenceDatabaseEntry(genbank); if (RecordParser.log.isDebugEnabled()) RecordParser.log.debug("Got genbank accession " + genbankAcc + " for " + bioSequence.getName()); break; } } return true; }
From source file:ubic.gemma.core.loader.genome.taxon.TaxonParser.java
@Override public Taxon parseOneLine(String line) { String[] fields = StringUtils.splitPreserveAllTokens(line, '|'); int ncbiid = Integer.parseInt(StringUtils.strip(fields[0])); if (!results.containsKey(ncbiid)) { Taxon t = Taxon.Factory.newInstance(); t.setNcbiId(ncbiid);/*w w w . j a v a 2s. c om*/ t.setIsGenesUsable(false); results.put(ncbiid, t); } String tag = StringUtils.strip(fields[3]); if (tag.equals("scientific name")) { results.get(ncbiid).setScientificName(StringUtils.strip(fields[1])); } else if (tag.equals("genbank common name")) { results.get(ncbiid).setCommonName(fields[1]); } return results.get(ncbiid); }
From source file:ubic.gemma.core.loader.pazar.PazarParser.java
@Override public PazarRecord parseOneLine(String line) { if (line == null || line.isEmpty()) return null; if (line.startsWith("TF_PAZAR_ID")) return null; String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length < 2) return null; PazarRecord r = new PazarRecord(); r.setPazarTfId(StringUtils.strip(fields[0])); r.setTfAcc(fields[1]);/* w w w . j ava 2 s . co m*/ r.setSpecies(fields[2]); r.setPazarTargetGeneId(fields[3]); r.setTargetGeneAcc(fields[4]); r.setProject(fields[6]); r.setPubMedId(fields[7]); // r.setMethod(fields[8); return r; }
From source file:ubic.gemma.core.search.GeneSetSearchImpl.java
@Override public GeneSet findByGoId(String goId, Taxon taxon) { OntologyTerm goTerm = geneOntologyService.getTermForId(StringUtils.strip(goId)); if (goTerm == null) { return null; }//from w ww.j a v a 2s .c o m // if taxon is null, this returns a geneset with genes from different taxons return this.goTermToGeneSet(goTerm, taxon); }
From source file:ubic.gemma.core.search.GeneSetSearchImpl.java
@Override public Collection<GeneSet> findByGoTermName(String goTermName, Taxon taxon, Integer maxGoTermsProcessed, Integer maxGeneSetSize) { Collection<? extends OntologyResource> matches = this.geneOntologyService .findTerm(StringUtils.strip(goTermName)); Collection<GeneSet> results = new HashSet<>(); for (OntologyResource t : matches) { assert t instanceof OntologyTerm; if (taxon == null) { Collection<GeneSet> sets = this.goTermToGeneSets((OntologyTerm) t, maxGeneSetSize); results.addAll(sets);/*w w w. java 2 s . com*/ // noinspection StatementWithEmptyBody // FIXME should we count each species as one go? if (maxGoTermsProcessed != null && results.size() > maxGoTermsProcessed) { // return results; } } else { GeneSet converted = this.goTermToGeneSet(t, taxon, maxGeneSetSize); // converted will be null if its size is more than maxGeneSetSize if (converted != null) { results.add(converted); } } if (maxGoTermsProcessed != null && results.size() > maxGoTermsProcessed) { return results; } } return results; }
From source file:ubic.gemma.core.search.GeneSetSearchImpl.java
@Override public Collection<GeneSet> findByName(String name) { return geneSetService.findByName(StringUtils.strip(name)); }
From source file:ubic.gemma.core.search.GeneSetSearchImpl.java
@Override public Collection<GeneSet> findByName(String name, Taxon taxon) { return geneSetService.findByName(StringUtils.strip(name), taxon); }
From source file:ubic.gemma.core.search.GeneSetSearchImpl.java
@Override public Collection<GeneSetValueObject> findByPhenotypeName(String phenotypeQuery, Taxon taxon) { StopWatch timer = new StopWatch(); timer.start();/* w w w .j a v a2 s .c o m*/ Collection<CharacteristicValueObject> phenotypes = phenotypeAssociationManagerService .searchOntologyForPhenotypes(StringUtils.strip(phenotypeQuery), null); Collection<GeneSetValueObject> results = new HashSet<>(); if (phenotypes.isEmpty()) { return results; } if (timer.getTime() > 200) { GeneSetSearchImpl.log.info("Find phenotypes: " + timer.getTime() + "ms"); } GeneSetSearchImpl.log.debug(" Converting CharacteristicValueObjects collection(size:" + phenotypes.size() + ") into GeneSets for phenotype query " + phenotypeQuery); Map<String, CharacteristicValueObject> uris = new HashMap<>(); for (CharacteristicValueObject cvo : phenotypes) { uris.put(cvo.getValueUri(), cvo); } Map<String, Collection<? extends GeneValueObject>> genes = phenotypeAssociationManagerService .findCandidateGenesForEach(uris.keySet(), taxon); if (timer.getTime() > 500) { GeneSetSearchImpl.log.info("Find phenotype genes done at " + timer.getTime() + "ms"); } for (String uri : genes.keySet()) { Collection<? extends GeneValueObject> gvos = genes.get(uri); if (gvos.isEmpty()) continue; Collection<Long> geneIds = EntityUtils.getIds(gvos); GeneSetValueObject transientGeneSet = new GeneSetValueObject(); transientGeneSet.setName(this.uri2phenoID(uris.get(uri))); transientGeneSet.setDescription(uris.get(uri).getValue()); transientGeneSet.setGeneIds(geneIds); transientGeneSet.setTaxonId(gvos.iterator().next().getTaxonId()); transientGeneSet.setTaxonName(gvos.iterator().next().getTaxonCommonName()); results.add(transientGeneSet); } if (timer.getTime() > 1000) { GeneSetSearchImpl.log.info("Loaded " + phenotypes.size() + " phenotype gene sets for query " + phenotypeQuery + " in " + timer.getTime() + "ms"); } return results; }