List of usage examples for org.apache.commons.lang3 StringUtils strip
public static String strip(final String str)
Strips whitespace from the start and end of a String.
This is similar to #trim(String) but removes whitespace.
From source file:ubic.gemma.core.datastructure.matrix.ExpressionDataWriterUtils.java
/** * Replaces spaces and hyphens with underscores. * * @param factorValue FV// w w w . ja va2 s . c o m * @return replaced string */ public static String constructFactorValueName(FactorValue factorValue) { StringBuilder buf = new StringBuilder(); if (factorValue.getCharacteristics().size() > 0) { for (Characteristic c : factorValue.getCharacteristics()) { buf.append(StringUtils.strip(c.getValue())); if (factorValue.getCharacteristics().size() > 1) buf.append(" | "); } } else if (factorValue.getMeasurement() != null) { buf.append(factorValue.getMeasurement().getValue()); } else if (StringUtils.isNotBlank(factorValue.getValue())) { buf.append(StringUtils.strip(factorValue.getValue())); } String matchedFactorValue = buf.toString(); matchedFactorValue = matchedFactorValue.trim(); matchedFactorValue = matchedFactorValue.replaceAll("-", "_"); matchedFactorValue = matchedFactorValue.replaceAll("\\s", "_"); return matchedFactorValue; }
From source file:ubic.gemma.core.expression.experiment.service.ExpressionExperimentSearchServiceImpl.java
@Override public Collection<ExpressionExperimentValueObject> searchExpressionExperiments(List<String> query) { Set<ExpressionExperimentValueObject> all = new HashSet<>(); Set<ExpressionExperimentValueObject> prev = null; Set<ExpressionExperimentValueObject> current; for (String s : query) { s = StringUtils.strip(s); if (prev == null) { prev = new HashSet<>(this.searchExpressionExperiments(s)); all = new HashSet<>(prev); continue; }// ww w . ja v a2s .c o m current = new HashSet<>(this.searchExpressionExperiments(s)); all = Sets.intersection(all, current); } return all; }
From source file:ubic.gemma.core.genome.gene.service.GeneSearchServiceImpl.java
@Override public Map<String, GeneValueObject> searchMultipleGenesGetMap(Collection<String> query, Long taxonId) { Taxon taxon = taxonService.load(taxonId); if (taxon == null) throw new IllegalArgumentException("No such taxon with id=" + taxonId); // this deals with the simple cases. For remainder we look a little harder Map<String, GeneValueObject> queryToGenes = geneService.findByOfficialSymbols(query, taxonId); for (String line : query) { line = StringUtils.strip(line); if (StringUtils.isBlank(line)) { continue; }//from w w w .j a v a 2 s . c o m String queryAsKey = line.toLowerCase(); if (queryToGenes.containsKey(queryAsKey)) { // already found. continue; } if (queryToGenes.size() >= GeneSearchServiceImpl.MAX_GENES_PER_QUERY) { GeneSearchServiceImpl.log .warn("Too many genes, stopping (limit=" + GeneSearchServiceImpl.MAX_GENES_PER_QUERY + ')'); break; } // searching one gene at a time is a bit slow; we do a quick search for symbols. SearchSettings settings = SearchSettingsImpl.geneSearch(line, taxon); List<SearchResult> geneSearchResults = searchService.speedSearch(settings).get(Gene.class); if (geneSearchResults == null || geneSearchResults.isEmpty()) { // an empty set is an indication of no results. queryToGenes.put(queryAsKey, null); } else if (geneSearchResults.size() == 1) { // Just one result so add it Gene g = (Gene) geneSearchResults.iterator().next().getResultObject(); queryToGenes.put(queryAsKey, new GeneValueObject(g)); } else { // Multiple results need to find best one // Usually if there is more than 1 results the search term was a official symbol and picked up matches // like grin1, grin2, grin3, grin (given the search term was grin) for (SearchResult sr : geneSearchResults) { Gene srGene = (Gene) sr.getResultObject(); if (srGene.getTaxon().equals(taxon) && srGene.getOfficialSymbol().equalsIgnoreCase(line)) { queryToGenes.put(queryAsKey, new GeneValueObject(srGene)); break; // found so done } } } } return queryToGenes; }
From source file:ubic.gemma.core.loader.entrez.pubmed.PubMedXMLParser.java
/** * Fill in information about the book: Publisher, Editor(s), Publication year * * @param bibRef bib ref/* w ww . java 2 s . c o m*/ * @param record record */ private void processBookRecord(BibliographicReference bibRef, Node record) { NodeList recordNodes = record.getChildNodes(); for (int p = 0; p < recordNodes.getLength(); p++) { Node item = recordNodes.item(p); if (!(item instanceof Element)) { continue; } String name = item.getNodeName(); switch (name) { case "ArticleTitle": // this is the title of the chapter. bibRef.setTitle(StringUtils.strip(XMLUtils.getTextValue((Element) item))); break; case "Book": this.processBookInfo(bibRef, item); break; case "AuthorList": bibRef.setAuthorList(this.extractAuthorList(item.getChildNodes())); break; case "Abstract": bibRef.setAbstractText(""); NodeList abstractTextSections = item.getChildNodes(); for (int q = 0; q < abstractTextSections.getLength(); q++) { Node jitem = abstractTextSections.item(q); if (!(jitem instanceof Element)) { continue; } if (jitem.getNodeName().equals("AbstractText")) { bibRef.setAbstractText( bibRef.getAbstractText() + (XMLUtils.getTextValue((Element) jitem)) + " "); } bibRef.setAbstractText(bibRef.getAbstractText().trim()); } break; case "PMID": this.processAccession(bibRef, item); break; case "ContributionDate": /* * Unusual, but happens for books that are updated with new sections. We use this instead of the * publication date. */ this.extractBookPublicationYear(bibRef, item); break; default: log.warn("Unrecognized node name " + name); } } }
From source file:ubic.gemma.core.loader.expression.geo.GeoFamilyParser.java
/** * Extract a key and value pair from a line in the format #key = value. * * @param line line/*from w ww . j a v a 2 s . c o m*/ * @return Map containing the String key and String value. Return null if it is misformatted. */ private Map<String, String> extractKeyValue(String line) { if (!line.startsWith("#")) throw new IllegalArgumentException("Wrong type of line"); Map<String, String> result = new HashMap<>(); String fixed = line.substring(line.indexOf('#') + 1); String[] tokens = fixed.split("=", 2); if (tokens.length != 2) { GeoFamilyParser.log.warn("Invalid key-value line, expected an '=' somewhere, got: '" + line + "'"); return null; } String key = tokens[0]; String value = tokens[1]; key = StringUtils.strip(key); value = StringUtils.strip(value); result.put(key, value); return result; }
From source file:ubic.gemma.core.loader.expression.geo.GeoFamilyParser.java
/** * Extract a value from a line in the format xxxx=value. * * @param line line/*from w ww . j a v a2 s . c o m*/ * @return String following the first occurrence of '=', or null if there is no '=' in the String. */ private String extractValue(String line) { int eqIndex = line.indexOf('='); if (eqIndex < 0) { return null; // that's okay, there are lines that just indicate the end of sections. } return StringUtils.strip(line.substring(eqIndex + 1)); }
From source file:ubic.gemma.core.loader.expression.geo.service.GeoBrowser.java
/** * Retrieves and parses tab delimited file from GEO. File contains pageSize GEO records starting from startPage. * * @param startPage start page//w w w . ja va2s . c om * @param pageSize page size * @return list of GeoRecords * @throws IOException if there is a problem while manipulating the file * @throws ParseException if there is a parsing problem */ public List<GeoRecord> getRecentGeoRecords(int startPage, int pageSize) throws IOException, ParseException { if (startPage < 0 || pageSize < 0) throw new IllegalArgumentException("Values must be greater than zero "); List<GeoRecord> records = new ArrayList<>(); URL url; try { url = new URL(GEO_BROWSE_URL + startPage + GEO_BROWSE_SUFFIX + pageSize); } catch (MalformedURLException e) { throw new RuntimeException("Invalid URL: " + GEO_BROWSE_URL + startPage + GEO_BROWSE_SUFFIX + pageSize, e); } URLConnection conn = url.openConnection(); conn.connect(); try (InputStream is = conn.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is))) { // We are getting a tab delimited file. // Read columns headers. String headerLine = br.readLine(); String[] headers = StringUtil.csvSplit(headerLine); // Map column names to their indices (handy later). Map<String, Integer> columnNameToIndex = new HashMap<>(); for (int i = 0; i < headers.length; i++) { columnNameToIndex.put(headers[i], i); } // Read the rest of the file. String line; while ((line = br.readLine()) != null) { String[] fields = StringUtil.csvSplit(line); GeoRecord geoRecord = new GeoRecord(); geoRecord.setGeoAccession(fields[columnNameToIndex.get("Accession")]); geoRecord.setTitle(StringUtils.strip( fields[columnNameToIndex.get("Title")].replaceAll(GeoBrowser.FLANKING_QUOTES_REGEX, ""))); String sampleCountS = fields[columnNameToIndex.get("Sample Count")]; if (StringUtils.isNotBlank(sampleCountS)) { try { geoRecord.setNumSamples(Integer.parseInt(sampleCountS)); } catch (NumberFormatException e) { throw new RuntimeException("Could not parse sample count: " + sampleCountS); } } else { GeoBrowser.log.warn("No sample count for " + geoRecord.getGeoAccession()); } geoRecord.setContactName( fields[columnNameToIndex.get("Contact")].replaceAll(GeoBrowser.FLANKING_QUOTES_REGEX, "")); String[] taxons = fields[columnNameToIndex.get("Taxonomy")] .replaceAll(GeoBrowser.FLANKING_QUOTES_REGEX, "").split(";"); geoRecord.getOrganisms().addAll(Arrays.asList(taxons)); Date date = DateUtils.parseDate(fields[columnNameToIndex.get("Release Date")] .replaceAll(GeoBrowser.FLANKING_QUOTES_REGEX, ""), DATE_FORMATS); geoRecord.setReleaseDate(date); geoRecord.setSeriesType(fields[columnNameToIndex.get("Series Type")]); records.add(geoRecord); } } if (records.isEmpty()) { GeoBrowser.log.warn("No records obtained"); } return records; }
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * This method reads the file line e.g. $Run time : Category=environmental_history Type=categorical and creates * experimental factors from it and adds them to the experimental design. * NOTE that this doesn't have the ability to add values to existing factors, which might be desirable. * * @param experimentalDesign Experimental design for this expression experiment * @param experimentalFactorFileLines List of strings representing lines from input file containing experimental * factors * @param headerFields Sample header line split on tab. * @param factorValueLines Lines containing biomaterial names and their factor values *///from w w w . j a va 2s . c o m private void addExperimentalFactorsToExperimentalDesign(ExperimentalDesign experimentalDesign, List<String> experimentalFactorFileLines, String[] headerFields, List<String> factorValueLines) { int maxWait = 0; if (efoService.isEnabled()) { while (!efoService.isOntologyLoaded()) { try { Thread.sleep(10000); if (maxWait++ > 10) { ExperimentalDesignImporterImpl.log.error("EFO is not loaded and gave up waiting"); break; // this is okay, we can get by using OntologyTermSimple. } } catch (InterruptedException e) { e.printStackTrace(); } } } Collection<OntologyTerm> terms = ontologyService.getCategoryTerms(); if (experimentalDesign.getExperimentalFactors() == null) { experimentalDesign.setExperimentalFactors(new HashSet<ExperimentalFactor>()); } Map<String, Set<String>> mapFactorSampleValues = this.getMapFactorSampleValues(headerFields, factorValueLines); for (String experimentalFactorFileLine : experimentalFactorFileLines) { // $Run time : Category=EnvironmentalHistory Type=categorical String[] experimentalFactorfields = experimentalFactorFileLine.split(":"); String factorValue = (StringUtils.strip(experimentalFactorfields[0].replaceFirst( Pattern.quote(ExperimentalDesignImporterImpl.EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR) + "\\s*", ""))).trim(); String categoryAndType = StringUtils.strip(experimentalFactorfields[1]); String[] categoryAndTypeFields = StringUtils.split(categoryAndType); // e.g. Category=EnvironmentalHistory String category = categoryAndTypeFields[0]; // e.g. EnvironmentalHistory String categoryValue = StringUtils.split(category, "=")[1]; ExperimentalFactor experimentalFactorFromFile = ExperimentalFactor.Factory.newInstance(); experimentalFactorFromFile.setExperimentalDesign(experimentalDesign); Characteristic vc = this.termForCategoryLookup(categoryValue, terms); // e.g. Category=EnvironmentalHistory String categoryTypeValue = categoryAndTypeFields[1]; String factorType = StringUtils.split(categoryTypeValue, "=")[1]; // vc.setCategory( categoryType ); experimentalFactorFromFile.setCategory(vc); experimentalFactorFromFile.setName(factorValue); experimentalFactorFromFile.setDescription(factorValue); experimentalFactorFromFile.setType( factorType.equalsIgnoreCase("CATEGORICAL") ? FactorType.CATEGORICAL : FactorType.CONTINUOUS); this.addFactorValuesToExperimentalFactor(experimentalFactorFromFile, mapFactorSampleValues, factorType); if (!this.checkForDuplicateExperimentalFactorOnExperimentalDesign(experimentalDesign, experimentalFactorFromFile)) { experimentalDesign.getExperimentalFactors().add(experimentalFactorFromFile); ExperimentalDesignImporterImpl.log.info("Added " + experimentalFactorFromFile); } } }
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Add the factor values to the biomaterials * * @param experimentBioMaterials Current expression experiment's biomaterials. * @param experimentalDesign experimental design * @param factorValueLines Lines from file containing factor values and biomaterial ids * @param headerFields header fields * @return Collection of biomaterials associated with this experiment, this is returned as * the biomaterial is in a * bioassay (first one retrieved) *///from w w w . jav a 2 s. co m private Collection<BioMaterial> addFactorValuesToBioMaterialsInExpressionExperiment( Collection<BioMaterial> experimentBioMaterials, ExperimentalDesign experimentalDesign, List<String> factorValueLines, String[] headerFields) { ExperimentalDesignImporterImpl.log .debug("Adding factors values to biomaterials: " + experimentalDesign.getId()); Collection<ExperimentalFactor> experimentalFactorsInExperiment = experimentalDesign .getExperimentalFactors(); Collection<BioMaterial> biomaterialsWithFactorValuesInExperiment = new HashSet<>(); Collection<BioMaterial> seenBioMaterials = new HashSet<>(); Map<ExperimentalFactor, Collection<BioMaterial>> factorsAssociatedWithBioMaterials = new HashMap<>(); for (String factorValueLine : factorValueLines) { String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t"); String externalId = null; boolean hasExternalId = headerFields[1].toUpperCase().equals("EXTERNALID"); if (hasExternalId) { externalId = factorValueFields[1]; } BioMaterial currentBioMaterial = this.getBioMaterialFromExpressionExperiment(experimentBioMaterials, factorValueFields[0], externalId); if (currentBioMaterial == null) { // this could just be due to extras. throw new IllegalStateException( "No biomaterial for " + factorValueFields[0] + ", " + factorValueFields[1]); } if (seenBioMaterials.contains(currentBioMaterial)) { throw new IllegalArgumentException( "A biomaterial occurred more than once in the file: " + currentBioMaterial); } seenBioMaterials.add(currentBioMaterial); int start = 1; if (hasExternalId) { start = 2; } for (int i = start; i < factorValueFields.length; i++) { ExperimentalFactor currentExperimentalFactor = null; String currentExperimentalFactorName = StringUtils.strip(headerFields[i]); FactorValue currentFactorValue = null; String currentFVtext = StringUtils.strip(factorValueFields[i]); if (StringUtils.isBlank(currentFVtext)) { // Missing value. Note that catching 'NA' etc. is hard, because they could be valid strings. continue; } for (ExperimentalFactor experimentalFactor : experimentalFactorsInExperiment) { if (experimentalFactor.getName().equals(currentExperimentalFactorName)) { currentExperimentalFactor = experimentalFactor; } } if (currentExperimentalFactor == null) throw new IllegalStateException("No factor matches column " + currentExperimentalFactorName); Collection<FactorValue> factorValuesInCurrentExperimentalFactor = currentExperimentalFactor .getFactorValues(); for (FactorValue factorValue : factorValuesInCurrentExperimentalFactor) { String fvv = factorValue.getValue(); if (StringUtils.isBlank(fvv)) { // try characteristics; this would be a mess if there are more than one. if (factorValue.getCharacteristics().size() == 1) { fvv = factorValue.getCharacteristics().iterator().next().getValue(); if (StringUtils.isBlank(fvv)) { continue; // we can't match to factor values that lack a value string. } } } if (fvv.trim().equalsIgnoreCase(currentFVtext)) { currentFactorValue = factorValue; } } /* * If we can't find the factorvalue that matches this, we don't get a value for this biomaterial. */ if (currentFactorValue == null) { ExperimentalDesignImporterImpl.log.error("No factor value for " + currentExperimentalFactor + " matches the text value=" + currentFVtext); } else { if (!this.checkForDuplicateFactorOnBioMaterial(currentBioMaterial, currentFactorValue)) { currentBioMaterial.getFactorValues().add(currentFactorValue); } } ExperimentalDesignImporterImpl.log.debug( "Added factor value " + currentFactorValue + " to biomaterial " + currentBioMaterial); biomaterialsWithFactorValuesInExperiment.add(currentBioMaterial); if (!factorsAssociatedWithBioMaterials.containsKey(currentExperimentalFactor)) { factorsAssociatedWithBioMaterials.put(currentExperimentalFactor, new HashSet<BioMaterial>()); } factorsAssociatedWithBioMaterials.get(currentExperimentalFactor).add(currentBioMaterial); } } /* * Check if every biomaterial got used. Worth a warning, at least. */ for (ExperimentalFactor ef : factorsAssociatedWithBioMaterials.keySet()) { if (!factorsAssociatedWithBioMaterials.get(ef).containsAll(experimentBioMaterials)) { ExperimentalDesignImporterImpl.log.warn( "File did not contain values for all factor - biomaterial combinations: Missing at least one for " + ef + " [populated " + factorsAssociatedWithBioMaterials.get(ef).size() + "/" + experimentBioMaterials.size() + " ]"); } } return biomaterialsWithFactorValuesInExperiment; }
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Get a map of experimental values keyed on experimental factor name * * @param headerFields header fields * @param factorValueLines factor value lines * @return map of experimental factor values keyed on experimental factor */// w ww . j av a 2 s .c om private Map<String, Set<String>> getMapFactorSampleValues(String[] headerFields, List<String> factorValueLines) { Map<String, Set<String>> factorSampleValues = new HashMap<>(); for (String factorValueLine : factorValueLines) { String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t"); for (int i = 1; i < headerFields.length; i++) { // get the key String value = headerFields[i]; value = StringUtils.strip(value); String factorValue = StringUtils.strip(factorValueFields[i]); Set<String> listFactorValues = factorSampleValues.get(value); if (listFactorValues == null) { listFactorValues = new HashSet<>(); } listFactorValues.add(factorValue); factorSampleValues.put(value, listFactorValues); } } return factorSampleValues; }