List of usage examples for org.apache.commons.lang StringUtils splitPreserveAllTokens
public static String[] splitPreserveAllTokens(String str, String separatorChars)
Splits the provided text into an array, separators specified, preserving all tokens, including empty tokens created by adjacent separators.
public static String unreverseUrl(String reversedUrl) { StringBuilder buf = new StringBuilder(reversedUrl.length() + 2); int pathBegin = reversedUrl.indexOf('/'); if (pathBegin == -1) pathBegin = reversedUrl.length(); String sub = reversedUrl.substring(0, pathBegin); String[] splits = StringUtils.splitPreserveAllTokens(sub, ':'); // {<reversed host>, <port>, <protocol>} buf.append(splits[1]); // add protocol buf.append("://"); reverseAppendSplits(splits[0], buf); // splits[0] is reversed // host// ww w . j av a2s . c om if (splits.length == 3) { // has a port buf.append(':'); buf.append(splits[2]); } buf.append(reversedUrl.substring(pathBegin)); return buf.toString(); }
/** * @param bis/*from w ww . jav a 2 s. c o m*/ * @param activeGenes * @throws IOException */ protected void readAgilent(InputStream bis, Set<String> activeGenes) throws IOException { if (bis == null) { throw new IOException("Inputstream was null"); } BufferedReader dis = new BufferedReader(new InputStreamReader(bis)); Collection<String> probeIds = new ArrayList<String>(); String classIds = null; String header = dis.readLine(); int numFields = getAgilentNumFields(header); int probeIndex = getAgilentProbeIndex(header); int goIndex = getAgilentGoIndex(header); int geneNameIndex = getAgilentGeneNameIndex(header); int geneSymbolIndex = getAgilentGeneSymbolIndex(header); tick(); assert (numFields > probeIndex + 1 && numFields > geneSymbolIndex + 1); Pattern pat = Pattern.compile("[0-9]+"); // loop through rows. Makes hash map of probes to go, and map of go to // probes. int n = 0; String line = ""; while ((line = dis.readLine()) != null) { if (Thread.currentThread().isInterrupted()) { dis.close(); throw new CancellationException(); } String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length < probeIndex + 1 || fields.length < geneSymbolIndex + 1) { continue; // skip lines that don't meet criteria. } String probe = fields[probeIndex]; String gene = fields[geneSymbolIndex]; if (activeGenes != null && !activeGenes.contains(gene)) { continue; } storeProbeAndGene(probeIds, probe, gene); /* read gene description */ String description = fields[geneNameIndex].intern(); if (!description.startsWith("GO:")) { probeToDescription.put(probe.intern(), description.intern()); } else { probeToDescription.put(probe.intern(), NO_DESCRIPTION); } if (fields.length < goIndex + 1) { continue; } classIds = fields[goIndex]; if (StringUtils.isNotBlank(classIds)) { String[] goinfo = classIds.split("\\|"); for (String element : goinfo) { String goi = element.intern(); parseGoTerm(probe, pat, goi); } } if (messenger != null && n % 500 == 0) { messenger.showStatus("Read " + n + " probes"); try { Thread.sleep(10); } catch (InterruptedException e) { dis.close(); throw new RuntimeException("Interrupted"); } } n++; } /* Fill in the genegroupreader and the classmap */ dis.close(); tick(); resetSelectedProbes(); if (probeToGeneName.size() == 0 || geneSetToProbeMap.size() == 0) { throw new IllegalArgumentException( "The gene annotations had invalid information. Please check the format."); } }
/** * @param arrayDesign/* w w w. j av a2 s. c o m*/ * @return Map of composite sequence ids to an array of delimited strings: [probe name,genes symbol, gene Name, * gemma gene id, ncbi id] for a given probe id. format of string is geneSymbol then geneNames same as found * in annotation file */ public static Map<Long, String[]> readAnnotationFileAsString(ArrayDesign arrayDesign) { Map<Long, String[]> results = new HashMap<Long, String[]>(); File f = new File(ANNOT_DATA_DIR + mungeFileName(arrayDesign.getShortName()) + STANDARD_FILE_SUFFIX + ANNOTATION_FILE_SUFFIX); if (!f.canRead()) {"Gene annotations are not available from " + f); return results; } Map<String, Long> probeNameToId = new HashMap<String, Long>(); int FIELDS_PER_GENE = 5; // used to be 3, now is 5; for (CompositeSequence cs : arrayDesign.getCompositeSequences()) { results.put(cs.getId(), new String[FIELDS_PER_GENE]); if (probeNameToId.containsKey(cs.getName())) { log.warn("Duplicate probe name: " + cs.getName()); } probeNameToId.put(cs.getName(), cs.getId()); } try {"Reading annotations from: " + f); InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(f.getAbsolutePath()); BufferedReader br = new BufferedReader(new InputStreamReader(is)); String line = null; while ((line = br.readLine()) != null) { if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) { continue; } String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length < 3) continue; // means there are no gene annotations. String probeName = fields[0]; if (!probeNameToId.containsKey(probeName)) continue; Long probeId = probeNameToId.get(probeName); results.get(probeId)[0] = probeName; // Probe Name (redundant!) results.get(probeId)[1] = fields[1]; // Gene Symbol results.get(probeId)[2] = fields[2]; // Gene Name // fields[3] is the GO annotations, we skip that. if (fields.length > 4) { results.get(probeId)[3] = fields[4]; // Gemma Id } if (fields.length > 5) { results.get(probeId)[4] = fields[5]; // NCBI id. } } is.close(); return results; } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } }
/** * @param results//w w w.ja v a 2s .c om * @param f * @param probeNameToId * @return */ private static Map<Long, Collection<Gene>> parseAnnotationFile(Map<Long, Collection<Gene>> results, InputStream is, Map<String, Long> probeNameToId) { try { BufferedReader br = new BufferedReader(new InputStreamReader(is)); String line = null; while ((line = br.readLine()) != null) { if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) { continue; } String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length < 3) continue; // means there are no gene annotations. String probeName = fields[0]; if (!probeNameToId.containsKey(probeName)) continue; Long probeId = probeNameToId.get(probeName); List<String> geneSymbols = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[1], '|')); List<String> geneNames = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[2], '|')); if (geneSymbols.size() != geneNames.size()) { log.warn("Annotation file format error: Unequal number of gene symbols and names for probe=" + probeName + ", skipping row"); continue; } List<String> gemmaGeneIds = null; List<String> ncbiIds = null; if (fields.length > 4) { // new style. fields[3] is the GO annotations. gemmaGeneIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[4], '|')); } if (fields.length > 5) { ncbiIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[5], '|')); } for (int i = 0; i < geneSymbols.size(); i++) { String symbol = geneSymbols.get(i); String name = geneNames.get(i); if (StringUtils.isBlank(symbol)) { continue; } String[] symbolsb = StringUtils.split(symbol, ','); String[] namesb = StringUtils.split(name, '$'); for (int j = 0; j < symbolsb.length; j++) { String s = symbolsb[j]; Gene g = Gene.Factory.newInstance(); g.setOfficialSymbol(s); try { if (gemmaGeneIds != null) { g.setId(Long.parseLong(gemmaGeneIds.get(j))); } if (ncbiIds != null) { g.setNcbiGeneId(Integer.parseInt(ncbiIds.get(j))); } } catch (NumberFormatException e) { // oh well, couldn't populate extra info. } if (namesb.length >= j + 1) { String n = namesb[j]; g.setName(n); } results.get(probeId).add(g); } } } return results; } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } }
/** * Note that "-" means a missing value, which in practice only occurs in the "qualifier" and "pubmed" columns. * //from www.j a va2 s . c o m * @param line * @param taxa to use * @return Object */ public Gene2GOAssociation mapFromGene2GO(String line) { String[] values = StringUtils.splitPreserveAllTokens(line, "\t"); if (line.startsWith(COMMENT_INDICATOR)) return null; if (values.length < 8) return null; Integer taxonId = null; try { taxonId = Integer.parseInt(values[TAX_ID]); } catch (NumberFormatException e) { throw new RuntimeException(e); } if (!taxaNcbiIds.containsKey(taxonId)) { return null; } Gene2GOAssociation g2GOAss = Gene2GOAssociation.Factory.newInstance(); Gene gene = Gene.Factory.newInstance(); gene.setNcbiGeneId(Integer.parseInt(values[GENE_ID])); gene.setTaxon(taxaNcbiIds.get(taxonId)); VocabCharacteristic oe = VocabCharacteristic.Factory.newInstance(); String value = values[GO_ID].replace(":", "_"); oe.setValueUri(GeneOntologyService.BASE_GO_URI + value); oe.setValue(value); // g2GOAss.setSource( ncbiGeneDb ); g2GOAss.setGene(gene); g2GOAss.setOntologyEntry(oe); String evidenceCode = values[EVIDENCE_CODE]; if (!(StringUtils.isBlank(evidenceCode) || evidenceCode.equals("-"))) { if (ignoredEvidenceCodes.contains(evidenceCode)) { return null; } g2GOAss.setEvidenceCode(GOEvidenceCode.fromString(evidenceCode)); } try { queue.put(g2GOAss); } catch (InterruptedException e) { throw new RuntimeException(e); } return g2GOAss; }
@Override public ArrayDesign parseOneLine(String line) { ArrayDesign ad = ArrayDesign.Factory.newInstance(); String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); ad.setName(fields[0]);//w w w .j a v a2 s . co m ad.setDescription(fields[5]); Taxon t = Taxon.Factory.newInstance(); t.setCommonName(fields[4].toLowerCase()); t.setIsSpecies(true); // assumption. t.setIsGenesUsable(true); // assumption ad.setPrimaryTaxon(t); Contact manufacturer = Contact.Factory.newInstance(); manufacturer.setName(fields[1]); ad.setDesignProvider(manufacturer); ad.setAdvertisedNumberOfDesignElements(Integer.parseInt(fields[4])); return ad; }
@Override public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB, boolean ncbiIds) throws IOException { if (taxon == null && !ncbiIds) { throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true"); }//from w ww . ja va 2 s .c om if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) { throw new IllegalArgumentException( "Do not use this service to process platforms that do not use an probe-based technology."); } BufferedReader b = new BufferedReader(new FileReader(source)); String line = null; int numSkipped = 0;"Removing any old associations"); arrayDesignService.deleteGeneProductAssociations(arrayDesign); while ((line = b.readLine()) != null) { if (StringUtils.isBlank(line)) { continue; } if (line.startsWith("#")) { continue; } String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length != 3) { throw new IOException("Illegal format, expected three columns, got " + fields.length); } String probeId = fields[0]; String seqName = fields[1]; /* * FIXME. We have to allow NCBI gene ids here. */ String geneSymbol = fields[2]; if (StringUtils.isBlank(geneSymbol)) { numSkipped++; continue; } CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId); if (c == null) { if (log.isDebugEnabled()) log.debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping"); numSkipped++; continue; } // a probe can have more than one gene associated with it if so they are piped | Collection<Gene> geneListProbe = new HashSet<Gene>(); // indicate multiple genes Gene geneDetails = null; StringTokenizer st = new StringTokenizer(geneSymbol, "|"); while (st.hasMoreTokens()) { String geneToken = st.nextToken().trim(); if (ncbiIds) { geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken)); } else { geneDetails = geneService.findByOfficialSymbol(geneToken, taxon); } if (geneDetails != null) { geneListProbe.add(geneDetails); } } if (geneListProbe.size() == 0) { log.warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping"); numSkipped++; continue; } else if (geneListProbe.size() > 1) { // this is a common situation, when the geneSymbol actually has |-separated genes, so no need to make a // lot of fuss. log.debug("More than one gene found for '" + geneSymbol + "' in " + taxon); } BioSequence bs = c.getBiologicalCharacteristic(); if (bs != null) { if (StringUtils.isNotBlank(seqName)) { bs = bioSequenceService.thaw(bs); if (!bs.getName().equals(seqName)) { log.warn("Sequence name '" + seqName + "' given for " + probeId + " does not match existing entry " + bs.getName() + ", skipping"); numSkipped++; continue; } } // otherwise we assume everything is okay. } else { // create one based on the text provided. if (StringUtils.isBlank(seqName)) { log.warn("You must provide sequence names for probes which are not already mapped. probeName=" + probeId + " had no sequence associated and no name provided; skipping"); numSkipped++; continue; } bs = BioSequence.Factory.newInstance(); bs.setName(seqName); bs.setTaxon(taxon); bs.setDescription("Imported from annotation file"); // Placeholder. bs.setType(SequenceType.OTHER); bs = bioSequenceService.create(bs); c.setBiologicalCharacteristic(bs); // fixme: possibly move outside the loop if that's faster. compositeSequenceService.update(c); } assert bs.getId() != null; for (Gene gene : geneListProbe) { gene = geneService.thaw(gene); if (gene.getProducts().size() == 0) { log.warn( "There are no gene products for " + gene + ", it cannot be mapped to probes. Skipping"); numSkipped++; continue; } for (GeneProduct gp : gene.getProducts()) { AnnotationAssociation association = AnnotationAssociation.Factory.newInstance(); association.setBioSequence(bs); association.setGeneProduct(gp); association.setSource(sourceDB); annotationAssociationService.create(association); } } } arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId()); this.deleteOldFiles(arrayDesign);"Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped"); b.close(); }
@Override public CompositeSequence parseOneLine(String line) { String[] tokens = StringUtils.splitPreserveAllTokens(line, '\t'); if (tokens.length != 3) { return null; }// www .j ava 2 s. c o m String probeid = tokens[0]; String genbankAcc = tokens[1]; String description = tokens[2]; CompositeSequence result = CompositeSequence.Factory.newInstance(); result.setName(probeid); result.setDescription(description); DatabaseEntry dbEntry = ExternalDatabaseUtils.getGenbankAccession(genbankAcc); BioSequence biologicalCharacteristic = BioSequence.Factory.newInstance(); biologicalCharacteristic.setName(genbankAcc); // this will be changed later, typically. biologicalCharacteristic.setTaxon(taxon); // this will be changed later, typically. biologicalCharacteristic.setDescription(description + " (From platform source)"); biologicalCharacteristic.setSequenceDatabaseEntry(dbEntry); result.setBiologicalCharacteristic(biologicalCharacteristic); return result; }
/** * If a line does not have the same number of fields as the column headings, it is skipped. * /*from w w w . j a v a 2 s. c om*/ * @param line */ private void parsePlatformLine(String line) { if (!haveReadPlatformHeader) { haveReadPlatformHeader = true; return; } GeoPlatform currentPlatform = results.getPlatformMap().get(currentPlatformAccession); assert currentPlatform != null; /* * Skip platform information when it is not going to be usable, unless we are ONLY parsing a platform. */ // Actually this isn't as important, since we filter out bad elements. // if ( !processPlatformsOnly && !currentPlatform.useDataFromGeo() ) { // return; // } String[] tokens = StringUtils.splitPreserveAllTokens(line, FIELD_DELIM); List<String> columnNames = currentPlatform.getColumnNames(); int numColumns = columnNames.size(); if (numColumns != tokens.length && numWarnings < MAX_WARNINGS) { log.warn("Wrong number of tokens in line (" + tokens.length + ", expected " + numColumns + "), line was '" + line + "'; Possible corrupt file or invalid format?"); numWarnings++; if (numWarnings == MAX_WARNINGS) { log.warn("Further warnings suppressed"); } return; } GeoPlatform platform = currentPlatform; for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; String columnName = columnNames.get(i); platform.addToColumnData(columnName, token); } platformLines++; }
/** * The data for one sample is all the values for each quantitation type. * <p>//w ww.j a va 2 s . com * Important implementation note: In the sample table sections of GSEXXX_family files, the first column is always * ID_REF, according to the kind folks at NCBI. If this changes, this code will BREAK. * <p> * Similarly, the column names between the different samples are not necessarily the same, but we trust that they * all refer to the same quantitation types in the same order, for a given platform. That is, the nth column for * this sample 'means' the same thing as the nth column for another sample in this series (on the same platform). If * that isn't true, this will be BROKEN. However, we do try to sort it out if we can. * * @param line * @see initializeQuantitationTypes */ private void parseSampleDataLine(String line) { if (StringUtils.isBlank(line)) return; if (!haveReadSampleDataHeader) { haveReadSampleDataHeader = true; previousNumTokens = null; initializeQuantitationTypes(); return; } GeoSample sample = results.getSampleMap().get(currentSampleAccession); /* * skip this step if it's not a supported platform type (RNA-seq, exon arrays: we put the data in later) */ if (!sample.hasUsableData()) { return; } String[] tokens = StringUtils.splitPreserveAllTokens(line, FIELD_DELIM); assert tokens != null; /* * This can happen in some files that are mildly corrupted. -- we have to ignore it. */ if (tokens.length <= 1 && numWarnings < MAX_WARNINGS) { log.error("Parse error, sample data line has too few elements (" + tokens.length + "), line was '" + line + "'"); numWarnings++; if (numWarnings == MAX_WARNINGS) { log.warn("Further warnings suppressed"); } return; } if (previousNumTokens != null && tokens.length != previousNumTokens) { log.warn("Last line had " + (previousNumTokens - 1) + " quantitation types, this one has " + (tokens.length - 1)); } previousNumTokens = tokens.length; if (results.getSeriesMap().get(currentSeriesAccession) == null) { return; // this happens if we are parsing a GPL file. } GeoPlatform platformForSample = sample.getPlatforms().iterator().next(); // slow GeoValues values = results.getSeriesMap().get(currentSeriesAccession).getValues(); String designElement = tokens[0]; // ID_REF. For bug 1709, adding toLower() will fix this. Map<Integer, Integer> map = quantitationTypeTargetColumn.get(platformForSample); for (int i = 1; i < tokens.length; i++) { String value = tokens[i]; int qtIndex = i - 1; /* * This map tells us which column this quantitation type is SUPPOSED to go in. */ if (map.containsKey(qtIndex)) qtIndex = map.get(qtIndex); if (!isWantedQuantitationType(qtIndex)) { continue; } if (log.isTraceEnabled()) { log.trace("Adding: " + value + " to quantitationType " + (qtIndex) + " for " + designElement); } values.addValue(sample, qtIndex, designElement, value); processedDesignElements.add(designElement); } sampleDataLines++; }