List of usage examples for org.apache.commons.lang3 StringUtils splitPreserveAllTokens
public static String[] splitPreserveAllTokens(final String str, final String separatorChars)
Splits the provided text into an array, separators specified, preserving all tokens, including empty tokens created by adjacent separators.
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Check that the biomaterial is in the file and in the experiment. It is arguable whether this should be an * exception. I think it has to be to make sure that simple errors in the format are caught. But it's inconvenient * for cases where a single 'design' file is to be used for multiple microarray studies. Biomaterial ids should * match what is stored//from w ww. j a v a 2 s .c o m * * @param factorValueLines Lines containing biomaterial names and their factor values */ private void validateBioMaterialFileContent(Collection<BioMaterial> bioMaterials, List<String> factorValueLines) throws IllegalArgumentException { for (String factorValueLine : factorValueLines) { String[] vals = StringUtils.splitPreserveAllTokens(factorValueLine, '\t'); if (vals.length < 2) { throw new IllegalArgumentException( "Expected a file with at least two columns separated by tabs, got " + factorValueLine); } BioMaterial bioMaterialInFile = this.getBioMaterialFromExpressionExperiment(bioMaterials, vals[0], vals[1]); if (bioMaterialInFile == null) { // these might just be "extras" but we're being strict throw new IllegalArgumentException( "The uploaded file has a biomaterial name/ID that does not match the study: " + vals[0] + ", " + vals[1]); } } }
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Validates that factor values given in file for each biomaterial match the number of experimental factor values * expected.// ww w .j a v a 2 s . c o m * * @param factorValueList Represents lines of file containing factor values for a biomaterial * @param numberOfExperimentalFactors number of experimental factors */ private void validateFactorFileContent(Integer numberOfExperimentalFactors, List<String> factorValueList) throws IOException { for (String factorValueLine : factorValueList) { String[] fields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t"); if (fields.length > numberOfExperimentalFactors + ExperimentalDesignImporterImpl.NUMBER_OF_EXTRA_COLUMNS_ALLOWED) { throw new IOException("Expected no more than " + (numberOfExperimentalFactors + ExperimentalDesignImporterImpl.NUMBER_OF_EXTRA_COLUMNS_ALLOWED) + " columns based on EF descriptions (plus id column), got " + fields.length); } if (fields.length <= numberOfExperimentalFactors) { throw new IOException("Expected at least " + (numberOfExperimentalFactors + 1) + " columns based on EF descriptions (plus id column), got " + fields.length); } } }
From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java
/** * Validates that the sample header is correctly formatted. Checks that the experimental factors defined in the * header match those in the experimental factor file lines. * * @param experimentalFactorValueNames experimental factor value names * @param numberOfExperimentalFactors number fo EFs * @param sampleHeaderLine sample header line * @throws IOException Validation fails. *//* w w w . java2 s .c o m*/ private void validateSampleHeaderFileContent(Set<String> experimentalFactorValueNames, Integer numberOfExperimentalFactors, String sampleHeaderLine) throws IOException { String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t"); // we might have the ids, and the external id. if (headerFields.length > numberOfExperimentalFactors + ExperimentalDesignImporterImpl.NUMBER_OF_EXTRA_COLUMNS_ALLOWED) { throw new IOException("Expected " + (numberOfExperimentalFactors + ExperimentalDesignImporterImpl.NUMBER_OF_EXTRA_COLUMNS_ALLOWED) + " columns based on EF descriptions (plus id column), got " + headerFields.length); } for (int i = 1; i < headerFields.length; i++) { String value = headerFields[i]; value = StringUtils.strip(value); if (value.equals("ExternalID")) { // that's fine. continue; } if (!experimentalFactorValueNames.contains(value)) { throw new IOException("Expected to find an EF matching the column heading '" + value + "'"); } } }
From source file:ubic.gemma.core.loader.genome.FastaParser.java
/** * The following formats are supported//from www .j a v a 2 s .c o m * <ul> * <li>GenBank: gi|gi-number|gb|accession|locus * <li>EMBL Data Library : gi|gi-number|emb|accession|locus * <li>DDBJ, DNA Database of Japan : gi|gi-number|dbj|accession|locus * <li>NBRF PIR : pir||entry * <li>Protein Research Foundation : prf||name * <li>SWISS-PROT : sp|accession|name * <li>Brookhaven Protein Data Bank (1) : pdb|entry|chain * <li>Brookhaven Protein Data Bank (2) : entry:chain|PDBID|CHAIN|SEQUENCE * <li>Patents : pat|country|number * <li>GenInfo Backbone Id bbs|number * <li>General database identifier : gnl|database|identifier * <li>NCBI Reference Sequence : ref|accession|locus * <li>Local Sequence identifier : lcl|identifier * <li>NIA 15k and 7k sets : H[0-9A-Z]{1-9}-\d | alternate (example: >H4002F12-5 ) * <li>Generic: probeid * </ul> * * @param bioSequence BA * @param header header * @return boolean */ private boolean parseDeflineHeader(BioSequence bioSequence, String header) { // one of the genbank formats. String[] split = StringUtils.splitPreserveAllTokens(header, "|;"); String firstTag = split[0]; // assert firstTag.startsWith( ">" ); // assert firstTag.length() > 1; firstTag = StringUtils.removeStart(firstTag, ">"); if (firstTag.equals("gi")) { bioSequence.setDescription(split[4]); String genbankAcc = split[3]; // with version number, possibly DatabaseEntry genbank = ExternalDatabaseUtils.getGenbankAccession(genbankAcc); bioSequence.setName(genbank.getAccession()); // without version number. bioSequence.setSequenceDatabaseEntry(genbank); } else if (firstTag.equals("pir")) { bioSequence.setName(split[1]); } else if (firstTag.equals("sp")) { bioSequence.setName(split[1]); bioSequence.setDescription(split[2]); } else if (firstTag.equals("ref")) { bioSequence.setName(split[1]); bioSequence.setDescription(split[2]); } else if (firstTag.equals("lcl")) { bioSequence.setName(split[1]); } else if (firstTag.equals("pdb")) { bioSequence.setName(split[1]); bioSequence.setDescription(split[2]); } else if (firstTag.equals("gnl")) { bioSequence.setName(split[2]); } else if (firstTag.equals("entry:chain")) { bioSequence.setName(split[1]); } else if (firstTag.matches(FastaParser.NIA_HEADER_REGEX)) { return this.parseNIA(bioSequence, header); } else { // generic. bioSequence.setName(split[0]); if (split.length > 1) bioSequence.setDescription(split[1]); // log.warn( "Defline-style FASTA header in unrecognized format, started with " + firstTag ); // return false; } return true; }
From source file:ubic.gemma.core.loader.genome.gene.ExternalFileGeneLoaderServiceImpl.java
/** * Read a gene file line, splitting the line into 3 strings. * * @param line A line from the gene file * @return Array of strings representing a line in a gene file. * @throws IOException Thrown if file is not readable *//*from w ww .j av a 2s . co m*/ private String[] readLine(String line) throws IOException { if (StringUtils.isBlank(line)) { return null; } if (line.startsWith("#")) { return null; } String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length < 2) { throw new IOException("Illegal format, expected at least 2 columns, got " + fields.length); } return fields; }
From source file:ubic.gemma.core.loader.genome.gene.ncbi.homology.HomologeneServiceImpl.java
@Override public void parseHomologeneFile(InputStream is) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(is)); String line;//from w w w . j ava2s. c o m while ((line = br.readLine()) != null) { if (StringUtils.isBlank(line) || line.startsWith(HomologeneServiceImpl.COMMENT_CHARACTER)) { continue; } String[] fields = StringUtils.splitPreserveAllTokens(line, HomologeneServiceImpl.DELIMITING_CHARACTER); Integer taxonId = Integer.parseInt(fields[1]); Long groupId; Long geneId; try { groupId = Long.parseLong(fields[0]); geneId = Long.parseLong(fields[2]); } catch (NumberFormatException e) { HomologeneServiceImpl.log.warn("Unparseable line from homologene: " + line); continue; } String geneSymbol = fields[3]; if (!group2Gene.containsKey(groupId)) { group2Gene.put(groupId, new ArrayList<Long>()); } if (!group2Gene.get(groupId).contains(geneId)) { group2Gene.get(groupId).add(geneId); } else { HomologeneServiceImpl.log.warn( "Duplicate gene ID encountered (group2Gene). Skipping: geneID=" + geneId + " , taxonID = " + taxonId + " , geneSymbol = " + geneSymbol + " for group " + groupId); } if (!gene2Group.containsKey(geneId)) { gene2Group.put(geneId, groupId); } else { HomologeneServiceImpl.log.warn( "Duplicate gene ID encountered (gene2Group). Skipping: geneID=" + geneId + " , taxonID = " + taxonId + " , geneSymbol = " + geneSymbol + " for group " + groupId); } } ready.set(true); HomologeneServiceImpl.log.info("Gene Homology successfully loaded: " + gene2Group.keySet().size() + " genes covered in " + group2Gene.keySet().size() + " groups"); }
From source file:ubic.gemma.core.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java
@Override public NCBIGene2Accession parseOneLine(String line) { String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length < NcbiGene2AccessionParser.NCBI_GENE2ACCESSION_FIELDS_PER_ROW) { throw new IllegalArgumentException("Line is not in the right format: has " + fields.length + " fields, expected " + NcbiGene2AccessionParser.NCBI_GENE2ACCESSION_FIELDS_PER_ROW); }//from w ww.jav a 2 s . c om NCBIGene2Accession currentAccession = this.processFields(fields); if (currentAccession == null) { return null; } this.addResult(currentAccession); // really doesn't serve much of a purpose /* * Only some genes are relevant - for example, we might have filtered them by taxon. */ if (geneInfo != null && !geneInfo.containsKey(currentAccession.getGeneId())) { return null; } // if the current gene Id is different from this current one, then // we are done with the gene Id. Push the geneCollection into the queue. if (lastGeneId != null && !lastGeneId.equalsIgnoreCase(currentAccession.getGeneId())) { // push the gene set to the queue try { queue.put(geneData); } catch (InterruptedException e) { throw new RuntimeException(e); } // clear the gene set geneData = new NcbiGeneData(); if (geneInfo != null) geneInfo.remove(lastGeneId); } assert currentAccession.getGeneId() != null; // we're either starting a new one, or continuing with an old one. lastGeneId = currentAccession.getGeneId(); geneData.addAccession(currentAccession); geneData.setGeneInfo(geneInfo.get(currentAccession.getGeneId())); // this will be a trailing accession.? return currentAccession; }
From source file:ubic.gemma.core.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java
private NCBIGene2Accession processFields(String[] fields) { NCBIGene2Accession newGene = new NCBIGene2Accession(); try {//from w w w .ja v a 2 s . c o m /* * Skip lines that refer to locations in non-reference assemblies. */ if (fields[12].startsWith("Alternate assembly")) { return null; } newGene.setGeneId(fields[1]); if (!hasStarted) { assert startingNcbiId != null; if (startingNcbiId.equals(Integer.parseInt(fields[1]))) { log.info("Found the starting gene " + startingNcbiId); hasStarted = true; } else { return null; } } // #Format: // tax_id 0 // GeneID 1 // status 2 // RNA_nucleotide_accession.version 3 // RNA_nucleotide_gi 4 // protein_accession.version 5 // protein_gi 6 // genomic_nucleotide_accession.version 7 // genomic_nucleotide_gi 8 // start_position_on_the_genomic_accession 9 // end_position_on_the_genomic_accession 10 // orientation 11 // assembly 12 // mature_peptide_accession.version 13 // mature_peptide_gi 14 // Symbol 15 newGene.setTaxId(Integer.parseInt(fields[0])); newGene.setStatus(fields[2].equals("-") ? null : fields[2]); newGene.setRnaNucleotideAccession(fields[3].equals("-") ? null : fields[3]); newGene.setRnaNucleotideGI(fields[4].equals("-") ? null : fields[4]); newGene.setProteinAccession(fields[5].equals("-") ? null : fields[5]); newGene.setProteinGI(fields[6].equals("-") ? null : fields[6]); newGene.setGenomicNucleotideAccession(fields[7].equals("-") ? null : fields[7]); newGene.setGenomicNucleotideGI(fields[8].equals("-") ? null : fields[8]); newGene.setStartPosition(fields[9].equals("-") ? null : Long.parseLong(fields[9])); newGene.setEndPosition(fields[10].equals("-") ? null : Long.parseLong(fields[10])); newGene.setOrientation(fields[11].equals("?") ? null : fields[11]); // set accession version numbers (additional parsing) // the assumption is that the string is delimited by a dot // and it only has one dot with one version number (ie GS001.1, not GS001.1.1) // RNA String rnaAccession = newGene.getRnaNucleotideAccession(); if (StringUtils.isNotBlank(rnaAccession)) { String[] tokens = StringUtils.splitPreserveAllTokens(rnaAccession, '.'); switch (tokens.length) { case 1: newGene.setRnaNucleotideAccession(tokens[0]); newGene.setRnaNucleotideAccessionVersion(null); break; case 2: newGene.setRnaNucleotideAccession(tokens[0]); newGene.setRnaNucleotideAccessionVersion(tokens[1]); break; default: throw new UnsupportedOperationException("Don't know how to deal with " + rnaAccession); } } else { newGene.setRnaNucleotideAccessionVersion(null); newGene.setRnaNucleotideAccessionVersion(null); } // protein String proteinAccession = newGene.getProteinAccession(); if (StringUtils.isNotBlank(proteinAccession)) { String[] tokens = StringUtils.splitPreserveAllTokens(proteinAccession, '.'); switch (tokens.length) { case 1: newGene.setProteinAccession(tokens[0]); newGene.setProteinAccessionVersion(null); break; case 2: newGene.setProteinAccession(tokens[0]); newGene.setProteinAccessionVersion(tokens[1]); break; default: throw new UnsupportedOperationException("Don't know how to deal with " + proteinAccession); } } else { newGene.setProteinAccessionVersion(null); newGene.setProteinAccessionVersion(null); } // Genome (chromosome information) String genomicAccession = newGene.getGenomicNucleotideAccession(); if (StringUtils.isNotBlank(genomicAccession)) { String[] tokens = StringUtils.splitPreserveAllTokens(genomicAccession, '.'); switch (tokens.length) { case 1: newGene.setGenomicNucleotideAccession(tokens[0]); newGene.setGenomicNucleotideAccessionVersion(null); break; case 2: newGene.setGenomicNucleotideAccession(tokens[0]); newGene.setGenomicNucleotideAccessionVersion(tokens[1]); break; default: throw new UnsupportedOperationException("Don't know how to deal with " + genomicAccession); } } else { newGene.setGenomicNucleotideAccessionVersion(null); newGene.setGenomicNucleotideAccessionVersion(null); } } catch (NumberFormatException e) { throw new RuntimeException(e); } return newGene; }
From source file:ubic.gemma.core.loader.genome.gene.ncbi.NcbiGeneInfoParser.java
@Override public NCBIGeneInfo parseOneLine(String line) { String[] fields = StringUtils.splitPreserveAllTokens(line, '\t'); if (fields.length != NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW) { //noinspection StatementWithEmptyBody // backwards compatibility, old format, hopefully okay if (fields.length == 13 || fields.length == 14 || fields.length == 15) { // They keep adding fields at the end...we only need the first few. } else {//from w ww . j av a 2 s . c o m throw new FileFormatException("Line + " + line + " is not in the right format: has " + fields.length + " fields, expected " + NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW); } } NCBIGeneInfo geneInfo = new NCBIGeneInfo(); try { // Skip taxa that we don't support. int taxonId = Integer.parseInt(fields[0]); if (filter && ncbiTaxonIds != null) { if (!ncbiTaxonIds.contains(taxonId)) { return null; } } // See ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/README // #Format: // tax_id // GeneID // Symbol // LocusTag // Synonyms // dbXrefs, separated by | // chromosome // map_location // description // type_of_gene // Symbol_from_nomenclature_authority // Full_name_from_nomenclature_authority // Nomenclature_status // Other_designations // Modification_date // Feature type geneInfo.setTaxId(taxonId); geneInfo.setGeneId(fields[1]); geneInfo.setDefaultSymbol(fields[2]); geneInfo.setLocusTag(fields[3]); String[] synonyms = StringUtils.splitPreserveAllTokens(fields[4], '|'); for (String synonym : synonyms) { if (synonym.equals("-")) continue; geneInfo.addToSynonyms(synonym); } if (!fields[5].equals("-")) { String[] dbXRefs = StringUtils.splitPreserveAllTokens(fields[5], '|'); for (String dbXr : dbXRefs) { String[] dbF = StringUtils.split(dbXr, ':'); if (dbF.length != 2) { /* * Annoyingly, HGCN identifiers now have the format HGNC:X where X is an integer. This is * apparent from downloading files from HGCN (http://www.genenames.org/cgi-bin/statistics). Same * situation for MGI * * Therefore we have a special case. */ if (dbF.length == 3 && (dbF[1].equals("HGNC") || dbF[1].equals("MGI"))) { dbF[1] = dbF[1] + ":" + dbF[2]; } else { // we're very stringent to avoid data corruption. throw new FileFormatException( "Expected 2 fields, got " + dbF.length + " from '" + dbXr + "'"); } } geneInfo.addToDbXRefs(dbF[0], dbF[1]); } } geneInfo.setChromosome(fields[6]); geneInfo.setMapLocation(fields[7]); geneInfo.setDescription(fields[8]); geneInfo.setGeneType(NCBIGeneInfo.typeStringToGeneType(fields[9])); geneInfo.setSymbolIsFromAuthority(!fields[10].equals("-")); geneInfo.setNameIsFromAuthority(!fields[11].equals("-")); geneInfo.setNomenclatureStatus(fields[12].equals("-") ? NomenclatureStatus.UNKNOWN : fields[11].equals("O") ? NomenclatureStatus.OFFICIAL : NomenclatureStatus.INTERIM); // ignore 14th field for now - it stores alternate protein names // ignore 15th, modification date } catch (NumberFormatException e) { throw new FileFormatException(e); } return geneInfo; }
From source file:ubic.gemma.core.loader.genome.taxon.TaxonParser.java
@Override public Taxon parseOneLine(String line) { String[] fields = StringUtils.splitPreserveAllTokens(line, '|'); int ncbiid = Integer.parseInt(StringUtils.strip(fields[0])); if (!results.containsKey(ncbiid)) { Taxon t = Taxon.Factory.newInstance(); t.setNcbiId(ncbiid);//from w ww . ja v a 2 s .c om t.setIsGenesUsable(false); results.put(ncbiid, t); } String tag = StringUtils.strip(fields[3]); if (tag.equals("scientific name")) { results.get(ncbiid).setScientificName(StringUtils.strip(fields[1])); } else if (tag.equals("genbank common name")) { results.get(ncbiid).setCommonName(fields[1]); } return results.get(ncbiid); }