Example usage for org.apache.commons.lang3 StringUtils splitPreserveAllTokens

List of usage examples for org.apache.commons.lang3 StringUtils splitPreserveAllTokens

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringUtils splitPreserveAllTokens.

Prototype

public static String[] splitPreserveAllTokens(final String str, final String separatorChars) 

Source Link

Document

Splits the provided text into an array, separators specified, preserving all tokens, including empty tokens created by adjacent separators.

Usage

From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Check that the biomaterial is in the file and in the experiment. It is arguable whether this should be an
 * exception. I think it has to be to make sure that simple errors in the format are caught. But it's inconvenient
 * for cases where a single 'design' file is to be used for multiple microarray studies. Biomaterial ids should
 * match what is stored//from   w  ww. j  a v a 2 s  .c o  m
 *
 * @param factorValueLines Lines containing biomaterial names and their factor values
 */
private void validateBioMaterialFileContent(Collection<BioMaterial> bioMaterials, List<String> factorValueLines)
        throws IllegalArgumentException {

    for (String factorValueLine : factorValueLines) {
        String[] vals = StringUtils.splitPreserveAllTokens(factorValueLine, '\t');
        if (vals.length < 2) {
            throw new IllegalArgumentException(
                    "Expected a file with at least two columns separated by tabs, got " + factorValueLine);
        }
        BioMaterial bioMaterialInFile = this.getBioMaterialFromExpressionExperiment(bioMaterials, vals[0],
                vals[1]);
        if (bioMaterialInFile == null) {
            // these might just be "extras" but we're being strict
            throw new IllegalArgumentException(
                    "The uploaded file has a biomaterial name/ID that does not match the study: " + vals[0]
                            + ", " + vals[1]);
        }
    }
}

From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Validates that factor values given in file for each biomaterial match the number of experimental factor values
 * expected.//  ww w  .j  a  v  a 2 s  . c  o  m
 *
 * @param factorValueList             Represents lines of file containing factor values for a biomaterial
 * @param numberOfExperimentalFactors number of experimental factors
 */
private void validateFactorFileContent(Integer numberOfExperimentalFactors, List<String> factorValueList)
        throws IOException {
    for (String factorValueLine : factorValueList) {
        String[] fields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t");
        if (fields.length > numberOfExperimentalFactors
                + ExperimentalDesignImporterImpl.NUMBER_OF_EXTRA_COLUMNS_ALLOWED) {
            throw new IOException("Expected no more than "
                    + (numberOfExperimentalFactors
                            + ExperimentalDesignImporterImpl.NUMBER_OF_EXTRA_COLUMNS_ALLOWED)
                    + " columns based on EF descriptions (plus id column), got " + fields.length);
        }
        if (fields.length <= numberOfExperimentalFactors) {
            throw new IOException("Expected at least " + (numberOfExperimentalFactors + 1)
                    + " columns based on EF descriptions (plus id column), got " + fields.length);

        }
    }
}

From source file:ubic.gemma.core.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Validates that the sample header is correctly formatted. Checks that the experimental factors defined in the
 * header match those in the experimental factor file lines.
 *
 * @param  experimentalFactorValueNames experimental factor value names
 * @param  numberOfExperimentalFactors  number fo EFs
 * @param  sampleHeaderLine             sample header line
 * @throws IOException                  Validation fails.
 *//* w  w  w .  java2  s .c  o m*/
private void validateSampleHeaderFileContent(Set<String> experimentalFactorValueNames,
        Integer numberOfExperimentalFactors, String sampleHeaderLine) throws IOException {
    String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t");

    // we might have the ids, and the external id.
    if (headerFields.length > numberOfExperimentalFactors
            + ExperimentalDesignImporterImpl.NUMBER_OF_EXTRA_COLUMNS_ALLOWED) {
        throw new IOException("Expected "
                + (numberOfExperimentalFactors + ExperimentalDesignImporterImpl.NUMBER_OF_EXTRA_COLUMNS_ALLOWED)
                + " columns based on EF descriptions (plus id column), got " + headerFields.length);
    }

    for (int i = 1; i < headerFields.length; i++) {

        String value = headerFields[i];

        value = StringUtils.strip(value);

        if (value.equals("ExternalID")) {
            // that's fine.
            continue;
        }

        if (!experimentalFactorValueNames.contains(value)) {
            throw new IOException("Expected to find an EF matching the column heading '" + value + "'");
        }

    }

}

From source file:ubic.gemma.core.loader.genome.FastaParser.java

/**
 * The following formats are supported//from   www  .j  a v a  2 s  .c o m
 * <ul>
 * <li>GenBank: gi|gi-number|gb|accession|locus
 * <li>EMBL Data Library : gi|gi-number|emb|accession|locus
 * <li>DDBJ, DNA Database of Japan : gi|gi-number|dbj|accession|locus
 * <li>NBRF PIR : pir||entry
 * <li>Protein Research Foundation : prf||name
 * <li>SWISS-PROT : sp|accession|name
 * <li>Brookhaven Protein Data Bank (1) : pdb|entry|chain
 * <li>Brookhaven Protein Data Bank (2) : entry:chain|PDBID|CHAIN|SEQUENCE
 * <li>Patents : pat|country|number
 * <li>GenInfo Backbone Id bbs|number
 * <li>General database identifier : gnl|database|identifier
 * <li>NCBI Reference Sequence : ref|accession|locus
 * <li>Local Sequence identifier : lcl|identifier
 * <li>NIA 15k and 7k sets : H[0-9A-Z]{1-9}-\d | alternate (example: &gt;H4002F12-5 )
 * <li>Generic: probeid
 * </ul>
 *
 * @param bioSequence BA
 * @param header header
 * @return boolean
 */
private boolean parseDeflineHeader(BioSequence bioSequence, String header) {
    // one of the genbank formats.
    String[] split = StringUtils.splitPreserveAllTokens(header, "|;");

    String firstTag = split[0];

    // assert firstTag.startsWith( ">" );
    // assert firstTag.length() > 1;
    firstTag = StringUtils.removeStart(firstTag, ">");

    if (firstTag.equals("gi")) {
        bioSequence.setDescription(split[4]);
        String genbankAcc = split[3]; // with version number, possibly
        DatabaseEntry genbank = ExternalDatabaseUtils.getGenbankAccession(genbankAcc);
        bioSequence.setName(genbank.getAccession()); // without version number.
        bioSequence.setSequenceDatabaseEntry(genbank);
    } else if (firstTag.equals("pir")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.equals("sp")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("ref")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("lcl")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.equals("pdb")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("gnl")) {
        bioSequence.setName(split[2]);
    } else if (firstTag.equals("entry:chain")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.matches(FastaParser.NIA_HEADER_REGEX)) {
        return this.parseNIA(bioSequence, header);
    } else {
        // generic.
        bioSequence.setName(split[0]);
        if (split.length > 1)
            bioSequence.setDescription(split[1]);
        // log.warn( "Defline-style FASTA header in unrecognized format, started with " + firstTag );
        // return false;
    }
    return true;
}

From source file:ubic.gemma.core.loader.genome.gene.ExternalFileGeneLoaderServiceImpl.java

/**
 * Read a gene file line, splitting the line into 3 strings.
 *
 * @param  line        A line from the gene file
 * @return             Array of strings representing a line in a gene file.
 * @throws IOException Thrown if file is not readable
 *//*from   w  ww .j av  a 2s .  co  m*/
private String[] readLine(String line) throws IOException {
    if (StringUtils.isBlank(line)) {
        return null;
    }
    if (line.startsWith("#")) {
        return null;
    }

    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
    if (fields.length < 2) {
        throw new IOException("Illegal format, expected at least 2 columns, got " + fields.length);
    }
    return fields;

}

From source file:ubic.gemma.core.loader.genome.gene.ncbi.homology.HomologeneServiceImpl.java

@Override
public void parseHomologeneFile(InputStream is) throws IOException {

    BufferedReader br = new BufferedReader(new InputStreamReader(is));
    String line;//from w  w  w .  j  ava2s.  c  o  m

    while ((line = br.readLine()) != null) {

        if (StringUtils.isBlank(line) || line.startsWith(HomologeneServiceImpl.COMMENT_CHARACTER)) {
            continue;
        }
        String[] fields = StringUtils.splitPreserveAllTokens(line, HomologeneServiceImpl.DELIMITING_CHARACTER);

        Integer taxonId = Integer.parseInt(fields[1]);
        Long groupId;
        Long geneId;
        try {
            groupId = Long.parseLong(fields[0]);
            geneId = Long.parseLong(fields[2]);
        } catch (NumberFormatException e) {
            HomologeneServiceImpl.log.warn("Unparseable line from homologene: " + line);
            continue;
        }
        String geneSymbol = fields[3];

        if (!group2Gene.containsKey(groupId)) {
            group2Gene.put(groupId, new ArrayList<Long>());
        }
        if (!group2Gene.get(groupId).contains(geneId)) {
            group2Gene.get(groupId).add(geneId);
        } else {
            HomologeneServiceImpl.log.warn(
                    "Duplicate gene ID encountered (group2Gene).  Skipping: geneID=" + geneId + " , taxonID = "
                            + taxonId + " , geneSymbol = " + geneSymbol + " for group " + groupId);
        }

        if (!gene2Group.containsKey(geneId)) {
            gene2Group.put(geneId, groupId);
        } else {
            HomologeneServiceImpl.log.warn(
                    "Duplicate gene ID encountered (gene2Group).  Skipping: geneID=" + geneId + " , taxonID = "
                            + taxonId + " , geneSymbol = " + geneSymbol + " for group " + groupId);
        }
    }
    ready.set(true);
    HomologeneServiceImpl.log.info("Gene Homology successfully loaded: " + gene2Group.keySet().size()
            + " genes covered in " + group2Gene.keySet().size() + " groups");

}

From source file:ubic.gemma.core.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java

@Override
public NCBIGene2Accession parseOneLine(String line) {
    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

    if (fields.length < NcbiGene2AccessionParser.NCBI_GENE2ACCESSION_FIELDS_PER_ROW) {
        throw new IllegalArgumentException("Line is not in the right format: has " + fields.length
                + " fields, expected " + NcbiGene2AccessionParser.NCBI_GENE2ACCESSION_FIELDS_PER_ROW);
    }//from w ww.jav  a  2  s  . c om

    NCBIGene2Accession currentAccession = this.processFields(fields);

    if (currentAccession == null) {
        return null;
    }

    this.addResult(currentAccession); // really doesn't serve much of a purpose

    /*
     * Only some genes are relevant - for example, we might have filtered them by taxon.
     */
    if (geneInfo != null && !geneInfo.containsKey(currentAccession.getGeneId())) {
        return null;
    }

    // if the current gene Id is different from this current one, then
    // we are done with the gene Id. Push the geneCollection into the queue.
    if (lastGeneId != null && !lastGeneId.equalsIgnoreCase(currentAccession.getGeneId())) {
        // push the gene set to the queue
        try {
            queue.put(geneData);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        // clear the gene set
        geneData = new NcbiGeneData();
        if (geneInfo != null)
            geneInfo.remove(lastGeneId);
    }

    assert currentAccession.getGeneId() != null;

    // we're either starting a new one, or continuing with an old one.
    lastGeneId = currentAccession.getGeneId();
    geneData.addAccession(currentAccession);
    geneData.setGeneInfo(geneInfo.get(currentAccession.getGeneId()));

    // this will be a trailing accession.?
    return currentAccession;
}

From source file:ubic.gemma.core.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java

private NCBIGene2Accession processFields(String[] fields) {
    NCBIGene2Accession newGene = new NCBIGene2Accession();
    try {//from  w w w .ja v a  2 s  . c o m

        /*
         * Skip lines that refer to locations in non-reference assemblies.
         */
        if (fields[12].startsWith("Alternate assembly")) {
            return null;
        }

        newGene.setGeneId(fields[1]);

        if (!hasStarted) {
            assert startingNcbiId != null;
            if (startingNcbiId.equals(Integer.parseInt(fields[1]))) {
                log.info("Found the starting gene " + startingNcbiId);
                hasStarted = true;
            } else {
                return null;
            }
        }

        // #Format:
        // tax_id 0
        // GeneID 1
        // status 2
        // RNA_nucleotide_accession.version 3
        // RNA_nucleotide_gi 4
        // protein_accession.version 5
        // protein_gi 6
        // genomic_nucleotide_accession.version 7
        // genomic_nucleotide_gi 8
        // start_position_on_the_genomic_accession 9
        // end_position_on_the_genomic_accession 10
        // orientation 11
        // assembly 12
        // mature_peptide_accession.version 13
        // mature_peptide_gi 14
        // Symbol 15

        newGene.setTaxId(Integer.parseInt(fields[0]));
        newGene.setStatus(fields[2].equals("-") ? null : fields[2]);
        newGene.setRnaNucleotideAccession(fields[3].equals("-") ? null : fields[3]);
        newGene.setRnaNucleotideGI(fields[4].equals("-") ? null : fields[4]);
        newGene.setProteinAccession(fields[5].equals("-") ? null : fields[5]);
        newGene.setProteinGI(fields[6].equals("-") ? null : fields[6]);
        newGene.setGenomicNucleotideAccession(fields[7].equals("-") ? null : fields[7]);
        newGene.setGenomicNucleotideGI(fields[8].equals("-") ? null : fields[8]);
        newGene.setStartPosition(fields[9].equals("-") ? null : Long.parseLong(fields[9]));
        newGene.setEndPosition(fields[10].equals("-") ? null : Long.parseLong(fields[10]));
        newGene.setOrientation(fields[11].equals("?") ? null : fields[11]);

        // set accession version numbers (additional parsing)
        // the assumption is that the string is delimited by a dot
        // and it only has one dot with one version number (ie GS001.1, not GS001.1.1)
        // RNA
        String rnaAccession = newGene.getRnaNucleotideAccession();
        if (StringUtils.isNotBlank(rnaAccession)) {
            String[] tokens = StringUtils.splitPreserveAllTokens(rnaAccession, '.');
            switch (tokens.length) {
            case 1:
                newGene.setRnaNucleotideAccession(tokens[0]);
                newGene.setRnaNucleotideAccessionVersion(null);
                break;
            case 2:
                newGene.setRnaNucleotideAccession(tokens[0]);
                newGene.setRnaNucleotideAccessionVersion(tokens[1]);
                break;
            default:
                throw new UnsupportedOperationException("Don't know how to deal with " + rnaAccession);
            }
        } else {
            newGene.setRnaNucleotideAccessionVersion(null);
            newGene.setRnaNucleotideAccessionVersion(null);
        }

        // protein
        String proteinAccession = newGene.getProteinAccession();
        if (StringUtils.isNotBlank(proteinAccession)) {
            String[] tokens = StringUtils.splitPreserveAllTokens(proteinAccession, '.');
            switch (tokens.length) {
            case 1:
                newGene.setProteinAccession(tokens[0]);
                newGene.setProteinAccessionVersion(null);
                break;
            case 2:
                newGene.setProteinAccession(tokens[0]);
                newGene.setProteinAccessionVersion(tokens[1]);
                break;
            default:
                throw new UnsupportedOperationException("Don't know how to deal with " + proteinAccession);
            }
        } else {
            newGene.setProteinAccessionVersion(null);
            newGene.setProteinAccessionVersion(null);
        }

        // Genome (chromosome information)
        String genomicAccession = newGene.getGenomicNucleotideAccession();
        if (StringUtils.isNotBlank(genomicAccession)) {
            String[] tokens = StringUtils.splitPreserveAllTokens(genomicAccession, '.');
            switch (tokens.length) {
            case 1:
                newGene.setGenomicNucleotideAccession(tokens[0]);
                newGene.setGenomicNucleotideAccessionVersion(null);
                break;
            case 2:
                newGene.setGenomicNucleotideAccession(tokens[0]);
                newGene.setGenomicNucleotideAccessionVersion(tokens[1]);
                break;
            default:
                throw new UnsupportedOperationException("Don't know how to deal with " + genomicAccession);
            }
        } else {
            newGene.setGenomicNucleotideAccessionVersion(null);
            newGene.setGenomicNucleotideAccessionVersion(null);
        }

    } catch (NumberFormatException e) {
        throw new RuntimeException(e);
    }
    return newGene;
}

From source file:ubic.gemma.core.loader.genome.gene.ncbi.NcbiGeneInfoParser.java

@Override
public NCBIGeneInfo parseOneLine(String line) {
    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

    if (fields.length != NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW) {
        //noinspection StatementWithEmptyBody // backwards compatibility, old format, hopefully okay
        if (fields.length == 13 || fields.length == 14 || fields.length == 15) {
            // They keep adding fields at the end...we only need the first few.
        } else {//from w ww . j  av  a  2  s  .  c o  m
            throw new FileFormatException("Line + " + line + " is not in the right format: has " + fields.length
                    + " fields, expected " + NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW);
        }
    }
    NCBIGeneInfo geneInfo = new NCBIGeneInfo();
    try {

        // Skip taxa that we don't support.
        int taxonId = Integer.parseInt(fields[0]);
        if (filter && ncbiTaxonIds != null) {
            if (!ncbiTaxonIds.contains(taxonId)) {
                return null;
            }
        }

        // See ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/README
        // #Format:

        // tax_id
        // GeneID
        // Symbol
        // LocusTag
        // Synonyms
        // dbXrefs, separated by |
        // chromosome
        // map_location
        // description
        // type_of_gene
        // Symbol_from_nomenclature_authority
        // Full_name_from_nomenclature_authority
        // Nomenclature_status
        // Other_designations
        // Modification_date
        // Feature type

        geneInfo.setTaxId(taxonId);
        geneInfo.setGeneId(fields[1]);
        geneInfo.setDefaultSymbol(fields[2]);
        geneInfo.setLocusTag(fields[3]);
        String[] synonyms = StringUtils.splitPreserveAllTokens(fields[4], '|');
        for (String synonym : synonyms) {
            if (synonym.equals("-"))
                continue;
            geneInfo.addToSynonyms(synonym);
        }

        if (!fields[5].equals("-")) {
            String[] dbXRefs = StringUtils.splitPreserveAllTokens(fields[5], '|');
            for (String dbXr : dbXRefs) {
                String[] dbF = StringUtils.split(dbXr, ':');
                if (dbF.length != 2) {
                    /*
                     * Annoyingly, HGCN identifiers now have the format HGNC:X where X is an integer. This is
                     * apparent from downloading files from HGCN (http://www.genenames.org/cgi-bin/statistics). Same
                     * situation for MGI
                     *
                     * Therefore we have a special case.
                     */
                    if (dbF.length == 3 && (dbF[1].equals("HGNC") || dbF[1].equals("MGI"))) {
                        dbF[1] = dbF[1] + ":" + dbF[2];
                    } else {
                        // we're very stringent to avoid data corruption.
                        throw new FileFormatException(
                                "Expected 2 fields, got " + dbF.length + " from '" + dbXr + "'");
                    }
                }
                geneInfo.addToDbXRefs(dbF[0], dbF[1]);
            }
        }

        geneInfo.setChromosome(fields[6]);
        geneInfo.setMapLocation(fields[7]);
        geneInfo.setDescription(fields[8]);
        geneInfo.setGeneType(NCBIGeneInfo.typeStringToGeneType(fields[9]));
        geneInfo.setSymbolIsFromAuthority(!fields[10].equals("-"));
        geneInfo.setNameIsFromAuthority(!fields[11].equals("-"));
        geneInfo.setNomenclatureStatus(fields[12].equals("-") ? NomenclatureStatus.UNKNOWN
                : fields[11].equals("O") ? NomenclatureStatus.OFFICIAL : NomenclatureStatus.INTERIM);
        // ignore 14th field for now - it stores alternate protein names
        // ignore 15th, modification date
    } catch (NumberFormatException e) {
        throw new FileFormatException(e);
    }
    return geneInfo;
}

From source file:ubic.gemma.core.loader.genome.taxon.TaxonParser.java

@Override
public Taxon parseOneLine(String line) {
    String[] fields = StringUtils.splitPreserveAllTokens(line, '|');

    int ncbiid = Integer.parseInt(StringUtils.strip(fields[0]));

    if (!results.containsKey(ncbiid)) {
        Taxon t = Taxon.Factory.newInstance();
        t.setNcbiId(ncbiid);//from   w  ww  . ja v  a 2  s  .c om
        t.setIsGenesUsable(false);
        results.put(ncbiid, t);
    }

    String tag = StringUtils.strip(fields[3]);
    if (tag.equals("scientific name")) {
        results.get(ncbiid).setScientificName(StringUtils.strip(fields[1]));
    } else if (tag.equals("genbank common name")) {
        results.get(ncbiid).setCommonName(fields[1]);
    }

    return results.get(ncbiid);

}