Example usage for org.apache.commons.lang StringUtils splitPreserveAllTokens

Introduction

In this page you can find the example usage for org.apache.commons.lang StringUtils splitPreserveAllTokens.

Prototype

public static String[] splitPreserveAllTokens(String str, String separatorChars)

Source Link

Document

Splits the provided text into an array, separators specified, preserving all tokens, including empty tokens created by adjacent separators.

Usage

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

@Override
public void importDesign(ExpressionExperiment experiment, InputStream is, boolean dryRun) throws IOException {
    this.mgedOntologyService = this.ontologyService.getMgedOntologyService();

    log.debug("Parsing input file");
    boolean readHeader = false;

    BufferedReader r = new BufferedReader(new InputStreamReader(is));
    String line = null;//from w  ww . ja  va 2 s .c o  m
    if (mgedOntologyService == null) {
        throw new IllegalStateException("Please set the MGED OntologyService, thanks.");
    }

    ExperimentalDesign experimentalDesign = experiment.getExperimentalDesign();

    if (!experimentalDesign.getExperimentalFactors().isEmpty()) {
        log.warn("Experimental design already has factors, import will add new ones");
    }

    experimentalDesign.setDescription("Parsed from file.");

    List<String> experimentalFactorLines = new ArrayList<String>();
    String sampleHeaderLine = "";
    List<String> factorValueLines = new ArrayList<String>();

    while ((line = r.readLine()) != null) {
        if (line.startsWith(EXPERIMENTAL_FACTOR_DESCRIPTION_LINE_INDICATOR)) {
            experimentalFactorLines.add(line);
        } else if (line.startsWith("#") || StringUtils.isBlank(line)) {
            continue;
        } else if (!readHeader) {
            sampleHeaderLine = line;
            readHeader = true;
        } else {
            factorValueLines.add(line);
        }
    }
    String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t");

    Collection<BioMaterial> experimentBioMaterials = this.bioMaterialService.findByExperiment(experiment);

    validateFileComponents(experimentalFactorLines, sampleHeaderLine, factorValueLines);
    validateExperimentalFactorFileContent(experimentalFactorLines, sampleHeaderLine);
    validateFactorFileContent(experimentalFactorLines.size(), factorValueLines);
    validateBioMaterialFileContent(experiment, experimentBioMaterials, factorValueLines);

    // build up the composite: create experimental factor then add the experimental value
    addExperimentalFactorsToExperimentalDesign(experimentalDesign, experimentalFactorLines, headerFields,
            factorValueLines);

    experimentalDesignService.update(experimentalDesign);

    // a bit tricky as there is an assumption that the first biomaterial in the bioassay set is the relevent one;
    // safer to use biomaterial collection returned; cannot guarantee order of objects in collection.
    Collection<BioMaterial> bioMaterialsWithFactorValues = addFactorValuesToBioMaterialsInExpressionExperiment(
            experiment, experimentBioMaterials, experimentalDesign, factorValueLines, headerFields);

    for (BioMaterial bioMaterial : bioMaterialsWithFactorValues) {
        this.bioMaterialService.update(bioMaterial);

        // just a debugging sanity check.
        BioMaterial bbm = this.bioMaterialService.load(bioMaterial.getId());
        if (log.isDebugEnabled())
            log.debug(bbm + ": " + bbm.getFactorValues().size() + " factor values: "
                    + StringUtils.join(bbm.getFactorValues(), " ; "));
    }

}

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Add the factor values to the biomaterial
 * /*from w  ww  . ja v  a2 s. c  o m*/
 * @param experiment
 * @param experimentBioMaterials Current expression experiment's biomaterials.
 * @param experimentalDesign experimental design
 * @param factorValueLines Lines from file containing factor values and biomaterial ids
 * @param headerFields
 * @return Collection of biomaterials associated with this experiment, this is returned as the biomaterial is in a
 *         bioassay (first one retrieved)
 */
private Collection<BioMaterial> addFactorValuesToBioMaterialsInExpressionExperiment(
        ExpressionExperiment experiment, Collection<BioMaterial> experimentBioMaterials,
        ExperimentalDesign experimentalDesign, List<String> factorValueLines, String[] headerFields) {
    log.debug("Adding factors values to biomaterials: " + experimentalDesign.getId());
    Collection<ExperimentalFactor> experimentalFactorsInExperiment = experimentalDesign
            .getExperimentalFactors();
    Collection<BioMaterial> biomaterialsWithFactorValuesInExperiment = new HashSet<BioMaterial>();

    Collection<BioMaterial> seenBioMaterials = new HashSet<BioMaterial>();

    Map<ExperimentalFactor, Collection<BioMaterial>> factorsAssociatedWithBioMaterials = new HashMap<ExperimentalFactor, Collection<BioMaterial>>();

    for (String factorValueLine : factorValueLines) {
        String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t");

        String externalId = null;
        boolean hasExternalId = headerFields[1].toUpperCase().equals("EXTERNALID");
        if (hasExternalId) {
            externalId = factorValueFields[1];
        }
        BioMaterial currentBioMaterial = getBioMaterialFromExpressionExperiment(experiment,
                experimentBioMaterials, factorValueFields[0], externalId);

        if (currentBioMaterial == null) {
            throw new IllegalStateException("No biomaterial for " + factorValueFields[0]);
        }

        if (seenBioMaterials.contains(currentBioMaterial)) {
            throw new IllegalArgumentException(
                    "A biomaterial occurred more than once in the file: " + currentBioMaterial);
        }

        seenBioMaterials.add(currentBioMaterial);

        int start = 1;
        if (hasExternalId) {
            start = 2;
        }

        for (int i = start; i < factorValueFields.length; i++) {
            ExperimentalFactor currentExperimentalFactor = null;
            String currentExperimentalFactorName = StringUtils.strip(headerFields[i]);

            FactorValue currentFactorValue = null;
            String currentFactorValueValue = StringUtils.strip(factorValueFields[i]);

            if (StringUtils.isBlank(currentFactorValueValue)) {
                // Missing value. Note that catching 'NA' etc. is hard, because they could be valid strings.
                continue;
            }

            for (ExperimentalFactor experimentalFactor : experimentalFactorsInExperiment) {
                if (experimentalFactor.getName().equals(currentExperimentalFactorName)) {
                    currentExperimentalFactor = experimentalFactor;
                }
            }

            if (currentExperimentalFactor == null)
                throw new IllegalStateException("No factor matches column " + currentExperimentalFactorName);

            Collection<FactorValue> factorValuesInCurrentExperimentalFactor = currentExperimentalFactor
                    .getFactorValues();

            for (FactorValue factorValue : factorValuesInCurrentExperimentalFactor) {
                if (factorValue.getValue().trim().equalsIgnoreCase(currentFactorValueValue.trim())) {
                    currentFactorValue = factorValue;
                }
            }

            if (currentFactorValue == null) {
                log.error("Current factor value not found " + currentExperimentalFactor
                        + currentFactorValueValue);
            } else {
                if (!checkForDuplicateFactorOnBioMaterial(currentBioMaterial, currentFactorValue)) {
                    currentBioMaterial.getFactorValues().add(currentFactorValue);
                } else {
                    // already got warned.
                }
            }
            log.debug("Added factor value " + currentFactorValue + " to biomaterial " + currentBioMaterial);
            biomaterialsWithFactorValuesInExperiment.add(currentBioMaterial);

            if (!factorsAssociatedWithBioMaterials.containsKey(currentExperimentalFactor)) {
                factorsAssociatedWithBioMaterials.put(currentExperimentalFactor, new HashSet<BioMaterial>());
            }
            factorsAssociatedWithBioMaterials.get(currentExperimentalFactor).add(currentBioMaterial);

        }

    }

    /*
     * Check if every biomaterial got used. Worth a warning, at least.
     */
    for (ExperimentalFactor ef : factorsAssociatedWithBioMaterials.keySet()) {
        if (!factorsAssociatedWithBioMaterials.get(ef).containsAll(experimentBioMaterials)) {
            log.warn(
                    "File did not contain values for all factor - biomaterial combinations: Missing at least one for "
                            + ef + " [populated " + factorsAssociatedWithBioMaterials.get(ef).size() + "/"
                            + experimentBioMaterials.size() + " ]");
        }
    }

    return biomaterialsWithFactorValuesInExperiment;
}

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Get a map of experimental values keyed on experimental factor name
 * //from ww w. j av  a2 s.  c  o  m
 * @param headerFields
 * @param factorValueLines
 * @return map of experimental factor values keyed on experimental factor
 */
private Map<String, Set<String>> getMapFactorSampleValues(String[] headerFields,
        List<String> factorValueLines) {
    Map<String, Set<String>> factorSampleValues = new HashMap<String, Set<String>>();
    for (String factorValueLine : factorValueLines) {
        String[] factorValueFields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t");

        for (int i = 1; i < headerFields.length; i++) {

            // get the key
            String value = headerFields[i];
            value = StringUtils.strip(value);
            String factorValue = StringUtils.strip(factorValueFields[i]);
            Set<String> listFactorValues = factorSampleValues.get(value);
            if (listFactorValues == null) {
                listFactorValues = new HashSet<String>();
            }
            listFactorValues.add(factorValue);
            factorSampleValues.put(value, listFactorValues);

        }

    }
    return factorSampleValues;

}

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Check that the biomaterial is in the file and in the experiment. It is arguable whether this should be an
 * exception. I think it has to be to make sure that simple errors in the format are caught. But it's inconvenient
 * for cases where a single 'design' file is to be used for multiple microarray studies. Biomaterial ids should
 * match what is stored/*  www.ja va  2  s  .c  o m*/
 * 
 * @param experiment Current experiment
 * @param factorValueLines Lines containing biomaterial names and their factor values
 */
private void validateBioMaterialFileContent(ExpressionExperiment experiment,
        Collection<BioMaterial> bioMaterials, List<String> factorValueLines) throws IllegalArgumentException {

    for (String factorValueLine : factorValueLines) {
        String[] vals = StringUtils.splitPreserveAllTokens(factorValueLine, '\t');
        if (vals.length < 2) {
            throw new IllegalArgumentException(
                    "Expected a file with at least two columns separated by tabs, got " + factorValueLine);
        }
        BioMaterial bioMaterialInFile = getBioMaterialFromExpressionExperiment(experiment, bioMaterials,
                vals[0], vals[1]);
        if (bioMaterialInFile == null) {
            throw new IllegalArgumentException(
                    "The uploaded file has a biomaterial name that does not match the study: "
                            + StringUtils.splitPreserveAllTokens(factorValueLine, "\t")[0]
                            + " (formatted based on on input: ");
        }
    }
}

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Validates that factor values given in file for each biomaterial match the number of experimental factor values
 * expected./*  w  w  w  . j av  a 2  s.  co  m*/
 * 
 * @para numberOfExperimentalFactors
 * @param factorValueList Represents lines of file containing factor values for a biomaterial
 */
private void validateFactorFileContent(Integer numberOfExperimentalFactors, List<String> factorValueList)
        throws IOException {
    for (String factorValueLine : factorValueList) {
        String[] fields = StringUtils.splitPreserveAllTokens(factorValueLine, "\t");
        if (fields.length > numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED) {
            throw new IOException(
                    "Expected no more than " + (numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED)
                            + " columns based on EF descriptions (plus id column), got " + fields.length);
        }
        if (fields.length <= numberOfExperimentalFactors) {
            throw new IOException("Expected at least " + (numberOfExperimentalFactors + 1)
                    + " columns based on EF descriptions (plus id column), got " + fields.length);

        }
    }
}

From source file:ubic.gemma.loader.expression.simple.ExperimentalDesignImporterImpl.java

/**
 * Validates that the sample header is correctly formatted. Checks that the experimental factors defined in the
 * header match those in the experimental factor file lines.
 * /*  ww  w . j  a  va 2  s.co m*/
 * @param experimentalFactorValueNames
 * @param numberOfExperimentalFactors
 * @param sampleHeaderLine
 * @throws IOException Validation fails.
 */
private void validateSampleHeaderFileContent(Set<String> experimentalFactorValueNames,
        Integer numberOfExperimentalFactors, String sampleHeaderLine) throws IOException {
    String[] headerFields = StringUtils.splitPreserveAllTokens(sampleHeaderLine, "\t");

    // we might have the ids, and the external id.
    if (headerFields.length > numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED) {
        throw new IOException("Expected " + (numberOfExperimentalFactors + NUMBER_OF_EXTRA_COLUMNS_ALLOWED)
                + " columns based on EF descriptions (plus id column), got " + headerFields.length);
    }

    for (int i = 1; i < headerFields.length; i++) {

        String value = headerFields[i];

        value = StringUtils.strip(value);

        if (value.equals("ExternalID")) {
            // that's fine.
            continue;
        }

        if (!experimentalFactorValueNames.contains(value)) {
            throw new IOException("Expected to find an EF matching the column heading '" + value + "'");
        }

    }

}

From source file:ubic.gemma.loader.genome.FastaParser.java

/**
 * The following formats are supported//from w  w  w  .j av  a 2 s . c  om
 * <ul>
 * <li>GenBank: gi|gi-number|gb|accession|locus
 * <li>EMBL Data Library : gi|gi-number|emb|accession|locus
 * <li>DDBJ, DNA Database of Japan : gi|gi-number|dbj|accession|locus
 * <li>NBRF PIR : pir||entry
 * <li>Protein Research Foundation : prf||name
 * <li>SWISS-PROT : sp|accession|name
 * <li>Brookhaven Protein Data Bank (1) : pdb|entry|chain
 * <li>Brookhaven Protein Data Bank (2) : entry:chain|PDBID|CHAIN|SEQUENCE
 * <li>Patents : pat|country|number
 * <li>GenInfo Backbone Id bbs|number
 * <li>General database identifier : gnl|database|identifier
 * <li>NCBI Reference Sequence : ref|accession|locus
 * <li>Local Sequence identifier : lcl|identifier
 * <li>NIA 15k and 7k sets : H[0-9A-Z]{1-9}-\d | alternate (example: &gt;H4002F12-5 )
 * <li>Generic: probeid
 * </ul>
 * 
 * @param bioSequence
 * @param header
 * @return
 */
private boolean parseDeflineHeader(BioSequence bioSequence, String header) {
    // one of the genbank formats.
    String[] split = StringUtils.splitPreserveAllTokens(header, "|;");

    String firstTag = split[0];

    // assert firstTag.startsWith( ">" );
    // assert firstTag.length() > 1;
    firstTag = StringUtils.removeStart(firstTag, ">");

    // FIXME check for array lengths, throw illegal argument exceptions.

    if (firstTag.equals("gi")) {
        bioSequence.setDescription(split[4]);
        String genbankAcc = split[3]; // with version number, possibly
        DatabaseEntry genbank = ExternalDatabaseUtils.getGenbankAccession(genbankAcc);
        bioSequence.setName(genbank.getAccession()); // without version number.
        bioSequence.setSequenceDatabaseEntry(genbank);
    } else if (firstTag.equals("pir")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.equals("sp")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("ref")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("lcl")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.equals("pdb")) {
        bioSequence.setName(split[1]);
        bioSequence.setDescription(split[2]);
    } else if (firstTag.equals("gnl")) {
        bioSequence.setName(split[2]);
    } else if (firstTag.equals("entry:chain")) {
        bioSequence.setName(split[1]);
    } else if (firstTag.matches(NIA_HEADER_REGEX)) {
        return parseNIA(bioSequence, header);
    } else {
        // generic.
        bioSequence.setName(split[0]);
        if (split.length > 1)
            bioSequence.setDescription(split[1]);
        // log.warn( "Defline-style FASTA header in unrecognized format, started with " + firstTag );
        // return false;
    }
    return true;
}

From source file:ubic.gemma.loader.genome.gene.ExternalFileGeneLoaderServiceImpl.java

/**
 * Read a gene file line, splitting the line into 3 strings.
 * //from  ww  w.ja va2 s.co  m
 * @param line A line from the gene file
 * @return Array of strings representing a line in a gene file.
 * @throws IOException Thrown if file is not readable
 */
private String[] readLine(String line) throws IOException {
    if (StringUtils.isBlank(line)) {
        return null;
    }
    if (line.startsWith("#")) {
        return null;
    }

    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
    if (fields.length < 2) {
        throw new IOException("Illegal format, expected at least 2 columns, got " + fields.length);
    }
    return fields;

}

From source file:ubic.gemma.loader.genome.gene.ncbi.homology.HomologeneServiceImpl.java

/**
 * @param is/*w w  w .jav a  2s  .c  om*/
 * @throws IOException
 */
protected void parseHomologGeneFile(InputStream is) throws IOException {

    BufferedReader br = new BufferedReader(new InputStreamReader(is));
    String line = null;

    while ((line = br.readLine()) != null) {

        if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) {
            continue;
        }
        String[] fields = StringUtils.splitPreserveAllTokens(line, DELIMITING_CHARACTER);

        Integer taxonId = Integer.parseInt(fields[1]);
        Long groupId;
        Long geneId;
        try {
            groupId = Long.parseLong(fields[0]);
            geneId = Long.parseLong(fields[2]);
        } catch (NumberFormatException e) {
            log.warn("Unparseable line from homologene: " + line);
            continue;
        }
        String geneSymbol = fields[3];

        if (!group2Gene.containsKey(groupId)) {
            group2Gene.put(groupId, new ArrayList<Long>());
        }
        group2Gene.get(groupId).add(geneId);

        if (!gene2Group.containsKey(geneId)) {
            gene2Group.put(geneId, groupId);
        } else {
            log.warn("Duplicate gene ID encountered.  Skipping: geneID=" + geneId + " ,taxonID = " + taxonId
                    + " ,geneSymbol = " + geneSymbol);
        }
    }
    ready.set(true);
    log.info("Gene Homology successfully loaded: " + gene2Group.keySet().size() + " genes covered in "
            + group2Gene.keySet().size() + " groups");

}

From source file:ubic.gemma.loader.genome.gene.ncbi.NcbiGene2AccessionParser.java

@Override
public NCBIGene2Accession parseOneLine(String line) {
    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

    if (fields.length != NCBI_GENE2ACCESSION_FIELDS_PER_ROW) {
        throw new IllegalArgumentException("Line is not in the right format: has " + fields.length
                + " fields, expected " + NCBI_GENE2ACCESSION_FIELDS_PER_ROW);
    }//from   w w  w .j a v a  2  s .co m

    NCBIGene2Accession currentAccession = processFields(fields);

    if (currentAccession == null) {
        return null;
    }

    addResult(currentAccession); // really doesn't serve much of a purpose

    /*
     * Only some genes are relevant - for example, we might have filtered them by taxon.
     */
    if (geneInfo != null && !geneInfo.containsKey(currentAccession.getGeneId())) {
        return null;
    }

    // if the current gene Id is different from this current one, then
    // we are done with the gene Id. Push the geneCollection into the queue.
    if (lastGeneId != null && !lastGeneId.equalsIgnoreCase(currentAccession.getGeneId())) {
        // push the gene set to the queue
        try {
            queue.put(geneData);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        // clear the gene set
        geneData = new NcbiGeneData();
        if (geneInfo != null)
            geneInfo.remove(lastGeneId);
    }

    assert currentAccession.getGeneId() != null;

    // we're either starting a new one, or continuing with an old one.
    lastGeneId = currentAccession.getGeneId();
    geneData.addAccession(currentAccession);
    geneData.setGeneInfo(geneInfo.get(currentAccession.getGeneId()));

    // this will be a trailing accession.?
    return currentAccession;
}