Example usage for org.apache.commons.lang StringUtils splitPreserveAllTokens

Introduction

In this page you can find the example usage for org.apache.commons.lang StringUtils splitPreserveAllTokens.

Prototype

public static String[] splitPreserveAllTokens(String str, String separatorChars)

Source Link

Document

Splits the provided text into an array, separators specified, preserving all tokens, including empty tokens created by adjacent separators.

Usage

From source file:tv.icntv.grade.film.utils.TableUtil.java

public static String unreverseUrl(String reversedUrl) {
    StringBuilder buf = new StringBuilder(reversedUrl.length() + 2);

    int pathBegin = reversedUrl.indexOf('/');
    if (pathBegin == -1)
        pathBegin = reversedUrl.length();
    String sub = reversedUrl.substring(0, pathBegin);

    String[] splits = StringUtils.splitPreserveAllTokens(sub, ':'); // {<reversed host>, <port>, <protocol>}

    buf.append(splits[1]); // add protocol
    buf.append("://");
    reverseAppendSplits(splits[0], buf); // splits[0] is reversed
    // host// ww w .  j  av a2s . c om
    if (splits.length == 3) { // has a port
        buf.append(':');
        buf.append(splits[2]);
    }
    buf.append(reversedUrl.substring(pathBegin));
    return buf.toString();
}

From source file:ubic.basecode.bio.geneset.GeneAnnotations.java

/**
 * @param bis/*from   w ww .  jav a  2 s.  c  o  m*/
 * @param activeGenes
 * @throws IOException
 */
protected void readAgilent(InputStream bis, Set<String> activeGenes) throws IOException {
    if (bis == null) {
        throw new IOException("Inputstream was null");
    }
    BufferedReader dis = new BufferedReader(new InputStreamReader(bis));
    Collection<String> probeIds = new ArrayList<String>();
    String classIds = null;

    String header = dis.readLine();
    int numFields = getAgilentNumFields(header);
    int probeIndex = getAgilentProbeIndex(header);
    int goIndex = getAgilentGoIndex(header);
    int geneNameIndex = getAgilentGeneNameIndex(header);
    int geneSymbolIndex = getAgilentGeneSymbolIndex(header);

    tick();
    assert (numFields > probeIndex + 1 && numFields > geneSymbolIndex + 1);
    Pattern pat = Pattern.compile("[0-9]+");
    // loop through rows. Makes hash map of probes to go, and map of go to
    // probes.
    int n = 0;
    String line = "";
    while ((line = dis.readLine()) != null) {

        if (Thread.currentThread().isInterrupted()) {
            dis.close();
            throw new CancellationException();
        }

        String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
        if (fields.length < probeIndex + 1 || fields.length < geneSymbolIndex + 1) {
            continue; // skip lines that don't meet criteria.
        }

        String probe = fields[probeIndex];
        String gene = fields[geneSymbolIndex];

        if (activeGenes != null && !activeGenes.contains(gene)) {
            continue;
        }

        storeProbeAndGene(probeIds, probe, gene);

        /* read gene description */

        String description = fields[geneNameIndex].intern();
        if (!description.startsWith("GO:")) {
            probeToDescription.put(probe.intern(), description.intern());
        } else {
            probeToDescription.put(probe.intern(), NO_DESCRIPTION);
        }

        if (fields.length < goIndex + 1) {
            continue;
        }

        classIds = fields[goIndex];

        if (StringUtils.isNotBlank(classIds)) {
            String[] goinfo = classIds.split("\\|");
            for (String element : goinfo) {
                String goi = element.intern();
                parseGoTerm(probe, pat, goi);
            }
        }

        if (messenger != null && n % 500 == 0) {
            messenger.showStatus("Read " + n + " probes");
            try {
                Thread.sleep(10);
            } catch (InterruptedException e) {
                dis.close();
                throw new RuntimeException("Interrupted");
            }
        }
        n++;

    }

    /* Fill in the genegroupreader and the classmap */
    dis.close();
    tick();
    resetSelectedProbes();

    if (probeToGeneName.size() == 0 || geneSetToProbeMap.size() == 0) {
        throw new IllegalArgumentException(
                "The gene annotations had invalid information. Please check the format.");
    }

}

From source file:ubic.gemma.analysis.service.ArrayDesignAnnotationServiceImpl.java

/**
 * @param arrayDesign/*  w  w w. j  av  a2 s.  c  o  m*/
 * @return Map of composite sequence ids to an array of delimited strings: [probe name,genes symbol, gene Name,
 *         gemma gene id, ncbi id] for a given probe id. format of string is geneSymbol then geneNames same as found
 *         in annotation file
 */
public static Map<Long, String[]> readAnnotationFileAsString(ArrayDesign arrayDesign) {
    Map<Long, String[]> results = new HashMap<Long, String[]>();
    File f = new File(ANNOT_DATA_DIR + mungeFileName(arrayDesign.getShortName()) + STANDARD_FILE_SUFFIX
            + ANNOTATION_FILE_SUFFIX);
    if (!f.canRead()) {
        log.info("Gene annotations are not available from " + f);
        return results;
    }

    Map<String, Long> probeNameToId = new HashMap<String, Long>();

    int FIELDS_PER_GENE = 5; // used to be 3, now is 5;

    for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
        results.put(cs.getId(), new String[FIELDS_PER_GENE]);
        if (probeNameToId.containsKey(cs.getName())) {
            log.warn("Duplicate probe name: " + cs.getName());
        }
        probeNameToId.put(cs.getName(), cs.getId());
    }

    try {
        log.info("Reading annotations from: " + f);
        InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(f.getAbsolutePath());
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        String line = null;

        while ((line = br.readLine()) != null) {
            if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) {
                continue;
            }
            String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

            if (fields.length < 3)
                continue; // means there are no gene annotations.

            String probeName = fields[0];

            if (!probeNameToId.containsKey(probeName))
                continue;
            Long probeId = probeNameToId.get(probeName);

            results.get(probeId)[0] = probeName; // Probe Name (redundant!)
            results.get(probeId)[1] = fields[1]; // Gene Symbol
            results.get(probeId)[2] = fields[2]; // Gene Name

            // fields[3] is the GO annotations, we skip that.

            if (fields.length > 4) {
                results.get(probeId)[3] = fields[4]; // Gemma Id
            }

            if (fields.length > 5) {
                results.get(probeId)[4] = fields[5]; // NCBI id.
            }

        }

        is.close();

        return results;
    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:ubic.gemma.analysis.service.ArrayDesignAnnotationServiceImpl.java

/**
 * @param results//w w w.ja  v  a 2s  .c  om
 * @param f
 * @param probeNameToId
 * @return
 */
private static Map<Long, Collection<Gene>> parseAnnotationFile(Map<Long, Collection<Gene>> results,
        InputStream is, Map<String, Long> probeNameToId) {
    try {

        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        String line = null;

        while ((line = br.readLine()) != null) {
            if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) {
                continue;
            }
            String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

            if (fields.length < 3)
                continue; // means there are no gene annotations.

            String probeName = fields[0];

            if (!probeNameToId.containsKey(probeName))
                continue;
            Long probeId = probeNameToId.get(probeName);

            List<String> geneSymbols = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[1], '|'));
            List<String> geneNames = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[2], '|'));

            if (geneSymbols.size() != geneNames.size()) {
                log.warn("Annotation file format error: Unequal number of gene symbols and names for probe="
                        + probeName + ", skipping row");
                continue;
            }

            List<String> gemmaGeneIds = null;
            List<String> ncbiIds = null;

            if (fields.length > 4) { // new style. fields[3] is the GO annotations.
                gemmaGeneIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[4], '|'));
            }
            if (fields.length > 5) {
                ncbiIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[5], '|'));
            }

            for (int i = 0; i < geneSymbols.size(); i++) {

                String symbol = geneSymbols.get(i);
                String name = geneNames.get(i);

                if (StringUtils.isBlank(symbol)) {
                    continue;
                }

                String[] symbolsb = StringUtils.split(symbol, ',');
                String[] namesb = StringUtils.split(name, '$');

                for (int j = 0; j < symbolsb.length; j++) {

                    String s = symbolsb[j];

                    Gene g = Gene.Factory.newInstance();
                    g.setOfficialSymbol(s);

                    try {
                        if (gemmaGeneIds != null) {
                            g.setId(Long.parseLong(gemmaGeneIds.get(j)));
                        }

                        if (ncbiIds != null) {
                            g.setNcbiGeneId(Integer.parseInt(ncbiIds.get(j)));
                        }
                    } catch (NumberFormatException e) {
                        // oh well, couldn't populate extra info.
                    }

                    if (namesb.length >= j + 1) {
                        String n = namesb[j];
                        g.setName(n);
                    }

                    results.get(probeId).add(g);
                }
            }
        }

        return results;
    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:ubic.gemma.loader.association.NCBIGene2GOAssociationParser.java

/**
 * Note that "-" means a missing value, which in practice only occurs in the "qualifier" and "pubmed" columns.
 * //from  www.j a  va2 s .  c o  m
 * @param line
 * @param taxa to use
 * @return Object
 */
public Gene2GOAssociation mapFromGene2GO(String line) {

    String[] values = StringUtils.splitPreserveAllTokens(line, "\t");

    if (line.startsWith(COMMENT_INDICATOR))
        return null;

    if (values.length < 8)
        return null;

    Integer taxonId = null;
    try {
        taxonId = Integer.parseInt(values[TAX_ID]);
    } catch (NumberFormatException e) {
        throw new RuntimeException(e);
    }

    if (!taxaNcbiIds.containsKey(taxonId)) {
        return null;
    }

    Gene2GOAssociation g2GOAss = Gene2GOAssociation.Factory.newInstance();

    Gene gene = Gene.Factory.newInstance();
    gene.setNcbiGeneId(Integer.parseInt(values[GENE_ID]));

    gene.setTaxon(taxaNcbiIds.get(taxonId));
    VocabCharacteristic oe = VocabCharacteristic.Factory.newInstance();
    String value = values[GO_ID].replace(":", "_");
    oe.setValueUri(GeneOntologyService.BASE_GO_URI + value);
    oe.setValue(value);

    // g2GOAss.setSource( ncbiGeneDb );

    g2GOAss.setGene(gene);
    g2GOAss.setOntologyEntry(oe);

    String evidenceCode = values[EVIDENCE_CODE];

    if (!(StringUtils.isBlank(evidenceCode) || evidenceCode.equals("-"))) {

        if (ignoredEvidenceCodes.contains(evidenceCode)) {
            return null;
        }

        g2GOAss.setEvidenceCode(GOEvidenceCode.fromString(evidenceCode));
    }

    try {
        queue.put(g2GOAss);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    return g2GOAss;
}

From source file:ubic.gemma.loader.expression.arrayDesign.ArrayDesignParser.java

@Override
public ArrayDesign parseOneLine(String line) {
    ArrayDesign ad = ArrayDesign.Factory.newInstance();
    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
    ad.setName(fields[0]);//w  w  w .j a v a2  s . co m
    ad.setDescription(fields[5]);

    Taxon t = Taxon.Factory.newInstance();
    t.setCommonName(fields[4].toLowerCase());
    t.setIsSpecies(true); // assumption.
    t.setIsGenesUsable(true); // assumption
    ad.setPrimaryTaxon(t);

    Contact manufacturer = Contact.Factory.newInstance();
    manufacturer.setName(fields[1]);
    ad.setDesignProvider(manufacturer);

    ad.setAdvertisedNumberOfDesignElements(Integer.parseInt(fields[4]));
    return ad;
}

From source file:ubic.gemma.loader.expression.arrayDesign.ArrayDesignProbeMapperServiceImpl.java

@Override
public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB,
        boolean ncbiIds) throws IOException {

    if (taxon == null && !ncbiIds) {
        throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true");
    }//from   w  ww . ja  va  2 s .c om

    if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
        throw new IllegalArgumentException(
                "Do not use this service to process platforms that do not use an probe-based technology.");
    }

    BufferedReader b = new BufferedReader(new FileReader(source));
    String line = null;
    int numSkipped = 0;

    log.info("Removing any old associations");
    arrayDesignService.deleteGeneProductAssociations(arrayDesign);

    while ((line = b.readLine()) != null) {

        if (StringUtils.isBlank(line)) {
            continue;
        }
        if (line.startsWith("#")) {
            continue;
        }

        String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
        if (fields.length != 3) {
            throw new IOException("Illegal format, expected three columns, got " + fields.length);
        }

        String probeId = fields[0];
        String seqName = fields[1];

        /*
         * FIXME. We have to allow NCBI gene ids here.
         */
        String geneSymbol = fields[2];

        if (StringUtils.isBlank(geneSymbol)) {
            numSkipped++;
            continue;
        }

        CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId);

        if (c == null) {
            if (log.isDebugEnabled())
                log.debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping");
            numSkipped++;
            continue;
        }

        // a probe can have more than one gene associated with it if so they are piped |
        Collection<Gene> geneListProbe = new HashSet<Gene>();

        // indicate multiple genes
        Gene geneDetails = null;

        StringTokenizer st = new StringTokenizer(geneSymbol, "|");
        while (st.hasMoreTokens()) {
            String geneToken = st.nextToken().trim();
            if (ncbiIds) {
                geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken));
            } else {
                geneDetails = geneService.findByOfficialSymbol(geneToken, taxon);
            }
            if (geneDetails != null) {
                geneListProbe.add(geneDetails);
            }
        }

        if (geneListProbe.size() == 0) {
            log.warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping");
            numSkipped++;
            continue;
        } else if (geneListProbe.size() > 1) {
            // this is a common situation, when the geneSymbol actually has |-separated genes, so no need to make a
            // lot of fuss.
            log.debug("More than one gene found for '" + geneSymbol + "' in " + taxon);
        }

        BioSequence bs = c.getBiologicalCharacteristic();

        if (bs != null) {
            if (StringUtils.isNotBlank(seqName)) {
                bs = bioSequenceService.thaw(bs);
                if (!bs.getName().equals(seqName)) {
                    log.warn("Sequence name '" + seqName + "' given for " + probeId
                            + " does not match existing entry " + bs.getName() + ", skipping");
                    numSkipped++;
                    continue;
                }

            }
            // otherwise we assume everything is okay.
        } else {
            // create one based on the text provided.
            if (StringUtils.isBlank(seqName)) {
                log.warn("You must provide sequence names for probes which are not already mapped. probeName="
                        + probeId + " had no sequence associated and no name provided; skipping");
                numSkipped++;
                continue;
            }

            bs = BioSequence.Factory.newInstance();
            bs.setName(seqName);
            bs.setTaxon(taxon);
            bs.setDescription("Imported from annotation file");

            // Placeholder.
            bs.setType(SequenceType.OTHER);

            bs = bioSequenceService.create(bs);

            c.setBiologicalCharacteristic(bs);

            // fixme: possibly move outside the loop if that's faster.
            compositeSequenceService.update(c);
        }

        assert bs.getId() != null;
        for (Gene gene : geneListProbe) {
            gene = geneService.thaw(gene);
            if (gene.getProducts().size() == 0) {
                log.warn(
                        "There are no gene products for " + gene + ", it cannot be mapped to probes. Skipping");
                numSkipped++;
                continue;
            }
            for (GeneProduct gp : gene.getProducts()) {
                AnnotationAssociation association = AnnotationAssociation.Factory.newInstance();
                association.setBioSequence(bs);
                association.setGeneProduct(gp);
                association.setSource(sourceDB);
                annotationAssociationService.create(association);
            }

        }

    }

    arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());

    this.deleteOldFiles(arrayDesign);

    log.info("Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped");
    b.close();
}

From source file:ubic.gemma.loader.expression.arrayDesign.CompositeSequenceParser.java

@Override
public CompositeSequence parseOneLine(String line) {
    String[] tokens = StringUtils.splitPreserveAllTokens(line, '\t');

    if (tokens.length != 3) {
        return null;
    }//  www .j  ava  2  s. c  o m

    String probeid = tokens[0];
    String genbankAcc = tokens[1];
    String description = tokens[2];

    CompositeSequence result = CompositeSequence.Factory.newInstance();
    result.setName(probeid);
    result.setDescription(description);

    DatabaseEntry dbEntry = ExternalDatabaseUtils.getGenbankAccession(genbankAcc);

    BioSequence biologicalCharacteristic = BioSequence.Factory.newInstance();
    biologicalCharacteristic.setName(genbankAcc); // this will be changed later, typically.
    biologicalCharacteristic.setTaxon(taxon);

    // this will be changed later, typically.
    biologicalCharacteristic.setDescription(description + " (From platform source)");

    biologicalCharacteristic.setSequenceDatabaseEntry(dbEntry);

    result.setBiologicalCharacteristic(biologicalCharacteristic);

    return result;

}

From source file:ubic.gemma.loader.expression.geo.GeoFamilyParser.java

/**
 * If a line does not have the same number of fields as the column headings, it is skipped.
 * /*from   w  w w .  j  a  v  a 2  s.  c  om*/
 * @param line
 */
private void parsePlatformLine(String line) {

    if (!haveReadPlatformHeader) {
        haveReadPlatformHeader = true;
        return;
    }
    GeoPlatform currentPlatform = results.getPlatformMap().get(currentPlatformAccession);
    assert currentPlatform != null;

    /*
     * Skip platform information when it is not going to be usable, unless we are ONLY parsing a platform.
     */
    // Actually this isn't as important, since we filter out bad elements.
    // if ( !processPlatformsOnly && !currentPlatform.useDataFromGeo() ) {
    // return;
    // }

    String[] tokens = StringUtils.splitPreserveAllTokens(line, FIELD_DELIM);

    List<String> columnNames = currentPlatform.getColumnNames();
    int numColumns = columnNames.size();

    if (numColumns != tokens.length && numWarnings < MAX_WARNINGS) {
        log.warn("Wrong number of tokens in line (" + tokens.length + ", expected " + numColumns
                + "), line was '" + line + "'; Possible corrupt file or invalid format?");
        numWarnings++;
        if (numWarnings == MAX_WARNINGS) {
            log.warn("Further warnings suppressed");
        }

        return;
    }

    GeoPlatform platform = currentPlatform;

    for (int i = 0; i < tokens.length; i++) {
        String token = tokens[i];
        String columnName = columnNames.get(i);
        platform.addToColumnData(columnName, token);
    }
    platformLines++;
}

From source file:ubic.gemma.loader.expression.geo.GeoFamilyParser.java

/**
 * The data for one sample is all the values for each quantitation type.
 * <p>//w  ww.j  a  va 2  s .  com
 * Important implementation note: In the sample table sections of GSEXXX_family files, the first column is always
 * ID_REF, according to the kind folks at NCBI. If this changes, this code will BREAK.
 * <p>
 * Similarly, the column names between the different samples are not necessarily the same, but we trust that they
 * all refer to the same quantitation types in the same order, for a given platform. That is, the nth column for
 * this sample 'means' the same thing as the nth column for another sample in this series (on the same platform). If
 * that isn't true, this will be BROKEN. However, we do try to sort it out if we can.
 * 
 * @param line
 * @see initializeQuantitationTypes
 */
private void parseSampleDataLine(String line) {

    if (StringUtils.isBlank(line))
        return;

    if (!haveReadSampleDataHeader) {
        haveReadSampleDataHeader = true;
        previousNumTokens = null;
        initializeQuantitationTypes();
        return;
    }

    GeoSample sample = results.getSampleMap().get(currentSampleAccession);

    /*
     * skip this step if it's not a supported platform type (RNA-seq, exon arrays: we put the data in later)
     */
    if (!sample.hasUsableData()) {
        return;
    }

    String[] tokens = StringUtils.splitPreserveAllTokens(line, FIELD_DELIM);

    assert tokens != null;

    /*
     * This can happen in some files that are mildly corrupted. -- we have to ignore it.
     */
    if (tokens.length <= 1 && numWarnings < MAX_WARNINGS) {
        log.error("Parse error, sample data line has too few elements (" + tokens.length + "), line was '"
                + line + "'");
        numWarnings++;
        if (numWarnings == MAX_WARNINGS) {
            log.warn("Further warnings suppressed");
        }
        return;
    }

    if (previousNumTokens != null && tokens.length != previousNumTokens) {
        log.warn("Last line had " + (previousNumTokens - 1) + " quantitation types, this one has "
                + (tokens.length - 1));
    }

    previousNumTokens = tokens.length;

    if (results.getSeriesMap().get(currentSeriesAccession) == null) {
        return; // this happens if we are parsing a GPL file.
    }

    GeoPlatform platformForSample = sample.getPlatforms().iterator().next(); // slow

    GeoValues values = results.getSeriesMap().get(currentSeriesAccession).getValues();

    String designElement = tokens[0]; // ID_REF. For bug 1709, adding toLower() will fix this.
    Map<Integer, Integer> map = quantitationTypeTargetColumn.get(platformForSample);

    for (int i = 1; i < tokens.length; i++) {
        String value = tokens[i];
        int qtIndex = i - 1;

        /*
         * This map tells us which column this quantitation type is SUPPOSED to go in.
         */

        if (map.containsKey(qtIndex))
            qtIndex = map.get(qtIndex);
        if (!isWantedQuantitationType(qtIndex)) {
            continue;
        }

        if (log.isTraceEnabled()) {
            log.trace("Adding: " + value + " to  quantitationType " + (qtIndex) + " for " + designElement);
        }
        values.addValue(sample, qtIndex, designElement, value);
        processedDesignElements.add(designElement);
    }

    sampleDataLines++;
}