Example usage for org.apache.commons.lang StringUtils splitPreserveAllTokens

List of usage examples for org.apache.commons.lang StringUtils splitPreserveAllTokens

Introduction

In this page you can find the example usage for org.apache.commons.lang StringUtils splitPreserveAllTokens.

Prototype

public static String[] splitPreserveAllTokens(String str, String separatorChars) 

Source Link

Document

Splits the provided text into an array, separators specified, preserving all tokens, including empty tokens created by adjacent separators.

Usage

From source file:tv.icntv.grade.film.utils.TableUtil.java

public static String unreverseUrl(String reversedUrl) {
    StringBuilder buf = new StringBuilder(reversedUrl.length() + 2);

    int pathBegin = reversedUrl.indexOf('/');
    if (pathBegin == -1)
        pathBegin = reversedUrl.length();
    String sub = reversedUrl.substring(0, pathBegin);

    String[] splits = StringUtils.splitPreserveAllTokens(sub, ':'); // {<reversed host>, <port>, <protocol>}

    buf.append(splits[1]); // add protocol
    buf.append("://");
    reverseAppendSplits(splits[0], buf); // splits[0] is reversed
    // host// ww w .  j  av a2s . c om
    if (splits.length == 3) { // has a port
        buf.append(':');
        buf.append(splits[2]);
    }
    buf.append(reversedUrl.substring(pathBegin));
    return buf.toString();
}

From source file:ubic.basecode.bio.geneset.GeneAnnotations.java

/**
 * @param bis/*from   w ww .  jav a  2 s.  c  o  m*/
 * @param activeGenes
 * @throws IOException
 */
protected void readAgilent(InputStream bis, Set<String> activeGenes) throws IOException {
    if (bis == null) {
        throw new IOException("Inputstream was null");
    }
    BufferedReader dis = new BufferedReader(new InputStreamReader(bis));
    Collection<String> probeIds = new ArrayList<String>();
    String classIds = null;

    String header = dis.readLine();
    int numFields = getAgilentNumFields(header);
    int probeIndex = getAgilentProbeIndex(header);
    int goIndex = getAgilentGoIndex(header);
    int geneNameIndex = getAgilentGeneNameIndex(header);
    int geneSymbolIndex = getAgilentGeneSymbolIndex(header);

    tick();
    assert (numFields > probeIndex + 1 && numFields > geneSymbolIndex + 1);
    Pattern pat = Pattern.compile("[0-9]+");
    // loop through rows. Makes hash map of probes to go, and map of go to
    // probes.
    int n = 0;
    String line = "";
    while ((line = dis.readLine()) != null) {

        if (Thread.currentThread().isInterrupted()) {
            dis.close();
            throw new CancellationException();
        }

        String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
        if (fields.length < probeIndex + 1 || fields.length < geneSymbolIndex + 1) {
            continue; // skip lines that don't meet criteria.
        }

        String probe = fields[probeIndex];
        String gene = fields[geneSymbolIndex];

        if (activeGenes != null && !activeGenes.contains(gene)) {
            continue;
        }

        storeProbeAndGene(probeIds, probe, gene);

        /* read gene description */

        String description = fields[geneNameIndex].intern();
        if (!description.startsWith("GO:")) {
            probeToDescription.put(probe.intern(), description.intern());
        } else {
            probeToDescription.put(probe.intern(), NO_DESCRIPTION);
        }

        if (fields.length < goIndex + 1) {
            continue;
        }

        classIds = fields[goIndex];

        if (StringUtils.isNotBlank(classIds)) {
            String[] goinfo = classIds.split("\\|");
            for (String element : goinfo) {
                String goi = element.intern();
                parseGoTerm(probe, pat, goi);
            }
        }

        if (messenger != null && n % 500 == 0) {
            messenger.showStatus("Read " + n + " probes");
            try {
                Thread.sleep(10);
            } catch (InterruptedException e) {
                dis.close();
                throw new RuntimeException("Interrupted");
            }
        }
        n++;

    }

    /* Fill in the genegroupreader and the classmap */
    dis.close();
    tick();
    resetSelectedProbes();

    if (probeToGeneName.size() == 0 || geneSetToProbeMap.size() == 0) {
        throw new IllegalArgumentException(
                "The gene annotations had invalid information. Please check the format.");
    }

}

From source file:ubic.gemma.analysis.service.ArrayDesignAnnotationServiceImpl.java

/**
 * @param arrayDesign/*  w  w w. j  av  a2 s.  c  o  m*/
 * @return Map of composite sequence ids to an array of delimited strings: [probe name,genes symbol, gene Name,
 *         gemma gene id, ncbi id] for a given probe id. format of string is geneSymbol then geneNames same as found
 *         in annotation file
 */
public static Map<Long, String[]> readAnnotationFileAsString(ArrayDesign arrayDesign) {
    Map<Long, String[]> results = new HashMap<Long, String[]>();
    File f = new File(ANNOT_DATA_DIR + mungeFileName(arrayDesign.getShortName()) + STANDARD_FILE_SUFFIX
            + ANNOTATION_FILE_SUFFIX);
    if (!f.canRead()) {
        log.info("Gene annotations are not available from " + f);
        return results;
    }

    Map<String, Long> probeNameToId = new HashMap<String, Long>();

    int FIELDS_PER_GENE = 5; // used to be 3, now is 5;

    for (CompositeSequence cs : arrayDesign.getCompositeSequences()) {
        results.put(cs.getId(), new String[FIELDS_PER_GENE]);
        if (probeNameToId.containsKey(cs.getName())) {
            log.warn("Duplicate probe name: " + cs.getName());
        }
        probeNameToId.put(cs.getName(), cs.getId());
    }

    try {
        log.info("Reading annotations from: " + f);
        InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(f.getAbsolutePath());
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        String line = null;

        while ((line = br.readLine()) != null) {
            if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) {
                continue;
            }
            String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

            if (fields.length < 3)
                continue; // means there are no gene annotations.

            String probeName = fields[0];

            if (!probeNameToId.containsKey(probeName))
                continue;
            Long probeId = probeNameToId.get(probeName);

            results.get(probeId)[0] = probeName; // Probe Name (redundant!)
            results.get(probeId)[1] = fields[1]; // Gene Symbol
            results.get(probeId)[2] = fields[2]; // Gene Name

            // fields[3] is the GO annotations, we skip that.

            if (fields.length > 4) {
                results.get(probeId)[3] = fields[4]; // Gemma Id
            }

            if (fields.length > 5) {
                results.get(probeId)[4] = fields[5]; // NCBI id.
            }

        }

        is.close();

        return results;
    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:ubic.gemma.analysis.service.ArrayDesignAnnotationServiceImpl.java

/**
 * @param results//w w w.ja  v  a 2s  .c  om
 * @param f
 * @param probeNameToId
 * @return
 */
private static Map<Long, Collection<Gene>> parseAnnotationFile(Map<Long, Collection<Gene>> results,
        InputStream is, Map<String, Long> probeNameToId) {
    try {

        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        String line = null;

        while ((line = br.readLine()) != null) {
            if (StringUtils.isBlank(line) || line.startsWith(COMMENT_CHARACTER)) {
                continue;
            }
            String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');

            if (fields.length < 3)
                continue; // means there are no gene annotations.

            String probeName = fields[0];

            if (!probeNameToId.containsKey(probeName))
                continue;
            Long probeId = probeNameToId.get(probeName);

            List<String> geneSymbols = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[1], '|'));
            List<String> geneNames = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[2], '|'));

            if (geneSymbols.size() != geneNames.size()) {
                log.warn("Annotation file format error: Unequal number of gene symbols and names for probe="
                        + probeName + ", skipping row");
                continue;
            }

            List<String> gemmaGeneIds = null;
            List<String> ncbiIds = null;

            if (fields.length > 4) { // new style. fields[3] is the GO annotations.
                gemmaGeneIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[4], '|'));
            }
            if (fields.length > 5) {
                ncbiIds = Arrays.asList(StringUtils.splitPreserveAllTokens(fields[5], '|'));
            }

            for (int i = 0; i < geneSymbols.size(); i++) {

                String symbol = geneSymbols.get(i);
                String name = geneNames.get(i);

                if (StringUtils.isBlank(symbol)) {
                    continue;
                }

                String[] symbolsb = StringUtils.split(symbol, ',');
                String[] namesb = StringUtils.split(name, '$');

                for (int j = 0; j < symbolsb.length; j++) {

                    String s = symbolsb[j];

                    Gene g = Gene.Factory.newInstance();
                    g.setOfficialSymbol(s);

                    try {
                        if (gemmaGeneIds != null) {
                            g.setId(Long.parseLong(gemmaGeneIds.get(j)));
                        }

                        if (ncbiIds != null) {
                            g.setNcbiGeneId(Integer.parseInt(ncbiIds.get(j)));
                        }
                    } catch (NumberFormatException e) {
                        // oh well, couldn't populate extra info.
                    }

                    if (namesb.length >= j + 1) {
                        String n = namesb[j];
                        g.setName(n);
                    }

                    results.get(probeId).add(g);
                }
            }
        }

        return results;
    } catch (FileNotFoundException e) {
        throw new RuntimeException(e);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:ubic.gemma.loader.association.NCBIGene2GOAssociationParser.java

/**
 * Note that "-" means a missing value, which in practice only occurs in the "qualifier" and "pubmed" columns.
 * //from  www.j a  va2 s .  c o  m
 * @param line
 * @param taxa to use
 * @return Object
 */
public Gene2GOAssociation mapFromGene2GO(String line) {

    String[] values = StringUtils.splitPreserveAllTokens(line, "\t");

    if (line.startsWith(COMMENT_INDICATOR))
        return null;

    if (values.length < 8)
        return null;

    Integer taxonId = null;
    try {
        taxonId = Integer.parseInt(values[TAX_ID]);
    } catch (NumberFormatException e) {
        throw new RuntimeException(e);
    }

    if (!taxaNcbiIds.containsKey(taxonId)) {
        return null;
    }

    Gene2GOAssociation g2GOAss = Gene2GOAssociation.Factory.newInstance();

    Gene gene = Gene.Factory.newInstance();
    gene.setNcbiGeneId(Integer.parseInt(values[GENE_ID]));

    gene.setTaxon(taxaNcbiIds.get(taxonId));
    VocabCharacteristic oe = VocabCharacteristic.Factory.newInstance();
    String value = values[GO_ID].replace(":", "_");
    oe.setValueUri(GeneOntologyService.BASE_GO_URI + value);
    oe.setValue(value);

    // g2GOAss.setSource( ncbiGeneDb );

    g2GOAss.setGene(gene);
    g2GOAss.setOntologyEntry(oe);

    String evidenceCode = values[EVIDENCE_CODE];

    if (!(StringUtils.isBlank(evidenceCode) || evidenceCode.equals("-"))) {

        if (ignoredEvidenceCodes.contains(evidenceCode)) {
            return null;
        }

        g2GOAss.setEvidenceCode(GOEvidenceCode.fromString(evidenceCode));
    }

    try {
        queue.put(g2GOAss);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    return g2GOAss;
}

From source file:ubic.gemma.loader.expression.arrayDesign.ArrayDesignParser.java

@Override
public ArrayDesign parseOneLine(String line) {
    ArrayDesign ad = ArrayDesign.Factory.newInstance();
    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
    ad.setName(fields[0]);//w  w  w .j a v a2  s . co m
    ad.setDescription(fields[5]);

    Taxon t = Taxon.Factory.newInstance();
    t.setCommonName(fields[4].toLowerCase());
    t.setIsSpecies(true); // assumption.
    t.setIsGenesUsable(true); // assumption
    ad.setPrimaryTaxon(t);

    Contact manufacturer = Contact.Factory.newInstance();
    manufacturer.setName(fields[1]);
    ad.setDesignProvider(manufacturer);

    ad.setAdvertisedNumberOfDesignElements(Integer.parseInt(fields[4]));
    return ad;
}

From source file:ubic.gemma.loader.expression.arrayDesign.ArrayDesignProbeMapperServiceImpl.java

@Override
public void processArrayDesign(ArrayDesign arrayDesign, Taxon taxon, File source, ExternalDatabase sourceDB,
        boolean ncbiIds) throws IOException {

    if (taxon == null && !ncbiIds) {
        throw new IllegalArgumentException("You must provide a taxon unless passing ncbiIds = true");
    }//from   w  ww . ja  va  2 s .c om

    if (arrayDesign.getTechnologyType().equals(TechnologyType.NONE)) {
        throw new IllegalArgumentException(
                "Do not use this service to process platforms that do not use an probe-based technology.");
    }

    BufferedReader b = new BufferedReader(new FileReader(source));
    String line = null;
    int numSkipped = 0;

    log.info("Removing any old associations");
    arrayDesignService.deleteGeneProductAssociations(arrayDesign);

    while ((line = b.readLine()) != null) {

        if (StringUtils.isBlank(line)) {
            continue;
        }
        if (line.startsWith("#")) {
            continue;
        }

        String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
        if (fields.length != 3) {
            throw new IOException("Illegal format, expected three columns, got " + fields.length);
        }

        String probeId = fields[0];
        String seqName = fields[1];

        /*
         * FIXME. We have to allow NCBI gene ids here.
         */
        String geneSymbol = fields[2];

        if (StringUtils.isBlank(geneSymbol)) {
            numSkipped++;
            continue;
        }

        CompositeSequence c = compositeSequenceService.findByName(arrayDesign, probeId);

        if (c == null) {
            if (log.isDebugEnabled())
                log.debug("No probe found for '" + probeId + "' on " + arrayDesign + ", skipping");
            numSkipped++;
            continue;
        }

        // a probe can have more than one gene associated with it if so they are piped |
        Collection<Gene> geneListProbe = new HashSet<Gene>();

        // indicate multiple genes
        Gene geneDetails = null;

        StringTokenizer st = new StringTokenizer(geneSymbol, "|");
        while (st.hasMoreTokens()) {
            String geneToken = st.nextToken().trim();
            if (ncbiIds) {
                geneDetails = geneService.findByNCBIId(Integer.parseInt(geneToken));
            } else {
                geneDetails = geneService.findByOfficialSymbol(geneToken, taxon);
            }
            if (geneDetails != null) {
                geneListProbe.add(geneDetails);
            }
        }

        if (geneListProbe.size() == 0) {
            log.warn("No gene(s) found for '" + geneSymbol + "' in " + taxon + ", skipping");
            numSkipped++;
            continue;
        } else if (geneListProbe.size() > 1) {
            // this is a common situation, when the geneSymbol actually has |-separated genes, so no need to make a
            // lot of fuss.
            log.debug("More than one gene found for '" + geneSymbol + "' in " + taxon);
        }

        BioSequence bs = c.getBiologicalCharacteristic();

        if (bs != null) {
            if (StringUtils.isNotBlank(seqName)) {
                bs = bioSequenceService.thaw(bs);
                if (!bs.getName().equals(seqName)) {
                    log.warn("Sequence name '" + seqName + "' given for " + probeId
                            + " does not match existing entry " + bs.getName() + ", skipping");
                    numSkipped++;
                    continue;
                }

            }
            // otherwise we assume everything is okay.
        } else {
            // create one based on the text provided.
            if (StringUtils.isBlank(seqName)) {
                log.warn("You must provide sequence names for probes which are not already mapped. probeName="
                        + probeId + " had no sequence associated and no name provided; skipping");
                numSkipped++;
                continue;
            }

            bs = BioSequence.Factory.newInstance();
            bs.setName(seqName);
            bs.setTaxon(taxon);
            bs.setDescription("Imported from annotation file");

            // Placeholder.
            bs.setType(SequenceType.OTHER);

            bs = bioSequenceService.create(bs);

            c.setBiologicalCharacteristic(bs);

            // fixme: possibly move outside the loop if that's faster.
            compositeSequenceService.update(c);
        }

        assert bs.getId() != null;
        for (Gene gene : geneListProbe) {
            gene = geneService.thaw(gene);
            if (gene.getProducts().size() == 0) {
                log.warn(
                        "There are no gene products for " + gene + ", it cannot be mapped to probes. Skipping");
                numSkipped++;
                continue;
            }
            for (GeneProduct gp : gene.getProducts()) {
                AnnotationAssociation association = AnnotationAssociation.Factory.newInstance();
                association.setBioSequence(bs);
                association.setGeneProduct(gp);
                association.setSource(sourceDB);
                annotationAssociationService.create(association);
            }

        }

    }

    arrayDesignReportService.generateArrayDesignReport(arrayDesign.getId());

    this.deleteOldFiles(arrayDesign);

    log.info("Completed association processing for " + arrayDesign + ", " + numSkipped + " were skipped");
    b.close();
}

From source file:ubic.gemma.loader.expression.arrayDesign.CompositeSequenceParser.java

@Override
public CompositeSequence parseOneLine(String line) {
    String[] tokens = StringUtils.splitPreserveAllTokens(line, '\t');

    if (tokens.length != 3) {
        return null;
    }//  www .j  ava  2  s. c  o m

    String probeid = tokens[0];
    String genbankAcc = tokens[1];
    String description = tokens[2];

    CompositeSequence result = CompositeSequence.Factory.newInstance();
    result.setName(probeid);
    result.setDescription(description);

    DatabaseEntry dbEntry = ExternalDatabaseUtils.getGenbankAccession(genbankAcc);

    BioSequence biologicalCharacteristic = BioSequence.Factory.newInstance();
    biologicalCharacteristic.setName(genbankAcc); // this will be changed later, typically.
    biologicalCharacteristic.setTaxon(taxon);

    // this will be changed later, typically.
    biologicalCharacteristic.setDescription(description + " (From platform source)");

    biologicalCharacteristic.setSequenceDatabaseEntry(dbEntry);

    result.setBiologicalCharacteristic(biologicalCharacteristic);

    return result;

}

From source file:ubic.gemma.loader.expression.geo.GeoFamilyParser.java

/**
 * If a line does not have the same number of fields as the column headings, it is skipped.
 * /*from   w  w w .  j  a  v  a 2  s.  c  om*/
 * @param line
 */
private void parsePlatformLine(String line) {

    if (!haveReadPlatformHeader) {
        haveReadPlatformHeader = true;
        return;
    }
    GeoPlatform currentPlatform = results.getPlatformMap().get(currentPlatformAccession);
    assert currentPlatform != null;

    /*
     * Skip platform information when it is not going to be usable, unless we are ONLY parsing a platform.
     */
    // Actually this isn't as important, since we filter out bad elements.
    // if ( !processPlatformsOnly && !currentPlatform.useDataFromGeo() ) {
    // return;
    // }

    String[] tokens = StringUtils.splitPreserveAllTokens(line, FIELD_DELIM);

    List<String> columnNames = currentPlatform.getColumnNames();
    int numColumns = columnNames.size();

    if (numColumns != tokens.length && numWarnings < MAX_WARNINGS) {
        log.warn("Wrong number of tokens in line (" + tokens.length + ", expected " + numColumns
                + "), line was '" + line + "'; Possible corrupt file or invalid format?");
        numWarnings++;
        if (numWarnings == MAX_WARNINGS) {
            log.warn("Further warnings suppressed");
        }

        return;
    }

    GeoPlatform platform = currentPlatform;

    for (int i = 0; i < tokens.length; i++) {
        String token = tokens[i];
        String columnName = columnNames.get(i);
        platform.addToColumnData(columnName, token);
    }
    platformLines++;
}

From source file:ubic.gemma.loader.expression.geo.GeoFamilyParser.java

/**
 * The data for one sample is all the values for each quantitation type.
 * <p>//w  ww.j  a  va 2  s .  com
 * Important implementation note: In the sample table sections of GSEXXX_family files, the first column is always
 * ID_REF, according to the kind folks at NCBI. If this changes, this code will BREAK.
 * <p>
 * Similarly, the column names between the different samples are not necessarily the same, but we trust that they
 * all refer to the same quantitation types in the same order, for a given platform. That is, the nth column for
 * this sample 'means' the same thing as the nth column for another sample in this series (on the same platform). If
 * that isn't true, this will be BROKEN. However, we do try to sort it out if we can.
 * 
 * @param line
 * @see initializeQuantitationTypes
 */
private void parseSampleDataLine(String line) {

    if (StringUtils.isBlank(line))
        return;

    if (!haveReadSampleDataHeader) {
        haveReadSampleDataHeader = true;
        previousNumTokens = null;
        initializeQuantitationTypes();
        return;
    }

    GeoSample sample = results.getSampleMap().get(currentSampleAccession);

    /*
     * skip this step if it's not a supported platform type (RNA-seq, exon arrays: we put the data in later)
     */
    if (!sample.hasUsableData()) {
        return;
    }

    String[] tokens = StringUtils.splitPreserveAllTokens(line, FIELD_DELIM);

    assert tokens != null;

    /*
     * This can happen in some files that are mildly corrupted. -- we have to ignore it.
     */
    if (tokens.length <= 1 && numWarnings < MAX_WARNINGS) {
        log.error("Parse error, sample data line has too few elements (" + tokens.length + "), line was '"
                + line + "'");
        numWarnings++;
        if (numWarnings == MAX_WARNINGS) {
            log.warn("Further warnings suppressed");
        }
        return;
    }

    if (previousNumTokens != null && tokens.length != previousNumTokens) {
        log.warn("Last line had " + (previousNumTokens - 1) + " quantitation types, this one has "
                + (tokens.length - 1));
    }

    previousNumTokens = tokens.length;

    if (results.getSeriesMap().get(currentSeriesAccession) == null) {
        return; // this happens if we are parsing a GPL file.
    }

    GeoPlatform platformForSample = sample.getPlatforms().iterator().next(); // slow

    GeoValues values = results.getSeriesMap().get(currentSeriesAccession).getValues();

    String designElement = tokens[0]; // ID_REF. For bug 1709, adding toLower() will fix this.
    Map<Integer, Integer> map = quantitationTypeTargetColumn.get(platformForSample);

    for (int i = 1; i < tokens.length; i++) {
        String value = tokens[i];
        int qtIndex = i - 1;

        /*
         * This map tells us which column this quantitation type is SUPPOSED to go in.
         */

        if (map.containsKey(qtIndex))
            qtIndex = map.get(qtIndex);
        if (!isWantedQuantitationType(qtIndex)) {
            continue;
        }

        if (log.isTraceEnabled()) {
            log.trace("Adding: " + value + " to  quantitationType " + (qtIndex) + " for " + designElement);
        }
        values.addValue(sample, qtIndex, designElement, value);
        processedDesignElements.add(designElement);
    }

    sampleDataLines++;
}