Example usage for java.lang CharSequence subSequence

Introduction

In this page you can find the example usage for java.lang CharSequence subSequence.

Prototype

CharSequence subSequence(int start, int end);

Source Link

Document

Returns a CharSequence that is a subsequence of this sequence.

Usage

From source file:org.archive.extractor.RegexHTMLLinkExtractor.java

protected boolean processGeneralTag(CharSequence element, CharSequence cs) {

    Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);

    // Just in case it's an OBJECT or APPLET tag
    String codebase = null;/*from   w  w  w  .jav  a 2s  .  co  m*/
    ArrayList<String> resources = null;
    long tally = next.size();

    while (attr.find()) {
        int valueGroup = (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;
        int start = attr.start(valueGroup);
        int end = attr.end(valueGroup);
        CharSequence value = cs.subSequence(start, end);
        if (attr.start(2) > -1) {
            // HREF
            LinkContext context = new HTMLLinkContext(element, attr.group(2));
            if (element.toString().equalsIgnoreCase(LINK)) {
                // <LINK> elements treated as embeds (css, ico, etc)
                processEmbed(value, context);
            } else {
                if (element.toString().equalsIgnoreCase(BASE)) {
                    try {
                        base = UURIFactory.getInstance(value.toString());
                    } catch (URIException e) {
                        extractErrorListener.noteExtractError(e, source, value);
                    }
                }
                // other HREFs treated as links
                processLink(value, context);
            }
        } else if (attr.start(3) > -1) {
            // ACTION
            LinkContext context = new HTMLLinkContext(element, attr.group(3));
            processLink(value, context);
        } else if (attr.start(4) > -1) {
            // ON____
            processScriptCode(value); // TODO: context?
        } else if (attr.start(5) > -1) {
            // SRC etc.
            LinkContext context = new HTMLLinkContext(element, attr.group(5));
            processEmbed(value, context);
        } else if (attr.start(6) > -1) {
            // CODEBASE
            // TODO: more HTML deescaping?
            codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP);
            LinkContext context = new HTMLLinkContext(element, attr.group(6));
            processEmbed(codebase, context);
        } else if (attr.start(7) > -1) {
            // CLASSID, DATA
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            resources.add(value.toString());
        } else if (attr.start(8) > -1) {
            // ARCHIVE
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            String[] multi = TextUtils.split(WHITESPACE, value);
            for (int i = 0; i < multi.length; i++) {
                resources.add(multi[i]);
            }
        } else if (attr.start(9) > -1) {
            // CODE
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            // If element is applet and code value does not end with
            // '.class' then append '.class' to the code value.
            if (element.toString().toLowerCase().equals(APPLET)
                    && !value.toString().toLowerCase().endsWith(CLASSEXT)) {
                resources.add(value.toString() + CLASSEXT);
            } else {
                resources.add(value.toString());
            }

        } else if (attr.start(10) > -1) {
            // VALUE
            if (TextUtils.matches(LIKELY_URI_PATH, value)) {
                LinkContext context = new HTMLLinkContext(element, attr.group(10));
                processLink(value, context);
            }

        } else if (attr.start(11) > -1) {
            // any other attribute
            // ignore for now
            // could probe for path- or script-looking strings, but
            // those should be vanishingly rare in other attributes,
            // and/or symptomatic of page bugs
        }
    }
    TextUtils.recycleMatcher(attr);

    // handle codebase/resources
    if (resources == null) {
        return (tally - next.size()) > 0;
    }
    Iterator<String> iter = resources.iterator();
    UURI codebaseURI = null;
    String res = null;
    try {
        if (codebase != null) {
            // TODO: Pass in the charset.
            codebaseURI = UURIFactory.getInstance(base, codebase);
        }
        while (iter.hasNext()) {
            res = iter.next().toString();
            // TODO: more HTML deescaping?
            res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP);
            if (codebaseURI != null) {
                res = codebaseURI.resolve(res).toString();
            }
            processEmbed(res, new HTMLLinkContext(element.toString())); // TODO: include attribute too
        }
    } catch (URIException e) {
        extractErrorListener.noteExtractError(e, source, codebase);
    } catch (IllegalArgumentException e) {
        DevUtils.logger.log(Level.WARNING,
                "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(),
                e);
    }
    return (tally - next.size()) > 0;
}

From source file:com.cyberway.issue.extractor.RegexpHTMLLinkExtractor.java

protected boolean processGeneralTag(CharSequence element, CharSequence cs) {

    Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);

    // Just in case it's an OBJECT or APPLET tag
    String codebase = null;/*w  w w. ja v a  2  s.  co m*/
    ArrayList<String> resources = null;
    long tally = next.size();

    while (attr.find()) {
        int valueGroup = (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;
        int start = attr.start(valueGroup);
        int end = attr.end(valueGroup);
        CharSequence value = cs.subSequence(start, end);
        if (attr.start(2) > -1) {
            // HREF
            CharSequence context = Link.elementContext(element, attr.group(2));
            if (element.toString().equalsIgnoreCase(LINK)) {
                // <LINK> elements treated as embeds (css, ico, etc)
                processEmbed(value, context);
            } else {
                if (element.toString().equalsIgnoreCase(BASE)) {
                    try {
                        base = UURIFactory.getInstance(value.toString());
                    } catch (URIException e) {
                        extractErrorListener.noteExtractError(e, source, value);
                    }
                }
                // other HREFs treated as links
                processLink(value, context);
            }
        } else if (attr.start(3) > -1) {
            // ACTION
            CharSequence context = Link.elementContext(element, attr.group(3));
            processLink(value, context);
        } else if (attr.start(4) > -1) {
            // ON____
            processScriptCode(value); // TODO: context?
        } else if (attr.start(5) > -1) {
            // SRC etc.
            CharSequence context = Link.elementContext(element, attr.group(5));
            processEmbed(value, context);
        } else if (attr.start(6) > -1) {
            // CODEBASE
            // TODO: more HTML deescaping?
            codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP);
            CharSequence context = Link.elementContext(element, attr.group(6));
            processEmbed(codebase, context);
        } else if (attr.start(7) > -1) {
            // CLASSID, DATA
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            resources.add(value.toString());
        } else if (attr.start(8) > -1) {
            // ARCHIVE
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            String[] multi = TextUtils.split(WHITESPACE, value);
            for (int i = 0; i < multi.length; i++) {
                resources.add(multi[i]);
            }
        } else if (attr.start(9) > -1) {
            // CODE
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            // If element is applet and code value does not end with
            // '.class' then append '.class' to the code value.
            if (element.toString().toLowerCase().equals(APPLET)
                    && !value.toString().toLowerCase().endsWith(CLASSEXT)) {
                resources.add(value.toString() + CLASSEXT);
            } else {
                resources.add(value.toString());
            }

        } else if (attr.start(10) > -1) {
            // VALUE
            if (TextUtils.matches(LIKELY_URI_PATH, value)) {
                CharSequence context = Link.elementContext(element, attr.group(10));
                processLink(value, context);
            }

        } else if (attr.start(11) > -1) {
            // any other attribute
            // ignore for now
            // could probe for path- or script-looking strings, but
            // those should be vanishingly rare in other attributes,
            // and/or symptomatic of page bugs
        }
    }
    TextUtils.recycleMatcher(attr);

    // handle codebase/resources
    if (resources == null) {
        return (tally - next.size()) > 0;
    }
    Iterator iter = resources.iterator();
    UURI codebaseURI = null;
    String res = null;
    try {
        if (codebase != null) {
            // TODO: Pass in the charset.
            codebaseURI = UURIFactory.getInstance(base, codebase);
        }
        while (iter.hasNext()) {
            res = iter.next().toString();
            // TODO: more HTML deescaping?
            res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP);
            if (codebaseURI != null) {
                res = codebaseURI.resolve(res).toString();
            }
            processEmbed(res, element); // TODO: include attribute too
        }
    } catch (URIException e) {
        extractErrorListener.noteExtractError(e, source, codebase);
    } catch (IllegalArgumentException e) {
        DevUtils.logger.log(Level.WARNING,
                "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(),
                e);
    }
    return (tally - next.size()) > 0;
}

From source file:org.archive.crawler.extractor.ExtractorHTML.java

/**
 * Handle generic HREF cases.//from w  w  w .j a va 2  s .  c om
 * 
 * @param curi
 * @param value
 * @param context
 */
protected void processLink(CrawlURI curi, final CharSequence value, CharSequence context) {
    // System.out.println("I'm processLink "+curi.toString()+"");
    if (TextUtils.matches(JAVASCRIPT, value)) {
        processScriptCode(curi, value.subSequence(11, value.length()));
    } else {
        if (logger.isLoggable(Level.FINEST)) {
            logger.finest("link: " + value.toString() + " from " + curi);
        }
        addLinkFromString(curi, value, context, Link.NAVLINK_HOP);
        this.numberOfLinksExtracted++;
    }
}

From source file:com.cyberway.issue.crawler.extractor.ExtractorHTML.java

/**
 * Process metadata tags.//from  w w w .jav  a 2s .  com
 * @param curi CrawlURI we're processing.
 * @param cs Sequence from underlying ReplayCharSequence. This
 * is TRANSIENT data. Make a copy if you want the data to live outside
 * of this extractors' lifetime.
 * @return True robots exclusion metatag.
 */
protected boolean processMeta(CrawlURI curi, CharSequence cs) {
    Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
    String name = null;
    String httpEquiv = null;
    String content = null;
    while (attr.find()) {
        int valueGroup = (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
        CharSequence value = cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
        if (attr.group(1).equalsIgnoreCase("name")) {
            name = value.toString();
        } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
            httpEquiv = value.toString();
        } else if (attr.group(1).equalsIgnoreCase("content")) {
            content = value.toString();
        }
        // TODO: handle other stuff
    }
    TextUtils.recycleMatcher(attr);

    // Look for the 'robots' meta-tag
    if ("robots".equalsIgnoreCase(name) && content != null) {
        curi.putString(A_META_ROBOTS, content);
        RobotsHonoringPolicy policy = getSettingsHandler().getOrder().getRobotsHonoringPolicy();
        String contentLower = content.toLowerCase();
        if ((policy == null || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
                && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
                && (contentLower.indexOf("nofollow") >= 0 || contentLower.indexOf("none") >= 0)) {
            // if 'nofollow' or 'none' is specified and the
            // honoring policy is not IGNORE or CUSTOM, end html extraction
            logger.fine("HTML extraction skipped due to robots meta-tag for: " + curi.toString());
            return true;
        }
    } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
        int urlIndex = content.indexOf("=") + 1;
        if (urlIndex > 0) {
            String refreshUri = content.substring(urlIndex);
            try {
                curi.createAndAddLinkRelativeToBase(refreshUri, "meta", Link.REFER_HOP);
            } catch (URIException e) {
                if (getController() != null) {
                    getController().logUriError(e, curi.getUURI(), refreshUri);
                } else {
                    logger.info("Failed createAndAddLinkRelativeToBase " + curi + ", " + cs + ", " + refreshUri
                            + ": " + e);
                }
            }
        }
    }
    return false;
}

From source file:org.getobjects.appserver.core.WOMessage.java

public Appendable append(final CharSequence _s, int _start, int _end) throws IOException {
    this.appendContentHTMLString(_s.subSequence(_start, _end).toString());
    return this;
}

From source file:org.archive.modules.extractor.ExtractorHTML.java

/**
 * Handle generic HREF cases.//from   w  w  w  . j a va2 s . c  o m
 * 
 * @param curi
 * @param value
 * @param context
 */
protected void processLink(CrawlURI curi, final CharSequence value, CharSequence context) {
    if (TextUtils.matches(JAVASCRIPT, value)) {
        processScriptCode(curi, value.subSequence(11, value.length()));
    } else {
        if (logger.isLoggable(Level.FINEST)) {
            logger.finest("link: " + value.toString() + " from " + curi);
        }
        addLinkFromString(curi, value, context, Hop.NAVLINK);
        numberOfLinksExtracted.incrementAndGet();
    }
}

From source file:org.archive.modules.extractor.ExtractorHTML.java

/**
 * Process style text./*from  w w w .  j ava  2s . co  m*/
 * @param curi CrawlURI we're processing.
 * @param sequence Sequence from underlying ReplayCharSequence. This
 * is TRANSIENT data. Make a copy if you want the data to live outside
 * of this extractors' lifetime.
 * @param endOfOpenTag
 */
protected void processStyle(CrawlURI curi, CharSequence sequence, int endOfOpenTag) {
    // First, get attributes of script-open tag as per any other tag.
    processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag));

    // then, parse for URIs
    numberOfLinksExtracted.addAndGet(
            ExtractorCSS.processStyleCode(this, curi, sequence.subSequence(endOfOpenTag, sequence.length())));
}

From source file:com.jecelyin.editor.v2.core.text.TextUtils.java

/**
 * Debugging tool to print the spans in a CharSequence.  The output will
 * be printed one span per line.  If the CharSequence is not a Spanned,
 * then the entire string will be printed on a single line.
 *///from ww w  .j  a  v a 2s  .  c  om
public static void dumpSpans(CharSequence cs, Printer printer, String prefix) {
    if (cs instanceof Spanned) {
        Spanned sp = (Spanned) cs;
        Object[] os = sp.getSpans(0, cs.length(), Object.class);

        for (int i = 0; i < os.length; i++) {
            Object o = os[i];
            printer.println(prefix + cs.subSequence(sp.getSpanStart(o), sp.getSpanEnd(o)) + ": "
                    + Integer.toHexString(System.identityHashCode(o)) + " " + o.getClass().getCanonicalName()
                    + " (" + sp.getSpanStart(o) + "-" + sp.getSpanEnd(o) + ") fl=#" + sp.getSpanFlags(o));
        }
    } else {
        printer.println(prefix + cs + ": (no spans)");
    }
}

From source file:org.archive.modules.extractor.ExtractorHTML.java

protected void processScript(CrawlURI curi, CharSequence sequence, int endOfOpenTag) {
    // first, get attributes of script-open tag
    // as per any other tag
    processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag));

    // then, apply best-effort string-analysis heuristics
    // against any code present (false positives are OK)
    processScriptCode(curi, sequence.subSequence(endOfOpenTag, sequence.length()));
}

From source file:edu.cornell.med.icb.goby.util.SimulateBisulfiteReads.java

protected void process(CharSequence segmentBases, int from, Writer writer) throws IOException {

    int segmentLength = segmentBases.length();
    for (int repeatCount = 0; repeatCount < numRepeats; repeatCount++) {
        int startReadPosition = choose(0, Math.max(0, segmentBases.length() - 1 - readLength));
        boolean matchedReverseStrand = doReverseStrand && doForwardStrand ? random.nextBoolean()
                : doReverseStrand;//w ww . ja  va 2 s  .co m
        if (matchedReverseStrand && !doReverseStrand)
            continue;
        if (!matchedReverseStrand && !doForwardStrand)
            continue;

        final CharSequence selectedReadRegion = segmentBases.subSequence(startReadPosition,
                startReadPosition + readLength);
        CharSequence readBases = matchedReverseStrand ? reverseComplement(selectedReadRegion)
                : selectedReadRegion;

        MutableString sequenceInitial = new MutableString();
        MutableString sequenceTreated = new MutableString();
        MutableString log = new MutableString();
        IntArrayList mutatedPositions = new IntArrayList();

        for (int i = 0; i < readLength; i++) {

            char base = readBases.charAt(i);
            // genomic position is zero-based
            int genomicPosition = matchedReverseStrand ? readLength - (i + 1) + from + startReadPosition
                    : i + startReadPosition + from;
            sequenceInitial.append(base);

            if (base == 'C') {

                boolean isBaseMethylated = random
                        .nextDouble() <= getMethylationRateAtPosition(matchedReverseStrand, genomicPosition);

                if (isBaseMethylated) {
                    // base is methylated, stays a C on forward or reverse strand
                    if (!bisulfiteTreatment) {
                        // mutate base to G
                        // introduce mutation C -> G
                        base = 'G';

                    }
                    // bases that are methylated are protected and stay C on the forward strand. They would also
                    // be seen as G on the opposite strand if the sequencing protocol did not respect strandness
                    log.append(bisulfiteTreatment ? "met: " : "mut: ");
                    log.append(genomicPosition + 1); // write 1-based position
                    log.append(' ');

                    log.append("read-index: ");
                    log.append(i + 1);
                    log.append(' ');
                    mutatedPositions.add(genomicPosition);

                } else {
                    // bases that are not methylated are changed to T through the bisulfite and PCR conversion steps
                    if (bisulfiteTreatment) {
                        base = 'T';

                    }

                }
            }
            sequenceTreated.append(base);
        }
        MutableString coveredPositions = new MutableString();
        MutableString qualityScores = new MutableString();
        for (int i = 0; i < readLength; i++) {
            final char c = QualityEncoding.ILLUMINA.phredQualityScoreToAsciiEncoding((byte) 40);
            qualityScores.append(c);

        }
        // zero-based positions covered by the read:
        IntArrayList readCoveredPositions = new IntArrayList();

        for (int i = startReadPosition + from; i < startReadPosition + from + readLength; i++) {
            // positions are written 1-based
            coveredPositions.append(i + 1);
            coveredPositions.append(" ");
            readCoveredPositions.add(i);
        }

        readCoveredPositions.retainAll(mutatedPositions);
        assert readCoveredPositions.size() == mutatedPositions
                .size() : "positions mutated or changed must be covered by read.";
        //   System.out.printf("initial: %s%nbis:     %s%n", sequenceInitial, sequenceTreated);
        writer.write(String.format("@%d reference: %s startPosition: %d strand: %s %s %s%n%s%n+%n%s%n",
                repeatCount, refChoice, startReadPosition, matchedReverseStrand ? "-1" : "+1", log,
                coveredPositions, complement(sequenceTreated), qualityScores));
    }
    writer.flush();

}