List of usage examples for java.lang CharSequence subSequence
CharSequence subSequence(int start, int end);
From source file:org.archive.extractor.RegexHTMLLinkExtractor.java
protected boolean processGeneralTag(CharSequence element, CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); // Just in case it's an OBJECT or APPLET tag String codebase = null;/*from w w w .jav a 2s . co m*/ ArrayList<String> resources = null; long tally = next.size(); while (attr.find()) { int valueGroup = (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14; int start = attr.start(valueGroup); int end = attr.end(valueGroup); CharSequence value = cs.subSequence(start, end); if (attr.start(2) > -1) { // HREF LinkContext context = new HTMLLinkContext(element, attr.group(2)); if (element.toString().equalsIgnoreCase(LINK)) { // <LINK> elements treated as embeds (css, ico, etc) processEmbed(value, context); } else { if (element.toString().equalsIgnoreCase(BASE)) { try { base = UURIFactory.getInstance(value.toString()); } catch (URIException e) { extractErrorListener.noteExtractError(e, source, value); } } // other HREFs treated as links processLink(value, context); } } else if (attr.start(3) > -1) { // ACTION LinkContext context = new HTMLLinkContext(element, attr.group(3)); processLink(value, context); } else if (attr.start(4) > -1) { // ON____ processScriptCode(value); // TODO: context? } else if (attr.start(5) > -1) { // SRC etc. LinkContext context = new HTMLLinkContext(element, attr.group(5)); processEmbed(value, context); } else if (attr.start(6) > -1) { // CODEBASE // TODO: more HTML deescaping? codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP); LinkContext context = new HTMLLinkContext(element, attr.group(6)); processEmbed(codebase, context); } else if (attr.start(7) > -1) { // CLASSID, DATA if (resources == null) { resources = new ArrayList<String>(); } resources.add(value.toString()); } else if (attr.start(8) > -1) { // ARCHIVE if (resources == null) { resources = new ArrayList<String>(); } String[] multi = TextUtils.split(WHITESPACE, value); for (int i = 0; i < multi.length; i++) { resources.add(multi[i]); } } else if (attr.start(9) > -1) { // CODE if (resources == null) { resources = new ArrayList<String>(); } // If element is applet and code value does not end with // '.class' then append '.class' to the code value. if (element.toString().toLowerCase().equals(APPLET) && !value.toString().toLowerCase().endsWith(CLASSEXT)) { resources.add(value.toString() + CLASSEXT); } else { resources.add(value.toString()); } } else if (attr.start(10) > -1) { // VALUE if (TextUtils.matches(LIKELY_URI_PATH, value)) { LinkContext context = new HTMLLinkContext(element, attr.group(10)); processLink(value, context); } } else if (attr.start(11) > -1) { // any other attribute // ignore for now // could probe for path- or script-looking strings, but // those should be vanishingly rare in other attributes, // and/or symptomatic of page bugs } } TextUtils.recycleMatcher(attr); // handle codebase/resources if (resources == null) { return (tally - next.size()) > 0; } Iterator<String> iter = resources.iterator(); UURI codebaseURI = null; String res = null; try { if (codebase != null) { // TODO: Pass in the charset. codebaseURI = UURIFactory.getInstance(base, codebase); } while (iter.hasNext()) { res = iter.next().toString(); // TODO: more HTML deescaping? res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP); if (codebaseURI != null) { res = codebaseURI.resolve(res).toString(); } processEmbed(res, new HTMLLinkContext(element.toString())); // TODO: include attribute too } } catch (URIException e) { extractErrorListener.noteExtractError(e, source, codebase); } catch (IllegalArgumentException e) { DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e); } return (tally - next.size()) > 0; }
From source file:com.cyberway.issue.extractor.RegexpHTMLLinkExtractor.java
protected boolean processGeneralTag(CharSequence element, CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); // Just in case it's an OBJECT or APPLET tag String codebase = null;/*w w w. ja v a 2 s. co m*/ ArrayList<String> resources = null; long tally = next.size(); while (attr.find()) { int valueGroup = (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14; int start = attr.start(valueGroup); int end = attr.end(valueGroup); CharSequence value = cs.subSequence(start, end); if (attr.start(2) > -1) { // HREF CharSequence context = Link.elementContext(element, attr.group(2)); if (element.toString().equalsIgnoreCase(LINK)) { // <LINK> elements treated as embeds (css, ico, etc) processEmbed(value, context); } else { if (element.toString().equalsIgnoreCase(BASE)) { try { base = UURIFactory.getInstance(value.toString()); } catch (URIException e) { extractErrorListener.noteExtractError(e, source, value); } } // other HREFs treated as links processLink(value, context); } } else if (attr.start(3) > -1) { // ACTION CharSequence context = Link.elementContext(element, attr.group(3)); processLink(value, context); } else if (attr.start(4) > -1) { // ON____ processScriptCode(value); // TODO: context? } else if (attr.start(5) > -1) { // SRC etc. CharSequence context = Link.elementContext(element, attr.group(5)); processEmbed(value, context); } else if (attr.start(6) > -1) { // CODEBASE // TODO: more HTML deescaping? codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP); CharSequence context = Link.elementContext(element, attr.group(6)); processEmbed(codebase, context); } else if (attr.start(7) > -1) { // CLASSID, DATA if (resources == null) { resources = new ArrayList<String>(); } resources.add(value.toString()); } else if (attr.start(8) > -1) { // ARCHIVE if (resources == null) { resources = new ArrayList<String>(); } String[] multi = TextUtils.split(WHITESPACE, value); for (int i = 0; i < multi.length; i++) { resources.add(multi[i]); } } else if (attr.start(9) > -1) { // CODE if (resources == null) { resources = new ArrayList<String>(); } // If element is applet and code value does not end with // '.class' then append '.class' to the code value. if (element.toString().toLowerCase().equals(APPLET) && !value.toString().toLowerCase().endsWith(CLASSEXT)) { resources.add(value.toString() + CLASSEXT); } else { resources.add(value.toString()); } } else if (attr.start(10) > -1) { // VALUE if (TextUtils.matches(LIKELY_URI_PATH, value)) { CharSequence context = Link.elementContext(element, attr.group(10)); processLink(value, context); } } else if (attr.start(11) > -1) { // any other attribute // ignore for now // could probe for path- or script-looking strings, but // those should be vanishingly rare in other attributes, // and/or symptomatic of page bugs } } TextUtils.recycleMatcher(attr); // handle codebase/resources if (resources == null) { return (tally - next.size()) > 0; } Iterator iter = resources.iterator(); UURI codebaseURI = null; String res = null; try { if (codebase != null) { // TODO: Pass in the charset. codebaseURI = UURIFactory.getInstance(base, codebase); } while (iter.hasNext()) { res = iter.next().toString(); // TODO: more HTML deescaping? res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP); if (codebaseURI != null) { res = codebaseURI.resolve(res).toString(); } processEmbed(res, element); // TODO: include attribute too } } catch (URIException e) { extractErrorListener.noteExtractError(e, source, codebase); } catch (IllegalArgumentException e) { DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e); } return (tally - next.size()) > 0; }
From source file:org.archive.crawler.extractor.ExtractorHTML.java
/** * Handle generic HREF cases.//from w w w .j a va 2 s . c om * * @param curi * @param value * @param context */ protected void processLink(CrawlURI curi, final CharSequence value, CharSequence context) { // System.out.println("I'm processLink "+curi.toString()+""); if (TextUtils.matches(JAVASCRIPT, value)) { processScriptCode(curi, value.subSequence(11, value.length())); } else { if (logger.isLoggable(Level.FINEST)) { logger.finest("link: " + value.toString() + " from " + curi); } addLinkFromString(curi, value, context, Link.NAVLINK_HOP); this.numberOfLinksExtracted++; } }
From source file:com.cyberway.issue.crawler.extractor.ExtractorHTML.java
/** * Process metadata tags.//from w w w .jav a 2s . com * @param curi CrawlURI we're processing. * @param cs Sequence from underlying ReplayCharSequence. This * is TRANSIENT data. Make a copy if you want the data to live outside * of this extractors' lifetime. * @return True robots exclusion metatag. */ protected boolean processMeta(CrawlURI curi, CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); String name = null; String httpEquiv = null; String content = null; while (attr.find()) { int valueGroup = (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16; CharSequence value = cs.subSequence(attr.start(valueGroup), attr.end(valueGroup)); if (attr.group(1).equalsIgnoreCase("name")) { name = value.toString(); } else if (attr.group(1).equalsIgnoreCase("http-equiv")) { httpEquiv = value.toString(); } else if (attr.group(1).equalsIgnoreCase("content")) { content = value.toString(); } // TODO: handle other stuff } TextUtils.recycleMatcher(attr); // Look for the 'robots' meta-tag if ("robots".equalsIgnoreCase(name) && content != null) { curi.putString(A_META_ROBOTS, content); RobotsHonoringPolicy policy = getSettingsHandler().getOrder().getRobotsHonoringPolicy(); String contentLower = content.toLowerCase(); if ((policy == null || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE) && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM))) && (contentLower.indexOf("nofollow") >= 0 || contentLower.indexOf("none") >= 0)) { // if 'nofollow' or 'none' is specified and the // honoring policy is not IGNORE or CUSTOM, end html extraction logger.fine("HTML extraction skipped due to robots meta-tag for: " + curi.toString()); return true; } } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) { int urlIndex = content.indexOf("=") + 1; if (urlIndex > 0) { String refreshUri = content.substring(urlIndex); try { curi.createAndAddLinkRelativeToBase(refreshUri, "meta", Link.REFER_HOP); } catch (URIException e) { if (getController() != null) { getController().logUriError(e, curi.getUURI(), refreshUri); } else { logger.info("Failed createAndAddLinkRelativeToBase " + curi + ", " + cs + ", " + refreshUri + ": " + e); } } } } return false; }
From source file:org.getobjects.appserver.core.WOMessage.java
public Appendable append(final CharSequence _s, int _start, int _end) throws IOException { this.appendContentHTMLString(_s.subSequence(_start, _end).toString()); return this; }
From source file:org.archive.modules.extractor.ExtractorHTML.java
/** * Handle generic HREF cases.//from w w w . j a va2 s . c o m * * @param curi * @param value * @param context */ protected void processLink(CrawlURI curi, final CharSequence value, CharSequence context) { if (TextUtils.matches(JAVASCRIPT, value)) { processScriptCode(curi, value.subSequence(11, value.length())); } else { if (logger.isLoggable(Level.FINEST)) { logger.finest("link: " + value.toString() + " from " + curi); } addLinkFromString(curi, value, context, Hop.NAVLINK); numberOfLinksExtracted.incrementAndGet(); } }
From source file:org.archive.modules.extractor.ExtractorHTML.java
/** * Process style text./*from w w w . j ava 2s . co m*/ * @param curi CrawlURI we're processing. * @param sequence Sequence from underlying ReplayCharSequence. This * is TRANSIENT data. Make a copy if you want the data to live outside * of this extractors' lifetime. * @param endOfOpenTag */ protected void processStyle(CrawlURI curi, CharSequence sequence, int endOfOpenTag) { // First, get attributes of script-open tag as per any other tag. processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag)); // then, parse for URIs numberOfLinksExtracted.addAndGet( ExtractorCSS.processStyleCode(this, curi, sequence.subSequence(endOfOpenTag, sequence.length()))); }
From source file:com.jecelyin.editor.v2.core.text.TextUtils.java
/** * Debugging tool to print the spans in a CharSequence. The output will * be printed one span per line. If the CharSequence is not a Spanned, * then the entire string will be printed on a single line. *///from ww w .j a v a 2s . c om public static void dumpSpans(CharSequence cs, Printer printer, String prefix) { if (cs instanceof Spanned) { Spanned sp = (Spanned) cs; Object[] os = sp.getSpans(0, cs.length(), Object.class); for (int i = 0; i < os.length; i++) { Object o = os[i]; printer.println(prefix + cs.subSequence(sp.getSpanStart(o), sp.getSpanEnd(o)) + ": " + Integer.toHexString(System.identityHashCode(o)) + " " + o.getClass().getCanonicalName() + " (" + sp.getSpanStart(o) + "-" + sp.getSpanEnd(o) + ") fl=#" + sp.getSpanFlags(o)); } } else { printer.println(prefix + cs + ": (no spans)"); } }
From source file:org.archive.modules.extractor.ExtractorHTML.java
protected void processScript(CrawlURI curi, CharSequence sequence, int endOfOpenTag) { // first, get attributes of script-open tag // as per any other tag processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag)); // then, apply best-effort string-analysis heuristics // against any code present (false positives are OK) processScriptCode(curi, sequence.subSequence(endOfOpenTag, sequence.length())); }
From source file:edu.cornell.med.icb.goby.util.SimulateBisulfiteReads.java
protected void process(CharSequence segmentBases, int from, Writer writer) throws IOException { int segmentLength = segmentBases.length(); for (int repeatCount = 0; repeatCount < numRepeats; repeatCount++) { int startReadPosition = choose(0, Math.max(0, segmentBases.length() - 1 - readLength)); boolean matchedReverseStrand = doReverseStrand && doForwardStrand ? random.nextBoolean() : doReverseStrand;//w ww . ja va 2 s .co m if (matchedReverseStrand && !doReverseStrand) continue; if (!matchedReverseStrand && !doForwardStrand) continue; final CharSequence selectedReadRegion = segmentBases.subSequence(startReadPosition, startReadPosition + readLength); CharSequence readBases = matchedReverseStrand ? reverseComplement(selectedReadRegion) : selectedReadRegion; MutableString sequenceInitial = new MutableString(); MutableString sequenceTreated = new MutableString(); MutableString log = new MutableString(); IntArrayList mutatedPositions = new IntArrayList(); for (int i = 0; i < readLength; i++) { char base = readBases.charAt(i); // genomic position is zero-based int genomicPosition = matchedReverseStrand ? readLength - (i + 1) + from + startReadPosition : i + startReadPosition + from; sequenceInitial.append(base); if (base == 'C') { boolean isBaseMethylated = random .nextDouble() <= getMethylationRateAtPosition(matchedReverseStrand, genomicPosition); if (isBaseMethylated) { // base is methylated, stays a C on forward or reverse strand if (!bisulfiteTreatment) { // mutate base to G // introduce mutation C -> G base = 'G'; } // bases that are methylated are protected and stay C on the forward strand. They would also // be seen as G on the opposite strand if the sequencing protocol did not respect strandness log.append(bisulfiteTreatment ? "met: " : "mut: "); log.append(genomicPosition + 1); // write 1-based position log.append(' '); log.append("read-index: "); log.append(i + 1); log.append(' '); mutatedPositions.add(genomicPosition); } else { // bases that are not methylated are changed to T through the bisulfite and PCR conversion steps if (bisulfiteTreatment) { base = 'T'; } } } sequenceTreated.append(base); } MutableString coveredPositions = new MutableString(); MutableString qualityScores = new MutableString(); for (int i = 0; i < readLength; i++) { final char c = QualityEncoding.ILLUMINA.phredQualityScoreToAsciiEncoding((byte) 40); qualityScores.append(c); } // zero-based positions covered by the read: IntArrayList readCoveredPositions = new IntArrayList(); for (int i = startReadPosition + from; i < startReadPosition + from + readLength; i++) { // positions are written 1-based coveredPositions.append(i + 1); coveredPositions.append(" "); readCoveredPositions.add(i); } readCoveredPositions.retainAll(mutatedPositions); assert readCoveredPositions.size() == mutatedPositions .size() : "positions mutated or changed must be covered by read."; // System.out.printf("initial: %s%nbis: %s%n", sequenceInitial, sequenceTreated); writer.write(String.format("@%d reference: %s startPosition: %d strand: %s %s %s%n%s%n+%n%s%n", repeatCount, refChoice, startReadPosition, matchedReverseStrand ? "-1" : "+1", log, coveredPositions, complement(sequenceTreated), qualityScores)); } writer.flush(); }