Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.extractor; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Iterator; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import org.apache.commons.httpclient.URIException; import org.archive.io.ReplayCharSequence; import org.archive.modules.CrawlMetadata; import org.archive.modules.CrawlURI; import org.archive.modules.net.RobotsPolicy; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.util.DevUtils; import org.archive.util.TextUtils; import org.archive.util.UriUtils; import org.springframework.beans.factory.InitializingBean; import org.springframework.beans.factory.annotation.Autowired; /** * Basic link-extraction, from an HTML content-body, * using regular expressions. * * NOTE: This processor may open a ReplayCharSequence from the * CrawlURI's Recorder, without closing that ReplayCharSequence, to allow * reuse by later processors in sequence. In the usual (Heritrix) case, a * call after all processing to the Recorder's endReplays() method ensures * timely close of any reused ReplayCharSequences. Reuse of this processor * elsewhere should ensure a similar cleanup call to Recorder.endReplays() * occurs. * * TODO: Compare against extractors based on HTML parsing libraries for * accuracy, completeness, and speed. * * @contributor gojomo */ public class ExtractorHTML extends ContentExtractor implements InitializingBean { @SuppressWarnings("unused") private static final long serialVersionUID = 2L; private static Logger logger = Logger.getLogger(ExtractorHTML.class.getName()); private final static String MAX_ELEMENT_REPLACE = "MAX_ELEMENT"; private final static String MAX_ATTR_NAME_REPLACE = "MAX_ATTR_NAME"; private final static String MAX_ATTR_VAL_REPLACE = "MAX_ATTR_VAL"; public final static String A_META_ROBOTS = "meta-robots"; public final static String A_FORM_OFFSETS = "form-offsets"; { setMaxElementLength(64); } public int getMaxElementLength() { return (Integer) kp.get("maxElementLength"); } public void setMaxElementLength(int max) { kp.put("maxElementLength", max); } /** * Relevant tag extractor. * * <p> * This pattern extracts either: * </p> * <ul> * <li>(1) whole <script>...</script> or * <li>(2) <style>...</style> or * <li>(3) <meta ...> or * <li>(4) any other open-tag with at least one attribute (eg matches * "<a href='boo'>" but not "</a>" or "<br>") * </ul> * <p> * groups: * </p> * <ul> * <li>1: SCRIPT SRC=foo>boo</SCRIPT * <li>2: just script open tag * <li>3: STYLE TYPE=moo>zoo</STYLE * <li>4: just style open tag * <li>5: entire other tag, without '<' '>' * <li>6: element * <li>7: META * <li>8: !-- comment -- * </ul> * * <p> * HER-1998 - Modified part 8 to allow conditional html comments. * Conditional HTML comment example: * "<!--[if expression]> HTML <![endif]-->" * </p> * * <p> * This technique is commonly used to reference CSS & JavaScript that * are designed to deal with the quirks of a specific version of Internet * Explorer. There is another syntax for conditional comments which already * gets parsed by the regex since it doesn't start with "<!--" Ex. * <!if expression> HTML <!endif> * </p> * * <p> * https://en.wikipedia.org/wiki/Conditional_Comments * </p> */ // version w/ less unnecessary backtracking static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2 "|((style[^>]*+)>.*?</style)" + // 3, 4 "|(((meta)|(?:\\w{1," + MAX_ELEMENT_REPLACE + "}))\\s+[^>]*+)" + // 5, 6, 7 "|(!--(?!\\[if|>).*?--))>"; // 8 // version w/ problems with unclosed script tags // static final String RELEVANT_TAG_EXTRACTOR = // "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?:\\w+))\\s+.*?)|(!--.*?--))>"; // // this pattern extracts 'href' or 'src' attributes from // // any open-tag innards matched by the above // static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile( // "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))"); // // // this pattern extracts 'robots' attributes // static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile( // "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))"); { setMaxAttributeNameLength(64); // 64 chars } public int getMaxAttributeNameLength() { return (Integer) kp.get("maxAttributeNameLength"); } public void setMaxAttributeNameLength(int max) { kp.put("maxAttributeNameLength", max); } { setMaxAttributeValLength(2048); // 2K } public int getMaxAttributeValLength() { return (Integer) kp.get("maxAttributeValLength"); } public void setMaxAttributeValLength(int max) { kp.put("maxAttributeValLength", max); } // TODO: perhaps cut to near MAX_URI_LENGTH // this pattern extracts attributes from any open-tag innards // matched by the above. attributes known to be URIs of various // sorts are matched specially static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\s?((href)|(action)|(on\\w*)" // 1, 2, 3, 4 + "|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ... + "|(?:usemap)|(?:profile)|(?:datasrc))" // 5 + "|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9 + "|(value)|(style)|(method)" // 10, 11, 12 + "|([-\\w]{1," + MAX_ATTR_NAME_REPLACE + "}))" // 13 + "\\s*=\\s*" + "(?:(?:\"(.{0," + MAX_ATTR_VAL_REPLACE + "}?)(?:\"|$))" // 14 + "|(?:'(.{0," + MAX_ATTR_VAL_REPLACE + "}?)(?:'|$))" // 15 + "|(\\S{1," + MAX_ATTR_VAL_REPLACE + "}))"; // 16 // groups: // 1: attribute name // 2: HREF - single URI relative to doc base, or occasionally javascript: // 3: ACTION - single URI relative to doc base, or occasionally javascript: // 4: ON[WHATEVER] - script handler // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC // single URI relative to doc base // 6: CODEBASE - a single URI relative to doc base, affecting other // attributes // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied) // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE // (if supplied) // 9: CODE - a single URI relative to the CODEBASE (is specified). // 10: VALUE - often includes a uri path on forms // 11: STYLE - inline attribute style info // 12: METHOD - form GET/POST // 13: any other attribute // 14: double-quote delimited attr value // 15: single-quote delimited attr value // 16: space-delimited attr value static final String WHITESPACE = "\\s"; static final String CLASSEXT = ".class"; static final String APPLET = "applet"; static final String BASE = "base"; static final String LINK = "link"; static final String FRAME = "frame"; static final String IFRAME = "iframe"; /** * If true, FRAME/IFRAME SRC-links are treated as embedded resources (like * IMG, 'E' hop-type), otherwise they are treated as navigational links. * Default is true. */ { setTreatFramesAsEmbedLinks(true); } public boolean getTreatFramesAsEmbedLinks() { return (Boolean) kp.get("treatFramesAsEmbedLinks"); } public void setTreatFramesAsEmbedLinks(boolean asEmbeds) { kp.put("treatFramesAsEmbedLinks", asEmbeds); } /** * If true, URIs appearing as the ACTION attribute in HTML FORMs are * ignored. Default is false. */ { setIgnoreFormActionUrls(false); } public boolean getIgnoreFormActionUrls() { return (Boolean) kp.get("ignoreFormActionUrls"); } public void setIgnoreFormActionUrls(boolean ignoreActions) { kp.put("ignoreFormActionUrls", ignoreActions); } /** * If true, only ACTION URIs with a METHOD of GET (explicit or implied) * are extracted. Default is true. */ { setExtractOnlyFormGets(true); } public boolean getExtractOnlyFormGets() { return (Boolean) kp.get("extractOnlyFormGets"); } public void setExtractOnlyFormGets(boolean onlyGets) { kp.put("extractOnlyFormGets", onlyGets); } /** * If true, in-page Javascript is scanned for strings that * appear likely to be URIs. This typically finds both valid * and invalid URIs, and attempts to fetch the invalid URIs * sometimes generates webmaster concerns over odd crawler * behavior. Default is true. */ { setExtractJavascript(true); } public boolean getExtractJavascript() { return (Boolean) kp.get("extractJavascript"); } public void setExtractJavascript(boolean extractJavascript) { kp.put("extractJavascript", extractJavascript); } /** * If true, strings that look like URIs found in unusual places (such as * form VALUE attributes) will be extracted. This typically finds both valid * and invalid URIs, and attempts to fetch the invalid URIs sometimes * generate webmaster concerns over odd crawler behavior. Default is true. */ { setExtractValueAttributes(true); } public boolean getExtractValueAttributes() { return (Boolean) kp.get("extractValueAttributes"); } public void setExtractValueAttributes(boolean extractValueAttributes) { kp.put("extractValueAttributes", extractValueAttributes); } /** * If true, URIs which end in typical non-HTML extensions (such as .gif) * will not be scanned as if it were HTML. Default is true. */ { setIgnoreUnexpectedHtml(true); } public boolean getIgnoreUnexpectedHtml() { return (Boolean) kp.get("ignoreUnexpectedHtml"); } public void setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml) { kp.put("ignoreUnexpectedHtml", ignoreUnexpectedHtml); } /** * CrawlMetadata provides the robots honoring policy to use when * considering a robots META tag. */ protected CrawlMetadata metadata; public CrawlMetadata getMetadata() { return metadata; } @Autowired public void setMetadata(CrawlMetadata provider) { this.metadata = provider; } /** * Javascript extractor to use to process inline javascript. Autowired if * available. If null, links will not be extracted from inline javascript. */ transient protected ExtractorJS extractorJS; public ExtractorJS getExtractorJS() { return extractorJS; } @Autowired public void setExtractorJS(ExtractorJS extractorJS) { this.extractorJS = extractorJS; } // TODO: convert to Strings private String relevantTagPattern; private String eachAttributePattern; public ExtractorHTML() { } public void afterPropertiesSet() { String regex = RELEVANT_TAG_EXTRACTOR; regex = regex.replace(MAX_ELEMENT_REPLACE, Integer.toString(getMaxElementLength())); this.relevantTagPattern = regex; regex = EACH_ATTRIBUTE_EXTRACTOR; regex = regex.replace(MAX_ATTR_NAME_REPLACE, Integer.toString(getMaxAttributeNameLength())); regex = regex.replace(MAX_ATTR_VAL_REPLACE, Integer.toString(getMaxAttributeValLength())); this.eachAttributePattern = regex; } protected void processGeneralTag(CrawlURI curi, CharSequence element, CharSequence cs) { Matcher attr = TextUtils.getMatcher(eachAttributePattern, cs); // Just in case it's an OBJECT or APPLET tag String codebase = null; ArrayList<String> resources = null; // Just in case it's a FORM CharSequence action = null; CharSequence actionContext = null; CharSequence method = null; // Just in case it's a VALUE whose interpretation depends on accompanying NAME CharSequence valueVal = null; CharSequence valueContext = null; CharSequence nameVal = null; final boolean framesAsEmbeds = getTreatFramesAsEmbedLinks(); final boolean ignoreFormActions = getIgnoreFormActionUrls(); final boolean extractValueAttributes = getExtractValueAttributes(); final String elementStr = element.toString(); while (attr.find()) { int valueGroup = (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16; int start = attr.start(valueGroup); int end = attr.end(valueGroup); assert start >= 0 : "Start is: " + start + ", " + curi; assert end >= 0 : "End is :" + end + ", " + curi; CharSequence value = cs.subSequence(start, end); CharSequence attrName = cs.subSequence(attr.start(1), attr.end(1)); value = TextUtils.unescapeHtml(value); if (attr.start(2) > -1) { CharSequence context; // HREF if ("a".equals(element) && TextUtils.matches("(?i).*data-remote\\s*=\\s*([\"'])true.*\\1", cs)) { context = "a[data-remote='true']/@href"; } else { context = elementContext(element, attr.group(2)); } if ("a[data-remote='true']/@href".equals(context) || elementStr.equalsIgnoreCase(LINK)) { // <LINK> elements treated as embeds (css, ico, etc) processEmbed(curi, value, context); } else { // other HREFs treated as links processLink(curi, value, context); } if (elementStr.equalsIgnoreCase(BASE)) { try { UURI base = UURIFactory.getInstance(value.toString()); curi.setBaseURI(base); } catch (URIException e) { logUriError(e, curi.getUURI(), value); } } } else if (attr.start(3) > -1) { // ACTION if (!ignoreFormActions) { action = value; actionContext = elementContext(element, attr.group(3)); // handling finished only at end (after METHOD also collected) } } else if (attr.start(4) > -1) { // ON____ processScriptCode(curi, value); // TODO: context? } else if (attr.start(5) > -1) { // SRC etc. CharSequence context = elementContext(element, attr.group(5)); // true, if we expect another HTML page instead of an image etc. final Hop hop; if (!framesAsEmbeds && (elementStr.equalsIgnoreCase(FRAME) || elementStr.equalsIgnoreCase(IFRAME))) { hop = Hop.NAVLINK; } else { hop = Hop.EMBED; } processEmbed(curi, value, context, hop); } else if (attr.start(6) > -1) { // CODEBASE codebase = (value instanceof String) ? (String) value : value.toString(); CharSequence context = elementContext(element, attr.group(6)); processLink(curi, codebase, context); } else if (attr.start(7) > -1) { // CLASSID, DATA if (resources == null) { resources = new ArrayList<String>(); } resources.add(value.toString()); } else if (attr.start(8) > -1) { // ARCHIVE if (resources == null) { resources = new ArrayList<String>(); } String[] multi = TextUtils.split(WHITESPACE, value); for (int i = 0; i < multi.length; i++) { resources.add(multi[i]); } } else if (attr.start(9) > -1) { // CODE if (resources == null) { resources = new ArrayList<String>(); } // If element is applet and code value does not end with // '.class' then append '.class' to the code value. if (elementStr.equalsIgnoreCase(APPLET) && !value.toString().toLowerCase().endsWith(CLASSEXT)) { resources.add(value.toString() + CLASSEXT); } else { resources.add(value.toString()); } } else if (attr.start(10) > -1) { // VALUE, with possibility of URI // store value, context for handling at end valueVal = value; valueContext = elementContext(element, attr.group(10)); } else if (attr.start(11) > -1) { // STYLE inline attribute // then, parse for URIs numberOfLinksExtracted.addAndGet(ExtractorCSS.processStyleCode(this, curi, value)); } else if (attr.start(12) > -1) { // METHOD method = value; // form processing finished at end (after ACTION also collected) } else if (attr.start(13) > -1) { if ("NAME".equalsIgnoreCase(attrName.toString())) { // remember 'name' for end-analysis nameVal = value; } if ("FLASHVARS".equalsIgnoreCase(attrName.toString())) { // consider FLASHVARS attribute immediately valueContext = elementContext(element, attr.group(13)); considerQueryStringValues(curi, value, valueContext, Hop.SPECULATIVE); } // any other attribute // ignore for now // could probe for path- or script-looking strings, but // those should be vanishingly rare in other attributes, // and/or symptomatic of page bugs } } TextUtils.recycleMatcher(attr); // handle codebase/resources if (resources != null) { Iterator<String> iter = resources.iterator(); UURI codebaseURI = null; String res = null; try { if (codebase != null) { // TODO: Pass in the charset. codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase); } while (iter.hasNext()) { res = iter.next().toString(); res = (String) TextUtils.unescapeHtml(res); if (codebaseURI != null) { res = codebaseURI.resolve(res).toString(); } processEmbed(curi, res, element); // TODO: include attribute too } } catch (URIException e) { curi.getNonFatalFailures().add(e); } catch (IllegalArgumentException e) { DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e); } } // finish handling form action, now method is available if (action != null) { if (method == null || "GET".equalsIgnoreCase(method.toString()) || !getExtractOnlyFormGets()) { processLink(curi, action, actionContext); } } // finish handling VALUE if (valueVal != null) { if ("PARAM".equalsIgnoreCase(elementStr) && nameVal != null && "flashvars".equalsIgnoreCase(nameVal.toString())) { // special handling for <PARAM NAME='flashvars" VALUE=""> String queryStringLike = valueVal.toString(); // treat value as query-string-like "key=value[&key=value]*" pairings considerQueryStringValues(curi, queryStringLike, valueContext, Hop.SPECULATIVE); } else { // regular VALUE handling if (extractValueAttributes) { considerIfLikelyUri(curi, valueVal, valueContext, Hop.NAVLINK); } } } } /** * Consider a query-string-like collections of key=value[&key=value] * pairs for URI-like strings in the values. Where URI-like strings are * found, add as discovered outlink. * * @param curi origin CrawlURI * @param queryString query-string-like string * @param valueContext page context where found */ protected void considerQueryStringValues(CrawlURI curi, CharSequence queryString, CharSequence valueContext, Hop hop) { for (String pairString : queryString.toString().split("&")) { String[] encodedKeyVal = pairString.split("="); if (encodedKeyVal.length == 2) try { String value = URLDecoder.decode(encodedKeyVal[1], "UTF-8"); considerIfLikelyUri(curi, value, valueContext, hop); } catch (IllegalArgumentException e) { // still consider values rejected by URLDecoder considerIfLikelyUri(curi, encodedKeyVal[1], valueContext, hop); } catch (UnsupportedEncodingException e) { throw new AssertionError("all jvms must support UTF-8, and yet somehow this happened: " + e); } } } /** * Consider whether a given string is URI-like. If so, add as discovered * outlink. * * @param curi origin CrawlURI * @param queryString query-string-like string * @param valueContext page context where found */ protected void considerIfLikelyUri(CrawlURI curi, CharSequence candidate, CharSequence valueContext, Hop hop) { if (UriUtils.isVeryLikelyUri(candidate)) { addLinkFromString(curi, candidate, valueContext, hop); } } /** * Extract the (java)script source in the given CharSequence. * * @param curi source CrawlURI * @param cs CharSequence of javascript code */ protected void processScriptCode(CrawlURI curi, CharSequence cs) { if (getExtractorJS() != null && getExtractJavascript()) { numberOfLinksExtracted.addAndGet(getExtractorJS().considerStrings(this, curi, cs)); } } static final String JAVASCRIPT = "(?i)^javascript:.*"; /** * Handle generic HREF cases. * * @param curi * @param value * @param context */ protected void processLink(CrawlURI curi, final CharSequence value, CharSequence context) { if (TextUtils.matches(JAVASCRIPT, value)) { processScriptCode(curi, value.subSequence(11, value.length())); } else { if (logger.isLoggable(Level.FINEST)) { logger.finest("link: " + value.toString() + " from " + curi); } addLinkFromString(curi, value, context, Hop.NAVLINK); numberOfLinksExtracted.incrementAndGet(); } } protected void addLinkFromString(CrawlURI curi, CharSequence uri, CharSequence context, Hop hop) { try { // We do a 'toString' on context because its a sequence from // the underlying ReplayCharSequence and the link its about // to become a part of is expected to outlive the current // ReplayCharSequence. HTMLLinkContext hc = HTMLLinkContext.get(context.toString()); int max = getExtractorParameters().getMaxOutlinks(); addRelativeToBase(curi, max, uri.toString(), hc, hop); } catch (URIException e) { logUriError(e, curi.getUURI(), uri); } } protected final void processEmbed(CrawlURI curi, CharSequence value, CharSequence context) { processEmbed(curi, value, context, Hop.EMBED); } protected void processEmbed(CrawlURI curi, final CharSequence value, CharSequence context, Hop hop) { if (logger.isLoggable(Level.FINEST)) { logger.finest("embed (" + hop.getHopChar() + "): " + value.toString() + " from " + curi); } addLinkFromString(curi, (value instanceof String) ? (String) value : value.toString(), context, hop); numberOfLinksExtracted.incrementAndGet(); } protected boolean shouldExtract(CrawlURI uri) { if (getIgnoreUnexpectedHtml()) { try { // HTML was not expected (eg a GIF was expected) so ignore // (as if a soft 404) if (!isHtmlExpectedHere(uri)) { return false; } } catch (URIException e) { logger.severe("Failed expectedHTML test: " + e.getMessage()); // assume it's okay to extract } } String mime = uri.getContentType().toLowerCase(); if (mime.startsWith("text/html") || mime.startsWith("application/xhtml") || mime.startsWith("text/vnd.wap.wml") || mime.startsWith("application/vnd.wap.wml") || mime.startsWith("application/vnd.wap.xhtml")) { return true; } String contentPrefixLC = uri.getRecorder().getContentReplayPrefixString(1000).toLowerCase(); if (contentPrefixLC.contains("<html") || contentPrefixLC.contains("<!doctype html")) { return true; } return false; } public boolean innerExtract(CrawlURI curi) { if (!curi.containsContentTypeCharsetDeclaration()) { String contentPrefix = curi.getRecorder().getContentReplayPrefixString(1000); Charset contentDeclaredEncoding = getContentDeclaredCharset(curi, contentPrefix); if (!curi.getRecorder().getCharset().equals(contentDeclaredEncoding) && contentDeclaredEncoding != null) { String newContentPrefix = curi.getRecorder().getContentReplayPrefixString(1000, contentDeclaredEncoding); Charset reflexiveCharset = getContentDeclaredCharset(curi, newContentPrefix); if (contentDeclaredEncoding.equals(reflexiveCharset)) { // content-declared charset is self-consistent; use curi.getAnnotations().add("usingCharsetInHTML:" + contentDeclaredEncoding); curi.getRecorder().setCharset(contentDeclaredEncoding); } else { // error: declared charset not evident once put into effect curi.getAnnotations().add("inconsistentCharsetInHTML:" + contentDeclaredEncoding); // so, ignore in favor of original default } } } try { ReplayCharSequence cs = curi.getRecorder().getContentReplayCharSequence(); // Extract all links from the charsequence extract(curi, cs); if (cs.getDecodeExceptionCount() > 0) { curi.getNonFatalFailures().add(cs.getCodingException()); } // Set flag to indicate that link extraction is completed. return true; } catch (IOException e) { curi.getNonFatalFailures().add(e); logger.log(Level.WARNING, "Failed get of replay char sequence in " + Thread.currentThread().getName(), e); } return false; } // 1. look for <meta http-equiv="content-type"...> // 2. if not found then look for <meta charset=""> // 3. if not found then <?xml encoding=""...?> protected Charset getContentDeclaredCharset(CrawlURI curi, String contentPrefix) { String charsetName = null; // <meta http-equiv="content-type" content="text/html; charset=iso-8859-1"> Matcher matcher = TextUtils.getMatcher("(?is)<meta\\s+[^>]*http-equiv\\s*=\\s*['\"]content-type['\"][^>]*>", contentPrefix); if (matcher.find()) { String metaContentType = matcher.group(); TextUtils.recycleMatcher(matcher); matcher = TextUtils.getMatcher("charset=([^'\";\\s>]+)", metaContentType); if (matcher.find()) { charsetName = matcher.group(1); } TextUtils.recycleMatcher(matcher); } if (charsetName == null) { // <meta charset="utf-8"> matcher = TextUtils.getMatcher("(?si)<meta\\s+[^>]*charset=['\"]([^'\";\\s>]+)['\"]", contentPrefix); if (matcher.find()) { charsetName = matcher.group(1); TextUtils.recycleMatcher(matcher); } else { // <?xml version="1.0" encoding="utf-8"?> matcher = TextUtils.getMatcher("(?is)<\\?xml\\s+[^>]*encoding=['\"]([^'\"]+)['\"]", contentPrefix); if (matcher.find()) { charsetName = matcher.group(1); } else { return null; // none found } TextUtils.recycleMatcher(matcher); } } try { return Charset.forName(charsetName); } catch (IllegalArgumentException iae) { logger.log(Level.INFO, "Unknown content-encoding '" + charsetName + "' declared; using default"); curi.getAnnotations().add("unsatisfiableCharsetInHTML:" + charsetName); return null; } } /** * Run extractor. * This method is package visible to ease testing. * @param curi CrawlURI we're processing. * @param cs Sequence from underlying ReplayCharSequence. This * is TRANSIENT data. Make a copy if you want the data to live outside * of this extractors' lifetime. */ protected void extract(CrawlURI curi, CharSequence cs) { Matcher tags = TextUtils.getMatcher(relevantTagPattern, cs); while (tags.find()) { if (Thread.interrupted()) { break; } if (tags.start(8) > 0) { // comment match // for now do nothing } else if (tags.start(7) > 0) { // <meta> match int start = tags.start(5); int end = tags.end(5); assert start >= 0 : "Start is: " + start + ", " + curi; assert end >= 0 : "End is :" + end + ", " + curi; if (processMeta(curi, cs.subSequence(start, end))) { // meta tag included NOFOLLOW; abort processing break; } } else if (tags.start(5) > 0) { // generic <whatever> match int start5 = tags.start(5); int end5 = tags.end(5); assert start5 >= 0 : "Start is: " + start5 + ", " + curi; assert end5 >= 0 : "End is :" + end5 + ", " + curi; int start6 = tags.start(6); int end6 = tags.end(6); assert start6 >= 0 : "Start is: " + start6 + ", " + curi; assert end6 >= 0 : "End is :" + end6 + ", " + curi; String element = cs.subSequence(start6, end6).toString(); CharSequence attributes = cs.subSequence(start5, end5); processGeneralTag(curi, element, attributes); // remember FORM to help later extra processing if ("form".equalsIgnoreCase(element)) { curi.getDataList(A_FORM_OFFSETS).add((Integer) (start6 - 1)); } } else if (tags.start(1) > 0) { // <script> match int start = tags.start(1); int end = tags.end(1); assert start >= 0 : "Start is: " + start + ", " + curi; assert end >= 0 : "End is :" + end + ", " + curi; assert tags.end(2) >= 0 : "Tags.end(2) illegal " + tags.end(2) + ", " + curi; processScript(curi, cs.subSequence(start, end), tags.end(2) - start); } else if (tags.start(3) > 0) { // <style... match int start = tags.start(3); int end = tags.end(3); assert start >= 0 : "Start is: " + start + ", " + curi; assert end >= 0 : "End is :" + end + ", " + curi; assert tags.end(4) >= 0 : "Tags.end(4) illegal " + tags.end(4) + ", " + curi; processStyle(curi, cs.subSequence(start, end), tags.end(4) - start); } } TextUtils.recycleMatcher(tags); } static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)" + "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)"; /** * Test whether this HTML is so unexpected (eg in place of a GIF URI) * that it shouldn't be scanned for links. * * @param curi CrawlURI to examine. * @return True if HTML is acceptable/expected here * @throws URIException */ protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException { String path = curi.getUURI().getPath(); if (path == null) { // no path extension, HTML is fine return true; } int dot = path.lastIndexOf('.'); if (dot < 0) { // no path extension, HTML is fine return true; } if (dot < (path.length() - 5)) { // extension too long to recognize, HTML is fine return true; } String ext = path.substring(dot + 1); return !TextUtils.matches(NON_HTML_PATH_EXTENSION, ext); } protected void processScript(CrawlURI curi, CharSequence sequence, int endOfOpenTag) { // first, get attributes of script-open tag // as per any other tag processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag)); // then, apply best-effort string-analysis heuristics // against any code present (false positives are OK) processScriptCode(curi, sequence.subSequence(endOfOpenTag, sequence.length())); } /** * Process metadata tags. * @param curi CrawlURI we're processing. * @param cs Sequence from underlying ReplayCharSequence. This * is TRANSIENT data. Make a copy if you want the data to live outside * of this extractors' lifetime. * @return True robots exclusion metatag. */ protected boolean processMeta(CrawlURI curi, CharSequence cs) { Matcher attr = TextUtils.getMatcher(eachAttributePattern, cs); String name = null; String httpEquiv = null; String content = null; while (attr.find()) { int valueGroup = (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16; CharSequence value = cs.subSequence(attr.start(valueGroup), attr.end(valueGroup)); value = TextUtils.unescapeHtml(value); if (attr.group(1).equalsIgnoreCase("name")) { name = value.toString(); } else if (attr.group(1).equalsIgnoreCase("http-equiv")) { httpEquiv = value.toString(); } else if (attr.group(1).equalsIgnoreCase("content")) { content = value.toString(); } // TODO: handle other stuff } TextUtils.recycleMatcher(attr); // Look for the 'robots' meta-tag if ("robots".equalsIgnoreCase(name) && content != null) { curi.getData().put(A_META_ROBOTS, content); RobotsPolicy policy = metadata.getRobotsPolicy(); String contentLower = content.toLowerCase(); if (policy.obeyMetaRobotsNofollow() && (contentLower.indexOf("nofollow") >= 0 || contentLower.indexOf("none") >= 0)) { // if 'nofollow' or 'none' is specified and the // honoring policy is not IGNORE or CUSTOM, end html extraction logger.fine("HTML extraction skipped due to robots meta-tag for: " + curi.toString()); return true; } } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) { int urlIndex = content.indexOf("=") + 1; if (urlIndex > 0) { String refreshUri = content.substring(urlIndex); try { int max = getExtractorParameters().getMaxOutlinks(); addRelativeToBase(curi, max, refreshUri, HTMLLinkContext.META, Hop.REFER); } catch (URIException e) { logUriError(e, curi.getUURI(), refreshUri); } } } else if (content != null) { //look for likely urls in 'content' attribute try { if (UriUtils.isVeryLikelyUri(content)) { int max = getExtractorParameters().getMaxOutlinks(); addRelativeToBase(curi, max, content, HTMLLinkContext.META, Hop.SPECULATIVE); } } catch (URIException e) { logUriError(e, curi.getUURI(), content); } } return false; } /** * Process style text. * @param curi CrawlURI we're processing. * @param sequence Sequence from underlying ReplayCharSequence. This * is TRANSIENT data. Make a copy if you want the data to live outside * of this extractors' lifetime. * @param endOfOpenTag */ protected void processStyle(CrawlURI curi, CharSequence sequence, int endOfOpenTag) { // First, get attributes of script-open tag as per any other tag. processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag)); // then, parse for URIs numberOfLinksExtracted.addAndGet( ExtractorCSS.processStyleCode(this, curi, sequence.subSequence(endOfOpenTag, sequence.length()))); } /** * Create a suitable XPath-like context from an element name and optional * attribute name. * * @param element * @param attribute * @return CharSequence context */ public static CharSequence elementContext(CharSequence element, CharSequence attribute) { return attribute == null ? "" : element + "/@" + attribute; } }