Java tutorial
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * SimpleHTMLExtractor.java * Created on Jun 5, 2003 * * $Header$ */ package com.cyberway.issue.extractor; import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import org.apache.commons.httpclient.URIException; import com.cyberway.issue.crawler.extractor.Link; import com.cyberway.issue.net.UURI; import com.cyberway.issue.net.UURIFactory; import com.cyberway.issue.util.DevUtils; import com.cyberway.issue.util.TextUtils; /** * Basic link-extraction, from an HTML content-body, * using regular expressions. * * ROUGH DRAFT IN PROGRESS / incomplete... untested... * * @author gojomo */ public class RegexpHTMLLinkExtractor extends CharSequenceLinkExtractor { private static Logger logger = Logger.getLogger(RegexpHTMLLinkExtractor.class.getName()); boolean honorRobots = true; boolean extractInlineCss = true; boolean extractInlineJs = true; protected LinkedList<Link> next = new LinkedList<Link>(); protected Matcher tags; /* (non-Javadoc) * @see com.cyberway.issue.extractor.CharSequenceLinkExtractor#findNextLink() */ protected boolean findNextLink() { if (tags == null) { tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, sourceContent); } while (tags.find()) { if (Thread.interrupted()) { // TODO: throw an exception, perhaps, rather than just clear & break? break; } if (tags.start(8) > 0) { // comment match // for now do nothing } else if (tags.start(7) > 0) { // <meta> match int start = tags.start(5); int end = tags.end(5); processMeta(sourceContent.subSequence(start, end)); } else if (tags.start(5) > 0) { // generic <whatever> match int start5 = tags.start(5); int end5 = tags.end(5); int start6 = tags.start(6); int end6 = tags.end(6); processGeneralTag(sourceContent.subSequence(start6, end6), sourceContent.subSequence(start5, end5)); } else if (tags.start(1) > 0) { // <script> match int start = tags.start(1); int end = tags.end(1); processScript(sourceContent.subSequence(start, end), tags.end(2) - start); } else if (tags.start(3) > 0) { // <style... match int start = tags.start(3); int end = tags.end(3); processStyle(sourceContent.subSequence(start, end), tags.end(4) - start); } if (!next.isEmpty()) { // at least one link found return true; } } // no relevant tags found return false; } /** * Compiled relevant tag extractor. * * <p> * This pattern extracts either: * <li> (1) whole <script>...</script> or * <li> (2) <style>...</style> or * <li> (3) <meta ...> or * <li> (4) any other open-tag with at least one attribute * (eg matches "<a href='boo'>" but not "</a>" or "<br>") * <p> * groups: * <li> 1: SCRIPT SRC=foo>boo</SCRIPT * <li> 2: just script open tag * <li> 3: STYLE TYPE=moo>zoo</STYLE * <li> 4: just style open tag * <li> 5: entire other tag, without '<' '>' * <li> 6: element * <li> 7: META * <li> 8: !-- comment -- */ static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?:\\w+))\\s+[^>]*+)|(!--.*?--))>"; // this pattern extracts attributes from any open-tag innards // matched by the above. attributes known to be URIs of various // sorts are matched specially static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\s((href)|(action)|(on\\w*)" + "|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" + "|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))" + "|(codebase)|((?:classid)|(?:data))|(archive)|(code)" + "|(value)|([-\\w]+))" + "\\s*=\\s*" + "(?:(?:\"(.*?)(?:\"|$))" + "|(?:'(.*?)(?:'|$))" + "|(\\S+))"; // groups: // 1: attribute name // 2: HREF - single URI relative to doc base, or occasionally javascript: // 3: ACTION - single URI relative to doc base, or occasionally javascript: // 4: ON[WHATEVER] - script handler // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE,DATASRC, or FOR // single URI relative to doc base // 6: CODEBASE - a single URI relative to doc base, affecting other // attributes // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied) // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE // (if supplied) // 9: CODE - a single URI relative to the CODEBASE (is specified). // 10: VALUE - often includes a uri path on forms // 11: any other attribute // 12: double-quote delimited attr value // 13: single-quote delimited attr value // 14: space-delimited attr value // much like the javascript likely-URI extractor, but // without requiring quotes -- this can indicate whether // an HTML tag attribute that isn't definitionally a // URI might be one anyway, as in form-tag VALUE attributes static final String LIKELY_URI_PATH = "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)"; static final String ESCAPED_AMP = "&"; static final String AMP = "&"; static final String WHITESPACE = "\\s"; static final String CLASSEXT = ".class"; static final String APPLET = "applet"; static final String BASE = "base"; static final String LINK = "link"; protected boolean processGeneralTag(CharSequence element, CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); // Just in case it's an OBJECT or APPLET tag String codebase = null; ArrayList<String> resources = null; long tally = next.size(); while (attr.find()) { int valueGroup = (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14; int start = attr.start(valueGroup); int end = attr.end(valueGroup); CharSequence value = cs.subSequence(start, end); if (attr.start(2) > -1) { // HREF CharSequence context = Link.elementContext(element, attr.group(2)); if (element.toString().equalsIgnoreCase(LINK)) { // <LINK> elements treated as embeds (css, ico, etc) processEmbed(value, context); } else { if (element.toString().equalsIgnoreCase(BASE)) { try { base = UURIFactory.getInstance(value.toString()); } catch (URIException e) { extractErrorListener.noteExtractError(e, source, value); } } // other HREFs treated as links processLink(value, context); } } else if (attr.start(3) > -1) { // ACTION CharSequence context = Link.elementContext(element, attr.group(3)); processLink(value, context); } else if (attr.start(4) > -1) { // ON____ processScriptCode(value); // TODO: context? } else if (attr.start(5) > -1) { // SRC etc. CharSequence context = Link.elementContext(element, attr.group(5)); processEmbed(value, context); } else if (attr.start(6) > -1) { // CODEBASE // TODO: more HTML deescaping? codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP); CharSequence context = Link.elementContext(element, attr.group(6)); processEmbed(codebase, context); } else if (attr.start(7) > -1) { // CLASSID, DATA if (resources == null) { resources = new ArrayList<String>(); } resources.add(value.toString()); } else if (attr.start(8) > -1) { // ARCHIVE if (resources == null) { resources = new ArrayList<String>(); } String[] multi = TextUtils.split(WHITESPACE, value); for (int i = 0; i < multi.length; i++) { resources.add(multi[i]); } } else if (attr.start(9) > -1) { // CODE if (resources == null) { resources = new ArrayList<String>(); } // If element is applet and code value does not end with // '.class' then append '.class' to the code value. if (element.toString().toLowerCase().equals(APPLET) && !value.toString().toLowerCase().endsWith(CLASSEXT)) { resources.add(value.toString() + CLASSEXT); } else { resources.add(value.toString()); } } else if (attr.start(10) > -1) { // VALUE if (TextUtils.matches(LIKELY_URI_PATH, value)) { CharSequence context = Link.elementContext(element, attr.group(10)); processLink(value, context); } } else if (attr.start(11) > -1) { // any other attribute // ignore for now // could probe for path- or script-looking strings, but // those should be vanishingly rare in other attributes, // and/or symptomatic of page bugs } } TextUtils.recycleMatcher(attr); // handle codebase/resources if (resources == null) { return (tally - next.size()) > 0; } Iterator iter = resources.iterator(); UURI codebaseURI = null; String res = null; try { if (codebase != null) { // TODO: Pass in the charset. codebaseURI = UURIFactory.getInstance(base, codebase); } while (iter.hasNext()) { res = iter.next().toString(); // TODO: more HTML deescaping? res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP); if (codebaseURI != null) { res = codebaseURI.resolve(res).toString(); } processEmbed(res, element); // TODO: include attribute too } } catch (URIException e) { extractErrorListener.noteExtractError(e, source, codebase); } catch (IllegalArgumentException e) { DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e); } return (tally - next.size()) > 0; } /** * @param cs */ protected void processScriptCode(CharSequence cs) { RegexpJSLinkExtractor.extract(cs, source, base, next, extractErrorListener); } static final String JAVASCRIPT = "(?i)^javascript:.*"; /** * @param value * @param context */ protected void processLink(CharSequence value, CharSequence context) { String link = TextUtils.replaceAll(ESCAPED_AMP, value, "&"); if (TextUtils.matches(JAVASCRIPT, link)) { processScriptCode(value.subSequence(11, value.length())); } else { addLinkFromString(link, context, Link.NAVLINK_HOP); } } /** * @param uri * @param context */ private void addLinkFromString(String uri, CharSequence context, char hopType) { try { Link link = new Link(source, UURIFactory.getInstance(base, uri), context, hopType); next.addLast(link); } catch (URIException e) { extractErrorListener.noteExtractError(e, source, uri); } } protected long processEmbed(CharSequence value, CharSequence context) { String embed = TextUtils.replaceAll(ESCAPED_AMP, value, "&"); addLinkFromString(embed, context, Link.EMBED_HOP); return 1; } static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)" + "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)"; protected void processScript(CharSequence sequence, int endOfOpenTag) { // first, get attributes of script-open tag // as per any other tag processGeneralTag(sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag)); // then, apply best-effort string-analysis heuristics // against any code present (false positives are OK) processScriptCode(sequence.subSequence(endOfOpenTag, sequence.length())); } protected void processMeta(CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); String name = null; String httpEquiv = null; String content = null; while (attr.find()) { int valueGroup = (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14; CharSequence value = cs.subSequence(attr.start(valueGroup), attr.end(valueGroup)); if (attr.group(1).equalsIgnoreCase("name")) { name = value.toString(); } else if (attr.group(1).equalsIgnoreCase("http-equiv")) { httpEquiv = value.toString(); } else if (attr.group(1).equalsIgnoreCase("content")) { content = value.toString(); } // TODO: handle other stuff } TextUtils.recycleMatcher(attr); // Look for the 'robots' meta-tag if ("robots".equalsIgnoreCase(name) && content != null) { if (getHonorRobots()) { String contentLower = content.toLowerCase(); if ((contentLower.indexOf("nofollow") >= 0 || contentLower.indexOf("none") >= 0)) { // if 'nofollow' or 'none' is specified and we // are honoring robots, end html extraction logger.fine("HTML extraction skipped due to robots meta-tag for: " + source); cancelFurtherExtraction(); return; } } } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) { String refreshUri = content.substring(content.indexOf("=") + 1); try { Link refreshLink = new Link(source, UURIFactory.getInstance(base, refreshUri), Link.elementContext("meta", httpEquiv), Link.REFER_HOP); next.addLast(refreshLink); } catch (URIException e) { extractErrorListener.noteExtractError(e, source, refreshUri); } } } /** * @return whether to honor internal robots directives (eg meta robots) */ private boolean getHonorRobots() { return honorRobots; } /** * Ensure no further Links are extracted (by setting matcher up to fail) */ private void cancelFurtherExtraction() { // java 1.5 only: // tags.region(tags.regionEnd(),tags.regionEnd()); tags.reset(""); } /** * @param sequence * @param endOfOpenTag */ protected void processStyle(CharSequence sequence, int endOfOpenTag) { // First, get attributes of script-open tag as per any other tag. processGeneralTag(sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag)); // then, parse for URIs RegexpCSSLinkExtractor.extract(sequence.subSequence(endOfOpenTag, sequence.length()), source, base, next, extractErrorListener); } /** * Discard all state. Another setup() is required to use again. */ public void reset() { super.reset(); TextUtils.recycleMatcher(tags); tags = null; } protected static CharSequenceLinkExtractor newDefaultInstance() { return new RegexpHTMLLinkExtractor(); } }