Java tutorial
package fr.eolya.utils.http; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.StringReader; import java.net.MalformedURLException; import java.net.URI; import java.net.URL; import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashMap; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.MasonTagTypes; import net.htmlparser.jericho.MicrosoftConditionalCommentTagTypes; import net.htmlparser.jericho.PHPTagTypes; import net.htmlparser.jericho.Source; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.commons.lang3.StringUtils; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import com.sun.syndication.feed.synd.SyndEntryImpl; import com.sun.syndication.feed.synd.SyndFeed; import com.sun.syndication.io.SyndFeedInput; import com.sun.syndication.io.XmlReader; public class HttpUtils { public static String urlNormalize(String url, String preferedHost) { String ret_url = url.trim(); // Perform some url nomalizations described here : http://en.wikipedia.org/wiki/URL_normalization try { // Remove last "/" - NO !!! //if (ret_url.lastIndexOf("/") == ret_url.length()-1) // ret_url = ret_url.substring(0, ret_url.length()-1); // Remove final "?" if unique in url - http://www.example.com/display? -> http://www.example.com/display if (ret_url.lastIndexOf("?") == ret_url.length() - 1) ret_url = ret_url.substring(0, ret_url.length() - 1); // Fix "?&" int index = ret_url.indexOf("?&"); //int l = ret_url.length()-2; if (index != -1) { if (index != ret_url.length() - 2) { ret_url = ret_url.substring(0, index + 1) + ret_url.substring(index + 2); } else { ret_url = ret_url.substring(0, ret_url.length() - 2); } } // Replace "&" by "&" ret_url = StringEscapeUtils.unescapeHtml4(ret_url); // Replace " " by "%20" ret_url = ret_url.replace(" ", "%20"); // Replace "'" by "%27" ret_url = ret_url.replace("'", "%27"); // Replace "%5F" by "_" ret_url = ret_url.replace("%5f", "_"); ret_url = ret_url.replace("%5F", "_"); // Remove dot-segments. // http://www.example.com/../a/b/../c/./d.html => http://www.example.com/a/c/d.html URI uri = new URI(ret_url); uri = uri.normalize(); ret_url = uri.toURL().toExternalForm(); // Remove dot-segments at the beginning of the path // http://www.example.com/../a/d.html => http://www.example.com/a/d.html URL tempUrl = new URL(ret_url); String path = tempUrl.getFile(); String pattern = ""; while (path.startsWith("/../")) { path = path.substring(3); pattern += "/.."; } if (!pattern.equals("")) { index = ret_url.indexOf(pattern); ret_url = ret_url.substring(0, index) + ret_url.substring(index + pattern.length()); } // Remove default port if (ret_url.indexOf("http://" + uri.getHost() + ":80") != -1) { ret_url = ret_url.replace("//" + uri.getHost() + ":80", "//" + uri.getHost()); } if (ret_url.indexOf("https://" + uri.getHost() + ":443") != -1) { ret_url = ret_url.replace("//" + uri.getHost() + ":443", "//" + uri.getHost()); } // translate to prefered host (www.site.com vs site.com) if (preferedHost != null && !"".equals(preferedHost)) { if (uri.getHost().equals("www." + preferedHost) || ("www." + uri.getHost()).equals(preferedHost)) { ret_url = ret_url.replace("//" + uri.getHost(), "//" + preferedHost); } } // Remove the fragment. // http://www.example.com/bar.html#section1 => http://www.example.com/bar.html if (ret_url.indexOf("#") != -1) ret_url = ret_url.substring(0, ret_url.indexOf("#")); // Reorder parameters in query string //ret_url = urlReorderParameters (ret_url); return ret_url; } catch (Exception e) { } return ret_url; } public static String urlRemoveParameters(String url, String paramsToRemove) { if (paramsToRemove == null || "".equals(paramsToRemove)) return url; try { URL u = new URL(url); if (u.getQuery() == null && u.getPath().indexOf(";jsessionid=") == -1) return url; } catch (MalformedURLException e1) { e1.printStackTrace(); return null; } try { url = url.replace("?&", "?"); if ("*".equals(paramsToRemove)) { int offset = url.lastIndexOf("?"); if (offset != -1) return url.substring(0, offset); } paramsToRemove = paramsToRemove.replaceAll(" ", "").replaceAll(";", ","); String[] aToRemove = paramsToRemove.split(","); String tempUrl = url; for (int i = 0; i < aToRemove.length; i++) { boolean found = true; while (found) { found = false; String re = "[?&;]" + aToRemove[i].toLowerCase() + "[=&]"; Pattern p = Pattern.compile(re); Matcher m = p.matcher(tempUrl.toLowerCase()); if (m.find()) { found = true; int start = m.start(); int stop = start; if ("jsessionid".equals(aToRemove[i].toLowerCase())) { stop = tempUrl.indexOf("?", start + 1); if (stop == -1) stop = tempUrl.indexOf("&", start + 1); } else { stop = tempUrl.indexOf("&", start + 1); } if (stop == -1) { tempUrl = tempUrl.substring(0, start); } else { String ope = tempUrl.substring(start, start + 1); if (";".equals(ope)) ope = "?"; tempUrl = tempUrl.substring(0, start) + ope + tempUrl.substring(stop + 1); } } re = "[?&;]" + aToRemove[i].toLowerCase() + "$"; p = Pattern.compile(re); m = p.matcher(tempUrl.toLowerCase()); if (m.find()) { found = true; int start = m.start(); int stop = start; if ("jsessionid".equals(aToRemove[i].toLowerCase())) { stop = tempUrl.indexOf("?", start + 1); if (stop == -1) stop = tempUrl.indexOf("&", start + 1); } else { stop = tempUrl.indexOf("&", start + 1); } if (stop == -1) { tempUrl = tempUrl.substring(0, start); } else { String ope = tempUrl.substring(start, start + 1); if (";".equals(ope)) ope = "?"; tempUrl = tempUrl.substring(0, start) + ope + tempUrl.substring(stop + 1); } } } } return tempUrl; } catch (Exception e) { e.printStackTrace(); } return ""; } //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= public static boolean urlBelongSameHost(String urlReferer, String urlHref, List<String> hostAliases) { if (urlReferer != null && urlBelongSameHost(urlReferer, urlHref)) return true; if (hostAliases != null) { for (int i = 0; i < hostAliases.size(); i++) { hostAliases.set(i, hostAliases.get(i).trim()); if (hostAliases.get(i).indexOf("*") == -1) { if (urlBelongSameHost(hostAliases.get(i), urlHref)) return true; } else { String alias = hostAliases.get(i).replace("*", ""); if (hostAliases.get(i).indexOf("*") == 0) { if (urlHref.endsWith(alias)) return true; } if (hostAliases.get(i).indexOf("*") == hostAliases.get(i).length() - 1) { if (urlHref.startsWith(alias)) return true; } } } } return false; } //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= public static boolean urlBelongSameHost(String urlReferer, String urlHref) { return areSameHosts(getUrlHost(urlReferer), getUrlHost(urlHref)); } //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= private static boolean areSameHosts(String hostR, String hostH) { String host1 = hostR.toLowerCase().trim(); String host2 = hostH.toLowerCase().trim(); if (host1.startsWith("www.") && !host2.startsWith("www.")) host2 = "www." + host2; if (!host1.startsWith("www.") && host2.startsWith("www.")) host1 = "www." + host1; return host1.equals(host2); } //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= private static String getUrlHost(String url) { try { if (!url.startsWith("http://") && !url.startsWith("https://")) { url = "http://" + url; } URL u = new URL(url); return u.getHost(); } catch (Exception e) { return ""; } } /** * Encode url * * @param url url to be encoded * @return */ public static String urlEncode(String url) { try { URL u = new URL(url); String host = u.getHost(); int indexFile = url.indexOf("/", url.indexOf(host)); if (indexFile == -1) return url; String urlFile = u.getFile(); urlFile = URLDecoder.decode(urlFile, "UTF-8"); String protocol = u.getProtocol(); int port = u.getPort(); if (port != -1 && port != 80 && "http".equals(protocol)) host += ":".concat(String.valueOf(port)); if (port != -1 && port != 443 && "https".equals(protocol)) host += ":".concat(String.valueOf(port)); URI uri = new URI(u.getProtocol(), host, urlFile, null); String ret = uri.toASCIIString(); ret = ret.replaceAll("%3F", "?"); return ret; } catch (Exception e) { e.printStackTrace(); } return ""; } public static HashMap<String, String> extractMetas(String rawPage) throws IOException { final HashMap<String, String> m = new HashMap<String, String>(); HtmlCleaner cleaner = new HtmlCleaner(); //CleanerProperties props = cleaner.getProperties(); //props.setXXX(...); TagNode node = cleaner.clean(rawPage); TagNode[] myNodes; // <meta name="..." content="..." /> // <meta http-equiv="refresh" content= myNodes = node.getElementsByName("meta", true); for (int i = 0; i < myNodes.length; i++) { String name = myNodes[i].getAttributeByName("name"); if (name != null) { String scheme = myNodes[i].getAttributeByName("scheme"); if (scheme != null) name += "_" + scheme; String content = myNodes[i].getAttributeByName("content"); if (content != null && !"".equals(content)) { m.put("meta_" + name.toLowerCase().replaceAll("\\-", "_"), content); } } String equiv = myNodes[i].getAttributeByName("http-equiv"); if (equiv != null) { String content = myNodes[i].getAttributeByName("content"); if (content != null && !"".equals(content)) { m.put("meta_equiv_" + equiv.toLowerCase().replaceAll("\\-", "_"), content); } } } // <link ... /> myNodes = node.getElementsByName("link", true); for (int i = 0; i < myNodes.length; i++) { String href = myNodes[i].getAttributeByName("href"); String rel = myNodes[i].getAttributeByName("rel"); if (href != null && rel != null && "canonical".equals(rel)) { m.put("meta_link_canonical", href); } } return m; } //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= public static boolean isRelativeURL(String urlHref) { if (urlHref.equals("")) return false; // Case 1 : urlHref starts with "http://" if (urlHref.startsWith("http://") || urlHref.startsWith("https://")) { return false; } // Case 2 : urlHref looks like "?..." if (urlHref.startsWith("?")) { return false; } // Case 3 : urlHref looks like "/path/file.html..." if (urlHref.startsWith("/")) { return false; } return true; } //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= public static String urlGetAbsoluteURL(String urlReferer, String urlHref) { try { if (urlHref.equals("")) return ""; // Case 1 : urlHref starts with "http://" if (urlHref.startsWith("http://") || urlHref.startsWith("https://")) { return urlHref; } URL url = new URL(urlReferer); // Case 1.1 : urlHref starts with "//" if (urlHref.startsWith("//")) { return url.getProtocol() + ":" + urlHref; } String urlRefererHost = url.getProtocol() + "://" + url.getHost(); if (url.getPort() != -1) { urlRefererHost = urlRefererHost + ":" + String.valueOf(url.getPort()); } // Case 2 : urlHref looks like "?..." if (urlHref.startsWith("?")) { // find "?" in urlReferer /* if (urlReferer.indexOf("?")!=-1) return urlReferer.substring(0,urlReferer.indexOf("?")) + urlHref; else return urlReferer + urlHref; */ return urlRefererHost + "/" + url.getPath() + urlHref; } // Case 3 : urlHref looks like "/path/file.html..." if (urlHref.startsWith("/")) { return urlRefererHost + urlHref; } // Case 4 : urlHref looks like "path/file.html..." String urlRefererPath = url.getPath(); if ("".equals(urlRefererPath)) urlRefererPath = "/"; //if (urlRefererPath.indexOf(".")==-1 && urlRefererPath.lastIndexOf("/") != urlRefererPath.length()-1) // urlRefererPath = urlRefererPath + "/"; int offset = urlRefererPath.lastIndexOf("/"); /* if (offset <= 0) { urlRefererPath = ""; } else { urlRefererPath = urlRefererPath.substring(0, offset); } */ urlRefererPath = urlRefererPath.substring(0, offset); return urlRefererHost + urlRefererPath + "/" + urlHref; } catch (Exception e) { //e.printStackTrace (); } return ""; } //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= private static String fixUpUrl(String url) { String ret = url; if ("".equals(ret)) ret = "/"; else { if (ret.indexOf(".") > 0) { ret = ret.substring(0, ret.lastIndexOf("/") + 1); } else { if (!ret.endsWith("/")) ret += "/"; } } return ret; } //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= public static boolean isChildOf(URL urlChild, URL urlFather) { String urlChildPath = fixUpUrl(urlChild.getPath().toLowerCase()); String urlFatherPath = fixUpUrl(urlFather.getPath().toLowerCase()); return urlChildPath.startsWith(urlFatherPath); } //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= public static String getHtmlDeclaredLanguage(String rawData) { if (rawData == null || "".equals(rawData)) return ""; Hashtable<String, Integer> langFreq = new Hashtable<String, Integer>(); BufferedReader in = new BufferedReader(new StringReader(rawData)); String line; try { while ((line = in.readLine()) != null) { line = line.toLowerCase(); //<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="fr-fr"> if (line.indexOf("<html") >= 0 && line.toLowerCase().indexOf(" xml:lang") >= 0) { String lang = parseAttributeValue(line, "xml:lang="); if (lang != null && lang.length() >= 2) { lang = lang.substring(0, 2); if (langFreq.containsKey(lang)) langFreq.put(lang, langFreq.get(lang) + 1); else langFreq.put(lang, 1); } } //<html lang="fr"> if (line.indexOf("<html") >= 0 && line.toLowerCase().indexOf(" lang") >= 0) { String lang = parseAttributeValue(line, "lang="); if (lang != null && lang.length() >= 2) { lang = lang.substring(0, 2); if (langFreq.containsKey(lang)) langFreq.put(lang, langFreq.get(lang) + 1); else langFreq.put(lang, 1); } } //<meta http-equiv="content-language" content="fr-fr" /> if (line.indexOf("<meta") >= 0 && line.toLowerCase().indexOf(" http-equiv") >= 0 && line.toLowerCase().indexOf("content-language") >= 0) { String lang = parseAttributeValue(line, "content="); if (lang != null && lang.length() >= 2) { lang = lang.substring(0, 2); if (langFreq.containsKey(lang)) langFreq.put(lang, langFreq.get(lang) + 1); else langFreq.put(lang, 1); } } //<meta name="language" content="fr-fr" /> if (line.indexOf("<meta") >= 0 && line.toLowerCase().indexOf(" name") >= 0 && line.toLowerCase().indexOf("language") >= 0 && line.toLowerCase().indexOf(" content") >= 0) { String lang = parseAttributeValue(line, "content="); if (lang != null && lang.length() >= 2) { lang = lang.substring(0, 2); if (langFreq.containsKey(lang)) langFreq.put(lang, langFreq.get(lang) + 1); else langFreq.put(lang, 1); } } //<meta name="content-language" content="fr-fr" /> if (line.indexOf("<meta") >= 0 && line.toLowerCase().indexOf(" name") >= 0 && line.toLowerCase().indexOf("content-language") >= 0 && line.toLowerCase().indexOf(" content") >= 0) { String lang = parseAttributeValue(line, "content="); if (lang != null && lang.length() >= 2) { lang = lang.substring(0, 2); if (langFreq.containsKey(lang)) langFreq.put(lang, langFreq.get(lang) + 1); else langFreq.put(lang, 1); } } } // Get the best candidate Vector<String> v = new Vector<String>(langFreq.keySet()); Iterator<String> it = v.iterator(); int max = 0; String lang = ""; while (it.hasNext()) { String element = (String) it.next(); //System.out.println( element + " " + encodingFreq.get(element)); if (langFreq.get(element) > max) { max = langFreq.get(element); lang = element; } } return lang; } catch (IOException e) { e.printStackTrace(); } return ""; } /** * Parse the character encoding from the specified content type header. * If the content type is null, or there is no explicit character encoding, * <code>null</code> is returned. * <br /> * This method was copied from org.apache.catalina.util.RequestUtil, * which is licensed under the Apache License, Version 2.0 (the "License"). * * @param contentType a content type header */ public static String parseCharacterEncoding(String contentType) { if (contentType == null) return (null); String value = ""; int start = contentType.indexOf("charset='"); if (start >= 0) { value = contentType.substring(start + 9); } else { start = contentType.indexOf("charset=\""); if (start >= 0) { value = contentType.substring(start + 9); } else { start = contentType.indexOf("charset="); if (start < 0) return (null); value = contentType.substring(start + 8); } } int end = value.indexOf(';'); if (end >= 0) value = value.substring(0, end); end = value.indexOf('"'); if (end >= 0) value = value.substring(0, end); end = value.indexOf('\''); if (end >= 0) value = value.substring(0, end); end = value.indexOf('/'); if (end >= 0) value = value.substring(0, end); end = value.indexOf('>'); if (end >= 0) value = value.substring(0, end); value = value.replaceAll("\"", ""); value = value.replaceAll("'", ""); return (value.trim()); } public static String parseAttributeValue(String line, String attName) { if (line == null) return (null); int start = line.indexOf(attName); if (start < 0) return (null); String value = line.substring(start + attName.length()); value = value.trim(); if (value.charAt(0) == '"' || value.charAt(0) == '\'') value = value.substring(1); int end = value.indexOf(';'); if (end >= 0) value = value.substring(0, end); end = value.indexOf('"'); if (end >= 0) value = value.substring(0, end); end = value.indexOf('\''); if (end >= 0) value = value.substring(0, end); end = value.indexOf('/'); if (end >= 0) value = value.substring(0, end); end = value.indexOf('>'); if (end >= 0) value = value.substring(0, end); value = value.replaceAll("\"", ""); value = value.replaceAll("'", ""); return (value.trim()); } public static String filtreEncoding(String encoding) { encoding = encoding.toLowerCase(); if (encoding.startsWith("utf") && !"utf-8".equals(encoding)) return ""; return encoding; } public static List<String> extractLinksFromFeed(String rawPage) { final ArrayList<String> list = new ArrayList<String>(); try { XmlReader xmlReader = new XmlReader(new ByteArrayInputStream(rawPage.getBytes())); SyndFeedInput input = new SyndFeedInput(); SyndFeed feed = input.build(xmlReader); Object[] arraySyndEntry = feed.getEntries().toArray(); for (int k = arraySyndEntry.length - 1; k >= 0; k--) { SyndEntryImpl syndEntry = (SyndEntryImpl) arraySyndEntry[k]; String link = strLinkCleanup(syndEntry.getLink()); if (!list.contains(link)) list.add(link); } return list; } catch (Exception e) { e.printStackTrace(); return null; } } static private String strLinkCleanup(String str) { if (str == null) return ""; // line start and end str = str.replaceAll("^[\\n\\t\\s]*", ""); str = str.replaceAll("[\\n\\t\\s]*$", ""); // some unicode chars str = str.replaceAll("\\u0091", "'"); str = str.replaceAll("\\u0092", "'"); str = str.replaceAll("\\u0093", "\""); str = str.replaceAll("\\u0094", "\""); return str; } public static List<String> extractAbsoluteLinks(String rawPage, String urlPage, int depth) throws IOException { List<String> links = extractLinks(rawPage, depth); String baseHref = null; for (int i = 0; i < links.size(); i++) { try { String url = null; if (baseHref == null && isRelativeURL(links.get(i).trim())) baseHref = getBaseHref(rawPage); if (baseHref != null && isRelativeURL(links.get(i).trim())) { url = urlGetAbsoluteURL(baseHref, links.get(i).trim()); } else { url = urlGetAbsoluteURL(urlPage, links.get(i).trim()); } links.set(i, url); } catch (Exception e) { e.printStackTrace(); } } return links; } public static String getBaseHref(String rawPage) throws IOException { if (rawPage == null || !StringUtils.containsIgnoreCase(rawPage, "<base")) return null; HtmlCleaner cleaner = new HtmlCleaner(); //CleanerProperties props = cleaner.getProperties(); //props.setXXX(...); TagNode node = cleaner.clean(rawPage); TagNode[] myNodes = node.getElementsByName("base", true); if (myNodes == null || myNodes.length == 0) return null; String href = myNodes[0].getAttributeByName("href"); if (href != null) return href; return null; } /** * Extract link in html string according to depth parameter * if depth = 0 : extract only redirection or iframe or framset urls links * if depth = 1 : extract only standard urls links (<a href='..'> * if depth = 2 : extract all links * * @param rawPage the input html string * @param depth the type of links to be extracted * @return the extracted urls in a String List * @throws IOException */ public static List<String> extractLinks(String rawPage, int depth) throws IOException { final ArrayList<String> list = new ArrayList<String>(); HtmlCleaner cleaner = new HtmlCleaner(); //CleanerProperties props = cleaner.getProperties(); //props.setXXX(...); TagNode node = cleaner.clean(rawPage); TagNode[] myNodes; if (depth == 1 || depth == 2) { // <a href= myNodes = node.getElementsByName("a", true); for (int i = 0; i < myNodes.length; i++) { String link = myNodes[i].getAttributeByName("href"); if (link != null) { link = link.trim(); if (link != null && !"".equals(link)) { if (isValidUrl(link)) if (!list.contains(link)) list.add(link); } } } // <area href= myNodes = node.getElementsByName("area", true); for (int i = 0; i < myNodes.length; i++) { String link = myNodes[i].getAttributeByName("href"); if (link != null && !"".equals(link)) if (isValidUrl(link)) if (!list.contains(link)) list.add(link); } } if (depth == 0 || depth == 2) { // <frame src= myNodes = node.getElementsByName("frame", true); for (int i = 0; i < myNodes.length; i++) { String link = myNodes[i].getAttributeByName("src"); if (link != null && !"".equals(link)) if (isValidUrl(link)) if (!list.contains(link)) list.add(link); } // <iframe src= myNodes = node.getElementsByName("iframe", true); for (int i = 0; i < myNodes.length; i++) { String link = myNodes[i].getAttributeByName("src"); if (link != null && !"".equals(link)) if (isValidUrl(link)) if (!list.contains(link)) list.add(link); } // <meta http-equiv="refresh" content= myNodes = node.getElementsByName("meta", true); for (int i = 0; i < myNodes.length; i++) { String equiv = myNodes[i].getAttributeByName("http-equiv"); if ((equiv != null) && (equiv.equalsIgnoreCase("refresh"))) { String link = myNodes[i].getAttributeByName("content"); if (link != null && !"".equals(link)) { if (link.indexOf("=") > 0) { link = link.substring(link.indexOf("=") + 1); if (!list.contains(link)) list.add(link); } } } } // Look for embeded flash // <param name="movie" value="..." myNodes = node.getElementsByName("param", true); for (int i = 0; i < myNodes.length; i++) { String name = myNodes[i].getAttributeByName("name"); if ("movie".equals(name)) { String link = myNodes[i].getAttributeByName("value"); if (!list.contains(link)) list.add(link); } } } // <frame src= (par Jericho parser car HTML Cleaner echoue) MicrosoftConditionalCommentTagTypes.register(); PHPTagTypes.register(); PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this example otherwise they override processing instructions MasonTagTypes.register(); Source source = new Source(rawPage); source.fullSequentialParse(); if (depth == 0 || depth == 2) { List<Element> linkElements = source.getAllElements(HTMLElementName.FRAME); for (Element linkElement : linkElements) { String link = linkElement.getAttributeValue("src"); if (link != null && !"".equals(link)) if (isValidUrl(link)) if (!list.contains(link)) list.add(link); } } if (depth == 1 || depth == 2) { List<Element> linkElements = source.getAllElements(HTMLElementName.A); for (Element linkElement : linkElements) { String link = linkElement.getAttributeValue("href"); if (link != null && !"".equals(link)) if (isValidUrl(link)) if (!list.contains(link)) list.add(link); /* if (href==null) continue; // A element can contain other tags so need to extract the text from it: String label=linkElement.getContent().getTextExtractor().toString(); System.out.println(label+" <"+href+'>'); */ } } String strPattern = "location[.]href=['\"](.*)['\"]"; Pattern pattern = Pattern.compile(strPattern); Matcher matcher = pattern.matcher(rawPage); while (matcher.find()) { try { String url = matcher.group(1); if (url.indexOf("'") != -1) url = url.substring(0, url.indexOf("'")); if (url.indexOf('"') != -1) url = url.substring(0, url.indexOf('"')); if (!list.contains(url)) list.add(url); } catch (Exception e) { } } // Look for location.href='...' // strPattern = "href=['\"](.*)['\"]"; // pattern = Pattern.compile(strPattern); // matcher = pattern.matcher(rawPage); // while (matcher.find()) { // try{ // String url = matcher.group(1); // if (url.indexOf("'")!=-1) // url = url.substring(0, url.indexOf("'")); // if (url.indexOf('"')!=-1) // url = url.substring(0, url.indexOf('"')); // if (!list.contains(url)) // list.add(url); // } // catch (Exception e){} // } if (depth == 0 || depth == 2) { // Look for location.replace("...") strPattern = "location[.]replace\\(['\"](.*)['\"]\\)"; pattern = Pattern.compile(strPattern); matcher = pattern.matcher(rawPage); while (matcher.find()) { try { String url = matcher.group(1); if (url.indexOf("'") != -1) url = url.substring(0, url.indexOf("'")); if (url.indexOf('"') != -1) url = url.substring(0, url.indexOf('"')); if (!list.contains(url)) list.add(url); } catch (Exception e) { } } // Look for window.location='...' strPattern = "window[.]location=['\"](.*)['\"]"; pattern = Pattern.compile(strPattern); matcher = pattern.matcher(rawPage); while (matcher.find()) { try { String url = matcher.group(1); if (url.indexOf("'") != -1) url = url.substring(0, url.indexOf("'")); if (url.indexOf('"') != -1) url = url.substring(0, url.indexOf('"')); if (!list.contains(url)) list.add(url); } catch (Exception e) { } } } return list; } private static boolean isValidUrl(String url) { String temp = url.toLowerCase(); if (!temp.startsWith("mailto:") && !temp.startsWith("javascript:") && !temp.startsWith("#") && !temp.startsWith("\\") && !temp.startsWith("'") && !temp.startsWith("\"")) { if (temp.startsWith("http")) { try { if (temp.startsWith("http:/") && !temp.startsWith("http://")) return false; @SuppressWarnings("unused") URL u = new URL(temp); return true; } catch (Exception e) { return false; } } return true; } else return false; } public static String urlGetFileName(String url) { try { URL u = new URL(url); String name = u.getPath(); if (name.lastIndexOf("/") != -1 && name.lastIndexOf("/") < name.length()) name = name.substring(name.lastIndexOf("/") + 1); return name; } catch (Exception e) { } return ""; } public static String urlAddBasicAuthentication(String url, String login, String password) { url = url.replace("http://", "http://" + login + ":" + password + "@"); url = url.replace("https://", "https://" + login + ":" + password + "@"); return url; } }