Java tutorial
/** * MultiProtocolURI * Copyright 2010 by Michael Peter Christen * First released 25.5.2010 at http://yacy.net * * $LastChangedDate$ * $LastChangedRevision$ * $LastChangedBy$ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.cora.document.id; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.io.UnsupportedEncodingException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URLDecoder; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.BitSet; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpStatus; import jcifs.smb.SmbException; import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.Punycode.PunycodeException; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.HTTPInputStream; import net.yacy.crawler.retrieval.Response; /** * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file * */ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolURL> { private static final long serialVersionUID = -1173233022912141884L; private static final long SMB_TIMEOUT = 5000; public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter private static final Pattern backPathPattern = Pattern .compile("(/[^/]+(?<!/\\.{1,2})/)[.]{2}(?=/|$)|/\\.(?=/)|/(?=/)"); private static final Pattern patternMail = Pattern.compile("^[a-z]+:.*?"); //private static final Pattern patternSpace = Pattern.compile("%20"); /** Register unreserved chars (never escaped in url) */ private final static BitSet UNRESERVED_RFC1738 = new BitSet(128); /** Register unreserved chars for path part (not escaped in path) */ private final static BitSet UNRESERVED_PATH = new BitSet(128); /** * Register regular expressions metacharacters used by the {@link Pattern} * class. * * @see <a href= * "https://docs.oracle.com/javase/tutorial/essential/regex/literals.html">Regular * expressions string literals documentation</a> */ private static final BitSet PATTERN_METACHARACTERS = new BitSet(128); static { // unreserved characters (chars not to escape in url) for (int i = 'A'; i <= 'Z'; i++) { // hialpha RFC1738 Section 5 UNRESERVED_RFC1738.set(i); } for (int i = 'a'; i <= 'z'; i++) { // lowalpha RFC1738 Section 5 UNRESERVED_RFC1738.set(i); } for (int i = '0'; i <= '9'; i++) { // digit RFC1738 Section 5 UNRESERVED_RFC1738.set(i); } // special char set RFC1738 Section 2.2 $-_.+!*'(), UNRESERVED_RFC1738.set('$'); // safe chars RFC1738 Section 5 UNRESERVED_RFC1738.set('-'); // & mark RFC2396 Section 2.2 UNRESERVED_RFC1738.set('_'); // & mark RFC2396 Section 2.2 UNRESERVED_RFC1738.set('.'); // & mark RFC2396 Section 2.2 UNRESERVED_RFC1738.set('+'); UNRESERVED_RFC1738.set('!'); // extra chars RFC1738 Section 5 & mark RFC2396 Section 2.2 UNRESERVED_RFC1738.set('*'); // & mark RFC2396 Section 2.2 UNRESERVED_RFC1738.set('\''); // & mark RFC2396 Section 2.2 UNRESERVED_RFC1738.set('('); // & mark RFC2396 Section 2.2 UNRESERVED_RFC1738.set(')'); // & mark RFC2396 Section 2.2 UNRESERVED_RFC1738.set(','); UNRESERVED_RFC1738.set('~'); // mark RFC2396 Section 2.2 // unreseved in URL path UNRESERVED_PATH.or(UNRESERVED_RFC1738); UNRESERVED_PATH.set('/'); // hpath segment separator RFC 1738 Section 5 UNRESERVED_PATH.set(';'); // hsegment param separator (FTP) UNRESERVED_PATH.set(':'); UNRESERVED_PATH.set('@'); UNRESERVED_PATH.set('&'); UNRESERVED_PATH.set('='); /* Pattern metacharacters : <([{\^-=$!|]})?*+.> */ PATTERN_METACHARACTERS.set('<'); PATTERN_METACHARACTERS.set('('); PATTERN_METACHARACTERS.set('['); PATTERN_METACHARACTERS.set('{'); PATTERN_METACHARACTERS.set('\\'); PATTERN_METACHARACTERS.set('^'); PATTERN_METACHARACTERS.set('-'); PATTERN_METACHARACTERS.set('='); PATTERN_METACHARACTERS.set('$'); PATTERN_METACHARACTERS.set('!'); PATTERN_METACHARACTERS.set('|'); PATTERN_METACHARACTERS.set(']'); PATTERN_METACHARACTERS.set('}'); PATTERN_METACHARACTERS.set(')'); PATTERN_METACHARACTERS.set('?'); PATTERN_METACHARACTERS.set('*'); PATTERN_METACHARACTERS.set('+'); PATTERN_METACHARACTERS.set('.'); PATTERN_METACHARACTERS.set('>'); } // session id handling private static final Object PRESENT = new Object(); private static final ConcurrentHashMap<String, Object> sessionIDnames = new ConcurrentHashMap<String, Object>(); public static final void initSessionIDNames(final Set<String> idNames) { for (String s : idNames) { if (s == null) continue; s = s.trim(); if (!s.isEmpty()) sessionIDnames.put(s, PRESENT); } } // class variables (the variable content is stored in encoded/escaped form) protected final String protocol, userInfo; protected String host, path, searchpart, anchor; protected int port; protected InetAddress hostAddress; protected ContentDomain contentDomain; /** * initialization of a MultiProtocolURI to produce poison pills for concurrent blocking queues */ protected MultiProtocolURL() { this.protocol = null; this.host = null; this.hostAddress = null; this.userInfo = null; this.path = null; this.searchpart = null; this.anchor = null; this.contentDomain = null; this.port = -1; } public MultiProtocolURL(final File file) throws MalformedURLException { this("file", "", -1, file.getAbsolutePath()); } protected MultiProtocolURL(final MultiProtocolURL url) { this.protocol = url.protocol; this.host = url.host; this.hostAddress = null; this.userInfo = url.userInfo; this.path = url.path; this.searchpart = url.searchpart; this.anchor = url.anchor; this.contentDomain = null; this.port = url.port; } /** * Create MultiProtocolURL * * decoding exception: if url string contains http url with char '%' the url string must be url encoded (percent-escaped) before * as internal encoding is skipped if url string contains '%'. * * @param url '%' char url encoded before * @throws MalformedURLException */ public MultiProtocolURL(String url) throws MalformedURLException { if (url == null) throw new MalformedURLException("url string is null"); this.hostAddress = null; this.contentDomain = null; // identify protocol url = url.trim(); if (url.startsWith("//")) { // patch for urls starting with "//" which can be found in the wild url = "http:" + url; } if (url.startsWith("\\\\")) { url = "smb://" + CommonPattern.BACKSLASH.matcher(url.substring(2)).replaceAll("/"); } if (url.length() > 1 && (url.charAt(1) == ':' && Character.isLetter(url.charAt(0)))) { // maybe a DOS drive path ( A: to z: ) url = "file://" + url; } if (url.length() > 0 && url.charAt(0) == '/') { // maybe a unix/linux absolute path url = "file://" + url; } int p = url.lastIndexOf("://", 5); // lastindexof to look only at the begin of url, up to "https://", if (p < 0) { if (url.length() > 7 && url.substring(0, 7).equalsIgnoreCase("mailto:")) { p = 6; } else { url = "http://" + url; p = 4; } } this.protocol = url.substring(0, p).toLowerCase(Locale.ROOT).trim().intern(); if (url.length() < p + 4) throw new MalformedURLException("URL not parseable: '" + url + "'"); if (!this.protocol.equals("file") && url.substring(p + 1, p + 3).equals("//")) { // identify host, userInfo and file for http and ftp protocol int q = url.indexOf('/', p + 3); if (q < 0) { // check for www.test.com?searchpart q = url.indexOf("?", p + 3); } else { // check that '/' was not in searchpart (example http://test.com?data=1/2/3) if (url.lastIndexOf("?", q) >= 0) { q = url.indexOf("?", p + 3); } } if (q < 0) { // check for www.test.com#fragment q = url.indexOf("#", p + 3); } int r; if (q < 0) { if ((r = url.indexOf('@', p + 3)) < 0) { this.host = url.substring(p + 3).intern(); this.userInfo = null; } else { this.host = url.substring(r + 1).intern(); this.userInfo = url.substring(p + 3, r); } this.path = "/"; } else { this.host = url.substring(p + 3, q).trim().intern(); if ((r = this.host.indexOf('@')) < 0) { this.userInfo = null; } else { this.userInfo = this.host.substring(0, r); this.host = this.host.substring(r + 1).intern(); } this.path = url.substring(q); // may result in "?searchpart" (resolveBackpath prepends a "/" ) } if (this.host.length() < 4 && !this.protocol.equals("file")) throw new MalformedURLException("host too short: '" + this.host + "', url = " + url); if (this.host.indexOf('&') >= 0) throw new MalformedURLException("invalid '&' in host"); this.path = resolveBackpath(this.path); // adds "/" if missing identPort(url, (isHTTP() ? 80 : (isHTTPS() ? 443 : (isFTP() ? 21 : (isSMB() ? 445 : -1))))); if (this.port < 0) { // none of known protocols (above) = unknown throw new MalformedURLException("unknown protocol: " + url); } identAnchor(); identSearchpart(); escape(); } else { url = UTF8.decodeURL(url); // normalization here // this is not a http or ftp url if (this.protocol.equals("mailto")) { // parse email url final int q = url.indexOf('@', p + 3); if (q < 0) { throw new MalformedURLException("wrong email address: " + url); } this.userInfo = url.substring(p + 1, q); this.host = url.substring(q + 1); this.path = ""; // TODO: quick fix, as not always checked for path != null this.port = -1; this.searchpart = null; this.anchor = null; } else if (this.protocol.equals("file")) { // parse file url (RFC 1738 file://host.domain/path file://localhost/path file:///path) // example unix file://localhost/etc/fstab // file:///etc/fstab // example windows file://localhost/c|/WINDOWS/clock.avi // file:///c|/WINDOWS/clock.avi // file://localhost/c:/WINDOWS/clock.avi // network file://hostname/path/to/the%20file.txt // local file:///c:/path/to/the%20file.txt String h = url.substring(p + 1); this.host = null; // host is ignored on file: protocol if (h.startsWith("///")) { //absolute local file path // no host given this.path = h.substring(2); // "/path" or "/c:/path" } else if (h.startsWith("//")) { // "//host/path" or "//host/c:/path" if (h.length() > 4 && h.charAt(3) == ':' && h.charAt(4) != '/' && h.charAt(4) != '\\') { // wrong windows path, after the doublepoint there should be a backslash. Let's add a slash, as it will be slash in the normal form h = h.substring(0, 4) + '/' + h.substring(4); } int q = h.indexOf('/', 2); if (q < 0 || h.length() > 3 && h.charAt(3) == ':') { // Missing root slash such as "path" or "c:/path" accepted, but the path attribute must by after all start with it this.path = "/" + h.substring(2); } else { this.host = h.substring(2, q); // TODO: handle "c:" ? if (this.host.equalsIgnoreCase(Domains.LOCALHOST)) this.host = null; this.path = h.substring(q); // "/path" } } else if (h.startsWith("/")) { // "/host/path" or "/host/c:/path" this.path = h; } this.userInfo = null; this.port = -1; this.searchpart = null; this.anchor = null; } else { throw new MalformedURLException("unknown protocol: " + url); } } // handle international domains if (!Punycode.isBasic(this.host)) try { this.host = toPunycode(this.host); } catch (final PunycodeException e) { } } public static String toPunycode(final String host) throws PunycodeException { final String[] domainParts = CommonPattern.DOT.split(host, 0); final StringBuilder buffer = new StringBuilder(80); // encode each domain-part separately for (int i = 0; i < domainParts.length; i++) { final String part = domainParts[i]; if (!Punycode.isBasic(part)) { buffer.append("xn--").append(Punycode.encode(part)); } else { buffer.append(part); } if (i != domainParts.length - 1) { buffer.append('.'); } } return buffer.toString(); } public static final boolean isHTTP(final String s) { return s.startsWith("http://"); } public static final boolean isHTTPS(final String s) { return s.startsWith("https://"); } public static final boolean isFTP(final String s) { return s.startsWith("ftp://"); } public static final boolean isFile(final String s) { return s.startsWith("file://"); } public static final boolean isSMB(final String s) { return s.startsWith("smb://") || s.startsWith("\\\\"); } public final boolean isHTTP() { return this.protocol.equals("http"); } public final boolean isHTTPS() { return this.protocol.equals("https"); } public final boolean isFTP() { return this.protocol.equals("ftp"); } public final boolean isFile() { return this.protocol.equals("file"); } public final boolean isSMB() { return this.protocol.equals("smb"); } /** * Get the content domain of a document according to the extension. * This can produce wrong results because the extension is a weak hint for the content domain. * If possible, use the mime type, call Classification.getContentDomainFromMime() * @return the content domain which classifies the content type */ public final ContentDomain getContentDomainFromExt() { if (this.contentDomain == null) { this.contentDomain = Classification.getContentDomainFromExt(getFileExtension(this.getFileName())); } return this.contentDomain; } /** * @deprecated not used (2016-07-20), doesn't handle all protocol cases. Use MultiprotocolURL(MultiProtocolURL, String) instead */ @Deprecated // not used 2016-07-20 public static MultiProtocolURL newURL(final String baseURL, String relPath) throws MalformedURLException { if (relPath.startsWith("//")) { // patch for urls starting with "//" which can be found in the wild relPath = "http:" + relPath; } if ((baseURL == null) || isHTTP(relPath) || isHTTPS(relPath) || isFTP(relPath) || isFile(relPath) || isSMB(relPath)/*|| relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) { return new MultiProtocolURL(relPath); } return new MultiProtocolURL(new MultiProtocolURL(baseURL), relPath); } /** * @deprecated not used (2016-07-20), doesn't handle all protocol cases. Use MultiprotocolURL(MultiProtocolURL, String) instead */ @Deprecated // not used 2016-07-20 public static MultiProtocolURL newURL(final MultiProtocolURL baseURL, String relPath) throws MalformedURLException { if (relPath.startsWith("//")) { // patch for urls starting with "//" which can be found in the wild relPath = (baseURL == null) ? "http:" + relPath : baseURL.getProtocol() + ":" + relPath; } if ((baseURL == null) || isHTTP(relPath) || isHTTPS(relPath) || isFTP(relPath) || isFile(relPath) || isSMB(relPath)/*|| relPath.contains(":") && patternMail.matcher(relPath.toLowerCase()).find()*/) { return new MultiProtocolURL(relPath); } return new MultiProtocolURL(baseURL, relPath); } public MultiProtocolURL(final MultiProtocolURL baseURL, String relPath) throws MalformedURLException { if (baseURL == null) throw new MalformedURLException("base URL is null"); if (relPath == null) throw new MalformedURLException("relPath is null"); this.protocol = baseURL.protocol; this.host = baseURL.host; this.port = baseURL.port; this.userInfo = baseURL.userInfo; if (relPath.startsWith("//")) { // a "network-path reference" as defined in rfc2396 denotes // a relative path that uses the protocol from the base url relPath = baseURL.protocol + ":" + relPath; } if (relPath.toLowerCase(Locale.ROOT).startsWith("javascript:")) { this.path = baseURL.path; } else if (isHTTP(relPath) || isHTTPS(relPath) || isFTP(relPath) || isFile(relPath) || isSMB(relPath)) { this.path = baseURL.path; } else if (relPath.contains(":") && patternMail.matcher(relPath.toLowerCase(Locale.ROOT)).find()) { // discards also any unknown protocol from previous if throw new MalformedURLException("relative path malformed: " + relPath); } else if (relPath.length() > 0 && relPath.charAt(0) == '/') { this.path = relPath; } else if (baseURL.path.endsWith("/")) { /* According to RFC 3986 example in Appendix B. (https://tools.ietf.org/html/rfc3986) such an URL is valid : http://www.ics.uci.edu/pub/ietf/uri/#Related We also find similar usages in the 2016 URL living standard (https://url.spec.whatwg.org/), for example : https://url.spec.whatwg.org/#syntax-url-absolute-with-fragment java.lang.URL constructor also accepts this form.*/ if (relPath.startsWith("/")) this.path = baseURL.path + relPath.substring(1); else this.path = baseURL.path + relPath; } else { if (relPath.length() > 0 && (relPath.charAt(0) == '#' || relPath.charAt(0) == '?')) { this.path = baseURL.path + relPath; } else { final int q = baseURL.path.lastIndexOf('/'); if (q < 0) { this.path = relPath; } else { this.path = baseURL.path.substring(0, q + 1) + relPath; } } } this.searchpart = baseURL.searchpart; this.anchor = baseURL.anchor; this.path = resolveBackpath(this.path); identAnchor(); identSearchpart(); escape(); } /** * creates MultiProtocolURL * if path contains '?' search part is automatically created by splitting input into path and searchpart * dto for anchor's ('#') */ public MultiProtocolURL(final String protocol, String host, final int port, final String path) throws MalformedURLException { if (protocol == null) throw new MalformedURLException("protocol is null"); if (host.indexOf(':') >= 0 && host.charAt(0) != '[') host = '[' + host + ']'; // IPv6 host must be enclosed in square brackets this.protocol = protocol; this.host = host; this.port = port; this.path = path; this.searchpart = null; this.userInfo = null; this.anchor = null; identAnchor(); identSearchpart(); escape(); } /** * @param host the new host to apply to the copy * @return an exact copy of this URL instance but with a new host. The original instance remains unchanged. * @throws IllegalArgumentException when the host parameter is null or empty. */ public MultiProtocolURL ofNewHost(final String host) throws IllegalArgumentException { if (host == null || host.trim().isEmpty()) { throw new IllegalArgumentException("Host parameter must not be null"); } MultiProtocolURL copy = new MultiProtocolURL(this); if (host.indexOf(':') >= 0 && host.charAt(0) != '[') { copy.host = '[' + host + ']'; // IPv6 host must be enclosed in square brackets } else { copy.host = host; } if (!Punycode.isBasic(this.host)) try { this.host = toPunycode(this.host); } catch (final PunycodeException e) { ConcurrentLog.logException(e); } return copy; } /** * Resolve '..' segments in the path. * For standard pseudo algorithms, see : * <ul> * <li>https://tools.ietf.org/html/rfc3986#section-5.2.4</li> * <li>https://url.spec.whatwg.org/#path-state</li> * <li>https://www.w3.org/TR/url/#relative-path-state</li> * </ul> * @param path URL path part : must not be null * @return the path with '..' segments resolved */ private static final String resolveBackpath(final String path) { String p = path; if (p.isEmpty() || p.charAt(0) != '/') { p = "/" + p; } final Matcher qm = CommonPattern.QUESTION.matcher(p); // do not resolve backpaths in the post values final int end = qm.find() ? qm.start() : p.length(); final Matcher matcher = backPathPattern.matcher(p); while (matcher.find()) { if (matcher.start() > end) break; p = matcher.replaceAll(""); matcher.reset(p); } /* Let's remove any eventual remaining but inappropriate '..' segments at the beginning. * See https://tools.ietf.org/html/rfc3986#section-5.2.4 -> parts 2.C and 2.D */ while (p.startsWith("/../")) { p = p.substring(3); } if (p.equals("/..")) { p = "/"; } return p.equals("") ? "/" : p; } /** * Escapes the following parts of the url, this object already contains: * <ul> * <li>path: see {@link #escape(String)}</li> * <li>ref: same as above</li> * <li>quest: same as above without the ampersand ("&") and the equals symbol</li> * </ul> */ private void escape() { if (this.path != null && this.path.indexOf('%') == -1) { this.path = escapePath(this.path); } if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart(); if (this.anchor != null) this.anchor = escape(this.anchor).toString(); } /** * <p>Percent-encode/escape an URL path part according to the allowed characters * (see RFC3986, and formerly RFC1738 & RFC2396). Uses UTF-8 character codes for * non-ASCII.</p> * <p>Important : already percent-encoded characters are not re-encoded</p> * * @param pathToEscape the path part to escape. * @return an escaped path with only ASCII characters, or null when pathToEscape * is null. * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986 * percent-encoding section</a> * @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path * definition</a> */ public static String escapePath(final String pathToEscape) { return escapePath(pathToEscape, false); } /** * <p>Percent-encode/escape an URL path regular expression according to the allowed * characters in an URL path (see RFC3986) and in the {@link Pattern} regular * expressions. Uses UTF-8 character codes for non-ASCII.</p> * <p>Important : already percent-encoded characters are not re-encoded</p> * * @param pathPattern the URL path regular expression to escape. * @return an escaped path regular expression with only allowed ASCII * characters, or null when pathPattern is null. * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986 * percent-encoding section</a> * @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path * definition</a> */ public static String escapePathPattern(final String pathPattern) { return escapePath(pathPattern, true); } /** * <p> * Percent-encode/escape an URL path part according to the allowed characters * specified in RFC3986 (formerly RFC1738 and RFC2396). Uses UTF-8 character * codes for non-ASCII. * </p> * <p> * When isPattern is true, the string is processed as a regular expression, and * therefore meta-characters used by the {@link Pattern} class are not * percent-encoded. * </p> * * @param pathToEscape the path part to escape. * @param isPattern when true, regular meta-characters are not escaped * @return an escaped path regular expression with only allowed ASCII * characters, or null when pathPattern is null. * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986 * percent-encoding section</a> * @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path * definition</a> */ private static String escapePath(final String pathToEscape, final boolean isPattern) { if (pathToEscape == null) { return pathToEscape; } final StringBuilder ptmp = new StringBuilder(pathToEscape.length() + 10); boolean modified = false; final int len = pathToEscape.length(); int i = 0; while (i < len) { int ch = pathToEscape.charAt(i); if (ch == '%' && (i + 2) < len) { final char digit1 = pathToEscape.charAt(i + 1); final char digit2 = pathToEscape.charAt(i + 2); if (isHexDigit(digit1) && isHexDigit(digit2)) { /* Already percent-encoded character */ ptmp.append((char) ch); /* Normalize hexadecimal digits to upper case */ if (Character.isLowerCase(digit1) || Character.isLowerCase(digit2)) { modified = true; } ptmp.append(Character.toUpperCase(digit1)); ptmp.append(Character.toUpperCase(digit2)); i += 2; } else { /* Not a valid percent-encoded character : we encode it now */ ptmp.append(hex[ch]); modified = true; } } else if (isPattern && PATTERN_METACHARACTERS.get(ch)) { ptmp.append((char) ch); } else if (ch <= 0x7F) { if (UNRESERVED_PATH.get(ch)) { ptmp.append((char) ch); } else { ptmp.append(hex[ch]); modified = true; } } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF ptmp.append(hex[0xc0 | (ch >> 6)]); ptmp.append(hex[0x80 | (ch & 0x3F)]); modified = true; } else { // 0x7FF < ch <= 0xFFFF ptmp.append(hex[0xe0 | (ch >> 12)]); ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]); ptmp.append(hex[0x80 | (ch & 0x3F)]); modified = true; } i++; } if (modified) { return ptmp.toString(); } return pathToEscape; } /** * Decode UTF-8 percent-encoded characters eventually found in the given path. * <ul> * Differences with {@link URLDecoder#decode(String, String)} : * <li>the '+' character is not decoded to space character</li> * <li>no exception is thrown when invalid hexadecimal digits are found after a '%' character</li> * </ul> * * @param path an URL path eventually escaped * @return return the unescaped path or null when path is null. */ public static final String unescapePath(final String escaped) { if (escaped == null) { return escaped; } boolean modified = false; final int len = escaped.length(); final StringBuilder unescaped = new StringBuilder(len > 500 ? len / 2 : len); ByteBuffer utf8Bytes = null; int i = 0; while (i < len) { final char ch = escaped.charAt(i); if (ch == '%' && (i + 2) < len) { final char digit1 = escaped.charAt(i + 1); final char digit2 = escaped.charAt(i + 2); if (isHexDigit(digit1) && isHexDigit(digit2)) { if (utf8Bytes == null) { utf8Bytes = ByteBuffer.allocate((len - i) / 3); } /* Percent-encoded character UTF-8 byte */ int hexaValue = Integer.parseInt(escaped.substring(i + 1, i + 3), 16); utf8Bytes.put((byte) hexaValue); modified = true; i += 2; } else { /* Not a valid percent-encoded character : we append it as is */ unescaped.append(ch); } } else { if (utf8Bytes != null && utf8Bytes.position() > 0) { unescaped .append(new String(utf8Bytes.array(), 0, utf8Bytes.position(), StandardCharsets.UTF_8)); utf8Bytes.position(0); } unescaped.append(ch); } i++; } if (utf8Bytes != null && utf8Bytes.position() > 0) { unescaped.append(new String(utf8Bytes.array(), 0, utf8Bytes.position(), StandardCharsets.UTF_8)); } return modified ? unescaped.toString() : escaped; } /** * @param character a character to test * @return true when the character is a valid hexadecimal digit */ private static boolean isHexDigit(final int character) { return (character >= '0' && character <= '9') || (character >= 'a' && character <= 'f') || (character >= 'A' && character <= 'F'); } private void escapeSearchpart() { final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10); for (final Map.Entry<String, String> element : getAttributes().entrySet()) { qtmp.append('&'); qtmp.append(escape(element.getKey())); qtmp.append('='); qtmp.append(escape(element.getValue())); } this.searchpart = qtmp.substring((qtmp.length() > 0) ? 1 : 0); } private final static String[] hex = { "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27", "%28", "%29", "%2A", "%2B", "%2C", "%2D", "%2E", "%2F", "%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37", "%38", "%39", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F", "%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47", "%48", "%49", "%4A", "%4B", "%4C", "%4D", "%4E", "%4F", "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57", "%58", "%59", "%5A", "%5B", "%5C", "%5D", "%5E", "%5F", "%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67", "%68", "%69", "%6A", "%6B", "%6C", "%6D", "%6E", "%6F", "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77", "%78", "%79", "%7A", "%7B", "%7C", "%7D", "%7E", "%7F", "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF" }; /** * Encode a string to the "x-www-form-urlencoded" form, enhanced * with the UTF-8-in-URL proposal. This is what happens: * * <ul> * <li>The ASCII characters 'a' through 'z', 'A' through 'Z', * and '0' through '9' remain the same. * * <li>The unreserved characters & : - _ . ! ~ * ' ( ) ; , = remain the same. * see RFC 1738 2.2 and RFC 3986 2.2 * * <li>All other ASCII characters are converted into the * 3-character string "%xy", where xy is * the two-digit hexadecimal representation of the character * code * * <li>All non-ASCII characters are encoded in two steps: first * to a sequence of 2 or 3 bytes, using the UTF-8 algorithm; * secondly each of these bytes is encoded as "%xx". * </ul> * * @param s The string to be encoded * @return The encoded string */ // from: http://www.w3.org/International/URLUTF8Encoder.java public static StringBuilder escape(final String s) { final int len = s.length(); final StringBuilder sbuf = new StringBuilder(len + 10); for (int i = 0; i < len; i++) { final int ch = s.charAt(i); if (ch == ' ') { // space sbuf.append("%20"); } else if (ch == '%') { if (i < len - 2 && s.charAt(i + 1) >= '0' && s.charAt(i + 1) <= '9' && s.charAt(i + 2) >= '0' && s.charAt(i + 2) <= '9') { // TODO: actually 0..9 A..F a..f is allowed (or any of hex[] sequence) sbuf.append((char) ch); // lets consider this is used for encoding, leave it that way } else { sbuf.append("%25"); // '%' RFC 1738 2.2 unsafe char shall be encoded } } else if (ch == '&') { if (i < len - 6 && "amp;".equals(s.substring(i + 1, i + 5).toLowerCase(Locale.ROOT))) { sbuf.append((char) ch); // leave it that way, it is used the right way } else { sbuf.append("%26"); // this must be urlencoded } } else if (ch == '#') { // RFC 1738 2.2 unsafe char is _not_ encoded because it may already be used for encoding sbuf.append((char) ch); } else if (ch == '!' || ch == ':' // unreserved || ch == '-' || ch == '_' || ch == '.' || ch == '~' || ch == '*' || ch == '\'' || ch == '(' || ch == ')' || ch == '{' || ch == '}' || ch == ';' || ch == ',' || ch == '=') { // RFC 1738 2.2 unsafe char (may be used unencoded) sbuf.append((char) ch); } else if ('0' <= ch && ch <= '9') { // '0'..'9' sbuf.append((char) ch); } else if (ch == '/') { // reserved, but may appear in post part where it should not be replaced sbuf.append((char) ch); } else if ('A' <= ch && ch <= 'Z') { // 'A'..'Z' sbuf.append((char) ch); } else if ('a' <= ch && ch <= 'z') { // 'a'..'z' sbuf.append((char) ch); } else if (ch <= 0x007f) { // other ASCII sbuf.append(hex[ch]); } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF sbuf.append(hex[0xc0 | (ch >> 6)]); sbuf.append(hex[0x80 | (ch & 0x3F)]); } else { // 0x7FF < ch <= 0xFFFF sbuf.append(hex[0xe0 | (ch >> 12)]); sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]); sbuf.append(hex[0x80 | (ch & 0x3F)]); } } return sbuf; } /** * Decodes a <code>application/x-www-form-urlencoded</code> string using UTF-8 encoding. * * @param s the string to decode * @return the newly decoded string, or the original string when it doesn't match the <code>application/x-www-form-urlencoded</code> format */ public static String unescape(final String s) { try { return URLDecoder.decode(s, StandardCharsets.UTF_8.name()); } catch (UnsupportedEncodingException e) { /* This should not happen */ ConcurrentLog.logException(e); return s; } catch (Exception e) { /* * URLDecode may throw an IllegalArgumentException (or any other * Exception in future implementations) when the string doesn't * match the application/x-www-form-urlencoded format: in that case * return the original string. * Example case : when the valid '%' character is used in a URL but without percent encoding purpose. */ return s; } } private void identPort(final String inputURL, final int dflt) throws MalformedURLException { // identify ref in file if (this.host == null) { this.port = dflt; return; } int pss = 0; int ip6 = this.host.indexOf('['); if (ip6 >= 0 && ((ip6 = this.host.indexOf("]", ip6)) > 0)) { pss = ip6 + 1; } final int r = this.host.indexOf(":", pss); if (r < 0) { this.port = dflt; } else { try { final String portStr = this.host.substring(r + 1); if (portStr.trim().length() > 0) this.port = Integer.parseInt(portStr); else this.port = dflt; this.host = this.host.substring(0, r); } catch (final NumberFormatException e) { throw new MalformedURLException( "wrong port in host fragment '" + this.host + "' of input url '" + inputURL + "'"); } } } private void identAnchor() { // identify ref in file final int r = this.path.indexOf('#'); if (r < 0) { this.anchor = null; } else { this.anchor = this.path.substring(r + 1); this.path = this.path.substring(0, r); } } private void identSearchpart() { // identify quest in file final int r = this.path.indexOf('?'); if (r < 0) { this.searchpart = null; } else { this.searchpart = this.path.substring(r + 1); // strip & /* Matcher matcher = CharacterCoding.ampPattern.matcher(this.searchpart); int from = 0; while (matcher.find(from)) { from = matcher.start() + 1; this.searchpart = matcher.replaceAll("&"); matcher.reset(this.searchpart); } */ this.path = this.path.substring(0, r); } } /** * get the hpath plus search field plus anchor. * see http://www.ietf.org/rfc/rfc1738.txt for naming. * if there is no search and no anchor the result is identical to getPath * this is defined according to http://docs.oracle.com/javase/1.4.2/docs/api/java/net/URL.html#getFile() * @return */ public String getFile() { return getFile(false, false); } /** * get the hpath plus search field plus anchor (if wanted) * see http://www.ietf.org/rfc/rfc1738.txt for naming. * if there is no search and no anchor the result is identical to getPath * this is defined according to http://docs.oracle.com/javase/1.4.2/docs/api/java/net/URL.html#getFile() * @param excludeAnchor * @param removeSessionID * @return */ public String getFile(final boolean excludeAnchor, final boolean removeSessionID) { if (this.searchpart == null) { if (excludeAnchor || this.anchor == null) return this.path; final StringBuilder sb = new StringBuilder(120); sb.append(this.path); sb.append('#'); sb.append(this.anchor); return sb.toString(); } String q = this.searchpart; if (removeSessionID) { for (final String sid : sessionIDnames.keySet()) { if (q.toLowerCase(Locale.ROOT).startsWith(sid.toLowerCase(Locale.ROOT) + "=")) { final int p = q.indexOf('&'); if (p < 0) { if (excludeAnchor || this.anchor == null) return this.path; final StringBuilder sb = new StringBuilder(120); sb.append(this.path); sb.append('#'); sb.append(this.anchor); return sb.toString(); } q = q.substring(p + 1); continue; } final int p = q.toLowerCase(Locale.ROOT).indexOf("&" + sid.toLowerCase(Locale.ROOT) + "=", 0); if (p < 0) continue; final int p1 = q.indexOf('&', p + 1); if (p1 < 0) { q = q.substring(0, p); } else { q = q.substring(0, p) + q.substring(p1); } } } final StringBuilder sb = new StringBuilder(120); sb.append(this.path); sb.append('?'); sb.append(q); if (excludeAnchor || this.anchor == null) return sb.toString(); sb.append('#'); sb.append(this.anchor); return sb.toString(); } public String getFileName() { // this is a method not defined in any sun api // it returns the last portion of a path without any reference final int p = this.path.lastIndexOf('/'); if (p < 0) return this.path; if (p == this.path.length() - 1) return ""; // no file name, this is a path to a directory return this.path.substring(p + 1); // the 'real' file name } /** * Get extension out of a filename in lowercase * cuts off query part * @param fileName * @return extension or "" */ public static String getFileExtension(final String fileName) { int p = fileName.lastIndexOf('.'); if (p < 0) return ""; final int q = fileName.lastIndexOf('?'); if (q < 0) { return fileName.substring(p + 1).toLowerCase(Locale.ROOT); } // check last dot in query part if (p > q) { p = fileName.lastIndexOf('.', q); if (p < 0) return ""; } return fileName.substring(p + 1, q).toLowerCase(Locale.ROOT); } /** * Get the path (including filename) * Path is never null * returns may range from empty string, just "/" to a full path * @return */ public String getPath() { return this.path; } /** * Get path elements (directories) as array * @return array with directory names or empty array */ public String[] getPaths() { String s = (this.path == null || this.path.length() < 1) ? "" : this.path.charAt(0) == '/' ? this.path.substring(1) : this.path; int p = s.lastIndexOf('/'); if (p < 0) return new String[0]; s = s.substring(0, p); // the paths do not contain the last part, which is considered as the getFileName() part. String[] paths = CommonPattern.SLASH.split(s); return paths; } /** * return the file object to a local file * this patches also 'strange' windows file paths (like /c|/tmp) * @return the file as absolute path */ public File getLocalFile() { // path always starts with '/' ( https://github.com/yacy/yacy_search_server/commit/1bb0b135ac5dab0adab423d89612f7b1e13f2e61 ) // e.g. /C:/tmp , charAt(1) == ':' never true, but keep it anyway char c = this.path.charAt(1); if (c == ':') return new File(this.path); if (c == '|') return new File(this.path.charAt(0) + ":" + this.path.substring(2)); if (this.path.length() > 1) { // prevent StringIndexOutOfBoundsException c = this.path.charAt(2); if (c == ':' || c == '|') return new File(this.path.charAt(1) + ":" + this.path.substring(3)); } return new File(this.path); } public String getAuthority() { return ((this.port >= 0) && (this.host != null)) ? this.host + ":" + this.port : ((this.host != null) ? this.host : ""); } /** * @return the host part of this URL, Punycode encoded for Internationalized Domain Names. Can be null, for example for file URLs such as "file:///path/file.ext" */ public String getHost() { return this.host; } public String getOrganization() { String dnc = Domains.getDNC(host); String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1); int p = subdomOrga.lastIndexOf('.'); String orga = (p < 0) ? subdomOrga : subdomOrga.substring(p + 1); return orga; } /** * @return the top-level domain name part of this url host name, or the empty string. */ public String getTLD() { if (this.host == null) return ""; int p = this.host.lastIndexOf('.'); if (p < 0) return ""; return this.host.substring(p + 1); } public InetAddress getInetAddress() { if (this.hostAddress != null) return this.hostAddress; if (this.host == null) return null; // this may happen for file:// urls this.hostAddress = Domains.dnsResolve(this.host.toLowerCase(Locale.ROOT)); return this.hostAddress; } public int getPort() { return this.port; } public String getProtocol() { return this.protocol; } /** * @return this URL fragment or null if has no fragment * @see <a href="https://url.spec.whatwg.org/#concept-url-fragment">URL fragment concept at WHATWG</a> * @see <a href="https://tools.ietf.org/html/rfc3986#section-3.5">URL fragment section in RFC 3986</a> */ public String getRef() { return this.anchor; } public void removeRef() { this.anchor = null; } /** * the userInfo is the authentication part in front of the host; separated by '@' * @return a string like '<user>:<password>' or just '<user>' */ public String getUserInfo() { return this.userInfo; } public String getSearchpart() { return this.searchpart; } /** * Returns a search part parameter map key=value * in internal url encoded format * for unescaped return values * @see #getAttributes() * * @return key name value */ public Map<String, String> getSearchpartMap() { if (this.searchpart == null) return null; this.searchpart = this.searchpart.replaceAll("&", "&"); String[] parts = CommonPattern.AMP.split(this.searchpart); Map<String, String> map = new LinkedHashMap<String, String>(); for (String part : parts) { int p = part.indexOf('='); if (p > 0) map.put(part.substring(0, p), part.substring(p + 1)); else map.put(part, ""); } return map; } @Override public String toString() { return toNormalform(false); } /** * Tokenizes url as string (without the protocol). * For example "http://host.com/path/file.txt" returns "host com path file ext" * @return url tokens as one string */ public String toTokens() { return toTokens(unescape(this.urlstub(true, true))); } /** * create word tokens for parser. Find CamelCases and separate these words * resulting words are not ordered by appearance, but all in sequence * @return string with unique tokens */ public static String toTokens(final String s) { // remove all non-character & non-number final StringBuilder sb = new StringBuilder(s.length()); char c; for (int i = 0; i < s.length(); i++) { c = s.charAt(i); if (Character.isAlphabetic(c) || Character.isDigit(c)) sb.append(c); else sb.append(' '); } // split the string into tokens and add all camel-case splitting final String[] u = CommonPattern.SPACES.split(sb); final Set<String> token = new LinkedHashSet<String>(); for (final String r : u) token.add(r); for (final String r : u) token.addAll(parseCamelCase(r)); // construct a String again sb.setLength(0); for (final String v : token) if (v.length() >= 1) sb.append(v).append(' '); return sb.length() == 0 ? "" : sb.substring(0, sb.length() - 1); } public static enum CharType { low, high, number; } private static Set<String> parseCamelCase(String s) { final Set<String> token = new LinkedHashSet<String>(); if (s.isEmpty()) return token; int p = 0; CharType type = charType(s.charAt(0)), nct = type; while (p < s.length()) { // search for first appearance of an character that is a upper-case while (p < s.length() && (nct = charType(s.charAt(p))) == type) p++; if (p >= s.length()) { token.add(s); break; } if (nct == CharType.low) { type = CharType.low; p++; continue; } // the char type has changed token.add(s.substring(0, p)); s = s.substring(p); p = 0; type = nct; } token.add(s); return token; } /** * Evaluates url search part and returns attribute '=' value pairs * the returned values are in clear text (without urlencoding). * * To get the parameter map as (url-encoded key and values) * @see getSearchpartMap() * * @return map key=attribue name, value=string after '=' */ public Map<String, String> getAttributes() { Map<String, String> map = new LinkedHashMap<>(); if (this.searchpart == null) return map; final String[] questp = CommonPattern.AMP.split(this.searchpart, -1); for (final String element : questp) { int p = element.indexOf('='); if (p != -1) { map.put(unescape(element.substring(0, p)), unescape(element.substring(p + 1))); } else { if (!element.isEmpty()) map.put(unescape(element), ""); } } return map; } private static CharType charType(final char c) { if (Character.isLowerCase(c)) return CharType.low; if (Character.isDigit(c)) return CharType.number; return CharType.high; } public String toNormalform(final boolean excludeAnchor) { return toNormalform(excludeAnchor, false); } /** * Generates a normal form of the URL. * For file: url it normalizes also path delimiter to be '/' (replace possible Windows '\' * @param excludeAnchor * @param removeSessionID * @return */ public String toNormalform(final boolean excludeAnchor, final boolean removeSessionID) { boolean defaultPort = false; if (this.protocol.equals("mailto")) { return this.protocol + ":" + this.userInfo + "@" + this.host; } else if (isHTTP()) { if (this.port < 0 || this.port == 80) { defaultPort = true; } } else if (isHTTPS()) { if (this.port < 0 || this.port == 443) { defaultPort = true; } } else if (isFTP()) { if (this.port < 0 || this.port == 21) { defaultPort = true; } } else if (isSMB()) { if (this.port < 0 || this.port == 445) { defaultPort = true; } } else if (isFile()) { defaultPort = true; } String urlPath = this.getFile(excludeAnchor, removeSessionID); String h = getHost(); final StringBuilder u = new StringBuilder( 20 + (urlPath == null ? 0 : urlPath.length()) + ((h == null) ? 0 : h.length())); u.append(this.protocol); u.append("://"); if (h != null) { if (this.userInfo != null && !(this.isFTP() && this.userInfo.startsWith(FTPClient.ANONYMOUS))) { u.append(this.userInfo); u.append("@"); } u.append(h.toLowerCase(Locale.ROOT)); } if (!defaultPort) { u.append(":"); u.append(this.port); } if (isFile() && urlPath.indexOf('\\') >= 0) { // normalize windows backslash (important for hash computation) urlPath = urlPath.replace('\\', '/'); } u.append(urlPath); String result = u.toString(); return result; } /** * Generates a normal form of the url, without the protocol part, * except the skipped protocol part this is identical with toNormalform() * @see #toNormalform(boolean) * @param excludeAnchor, exclude anchor part * @param removeSessionID, exclude session id * @return example "www.host.com:8080/path/file.html" * @see #toNormalform(boolean, boolean) */ public String urlstub(final boolean excludeAnchor, final boolean removeSessionID) { // generates a normal form of the URL boolean defaultPort = false; if (this.protocol.equals("mailto")) { return this.userInfo + "@" + this.host; } else if (isHTTP()) { if (this.port < 0 || this.port == 80) { defaultPort = true; } } else if (isHTTPS()) { if (this.port < 0 || this.port == 443) { defaultPort = true; } } else if (isFTP()) { if (this.port < 0 || this.port == 21) { defaultPort = true; } } else if (isSMB()) { if (this.port < 0 || this.port == 445) { defaultPort = true; } } else if (isFile()) { defaultPort = true; } String urlPath = this.getFile(excludeAnchor, removeSessionID); String h = getHost(); final StringBuilder u = new StringBuilder(20 + urlPath.length() + ((h == null) ? 0 : h.length())); if (h != null) { if (this.userInfo != null && !(this.isFTP() && this.userInfo.startsWith(FTPClient.ANONYMOUS))) { u.append(this.userInfo); u.append("@"); } u.append(h.toLowerCase(Locale.ROOT)); } if (!defaultPort) { u.append(":"); u.append(this.port); } u.append(urlPath); String result = u.toString(); return result; } @Override public int hashCode() { return (this.protocol == null ? 0 : this.protocol.hashCode() >> 2) + (this.host == null ? 0 : this.host.hashCode() >> 2) + (this.userInfo == null ? 0 : this.userInfo.hashCode() >> 2) + (this.path == null ? 0 : this.path.hashCode() >> 2) + (this.searchpart == null ? 0 : this.searchpart.hashCode() >> 2) + this.port; //return this.toNormalform(true).hashCode(); } /* (non-Javadoc) * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals(final Object obj) { if (this == obj) return true; if (obj == null) return false; if (!(obj instanceof MultiProtocolURL)) return false; final MultiProtocolURL other = (MultiProtocolURL) obj; return ((this.protocol == null && other.protocol == null) || (this.protocol != null && other.protocol != null && this.protocol.equals(other.protocol))) && ((this.host == null && other.host == null) || (this.host != null && other.host != null && this.host.equals(other.host))) && ((this.userInfo == null && other.userInfo == null) || (this.userInfo != null && other.userInfo != null && this.userInfo.equals(other.userInfo))) && ((this.path == null && other.path == null) || (this.path != null && other.path != null && this.path.equals(other.path))) && ((this.searchpart == null && other.searchpart == null) || (this.searchpart != null && other.searchpart != null && this.searchpart.equals(other.searchpart))) && this.port == other.port; } @Override public int compareTo(final MultiProtocolURL h) { int c; if (this.protocol != null && h.protocol != null && (c = this.protocol.compareTo(h.protocol)) != 0) return c; if (this.host != null && h.host != null && (c = this.host.compareTo(h.host)) != 0) return c; if (this.userInfo != null && h.userInfo != null && (c = this.userInfo.compareTo(h.userInfo)) != 0) return c; if (this.path != null && h.path != null && (c = this.path.compareTo(h.path)) != 0) return c; if (this.searchpart != null && h.searchpart != null && (c = this.searchpart.compareTo(h.searchpart)) != 0) return c; return toNormalform(true).compareTo(h.toNormalform(true)); } public boolean isPOST() { return (this.searchpart != null) && (this.searchpart.length() > 0); } public static final boolean isCGI(final String extension) { return extension != null && extension.length() > 0 && "cgi.exe".indexOf(extension.toLowerCase(Locale.ROOT)) >= 0; } /** * @deprecated use a mimetype considering methode (e.g. Document.getContentDomain() == ContentDomain.IMAGE or else Classification.isImageExtension() ) */ @Deprecated public static final boolean isImage(final String extension) { return extension != null && extension.length() > 0 && Response.docTypeExt(extension.toLowerCase(Locale.ROOT)) == Response.DT_IMAGE; } public final boolean isIndividual() { final String q = unescape(this.path.toLowerCase(Locale.ROOT)); for (final String sid : sessionIDnames.keySet()) { if (q.startsWith(sid.toLowerCase(Locale.ROOT) + "=")) return true; final int p = q.indexOf("&" + sid.toLowerCase(Locale.ROOT) + "=", 0); if (p >= 0) return true; } int pos; return ((pos = q.indexOf("sid", 0)) > 0 && (q.charAt(--pos) == '?' || q.charAt(pos) == '&' || q.charAt(pos) == ';') && (pos += 5) < q.length() && (q.charAt(pos) != '&' && q.charAt(--pos) == '=')) || ((pos = q.indexOf("sessionid", 0)) > 0 && (pos += 10) < q.length() && (q.charAt(pos) != '&' && (q.charAt(--pos) == '=' || q.charAt(pos) == '/'))) || ((pos = q.indexOf("phpsessid", 0)) > 0 && (pos += 10) < q.length() && (q.charAt(pos) != '&' && (q.charAt(--pos) == '=' || q.charAt(pos) == '/'))); } // checks for local/global IP range and local IP public boolean isLocal() { return this.isFile() || this.isSMB() || Domains.isLocal(this.host, this.hostAddress); } // language calculation //modified by copperdust; Ukraine, 2012 public final String language() { String language = "en"; if (this.host == null) return language; final int pos = this.host.lastIndexOf('.'); String host_tld = this.host.substring(pos + 1).toLowerCase(Locale.ROOT); if (pos == 0) return language; int length = this.host.length() - pos - 1; switch (length) { case 2: char firstletter = host_tld.charAt(0); switch (firstletter) {//speed-up case 'a': if (host_tld.equals("au")) {//Australia /91,000,000 language = "en";//australian english; eng; eng; ause } else if (host_tld.equals("at")) {//Austria /23,000,000 language = "de";//german; ger (deu); deu } else if (host_tld.equals("ar")) {//Argentina /10,700,000 language = "es";//spanish } else if (host_tld.equals("ae")) {//United Arab Emirates /3,310,000 language = "ar";//arabic } else if (host_tld.equals("am")) {//Armenia /2,080,000 language = "hy";//armenian; arm (hye); hye } else if (host_tld.equals("ac")) {//Ascension Island /2,060,000 language = "en";//english } else if (host_tld.equals("az")) {//Azerbaijan /1,340,000 language = "az";//azerbaijani; aze; aze (azj, azb) } else if (host_tld.equals("ag")) {//Antigua and Barbuda /1,310,000 language = "en";//english } else if (host_tld.equals("as")) {//American Samoa /1,220,000 language = "en";//english } else if (host_tld.equals("al")) {//Albania /389,000 language = "sq";//albanian; alb (sqi); sqi } else if (host_tld.equals("ad")) {//Andorra /321,000 language = "ca";//catalan; cat } else if (host_tld.equals("ao")) {//Angola /153,000 language = "pt";//portuguese } else if (host_tld.equals("ai")) {//Anguilla /149,000 language = "en";//english } else if (host_tld.equals("af")) {//Afghanistan /101,000 language = "ps";//pashto; pus } else if (host_tld.equals("an")) {//Netherlands Antilles /78,100 language = "nl";//dutch } else if (host_tld.equals("aq")) {//Antarctica /36,000 language = "en";//can be any } else if (host_tld.equals("aw")) {//Aruba /34,400 language = "nl";//dutch } else if (host_tld.equals("ax")) {//Aland Islands /28 language = "sv";//swedish } break; case 'b': if (host_tld.equals("br")) {//Brazil /25,800,000 language = "pt";//portuguese } else if (host_tld.equals("be")) {//Belgium /25,100,000 language = "nl";//dutch } else if (host_tld.equals("bg")) {//Bulgaria /3,480,000 language = "bg";//bulgarian; bul } else if (host_tld.equals("bz")) {//Belize /2,790,000 language = "en";//english } else if (host_tld.equals("ba")) {//Bosnia and Herzegovina /2,760,000 language = "sh";//serbo-croatian } else if (host_tld.equals("by")) {//Belarus /2,540,000 language = "be";//belarusian; bel } else if (host_tld.equals("bo")) {//Bolivia /1,590,000 language = "es";//spanish; spa //language = "qu";//quechua; que //language = "ay";//aymara; aym (ayr) //und viele andere (indian) } else if (host_tld.equals("bd")) {//Bangladesh /342,000 language = "bn";//bengali; ben } else if (host_tld.equals("bw")) {//Botswana /244,000 //language = "en";//english language = "tn";//tswana; tsn } else if (host_tld.equals("bh")) {//Bahrain /241,000 language = "ar";//arabic } else if (host_tld.equals("bf")) {//Burkina Faso /239,000 language = "fr";//french } else if (host_tld.equals("bm")) {//Bermuda /238,000 language = "en";//english } else if (host_tld.equals("bn")) {//Brunei Darussalam /157,000 language = "ms";//malay; msa/mhp } else if (host_tld.equals("bb")) {//Barbados /131,000 language = "en";//english } else if (host_tld.equals("bt")) {//Bhutan /123,000 language = "dz";//dzongkha; dzo } else if (host_tld.equals("bi")) {//Burundi /60,600 language = "rn";//kirundi; run } else if (host_tld.equals("bs")) {//Bahamas /37,700 language = "en";//english } else if (host_tld.equals("bj")) {//Benin /36,200 language = "fr";//french; fra (fre); fra } else if (host_tld.equals("bv")) {//Bouvet Island /55 language = "no";//norwegian; nor (nob/nno) } break; case 'c': if (host_tld.equals("ca")) {//Canada /165,000,000 language = "en";//english //language = "fr";//french } else if (host_tld.equals("ch")) {//Switzerland /62,100,000 language = "de";//german; gsw } else if (host_tld.equals("cn")) {//People's Republic of China /26,700,000 language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("cz")) {//Czech Republic /18,800,000 language = "cs";//czech; cze (ces); ces } else if (host_tld.equals("cl")) {//Chile /18,500,000 language = "es";//spanish; spa } else if (host_tld.equals("co")) {//Colombia /4,270,000 language = "es";//spanish; spa } else if (host_tld.equals("cc")) {//Cocos (Keeling) Islands /4,050,000 language = "en";//english } else if (host_tld.equals("cr")) {//Costa Rica /2,060,000 language = "es";//spanish; spa } else if (host_tld.equals("cy")) {//Cyprus /2,500,000 language = "el";//greek; gre (ell); ell } else if (host_tld.equals("cu")) {//Cuba /2,040,000 language = "es";//spanish; spa } else if (host_tld.equals("cx")) {//Christmas Island /1,830,000 language = "en";//english } else if (host_tld.equals("cd")) {//Democratic Republic of the Congo /475,000 language = "fr";//french } else if (host_tld.equals("cg")) {//Republic of the Congo /193,000 language = "fr";//french } else if (host_tld.equals("cm")) {//Cameroon /119,000 //language = "fr";//french language = "en";//english } else if (host_tld.equals("ci")) {//Cote d'Ivoire /95,200 language = "fr";//french } else if (host_tld.equals("cv")) {//Cape Verde /81,900 language = "pt";//portuguese; por } else if (host_tld.equals("ck")) {//Cook Islands /43,300 language = "en";//english //language = "";//cook islands maori; rar (pnh, rkh) } else if (host_tld.equals("cf")) {//Central African Republic /703 language = "sg";//sango; sag; 92% could speak //language = "fr";//french; fra (fre); fra; 22,5% could speak, but maybe inet users prefer this } break; case 'd': if (host_tld.equals("dk")) {//Denmark /19,700,000 language = "da";//danish; dan } else if (host_tld.equals("do")) {//Dominican Republic /1,510,000 language = "es";//spanish; spa } else if (host_tld.equals("dz")) {//Algeria /326,000 language = "ar";//arabic; ara; arq } else if (host_tld.equals("dj")) {//Djibouti /150,000 language = "ar";//arabic; ara; 94% are muslims, so arabic is primary //language = "fr";//french; fre (fra); fra } else if (host_tld.equals("dm")) {//Dominica /30,100 language = "en";//english } break; case 'e': if (host_tld.equals("ee")) {//Estonia /6,790,000 language = "et";//estonian; est; est (ekk) } else if (host_tld.equals("eg")) {//Egypt /2,990,000 language = "ar";//modern standard arabic; ara; arb //language = "ar";//egyptian arabic; ara; arz } else if (host_tld.equals("ec")) {//Ecuador /2,580,000 language = "es";//spanish; spa } else if (host_tld.equals("et")) {//Ethiopia /142,000 language = "am";//amharic; amh } else if (host_tld.equals("eu")) {//European Union /45,100 language = "en";//english (what can be else) } else if (host_tld.equals("er")) {//Eritrea /15,800 language = "ti";//tigrinya; tir } break; case 'f': if (host_tld.equals("fr")) {//France /96,700,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("fi")) {//Finland /28,100,000 language = "fi";//finnish; fin (92%) } else if (host_tld.equals("fm")) {//Federated States of Micronesia /4,580,000 language = "en";//english //all native at regional level } else if (host_tld.equals("fo")) {//Faroe Islands /623,000 language = "fo";//faroese; fao } else if (host_tld.equals("fj")) {//Fiji /466,000 language = "fj";//fijian; fij //also english, fiji hindi etc } else if (host_tld.equals("fk")) {//Falkland Islands /10,500 language = "en";//english } break; case 'g': if (host_tld.equals("gr")) {//Greece /13,500,000 language = "el";//greek; gre (ell); ell } else if (host_tld.equals("ge")) {//Georgia /2,480,000 language = "ka";//georgian; geo (kat); kat } else if (host_tld.equals("gt")) {//Guatemala /904,000 language = "es";//spanish; spa } else if (host_tld.equals("gs")) {//South Georgia and the South Sandwich Islands /772,000 language = "en";//english } else if (host_tld.equals("gl")) {//Greenland /526,000 language = "kl";//greenlandic; kal } else if (host_tld.equals("gg")) {//Guernsey /322,000 language = "en";//english } else if (host_tld.equals("gi")) {//Gibraltar /193,000 language = "en";//english } else if (host_tld.equals("gh")) {//Ghana /107,000 language = "en";//english } else if (host_tld.equals("gy")) {//Guyana /68,700 language = "en";//english } else if (host_tld.equals("gm")) {//Gambia /59,300 language = "en";//english } else if (host_tld.equals("gn")) {//Guinea /18,700 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("ga")) {//Gabon /17,900 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("gd")) {//Grenada /13,600 language = "en";//english } else if (host_tld.equals("gu")) {//Guam /12,800 //language = "ch";//chamorro; cha (looks like young generation don't want to use) language = "en";//english } else if (host_tld.equals("gq")) {//Equatorial Guinea /1,450 language = "es";//spanish; spa } else if (host_tld.equals("gp")) {//Guadeloupe /980 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("gf")) {//French Guiana /926 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("gb")) {//United Kingdom of Great Britain and Northern Ireland (currently->uk) /186 language = "en";//english } else if (host_tld.equals("gw")) {//Guinea-Bissau /26 language = "pt";//portuguese; por } break; case 'h': if (host_tld.equals("hu")) {//Hungary /18,500,000 language = "hu";//hungarian; hun } else if (host_tld.equals("hk")) {//Hong Kong /9,510,000 language = "zh";//chinese; chi (zho, cmn) //also english } else if (host_tld.equals("hr")) {//Croatia /6,080,000 language = "hr";//croatian; hrv } else if (host_tld.equals("hn")) {//Honduras /628,000 language = "es";//spanish; spa } else if (host_tld.equals("hm")) {//Heard and McDonald Islands /194,000 language = "en";//english } else if (host_tld.equals("ht")) {//Haiti /17,700 language = "fr";//french; fre (fra); fra //language = "ht";//haitian creole; hat } break; case 'i': if (host_tld.equals("it")) {//Italy /55,200,000 language = "it";//italian; ita } else if (host_tld.equals("il")) {//Israel /17,800,000 language = "he";//hebrew; heb } else if (host_tld.equals("ie")) {//Republic of Ireland + Northern Ireland /17,000,000 language = "ga";//irish; gle //language = "en";//english } else if (host_tld.equals("in")) {//India /9,330,000 language = "hi";//hindi; hin } else if (language.equals("is")) {//Iceland /5,310,000 language = "is";//icelandic; ice (isl); isl } else if (host_tld.equals("ir")) {//Islamic Republic of Iran /2,940,000 language = "fa";//persian; per (fas); pes } else if (host_tld.equals("im")) {//Isle of Man /276,000 language = "en";//english //language = "gv";//manx; glv (was dead, currently only slogans etc basically) } else if (host_tld.equals("io")) {//British Indian Ocean Territory /108,000 language = "en";//english } else if (host_tld.equals("iq")) {//Iraq /133 language = "ar";//arabic; ara; acm //language = "ku";//kurdish; kur } break; case 'j': if (host_tld.equals("jp")) {//Japan /139,000,000 language = "ja";//japanese; jpn } else if (host_tld.equals("jo")) {//Jordan /601,000 language = "ar";//jordanian arabic; ara; ajp //language = "en";//english (businness) } else if (host_tld.equals("jm")) {//Jamaica /290,000 language = "en";//english } else if (host_tld.equals("je")) {//Jersey /202,000 language = "en";//english } break; case 'k': if (host_tld.equals("kr")) {//Republic of Korea /13,700,000 language = "ko";//korean; kor } else if (host_tld.equals("kz")) {//Kazakhstan /2,680,000 language = "kk";//kazakh; kaz //language = "ru";//russian; rus (de-facto is widely used than native language) } else if (host_tld.equals("kg")) {//Kyrgyzstan /1,440,000 language = "ky";//kyrgyz; kir //language = "ru";//russian; rus (perhaps this one here is widely used) } else if (host_tld.equals("ki")) {//Kiribati /427,000 //language = "";//kiribati; gil (this one must be used, but don't have ISO 639-1) (!) language = "en";//english //here also can be other languages: .de.ki = deutsch } else if (host_tld.equals("kw")) {//Kuwait /356,000 language = "ar";//arabic; ara } else if (host_tld.equals("ke")) {//Kenya /301,000 language = "sw";//swahili; swa; swh //language = "en";//english } else if (host_tld.equals("kh")) {//Cambodia /262,000 language = "km";//khmer; khm } else if (host_tld.equals("ky")) {//Cayman Islands /172,000 language = "en";//english } else if (host_tld.equals("kn")) {//Saint Kitts and Nevis /9,830 language = "en";//english } else if (host_tld.equals("km")) {//Comoros /533 //Comorian dialects ISO 639-3: zdj, wni, swb, wlc - must be used here language = "ar";//arabic; ara //language = "fr";//french; fre (fra); fra } else if (host_tld.equals("kp")) {//Democratic People's Republic of Korea /122 language = "ko";//korean; kor } break; case 'l': if (host_tld.equals("lv")) {//Latvia /6,970,000 language = "lv";//latvian; lav; lvs } else if (host_tld.equals("lt")) {//Lithuania /6,040,000 language = "lt";//lithuanian; lit } else if (host_tld.equals("lu")) {//Luxembourg /4,940,000 language = "lb";//luxembourgish; ltz (West Central German language familie; official 1984) //wide spoken, but not business or media //language = "fr";//french; fre (fra); fra (business) //language = "de";//german; ger (deu); ltz (media) } else if (host_tld.equals("li")) {//Liechtenstein /3,990,000 language = "de";//german; ger (deu); deu } else if (host_tld.equals("lb")) {//Lebanon /1,890,000 language = "ar";//arabic; ara } else if (host_tld.equals("lk")) {//Sri Lanka /1,770,000 language = "si";//sinhala; sin //language = "ta";//tamil; tam } else if (host_tld.equals("la")) {//Laos (Lao Peoples Democratic Republic) /932,000 language = "lo";//lao; lao } else if (host_tld.equals("ly")) {//Libya /388,000 language = "ar";//libyan arabic; ara; ayl } else if (host_tld.equals("lc")) {//Saint Lucia /86,400 language = "en";//english //language = "";//french creole; acf (ISO 639-3) //ISO 639-1 is missed + not official, but this is 95% speaking language - must be first (!) } else if (host_tld.equals("ls")) {//Lesotho /81,900 language = "st";//sotho; sot (97%) //language = "en";//english } else if (host_tld.equals("lr")) {//Liberia /588 language = "en";//english } break; case 'm': if (host_tld.equals("mx")) {//Mexico /13,700,000 language = "es";//spanish; spa } else if (host_tld.equals("my")) {//Malaysia /4,610,000 language = "en";//english (business) //language = "";//malaysian; zsm, zlm (maybe must be used here, but no ISO 639-1,2) } else if (host_tld.equals("md")) {//Moldova /3,230,000 language = "ro";//romanian; rum (ron); ron } else if (host_tld.equals("ma")) {//Morocco /3,030,000 language = "ar";//moroccan arabic; ara; ary //language = "fr";//french; fre (fra); fra //language = "";//amazigh (berber); ber; tzm (no ISO 639-1 code) } else if (host_tld.equals("mk")) {//Republic of Macedonia /2,980,000 language = "mk";//macedonian; mac (mkd); mkd } else if (host_tld.equals("ms")) {//Montserrat /2,160,000 language = "en";//english } else if (host_tld.equals("mt")) {//Malta /1,650,000 language = "mt";//maltese; mlt //100% speak Maltese, 88% English, 66% Italian //(but about 75-80% of sites have default english, support of maltese have ~50% of sites) } else if (host_tld.equals("mo")) {//Macau /1,310,000 language = "zh";//chinese; chi (zho); yue (cantonese) } else if (host_tld.equals("mn")) {//Mongolia /1,160,000 language = "mn";//Mongolian; mon; mon: khk } else if (host_tld.equals("mp")) {//Northern Mariana Islands /861,000 language = "en";//english //language = "ch";//chamorro; cha //language = "";//carolinian; ISO 639-3: cal (no ISO 639-1) } else if (host_tld.equals("mu")) {//Mauritius /651,000 language = "fr";//french; fre (fra); fra, mfe (predominant on media) //language = "en";//english (goverment) } else if (host_tld.equals("mm")) {//Myanmar /367,000 language = "my";//burmese; bur (mya); mya } else if (host_tld.equals("mc")) {//Monaco /307,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("me")) {//Montenegro /? language = "sh";//montenegrin (~serbo-croatian, near serbian); scr, scc; hbs (macrolanguage): srp (serbian) } else if (host_tld.equals("mz")) {//Mozambique /288,000 language = "pt";//portuguese; por //language = "";//makhuwa; vmw (ISO 639-3) } else if (host_tld.equals("mg")) {//Madagascar /255,000 language = "mg";//malagasy; mlg (mlg); mlg (macrolanguage): plt //language = "fr";//french; fre (fra); fra //malagasy is native language, but elite want to french } else if (host_tld.equals("mr")) {//Mauritania /210,000 language = "ar";//arabic; ara; mey //language = "fr";//french; fre (fra); fra } else if (host_tld.equals("mv")) {//Maldives /125,000 language = "dv";//dhivehi; div //English is used widely in commerce and increasingly in government schools. } else if (host_tld.equals("mw")) {//Malawi /87,000 //language = "ny";//chewa; nya language = "en";//english (founded sites in english only, include goverment) } else if (host_tld.equals("ml")) {//Mali /73,500 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("mq")) {//Martinique /19,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("mh")) {//Marshall Islands /53 language = "mh";//marshallese; mah //language = "en";//english } break; case 'n': if (host_tld.equals("no")) {//Norway /32,300,000 language = "no";//norwegian; nor (nob/nno) } else if (host_tld.equals("nz")) {//New Zealand /18,500,000 language = "en";//english //language = "mi";//maori; mao (mri); mri (4.2%) } else if (host_tld.equals("nu")) {//Niue /5,100,000 language = "en";//english //language = "";//niuean; niu (no ISO 639-1) (97.4% of native, but most are bilingual in English) } else if (host_tld.equals("ni")) {//Nicaragua /4,240,000 language = "es";//spanish; spa } else if (host_tld.equals("np")) {//Nepal /1,910,000 language = "ne";//nepali; nep } if (host_tld.equals("na")) {//Namibia /1,650,000 language = "af";//afrikaans; afr //language = "de";//German; ger (deu); deu //language = "ng";//ndonga (ovambo); kua (ndo); ndo //language = "en";//english //Official is English. //Northern majority of Namibians speak Oshiwambo as first language, //whereas the most widely understood and spoken Afrikaans. //Younger generation most widely understood English and Afrikaans. //Afrikaans is spoken by 60% of the WHITE community, German is spoken by 32%, //English is spoken by 7% and Portuguese by 1%. } else if (host_tld.equals("nr")) {//Nauru /466,000 //language = "na";//Nauruan; nau (50% - 66% at home) language = "en";//english (goverment + business, also .co.nr is free so here can be any) } else if (host_tld.equals("nc")) {//New Caledonia /265,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("ne")) {//Niger /151,000 language = "fr";//french; fre (fra); fra (official and elite) //language = "ha";//hausa; hau (50%) } else if (host_tld.equals("ng")) {//Nigeria /101,000 language = "en";//english } else if (host_tld.equals("nf")) {//Norfolk Island /54,900 language = "en";//english } break; case 'o': if (host_tld.equals("om")) {//Oman /204,000 language = "ar";//omani arabic; ara; acx //language = "en";//english (education and science is ar/en, but people speak mostly arabic) } break; case 'p': if (host_tld.equals("pl")) {//Poland /20,100,000 language = "pl";//polish; pol } else if (host_tld.equals("pt")) {//Portugal /9,100,000 language = "pt";//portuguese; por } else if (host_tld.equals("ph")) {//Philippines /4,080,000 language = "tl";//filipino; fil //language = "en";//english } else if (host_tld.equals("pk")) {//Pakistan /3,180,000 language = "ur";//urdu; urd (lingua franca and national language) //language = "en";//english (official language and used in business, government, and legal contracts) //language = "";//pakistani english;6:pake //(sase: South-Asian-English, engs: English Spoken) //language = "pa";//punjabi; pan //language = "ps";//pashto; pus; pst, pbt //language = "sd";//sindhi; snd //also Saraiki skr (no 1,2) and Balochi bal; bal (bgp, bgn, bcc) (no 1) } else if (host_tld.equals("pw")) {//Palau /3,010,000 language = "en";//english //language = "";//palauan; pau (no ISO 639-1) //language = "tl";//tagalog; tgl //language = "ja";//japanese; jpn } else if (host_tld.equals("pe")) {//Peru /2,740,000 language = "es";//spanish; spa (83.9%) //language = "qu";//quechua; que (13.2%) } else if (host_tld.equals("pr")) {//Puerto Rico /1,920,000 language = "es";//spanish; spa } else if (host_tld.equals("pa")) {//Panama /1,040,000 language = "es";//spanish; spa } else if (host_tld.equals("py")) {//Paraguay /962,000 language = "gn";//guarani; grn; gug (90%) //language = "es";//spanish; spa (87%) } else if (host_tld.equals("ps")) {//Palestinian territories /559,000 language = "ar";//palestinian arabic; ara; ajp } else if (host_tld.equals("pf")) {//French Polynesia /240,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("pg")) {//Papua New Guinea /211,000 language = "en";//english (also pidgin Tok Pisin) //language = "ho";//hiri motu; hmo } else if (host_tld.equals("pn")) {//Pitcairn Islands /80,900 language = "en";//english/pitkern (english creole); pih (ISO 639-3) //language = "en";//english (second language in schools) } else if (host_tld.equals("pm")) {//Saint-Pierre and Miquelon /184 language = "fr";//french; fre (fra); fra } break; case 'q': if (host_tld.equals("qa")) {//Qatar /259,000 language = "ar";//gulf arabic; ara; afb } break; case 'r': if (host_tld.equals("ru")) {//Russia /67,900,000 language = "ru";//russian; rus } else if (host_tld.equals("ro")) {//Romania /7,990,000 language = "ro";//daco-romanian; rum (ron); ron } else if (host_tld.equals("rs")) {//Serbia /? language = "sr";//serbian; srp } else if (host_tld.equals("re")) {//Reunion /146,000 language = "fr";//french; fre (fra); fra, rcf (Reunion Creole) } else if (host_tld.equals("rw")) {//Rwanda /131,000 language = "rw";//kinyarwanda; kin //language = "en";//english //language = "fr";//french; fre (fra); fra //language = "sw";//swahili; swa } break; case 's': if (host_tld.equals("se")) {//Sweden /39,000,000 language = "sv";//swedish; swe } else if (host_tld.equals("es")) {//Spain /31,000,000 language = "es";//spanish; spa } else if (host_tld.equals("sg")) {//Singapore /8,770,000 language = "zh";//singaporean mandarin (chinese); chi (zho); cmn (49.9%) //language = "en";//english (business, government and medium of instruction in schools) (32.3%) //language = "ms";//malay; may (msa); msa, zsm ("national language") (12.2%) //language = "ta";//tamil; tam } else if (host_tld.equals("sk")) {//Slovakia /8,040,000 language = "sk";//slovak; slo (slk); slk } else if (host_tld.equals("si")) {//Slovenia /4,420,000 language = "sl";//slovene; slv } else if (host_tld.equals("su")) {//Soviet Union /3,530,000 language = "ru";//russian; rus } else if (host_tld.equals("sa")) {//Saudi Arabia /2,770,000 language = "ar";//gulf arabic; ara; afb } else if (host_tld.equals("st")) {//Sao Tome and Principe /2,490,000 language = "pt";//portuguese; por (95%) //language = "pt";//forro (creole); por; cri (85%) //language = "pt";//angolar (creole); cpp; aoa (3%) //language = "fr";//french; fre (fra); fra (Francophonie -> learns in schools) } else if (host_tld.equals("sv")) {//El Salvador /1,320,000 language = "es";//spanish; spa //language = "";//nahuatl; nah; nlv and others (no ISO 639-1) //language = "";//mayan; myn (no ISO 639-1,3) //language = "";//q'eqchi'; kek (no ISO 639-1,2) } else if (host_tld.equals("sc")) {//Seychelles /949,000 language = "en";//english //language = "fr";//french; fre (fra); fra //language = "fr";//seychellois creole; fre (fra); crs } else if (host_tld.equals("sh")) {//Saint Helena /547,000 language = "en";//english } else if (host_tld.equals("sn")) {//Senegal /503,000 language = "wo";//wolof; wol (80%) //language = "fr";//french; fre (fra); fra //(understood ~15%-20% of all males and ~1%-2% of all women, but official) } else if (host_tld.equals("sr")) {//Suriname /242,000 language = "nl";//dutch; dut (nld); nld (education, government, business and the media) //language = "en";//sranan (suriname creole); srn; srn //language = "bh";//bhojpuri (Surinamese Hindi is a dialect of Bhojpuri); bho //language = "jv";//javanese; jvn } else if (host_tld.equals("sm")) {//San Marino /225,000 language = "it";//italian; ita } else if (host_tld.equals("sy")) {//Syria /115,000 language = "ar";//syrian arabic; ara; apc, ajp //language = "ku";//kurmanji (kurdish); kur; kmr } else if (host_tld.equals("sz")) {//Swaziland /81,500 language = "ss";//swazi; ssw //language = "en";//english } else if (host_tld.equals("sl")) {//Sierra Leone /13,800 language = "en";//Sierra Leone Krio (english); eng; kri (97% spoken) //language = "en";//english (official) } else if (host_tld.equals("sb")) {//Solomon Islands /11,800 language = "en";//Pijin (Solomons Pidgin or Neo-Solomonic); cpe; pis //language = "en";//english (12%) } else if (host_tld.equals("sd")) {//Sudan /11,700 language = "ar";//sudanese arabic; ara; apd //language = "en";//english //english and arabic promoted by goverment (english for education and official) } else if (host_tld.equals("so")) {//Somalia /512 language = "so";//somali; som //language = "ar";//hadhrami arabic; ara; ayh //language = "en";//english //language = "it";//italian; ita //language = "sw";//bravanese (swahili); swa; swh } else if (host_tld.equals("ss")) {//South Sudan /? language = "en";//english //language = "ar";//juba arabic; ara; pga //language = "";//dinka; din (no ISO 639-1) //English and Juba Arabic are the official languages, although Dinka is the most widely spoken } break; case 't': if (host_tld.equals("tw")) {//Republic of China (Taiwan) /14,000,000 language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("tr")) {//Turkey /8,310,000 language = "tr";//turkish; tur } else if (host_tld.equals("tv")) {//Tuvalu /7,170,000 //used for TV, domain currently operated by dotTV, a VeriSign company //the Tuvalu government owns twenty percent of the company //language = "";//tuvaluan; tvl (no ISO 639-1) (close to Maori(mi), Tahitian(ty), Samoan(sm), Tongan(to)) language = "en";//english } else if (host_tld.equals("th")) {//Thailand /6,470,000 language = "th";//thai; tha } else if (host_tld.equals("tc")) {//Turks and Caicos Islands /2,610,000 //language = "en";//english language = "en";//turks and caicos islands creole; eng; tch } else if (host_tld.equals("to")) {//Tonga /2,490,000 //Often used unofficially for Torrent, Toronto, or Tokyo language = "to";//tongan; ton //language = "en";//english } else if (host_tld.equals("tk")) {//Tokelau /2,170,000 //Also used as a free domain service to the public (so maybe english here) language = "to";//tokelauan; tvl/ton; tkl (no ISO 639-1,2) //to - has marked similarities to the Niuafo'ou language of Tonga //tvl - Tokelauan is a Polynesian language closely related to Tuvaluan //language = "en";//english (main language is Tokelauan, but English is also spoken) } else if (host_tld.equals("tt")) {//Trinidad and Tobago /1,170,000 language = "en";//trinidadian english (official) //language = "en";//trinidadian creole; eng; trf (main spoken) //language = "en";//tobagonian creole; eng; tgh (main spoken) } else if (host_tld.equals("tn")) {//Tunisia /1,060,000 language = "ar";//tunisian arabic; ara; aeb } else if (host_tld.equals("tf")) {//French Southern and Antarctic Lands /777,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("tz")) {//Tanzania /405,000 language = "sw";//swahili; swa; swh //language = "en";//english (Higher courts, higher education) } else if (host_tld.equals("tj")) {//Tajikistan /153,000 language = "tg";//tajik; tgk //language = "ru";//russian; rus (wide in businness) } else if (host_tld.equals("tp")) {//East Timor /151,000 language = "pt";//portuguese; por //language = "en";//english } else if (host_tld.equals("tm")) {//Turkmenistan /136,000 language = "tk";//turkmen; tuk } else if (host_tld.equals("tg")) {//Togo /36,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("tl")) {//East Timor (Timor-Leste) /18,100 //language = "";//tetum; tet (no ISO 639-1) language = "id";//indonesian; ind //language = "pt";//portuguese; por (5% literally, 25-50% listeners) //language = "en";//english } else if (host_tld.equals("td")) {//Chad /332 language = "ar";//chadian arabic; ara; shu //language = "ar";//arabic; ara //language = "fr";//french; fre (fra); fra } break; case 'u': if (host_tld.equals("uk")) {//United Kingdom of Great Britain and Northern Ireland /473,000,000 language = "en";//english } else if (host_tld.equals("us")) {//United States of America /68,300,000 language = "en";//english } else if (host_tld.equals("ua")) {//Ukraine /6,820,000 language = "uk";//ukrainian; ukr } else if (host_tld.equals("uz")) {//Uzbekistan /2,610,000 language = "uz";//uzbek; uzb //language = "ru";//russian; rus (14% native) } else if (host_tld.equals("uy")) {//Uruguay /2,020,000 language = "es";//spanish; spa //language = "en";//english } else if (host_tld.equals("ug")) {//Uganda /337,000 language = "sw";//swahili; swa; swc //language = "en";//english (also ugandan english) //language = "lg";//ganda; lug (not all territory) } break; case 'v': if (host_tld.equals("vu")) {//Vanuatu /5,050,000 language = "en";//english (education) //language = "bi";//bislama; bis (creole language, used as pidgin) //language = "fr";//french; fre (fra); fra (education) //many native languages, but no-one primary } else if (host_tld.equals("ve")) {//Venezuela /3,050,000 language = "es";//spanish; spa //language = "en";//english //language = "it";//italian; ita //also many indigenous languages } else if (host_tld.equals("vn")) {//Vietnam /2,490,000 language = "vi";//vietnamese; vie } else if (host_tld.equals("va")) {//Vatican City /852,000 language = "it";//italian; ita } else if (host_tld.equals("vg")) {//British Virgin Islands /882,000 language = "en";//english //language = "en";//virgin islands creole english; eng; vic } else if (host_tld.equals("vc")) {//Saint Vincent and the Grenadines /239,000 language = "en";//english //language = "en";//vincentiancreole; eng; svc (home and friends) //language = "bh";//bhojpuri; bho (east indian language) //native indians 2% and no data about their language } else if (host_tld.equals("vi")) {//United States Virgin Islands /202,000 language = "en";//english //language = "en";//virgin islands creole english; eng; vic //language = "es";//spanish; spa //language = "fr";//french; fre (fra); fra } break; case 'w': if (host_tld.equals("ws")) {//Samoa /3,000,000 language = "sm";//Samoan; smo (most people) //but maybe english from the world also (!) } else if (host_tld.equals("wf")) {//Wallis and Futuna /30 language = "fr";//french; fre (fra); fra //language = "";//wallisian; wls (no ISO 639-1,2) //language = "";//futunan; fud (no ISO 639-1,2) //could: wallisian+futunan=88.5%; french=78.2% //had no knowledge: wallisian|futunan=7.2%; french=17.3% (!) } break; case 'x': break; case 'y': if (host_tld.equals("yu")) {//Yugoslavia /3,270,000 language = "sh";//serbo-croatian; scr, scc; hbs (srp, hrv, bos) } else if (host_tld.equals("ye")) {//Yemen /93,800 language = "ar";//yemeni arabic; ara; ayh (hadhrami), ayn (aanaani), acq(ta'izzi-adeni) } else if (host_tld.equals("yt")) {//Mayotte /34 language = "fr";//french; fre (fra); fra (55% read/write) //language = "sw";//maore comorian; swa; swb (41% r/w) //language = "ar";//yemeni arabic; ara (33% r/w) } break; case 'z': if (host_tld.equals("za")) {//South Africa /16,400,000 //language = "zu";//zulu; zul (23.8%) //language = "xh";//xhosa; xho (17.6%) language = "af";//afrikaans; afr (13.3%) //language = "en";//english; (8.2%, but language of commerce and science) //need research (!) } else if (host_tld.equals("zw")) {//Zimbabwe /507,000 language = "sn";//shona; sna (70%) //language = "nd";//ndebele; nde (20%) //language = "en"//english (2.5%, but traditionally used for official business) } else if (host_tld.equals("zm")) {//Zambia /324,000 language = "en";//english (official business and is the medium of instruction in schools) //language = "ny";//chewa; nya } break; } break; case 3: if (host_tld.equals("cat")) {//Catalan linguistic and cultural community /22,479 language = "ca";//catalan; cat } break; case 8: if (host_tld.equals("xn--p1ai")) {//Russia/Cyrillic /67,900,000* language = "ru";//russian; rus } else if (host_tld.equals("xn--node")) {//Georgia/Georgian /2,480,000* language = "ka";//georgian; geo (kat); kat //Proposed } break; case 9: if (host_tld.equals("xn--j1amh")) {//Ukraine/Cyrillic /6,820,000* language = "uk";//ukrainian; ukr //Proposed } break; case 10: if (host_tld.equals("xn--fiqs8s")) {//China/Simplified Chinese /26,700,000* language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("xn--fiqz9s")) {//China/Traditional Chinese /26,700,000* language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("xn--o3cw4h")) {//Thailand/Thai script /6,470,000* language = "th";//thai; tha } else if (host_tld.equals("xn--wgbh1c")) {//Egypt/Arabic /2,990,000* language = "ar";//modern standard arabic; ara; arb } else if (host_tld.equals("xn--wgbl6a")) {//Qatar/Arabic /259,000* language = "ar";//gulf arabic; ara; afb } else if (host_tld.equals("xn--90a3ac")) {//Serbia/Cyrillic /? language = "sr";//serbian; srp } else if (host_tld.equals("xn--wgv71a")) {//Japan/Japanese /139,000,000* language = "ja";//japanese; jpn //Proposed } break; case 11: if (host_tld.equals("xn--kprw13d")) {//Taiwan/Simplified Chinese /14,000,000* language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("xn--kpry57d")) {//Taiwan/Simplified Chinese /14,000,000* language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("xn--j6w193g")) {//Hong Kong/Traditional Chinese /9,510,000* language = "zh";//chinese; chi (zho, cmn) } else if (host_tld.equals("xn--h2brj9c")) {//India/Devanagari /9,330,000* language = "hi";//hindi; hin } else if (host_tld.equals("xn--gecrj9c")) {//India/Gujarati /9,330,000* language = "gu";//gujarati; guj //also can be Kutchi and Hindi } else if (host_tld.equals("xn--s9brj9c")) {//India/Gurmukhi /9,330,000* language = "pa";//punjabi; pan } else if (host_tld.equals("xn--45brj9c")) {//India/Bengali /9,330,000* language = "bn";//bengali; ben } else if (host_tld.equals("xn--pgbs0dh")) {//Tunisia/Arabic /1,060,000* language = "ar";//tunisian arabic; ara; aeb } else if (host_tld.equals("xn--80ao21a")) {//Kazakhstan/Cyrillic /2,680,000* language = "kk";//kazakh; kaz //Proposed } break; case 12: if (host_tld.equals("xn--3e0b707e")) {//South Korea/Hangul /13,700,000* language = "ko";//korean; kor } else if (host_tld.equals("xn--mgbtf8fl")) {//Syria/Arabic /115,000* language = "ar";//syrian arabic; ara; apc, ajp } else if (host_tld.equals("xn--4dbrk0ce")) {//Israel/Hebrew /17,800,000* language = "he";//hebrew; heb //Proposed } else if (host_tld.equals("xn--mgb9awbf")) {//Oman/Arabic /204,000 language = "ar";//omani arabic; ara; acx //Proposed } else if (host_tld.equals("xn--mgb2ddes")) {//Yemen/Arabic /93,800* language = "ar";//yemeni arabic; ara; ayh (hadhrami), ayn (aanaani), acq(ta'izzi-adeni) //Proposed } break; case 13: if (host_tld.equals("xn--fpcrj9c3d")) {//India/Telugu /9,330,000* language = "te";//telugu; tel } else if (host_tld.equals("xn--yfro4i67o")) {//Singapore/Chinese /8,770,000* language = "zh";//singaporean mandarin (chinese); chi (zho); cmn } else if (host_tld.equals("xn--fzc2c9e2c")) {//Sri Lanka/Sinhala language /1,770,000* language = "si";//sinhala; sin } else if (host_tld.equals("xn--ygbi2ammx")) {//Palestinian Territory/Arabic /559,000* language = "ar";//palestinian arabic; ara; ajp } break; case 14: if (host_tld.equals("xn--mgbbh1a71e")) {//India/Urdu /9,330,000* language = "ur";//urdu; urd } else if (host_tld.equals("xn--mgbaam7a8h")) {//United Arab Emirates/Arabic /3,310,000* language = "ar";//arabic } else if (host_tld.equals("xn--mgbayh7gpa")) {//Jordan/Arabic /601,000* language = "ar";//jordanian arabic; ara; ajp } else if (host_tld.equals("xn--mgbx4cd0ab")) {//Malaysia/Arabic(Jawi alphabet?) /4,610,000* language = "ar";//arabic //Proposed (why not malay?) } else if (host_tld.equals("xn--54b7fta0cc")) {//Bangladesh/Bengali /342,000* language = "bn";//bengali; ben //Proposed } break; case 15: if (host_tld.equals("xn--mgbc0a9azcg")) {//Morocco/Arabic /3,030,000* language = "ar";//moroccan arabic; ara; ary } else if (host_tld.equals("xn--mgba3a4f16a")) {//Iran/Persian /2,940,000* language = "fa";//persian; per (fas); pes } else if (host_tld.equals("xn--lgbbat1ad8j")) {//Algeria/Arabic /326,000* language = "ar";//arabic; ara; arq } break; case 16: if (host_tld.equals("xn--xkc2al3hye2a")) {//Sri Lanka/Tamil /1,770,000* language = "ta";//tamil; tam } break; case 17: if (host_tld.equals("xn--xkc2dl3a5ee0h")) {//India/Tamil /9,330,000* language = "ta";//tamil; tam //Badaga (ISO 639-3:bfq), Irula (ISO 639-3:iru), Paniya (ISO 639-3:pcg) } else if (host_tld.equals("xn--mgberp4a5d4ar")) {//Saudi Arabia/Arabic /2,770,000* language = "ar";//gulf arabic; ara; afb } else if (host_tld.equals("xn--mgbai9azgqp6j")) {//Pakistan/Arabic /3,180,000* language = "ar";//arabic //Proposed (why not urdu?) //language = "ur";//urdu; urd (lingua franca and national language) } break; case 22: if (host_tld.equals("xn--clchc0ea0b2g2a9gcd")) {//Singapore/Tamil /8,770,000* language = "ta";//tamil; tam } //* - stats from ccTLD break; default: break; } //6: ISO 639-6 Part 6: Alpha-4 - most of small languages from ISO 639-3 not exists. //ISO 639-2 languages included, but not all. return language; } // The MultiProtocolURI may be used to integrate File- and SMB accessed into one object // some extraction methods that generate File/SmbFile objects from the MultiProtocolURI /** * create a standard java URL. * Please call isHTTP(), isHTTPS() and isFTP() before using this class */ public java.net.URL getURL() throws MalformedURLException { if (!(isHTTP() || isHTTPS() || isFTP())) throw new MalformedURLException(); return new java.net.URL(this.toNormalform(false)); } /** * create a standard java File. * Please call isFile() before using this class */ public java.io.File getFSFile() throws MalformedURLException { if (!isFile()) throw new MalformedURLException(); return new java.io.File(unescape(this.toNormalform(true)).substring("file://".length())); } /** * create a smb File * Please call isSMB() before using this class * @throws MalformedURLException */ public SmbFile getSmbFile() throws MalformedURLException { if (!isSMB()) throw new MalformedURLException(); final String url = unescape(this.toNormalform(true)); return new SmbFile(url); } // some methods that let the MultiProtocolURI look like a java.io.File object // to use these methods the object must be either of type isFile() or isSMB() public boolean exists() throws IOException { if (isFile()) return getFSFile().exists(); if (isSMB()) try { return TimeoutRequest.exists(getSmbFile(), SMB_TIMEOUT); } catch (final SmbException e) { throw new IOException( "SMB.exists SmbException (" + e.getMessage() + ") for " + toNormalform(false)); } catch (final MalformedURLException e) { throw new IOException( "SMB.exists MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false)); } return false; } public boolean canRead() throws IOException { if (isFile()) return getFSFile().canRead(); if (isSMB()) try { return TimeoutRequest.canRead(getSmbFile(), SMB_TIMEOUT); } catch (final SmbException e) { throw new IOException( "SMB.canRead SmbException (" + e.getMessage() + ") for " + toNormalform(false)); } catch (final MalformedURLException e) { throw new IOException( "SMB.canRead MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false)); } return false; } public boolean canWrite() throws IOException { if (isFile()) return getFSFile().canWrite(); if (isSMB()) try { return TimeoutRequest.canWrite(getSmbFile(), SMB_TIMEOUT); } catch (final SmbException e) { throw new IOException( "SMB.canWrite SmbException (" + e.getMessage() + ") for " + toNormalform(false)); } catch (final MalformedURLException e) { throw new IOException( "SMB.canWrite MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false)); } return false; } public boolean isHidden() throws IOException { if (isFile()) return getFSFile().isHidden(); if (isSMB()) try { return TimeoutRequest.isHidden(getSmbFile(), SMB_TIMEOUT); } catch (final SmbException e) { throw new IOException( "SMB.isHidden SmbException (" + e.getMessage() + ") for " + toNormalform(false)); } catch (final MalformedURLException e) { throw new IOException( "SMB.isHidden MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false)); } return false; } public boolean isDirectory() throws IOException { if (isFile()) return getFSFile().isDirectory(); if (isSMB()) try { return TimeoutRequest.isDirectory(getSmbFile(), SMB_TIMEOUT); } catch (final SmbException e) { throw new IOException( "SMB.isDirectory SmbException (" + e.getMessage() + ") for " + toNormalform(false)); } catch (final MalformedURLException e) { throw new IOException("SMB.isDirectory MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false)); } return false; } public long length() { if (isFile()) try { return getFSFile().length(); } catch (final Throwable e) { ConcurrentLog.logException(e); return -1; } if (isSMB()) try { return getSmbFile().length(); //return TimeoutRequest.length(getSmbFile(), SMB_TIMEOUT); // a timeout request is a bad idea, that will create a lot of concurrent threads during crawling } catch (final Throwable e) { ConcurrentLog.logException(e); return -1; } return -1; } public long lastModified() throws IOException { if (isFile()) return getFSFile().lastModified(); if (isSMB()) try { return getSmbFile().lastModified(); // return TimeoutRequest.lastModified(getSmbFile(), SMB_TIMEOUT); // a timeout request is a bad idea, that will create a lot of concurrent threads during crawling } catch (final SmbException e) { throw new IOException( "SMB.lastModified SmbException (" + e.getMessage() + ") for " + toNormalform(false)); } catch (final MalformedURLException e) { throw new IOException("SMB.lastModified MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false)); } return 0; } public String getName() throws IOException { if (isFile()) return getFSFile().getName(); if (isSMB()) try { return getSmbFile().getName(); } catch (final MalformedURLException e) { throw new IOException( "SMB.getName MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false)); } if (isFTP()) { return this.getFileName(); } return null; } /** * Get directory listing of file or smb url * respects the hidden attribute of a directory (return null if hidden) * * @return names of files and directories or null * @throws IOException */ public String[] list() throws IOException { if (isFile() && !isHidden()) return getFSFile().list(); if (isSMB()) try { final SmbFile sf = getSmbFile(); if (!sf.isDirectory() || sf.isHidden()) return null; try { return TimeoutRequest.list(sf, SMB_TIMEOUT); } catch (final SmbException e) { throw new IOException("SMB.list SmbException for " + sf.toString() + ": " + e.getMessage()); } } catch (final MalformedURLException e) { throw new IOException( "SMB.list MalformedURLException for " + toNormalform(false) + ": " + e.getMessage()); } return null; } /** * Open an input stream on the resource described by this URL. * <strong>Please don't forget to release resources by closing the returned stream.</strong> * @param agent user agent identifier to use when the protocul is HTTP * @return an open input stream * @throws IOException when the stream can not be opened */ public InputStream getInputStream(final ClientIdentification.Agent agent) throws IOException { if (isFile()) return new BufferedInputStream(new FileInputStream(getFSFile())); if (isSMB()) return new BufferedInputStream(new SmbFileInputStream(getSmbFile())); if (isFTP()) { final FTPClient client = new FTPClient(); client.open(this.host, this.port < 0 ? 21 : this.port); final byte[] b = client.get(this.path); client.CLOSE(); return new ByteArrayInputStream(b); } if (isHTTP() || isHTTPS()) { final HTTPClient client = new HTTPClient(agent); client.setHost(getHost()); client.GET(this, false); if (client.getStatusCode() != HttpStatus.SC_OK) { throw new IOException("Unable to open http stream on " + this.toString() + "\nServer returned status: " + client.getHttpResponse().getStatusLine()); } return new HTTPInputStream(client); } return null; } public byte[] get(final ClientIdentification.Agent agent, final String username, final String pass) throws IOException { if (isFile()) return read(new FileInputStream(getFSFile())); if (isSMB()) return read(new SmbFileInputStream(getSmbFile())); if (isFTP()) { final FTPClient client = new FTPClient(); client.open(this.host, this.port < 0 ? 21 : this.port); final byte[] b = client.get(this.path); client.CLOSE(); return b; } if (isHTTP() || isHTTPS()) { final HTTPClient client = new HTTPClient(agent); client.setHost(getHost()); return client.GETbytes(this, username, pass, false); } return null; } /** * Read fully the source, close it and return its content as a bytes array. * @param source the source to read * @return return the content of the source stream * @throws IOException when an erro occured */ public static byte[] read(final InputStream source) throws IOException { try { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final byte[] buffer = new byte[2048]; int c; while ((c = source.read(buffer, 0, 2048)) > 0) baos.write(buffer, 0, c); baos.flush(); baos.close(); return baos.toByteArray(); } finally { try { source.close(); } catch (IOException ignored) { } } } public Locale getLocale() { if (this.hostAddress != null) { final Locale locale = Domains.getLocale(this.hostAddress); if (locale != null && locale.getCountry() != null && locale.getCountry().length() > 0) return locale; } /* if (this.hostAddress != null) { return Domains.getLocale(this.hostAddress); } */ return Domains.getLocale(this.host); } //--------------------- private static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; public static final Pattern splitpattern = Pattern.compile(splitrex); public static String[] urlComps(String normalizedURL) { final int p = normalizedURL.indexOf("//", 0); if (p > 0) normalizedURL = normalizedURL.substring(p + 2); // TODO lowering case in a locale sensitive manner makes sense here, but the used language locale should not dependant on the default system locale return splitpattern.split(normalizedURL.toLowerCase()); // word components of the url } public static void main(final String[] args) { final String[][] test = new String[][] { new String[] { null, "file://y:/" }, new String[] { null, "file://y:/yacy" }, new String[] { null, "file://y:/yacy/" }, new String[] { null, "file://y:" }, new String[] { null, "file://Z:admin\\home" }, // thats wrong but may appear new String[] { null, "file://Z:\\admin\\home" }, new String[] { null, "https://www.example.com/shoe/?p=2&ps=75#t={%22san_NaviPaging%22:2}" }, // ugly strange pagination link new String[] { null, "C:WINDOWS\\CMD0.EXE" }, new String[] { null, "file://C:WINDOWS\\CMD0.EXE" }, new String[] { null, "file:///bin/yacy2" }, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/' new String[] { null, "file:/bin/yacy1" }, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/' new String[] { null, "file:C:WINDOWS\\CMD.EXE" }, new String[] { null, "file:///C:WINDOWS\\CMD1.EXE" }, new String[] { null, "file:///C|WINDOWS\\CMD2.EXE" }, new String[] { null, "http://www.anomic.de/test/" }, new String[] { null, "http://www.anomic.de/" }, new String[] { null, "http://www.anomic.de" }, new String[] { null, "http://www.anomic.de/home/test?x=1#home" }, new String[] { null, "http://www.anomic.de/home/test?x=1" }, new String[] { null, "http://www.anomic.de/home/test#home" }, new String[] { null, "ftp://ftp.anomic.de/home/test#home" }, new String[] { null, "ftp://bob:builder@ftp.anomic.de/home/test.gif" }, new String[] { null, "http://www.anomic.de/home/../abc/" }, new String[] { null, "mailto:abcdefg@nomailnomail.com" }, new String[] { "http://www.anomic.de/home", "test" }, new String[] { "http://www.anomic.de/home", "test/" }, new String[] { "http://www.anomic.de/home/", "test" }, new String[] { "http://www.anomic.de/home/", "test/" }, new String[] { "http://www.anomic.de/home/index.html", "test.htm" }, new String[] { "http://www.anomic.de/home/index.html", "http://www.yacy.net/test" }, new String[] { "http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test" }, new String[] { "http://www.anomic.de/home/index.html", "../test" }, new String[] { "http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com" }, new String[] { null, "news:de.test" }, new String[] { "http://www.anomic.de/home", "news:de.test" }, new String[] { null, "mailto:bob@web.com" }, new String[] { "http://www.anomic.de/home", "mailto:bob@web.com" }, new String[] { "http://www.anomic.de/home", "ftp://ftp.anomic.de/src" }, new String[] { null, "ftp://ftp.delegate.org/" }, new String[] { "http://www.anomic.de/home", "ftp://ftp.delegate.org/" }, new String[] { "http://www.anomic.de", "mailto:yacy@weltherrschaft.org" }, new String[] { "http://www.anomic.de", "javascipt:temp" }, new String[] { null, "http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history" }, new String[] { null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585" }, new String[] { null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585" }, new String[] { null, "http://www.scc.kit.edu/publikationen/80.php?PHPSESSID=5f3624d3e1c33d4c086ab600d4d5f5a1" }, new String[] { null, "smb://localhost/" }, new String[] { null, "smb://localhost/repository" }, // paths must end with '/' new String[] { null, "smb://localhost/repository/" }, new String[] { null, "\\\\localhost\\" }, // Windows-like notion of smb shares new String[] { null, "\\\\localhost\\repository" }, new String[] { null, "\\\\localhost\\repository\\" }, new String[] { null, "http://test.net/test1.htm?s=multiple&a=amp&b=in&c=url" }, new String[] { null, "http://test.net/test2.htm?s=multiple&amp;amp;amp;a=amp" }, new String[] { null, "http://validator.w3.org/check?uri=http://www.anomic.de/" } }; //MultiProtocolURI.initSessionIDNames(FileUtils.loadList(new File("defaults/sessionid.names"))); String environment, url; MultiProtocolURL aURL, aURL1; java.net.URL jURL; for (String[] element : test) { environment = element[0]; url = element[1]; try { aURL = MultiProtocolURL.newURL(environment, url); } catch (final MalformedURLException e) { e.printStackTrace(); aURL = null; } if (environment == null) { try { jURL = new java.net.URL(url); } catch (final MalformedURLException e) { jURL = null; } } else { try { jURL = new java.net.URL(new java.net.URL(environment), url); } catch (final MalformedURLException e) { jURL = null; } } // check equality to java.net.URL if (((aURL == null) && (jURL != null)) || ((aURL != null) && (jURL == null)) || ((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toNormalform(false)))))) { System.out.println("Difference for environment=" + environment + ", url=" + url + ":"); System.out.println((jURL == null) ? "jURL rejected input" : "jURL=" + jURL.toString()); System.out.println((aURL == null) ? "aURL rejected input" : "aURL=" + aURL.toNormalform(false) + "; host=" + aURL.getHost() + "; path=" + aURL.getPath() + "; file=" + aURL.getFile()); } if (aURL != null && jURL != null && jURL.toString().equals(aURL.toNormalform(false))) { System.out.println("jURL == aURL=" + aURL.toNormalform(false) + "; host=" + aURL.getHost() + "; path=" + aURL.getPath() + "; file=" + aURL.getFile()); } // check stability: the normalform of the normalform must be equal to the normalform if (aURL != null) try { aURL1 = new MultiProtocolURL(aURL.toNormalform(false)); if (!(aURL1.toNormalform(false).equals(aURL.toNormalform(false)))) { System.out.println("no stability for url:"); System.out.println("aURL0=" + aURL.toNormalform(false)); System.out.println("aURL1=" + aURL1.toNormalform(false)); } } catch (final MalformedURLException e) { System.out.println("no stability for url:"); System.out.println("aURL0=" + aURL.toNormalform(false)); System.out.println("aURL1 cannot be computed:" + e.getMessage()); } } } }