List of usage examples for java.net URL getRef
public String getRef()
From source file:org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer.java
public String normalize(String urlString, String scope) throws MalformedURLException { if ("".equals(urlString)) // permit empty return urlString; urlString = urlString.trim(); // remove extra spaces URL url = new URL(urlString); String protocol = url.getProtocol(); String host = url.getHost();// w w w. j a v a 2 s . c om int port = url.getPort(); String file = url.getFile(); boolean changed = false; if (!urlString.startsWith(protocol)) // protocol was lowercased changed = true; if ("http".equals(protocol) || "ftp".equals(protocol)) { if (host != null) { String newHost = host.toLowerCase(); // lowercase host if (!host.equals(newHost)) { host = newHost; changed = true; } } if (port == url.getDefaultPort()) { // uses default port port = -1; // so don't specify it changed = true; } if (file == null || "".equals(file)) { // add a slash file = "/"; changed = true; } if (url.getRef() != null) { // remove the ref changed = true; } // check for unnecessary use of "/../" String file2 = substituteUnnecessaryRelativePaths(file); if (!file.equals(file2)) { changed = true; file = file2; } } if (changed) urlString = new URL(protocol, host, port, file).toString(); return urlString; }
From source file:com.iflytek.spider.net.BasicURLNormalizer.java
public String normalize(String urlString) throws MalformedURLException { if ("".equals(urlString)) // permit empty return urlString; urlString = urlString.trim(); // remove extra spaces URL url = new URL(urlString); String protocol = url.getProtocol(); String host = url.getHost();//w ww .j a v a2s . com int port = url.getPort(); String file = url.getFile(); boolean changed = false; if (!urlString.startsWith(protocol)) // protocol was lowercased changed = true; if ("http".equals(protocol) || "ftp".equals(protocol)) { if (host != null) { String newHost = host.toLowerCase(); // lowercase host if (!host.equals(newHost)) { host = newHost; changed = true; } } if (port == url.getDefaultPort()) { // uses default port port = -1; // so don't specify it changed = true; } if (file == null || "".equals(file)) { // add a slash file = "/"; changed = true; } if (url.getRef() != null) { // remove the ref changed = true; } // check for unnecessary use of "/../" String file2 = substituteUnnecessaryRelativePaths(file); if (!file.equals(file2)) { changed = true; file = file2; } } if (changed) urlString = new URL(protocol, host, port, file).toString(); return urlString; }
From source file:com.digitalpebble.storm.crawler.filtering.basic.BasicURLNormalizer.java
/** * Basic filter to remove query parameters from urls so parameters that * don't change the content of the page can be removed. An example would be * a google analytics query parameter like "utm_campaign" which might have * several different values for a url that points to the same content. *///from w w w.ja v a 2s .com private String filterQueryElements(String urlToFilter) { try { // Handle illegal characters by making a url first // this will clean illegal characters like | URL url = new URL(urlToFilter); if (StringUtils.isEmpty(url.getQuery())) { return urlToFilter; } List<NameValuePair> pairs = new ArrayList<NameValuePair>(); URLEncodedUtils.parse(pairs, new Scanner(url.getQuery()), "UTF-8"); Iterator<NameValuePair> pairsIterator = pairs.iterator(); while (pairsIterator.hasNext()) { NameValuePair param = pairsIterator.next(); if (queryElementsToRemove.contains(param.getName())) { pairsIterator.remove(); } } StringBuilder newFile = new StringBuilder(); if (url.getPath() != null) { newFile.append(url.getPath()); } if (!pairs.isEmpty()) { Collections.sort(pairs, comp); String newQueryString = URLEncodedUtils.format(pairs, StandardCharsets.UTF_8); newFile.append('?').append(newQueryString); } if (url.getRef() != null) { newFile.append('#').append(url.getRef()); } return new URL(url.getProtocol(), url.getHost(), url.getPort(), newFile.toString()).toString(); } catch (MalformedURLException e) { LOG.warn("Invalid urlToFilter {}. {}", urlToFilter, e); return null; } }
From source file:org.codice.alliance.nsili.client.SampleNsiliClient.java
private URI getEncodedUriFromString(String urlString) throws URISyntaxException, MalformedURLException { URL url = new URL(urlString); return new URI(url.getProtocol(), url.getUserInfo(), url.getHost(), url.getPort(), url.getPath(), url.getQuery(), url.getRef()); }
From source file:org.eclipse.smila.connectivity.framework.crawler.web.net.BasicUrlNormalizer.java
/** * {@inheritDoc}/* w w w .ja v a2 s .c o m*/ */ public String normalize(String urlString) throws MalformedURLException { if ("".equals(urlString)) { return urlString; } // remove extra spaces urlString = urlString.trim(); final URL url = new URL(urlString); final String protocol = url.getProtocol(); String host = url.getHost(); int port = url.getPort(); String file = url.getFile(); boolean changed = false; if (!urlString.startsWith(protocol)) { changed = true; } if ("http".equals(protocol) || "ftp".equals(protocol)) { if (host != null) { final String newHost = host.toLowerCase(); // lower case host if (!host.equals(newHost)) { host = newHost; changed = true; } } if (port == url.getDefaultPort()) { // uses default port port = -1; // so don't specify it changed = true; } if (file == null || "".equals(file)) { // add a slash file = "/"; changed = true; } if (url.getRef() != null) { // remove the reference changed = true; } // check for unnecessary use of "/../" final String file2 = substituteUnnecessaryRelativePaths(file); if (!file.equals(file2)) { changed = true; file = file2; } } if (changed) { urlString = new URL(protocol, host, port, file).toString(); } return urlString; }
From source file:com.digitalpebble.stormcrawler.filtering.basic.BasicURLNormalizer.java
/** * Basic filter to remove query parameters from urls so parameters that * don't change the content of the page can be removed. An example would be * a google analytics query parameter like "utm_campaign" which might have * several different values for a url that points to the same content. *//*from w w w. j a v a 2s .co m*/ private String filterQueryElements(String urlToFilter) { try { // Handle illegal characters by making a url first // this will clean illegal characters like | URL url = new URL(urlToFilter); if (StringUtils.isEmpty(url.getQuery())) { return urlToFilter; } List<NameValuePair> pairs = new ArrayList<>(); URLEncodedUtils.parse(pairs, new Scanner(url.getQuery()), "UTF-8"); Iterator<NameValuePair> pairsIterator = pairs.iterator(); while (pairsIterator.hasNext()) { NameValuePair param = pairsIterator.next(); if (queryElementsToRemove.contains(param.getName())) { pairsIterator.remove(); } } StringBuilder newFile = new StringBuilder(); if (url.getPath() != null) { newFile.append(url.getPath()); } if (!pairs.isEmpty()) { Collections.sort(pairs, comp); String newQueryString = URLEncodedUtils.format(pairs, StandardCharsets.UTF_8); newFile.append('?').append(newQueryString); } if (url.getRef() != null) { newFile.append('#').append(url.getRef()); } return new URL(url.getProtocol(), url.getHost(), url.getPort(), newFile.toString()).toString(); } catch (MalformedURLException e) { LOG.warn("Invalid urlToFilter {}. {}", urlToFilter, e); return null; } }
From source file:com.xpn.xwiki.web.XWikiServletURLFactory.java
/** * Converts a URL to a relative URL if it's a XWiki URL (keeping only the path + query string + anchor) and leave * the URL unchanged if it's an external URL. * <p>/* www .j a va 2 s .co m*/ * An URL is considered to be external if its server component doesn't match the server of the current request URL. * This means that URLs are made relative with respect to the current request URL rather than the current wiki set * on the XWiki context. Let's take an example: * * <pre> * {@code * request URL: http://playground.xwiki.org/xwiki/bin/view/Sandbox/TestURL * current wiki: code (code.xwiki.org) * URL (1): http://code.xwiki.org/xwiki/bin/view/Main/WebHome * URL (2): http://playground.xwiki.org/xwiki/bin/view/Spage/Page * * The result will be: * (1) http://code.xwiki.org/xwiki/bin/view/Main/WebHome * (2) /xwiki/bin/view/Spage/Page * } * </pre> * * @param url the URL to convert * @return the converted URL as a string * @see com.xpn.xwiki.web.XWikiDefaultURLFactory#getURL(java.net.URL, com.xpn.xwiki.XWikiContext) */ @Override public String getURL(URL url, XWikiContext context) { try { if (url == null) { return ""; } String surl = url.toString(); if (!surl.startsWith(serverURL.toString())) { // External URL: leave it as is. return surl; } else { // Internal XWiki URL: convert to relative. StringBuffer sbuf = new StringBuffer(url.getPath()); String querystring = url.getQuery(); if (!StringUtils.isEmpty(querystring)) { sbuf.append("?"); sbuf.append(StringUtils.chomp(StringUtils.chomp(querystring, "&"), "&")); // sbuf.append(querystring.replaceAll("&","&")); } String anchor = url.getRef(); if (!StringUtils.isEmpty(anchor)) { sbuf.append("#"); sbuf.append(anchor); } return Util.escapeURL(sbuf.toString()); } } catch (Exception e) { e.printStackTrace(); return ""; } }
From source file:com.adito.core.CoreUtil.java
/** * @param redirect/*from www.jav a2s .c o m*/ * @return String */ static String processRefererString(String redirect) { try { URL u = new URL(redirect); String query = u.getQuery(); if (query != null && !query.equals("")) { StringBuffer nq = new StringBuffer(); StringTokenizer t = new StringTokenizer(query, "&"); String parm = null; while (t.hasMoreTokens()) { parm = t.nextToken(); if (!parm.startsWith("referer=") && !parm.startsWith("vpnMessage=") && !parm.startsWith("vpnError=")) { if (nq.length() > 0) { nq.append("&"); } nq.append(parm); } } query = nq.length() == 0 ? null : nq.toString(); } StringBuffer file = new StringBuffer(); if (u.getPath() != null) { file.append(u.getPath()); } if (query != null) { file.append("?"); file.append(query); } if (u.getRef() != null) { file.append("#"); file.append(u.getRef()); } u = new URL(u.getProtocol(), u.getHost(), u.getPort(), file.toString()); return u.toExternalForm(); } catch (MalformedURLException mrule) { int idx = redirect.indexOf("?"); if (idx != -1) { String query = redirect.substring(idx + 1); redirect = redirect.substring(0, idx); if (query.length() > 0) { StringBuffer nq = new StringBuffer(); StringTokenizer t = new StringTokenizer(query, "&"); String parm = null; while (t.hasMoreTokens()) { parm = t.nextToken(); if (!parm.startsWith("vpnMessage=") && !parm.startsWith("vpnError=")) { if (nq.length() > 0) { nq.append("&"); } nq.append(parm); } } query = nq.length() == 0 ? null : nq.toString(); if (query != null) { redirect = redirect + "?" + query; } } } return redirect; } }
From source file:com.connectsdk.service.DLNAService.java
String encodeURL(String mediaURL) throws MalformedURLException, URISyntaxException, UnsupportedEncodingException { if (mediaURL == null || mediaURL.isEmpty()) { return ""; }/* w ww . j a v a2 s .co m*/ String decodedURL = URLDecoder.decode(mediaURL, "UTF-8"); if (decodedURL.equals(mediaURL)) { URL url = new URL(mediaURL); URI uri = new URI(url.getProtocol(), url.getUserInfo(), url.getHost(), url.getPort(), url.getPath(), url.getQuery(), url.getRef()); return uri.toASCIIString(); } return mediaURL; }