List of usage examples for java.net URL getDefaultPort
public int getDefaultPort()
From source file:edu.uci.ics.crawler4j.robotstxt.RobotstxtServer.java
private HostDirectives fetchDirectives(URL url) { WebURL robotsTxtUrl = new WebURL(); String host = getHost(url);//www. j a v a 2s.com String port = (url.getPort() == url.getDefaultPort() || url.getPort() == -1) ? "" : ":" + url.getPort(); robotsTxtUrl.setURL("http://" + host + port + "/robots.txt"); HostDirectives directives = null; PageFetchResult fetchResult = null; try { fetchResult = pageFetcher.fetchHeader(robotsTxtUrl); if (fetchResult.getStatusCode() == HttpStatus.SC_OK) { Page page = new Page(robotsTxtUrl); fetchResult.fetchContent(page); if (Util.hasPlainTextContent(page.getContentType())) { try { String content; if (page.getContentCharset() == null) { content = new String(page.getContentData()); } else { content = new String(page.getContentData(), page.getContentCharset()); } directives = RobotstxtParser.parse(content, config.getUserAgentName()); } catch (Exception e) { e.printStackTrace(); } } } } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } if (directives == null) { // We still need to have this object to keep track of the time we // // fetched it directives = new HostDirectives(); } synchronized (host2directivesCache) { if (host2directivesCache.size() == config.getCacheSize()) { String minHost = null; long minAccessTime = Long.MAX_VALUE; for (Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) { if (entry.getValue().getLastAccessTime() < minAccessTime) { minAccessTime = entry.getValue().getLastAccessTime(); minHost = entry.getKey(); } } host2directivesCache.remove(minHost); } host2directivesCache.put(host, directives); } return directives; }
From source file:com.grendelscan.commons.http.CookieJar.java
public List<Cookie> getMatchingCookies(final URL url) { int port = url.getPort(); if (port < 0) { port = url.getDefaultPort(); }/*from w w w . j a v a 2s . c om*/ CookieOrigin origin = new CookieOrigin(url.getHost(), port, url.getPath(), url.getProtocol().equalsIgnoreCase("https")); return getMatchingCookies(origin); }
From source file:com.kagilum.plugins.icescrum.IceScrumSession.java
private void setAuthentication() throws MalformedURLException { int port;// www. java2 s . c o m URL url = new URL(settings.getUrl() + "/version/"); if (url.getPort() == -1) { port = url.getDefaultPort(); } else { port = url.getPort(); } client.getState().setCredentials(new AuthScope(url.getHost(), port), new UsernamePasswordCredentials(settings.getUsername(), settings.getPassword())); }
From source file:com.nanocrawler.robotstxt.RobotstxtServer.java
private HostDirectives fetchDirectives(URL url) { WebURL robotsTxtUrl = new WebURL(); String host = getHost(url);/*from www .j av a2 s.c om*/ String port = (url.getPort() == url.getDefaultPort() || url.getPort() == -1) ? "" : ":" + url.getPort(); robotsTxtUrl.setURL("http://" + host + port + "/robots.txt"); HostDirectives directives = null; PageFetchResult fetchResult = null; try { fetchResult = pageFetcher.fetchHeader(robotsTxtUrl); // TO_DO: Does this work on redirects e.g. http://news.ycombinator.com/robots.txt -> https://news.ycombinator.com/robots.txt if (fetchResult.getStatusCode() == HttpStatus.SC_OK) { Page page = new Page(robotsTxtUrl); fetchResult.fetchContent(page); if (ContentTypeUtil.hasPlainTextContent(page.getContentType())) { try { String content; if (page.getContentCharset() == null) { content = new String(page.getContentData()); } else { content = new String(page.getContentData(), page.getContentCharset()); } directives = RobotstxtParser.parse(content, config.getUserAgentName()); } catch (Exception e) { e.printStackTrace(); } } } } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } if (directives == null) { directives = new HostDirectives(); } synchronized (host2directivesCache) { if (host2directivesCache.size() == config.getCacheSize()) { String minHost = null; long minAccessTime = Long.MAX_VALUE; for (Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) { if (entry.getValue().getLastAccessTime() < minAccessTime) { minAccessTime = entry.getValue().getLastAccessTime(); minHost = entry.getKey(); } } host2directivesCache.remove(minHost); } host2directivesCache.put(host, directives); } return directives; }
From source file:frame.crawler4j.robotstxt.RobotstxtServer.java
private HostDirectives fetchDirectives(URL url) { WebURL robotsTxtUrl = new WebURL(); String host = getHost(url);//from ww w. j a va2s . c om String port = (url.getPort() == url.getDefaultPort() || url.getPort() == -1) ? "" : ":" + url.getPort(); robotsTxtUrl.setURL("http://" + host + port + "/robots.txt"); HostDirectives directives = null; PageFetchResult fetchResult = null; try { fetchResult = pageFetcher.fetchHeader(robotsTxtUrl); if (fetchResult.getStatusCode() == HttpStatus.SC_OK) { Page page = new Page(robotsTxtUrl); fetchResult.fetchContent(page); if (Util.hasPlainTextContent(page.getContentType())) { try { String content; if (page.getContentCharset() == null) { content = new String(page.getContentData()); } else { content = new String(page.getContentData(), page.getContentCharset()); } directives = RobotstxtParser.parse(content, config.getUserAgentName()); } catch (Exception e) { e.printStackTrace(); } } } } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } if (directives == null) { // We still need to have this object to keep track of the time we // fetched it directives = new HostDirectives(); } synchronized (host2directivesCache) { if (host2directivesCache.size() == config.getCacheSize()) { String minHost = null; long minAccessTime = Long.MAX_VALUE; for (Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) { if (entry.getValue().getLastAccessTime() < minAccessTime) { minAccessTime = entry.getValue().getLastAccessTime(); minHost = entry.getKey(); } } host2directivesCache.remove(minHost); } host2directivesCache.put(host, directives); } return directives; }
From source file:de.comlineag.snc.webcrawler.robotstxt.RobotstxtServer.java
private HostDirectives fetchDirectives(URL url) { WebURL robotsTxtUrl = new WebURL(); String host = getHost(url);//from w w w.j av a 2s. com String port = (url.getPort() == url.getDefaultPort() || url.getPort() == -1) ? "" : ":" + url.getPort(); robotsTxtUrl.setURL("http://" + host + port + "/robots.txt"); HostDirectives directives = null; PageFetchResult fetchResult = null; try { fetchResult = pageFetcher.fetchHeader(robotsTxtUrl); if (fetchResult.getStatusCode() == HttpStatus.SC_OK) { Page page = new Page(robotsTxtUrl); fetchResult.fetchContent(page); if (WebCrawlerUtil.hasPlainTextContent(page.getContentType())) { try { String content; if (page.getContentCharset() == null) { content = new String(page.getContentData()); } else { content = new String(page.getContentData(), page.getContentCharset()); } directives = RobotstxtParser.parse(content, config.getUserAgentName()); } catch (Exception e) { logger.error("Error occurred while fetching (robots) url: " + robotsTxtUrl.getURL(), e); } } } } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } if (directives == null) { // We still need to have this object to keep track of the time we // fetched it directives = new HostDirectives(); } synchronized (host2directivesCache) { if (host2directivesCache.size() == config.getCacheSize()) { String minHost = null; long minAccessTime = Long.MAX_VALUE; for (Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) { if (entry.getValue().getLastAccessTime() < minAccessTime) { minAccessTime = entry.getValue().getLastAccessTime(); minHost = entry.getKey(); } } host2directivesCache.remove(minHost); } host2directivesCache.put(host, directives); } return directives; }
From source file:org.nmdp.service.epitope.task.URLProcessor.java
public long getFtpLastModifiedTime(URL url) { FTPClient ftpClient = new FTPClient(); try {// w ww . ja v a 2s. c o m ftpClient.connect(url.getHost(), url.getPort() == -1 ? url.getDefaultPort() : url.getPort()); ftpClient.login("anonymous", "anonymous"); ftpClient.enterLocalPassiveMode(); String filePath = url.getPath(); String time = ftpClient.getModificationTime(filePath); //logger.debug("server replied: " + time); SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); String timePart = time.split(" ")[1]; Date modificationTime = dateFormat.parse(timePart); //logger.debug("parsed time: " + modificationTime); return modificationTime.getTime(); } catch (Exception e) { logger.error("failed to parse time for url: " + url, e); return 0; } finally { if (ftpClient.isConnected()) { try { ftpClient.disconnect(); } catch (IOException ex) { ex.printStackTrace(); } } } }
From source file:io.github.cidisk.indexcrawler.robotstxt.RobotstxtServer.java
private HostDirectives fetchDirectives(URL url) { WebURL robotsTxtUrl = new WebURL(); String host = getHost(url);/*from w w w . j a v a 2 s. c om*/ String port = (url.getPort() == url.getDefaultPort() || url.getPort() == -1) ? "" : ":" + url.getPort(); robotsTxtUrl.setURL("http://" + host + port + "/robots.txt"); HostDirectives directives = null; PageFetchResult fetchResult = null; try { fetchResult = pageFetcher.fetchPage(robotsTxtUrl); if (fetchResult.getStatusCode() == HttpStatus.SC_OK) { Page page = new Page(robotsTxtUrl); fetchResult.fetchContent(page); if (Util.hasPlainTextContent(page.getContentType())) { String content; if (page.getContentCharset() == null) { content = new String(page.getContentData()); } else { content = new String(page.getContentData(), page.getContentCharset()); } directives = RobotstxtParser.parse(content, config.getUserAgentName()); } } } catch (SocketException | UnknownHostException | SocketTimeoutException se) { // No logging here, as it just means that robots.txt doesn't exist on this server which is perfectly ok } catch (PageBiggerThanMaxSizeException pbtms) { logger.error("Error occurred while fetching (robots) url: {}, {}", robotsTxtUrl.getURL(), pbtms.getMessage()); } catch (Exception e) { logger.error("Error occurred while fetching (robots) url: " + robotsTxtUrl.getURL(), e); } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } if (directives == null) { // We still need to have this object to keep track of the time we // fetched it directives = new HostDirectives(); } synchronized (host2directivesCache) { if (host2directivesCache.size() == config.getCacheSize()) { String minHost = null; long minAccessTime = Long.MAX_VALUE; for (Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) { if (entry.getValue().getLastAccessTime() < minAccessTime) { minAccessTime = entry.getValue().getLastAccessTime(); minHost = entry.getKey(); } } host2directivesCache.remove(minHost); } host2directivesCache.put(host, directives); } return directives; }
From source file:com.gargoylesoftware.htmlunit.CookieManager.java
/** * Gets the port of the URL./*from w w w. ja v a 2 s . co m*/ * This functionality is implemented here as protected method to allow subclass to change it * as workaround to <a href="http://code.google.com/p/googleappengine/issues/detail?id=4784"> * Google App Engine bug 4784</a>. * @param url the URL * @return the port use to connect the server */ protected int getPort(final URL url) { if (url.getPort() != -1) { return url.getPort(); } return url.getDefaultPort(); }
From source file:org.electrologic.convergence.server.NotaryBundleServlet.java
@Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { try {/*from w w w. j ava2 s. c o m*/ // get request URL, hostname and port and REST interface anddress URL requestUrl = new URL(req.getRequestURL().toString()); String host = requestUrl.getHost(); int port = requestUrl.getPort(); if (port == -1) { port = requestUrl.getDefaultPort(); } // create JSON object JSONObject result = new JSONObject(); result.put("version", 1); JSONObject hostElement = new JSONObject(); hostElement.put("host", host); hostElement.put("http_port", 80); // the FF addon seems to have a problem when this is not present hostElement.put("ssl_port", port); hostElement.put("certificate", pemCert); JSONArray hostList = new JSONArray(); hostList.put(hostElement); result.put("hosts", hostList); result.put("name", "Convergence J2EE Server"); result.put("bundle_location", requestUrl.toString()); String resultStr = result.toString(); resp.setCharacterEncoding("UTF-8"); resp.setContentType("application/json"); resp.getOutputStream().print(resultStr); } catch (JSONException ex) { String msg = "Failed to construct JSON result."; logger.error(msg, ex); throw new ServletException(msg, ex); } }