Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.fetcher; import static org.archive.modules.CoreAttributeConstants.A_FTP_CONTROL_CONVERSATION; import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.Socket; import java.net.URLEncoder; import java.net.UnknownHostException; import java.security.MessageDigest; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.net.SocketFactory; import org.apache.commons.httpclient.URIException; import org.apache.commons.net.ftp.FTP; import org.apache.commons.net.ftp.FTPCommand; import org.archive.io.RecordingInputStream; import org.archive.io.ReplayCharSequence; import org.archive.modules.CrawlURI; import org.archive.modules.Processor; import org.archive.modules.extractor.Hop; import org.archive.modules.extractor.LinkContext; import org.archive.net.ClientFTP; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.util.Recorder; /** * Fetches documents and directory listings using FTP. This class will also * try to extract FTP "links" from directory listings. For this class to * archive a directory listing, the remote FTP server must support the NLIST * command. Most modern FTP servers should. * * @author pjack * */ public class FetchFTP extends Processor { @SuppressWarnings("unused") private static final long serialVersionUID = 1L; /** Logger for this class. */ private static Logger logger = Logger.getLogger(FetchFTP.class.getName()); /** Pattern for matching directory entries. */ private static Pattern DIR = Pattern.compile("(.+)$", Pattern.MULTILINE); /** * The username to send to FTP servers. By convention, the default value of * "anonymous" is used for publicly available FTP sites. */ { setUsername("anonymous"); } public String getUsername() { return (String) kp.get("username"); } public void setUsername(String username) { kp.put("username", username); } /** * The password to send to FTP servers. By convention, anonymous users send * their email address in this field. */ { setPassword("password"); } public String getPassword() { return (String) kp.get("password"); } public void setPassword(String pw) { kp.put("password", pw); } /** * Set to true to extract further URIs from FTP directories. Default is * true. */ { setExtractFromDirs(true); } /** * Returns the <code>extract.from.dirs</code> attribute for this * <code>FetchFTP</code> and the given curi. * * @param curi the curi whose attribute to return * @return that curi's <code>extract.from.dirs</code> */ public boolean getExtractFromDirs() { return (Boolean) kp.get("extractFromDirs"); } public void setExtractFromDirs(boolean extractFromDirs) { kp.put("extractFromDirs", extractFromDirs); } /** * Set to true to extract the parent URI from all FTP URIs. Default is true. */ { setExtractParent(true); } /** * Returns the <code>extract.parent</code> attribute for this * <code>FetchFTP</code> and the given curi. * * @param curi the curi whose attribute to return * @return that curi's <code>extract-parent</code> */ public boolean getExtractParent() { return (Boolean) kp.get("extractParent"); } public void setExtractParent(boolean extractParent) { kp.put("extractParent", extractParent); } /** * Whether or not to perform an on-the-fly digest hash of retrieved * content-bodies. */ { setDigestContent(true); } public boolean getDigestContent() { return (Boolean) kp.get("digestContent"); } public void setDigestContent(boolean digest) { kp.put("digestContent", digest); } /** * Which algorithm (for example MD5 or SHA-1) to use to perform an * on-the-fly digest hash of retrieved content-bodies. */ protected String digestAlgorithm = "sha1"; public String getDigestAlgorithm() { return digestAlgorithm; } public void setDigestAlgorithm(String digestAlgorithm) { this.digestAlgorithm = digestAlgorithm; } /** * Maximum length in bytes to fetch. Fetch is truncated at this length. A * value of 0 means no limit. */ { setMaxLengthBytes(0L); // no limit } public long getMaxLengthBytes() { return (Long) kp.get("maxLengthBytes"); } public void setMaxLengthBytes(long timeout) { kp.put("maxLengthBytes", timeout); } /** * The maximum KB/sec to use when fetching data from a server. The default * of 0 means no maximum. */ { setMaxFetchKBSec(0); // no limit } public int getMaxFetchKBSec() { return (Integer) kp.get("maxFetchKBSec"); } public void setMaxFetchKBSec(int rate) { kp.put("maxFetchKBSec", rate); } /** * If the fetch is not completed in this number of seconds, give up (and * retry later). */ { setTimeoutSeconds(20 * 60); // 20 minutes } public int getTimeoutSeconds() { return (Integer) kp.get("timeoutSeconds"); } public void setTimeoutSeconds(int timeout) { kp.put("timeoutSeconds", timeout); } /** * If the socket is unresponsive for this number of milliseconds, give up. * Set to zero for no timeout (Not. recommended. Could hang a thread on an * unresponsive server). This timeout is used timing out socket opens and * for timing out each socket read. Make sure this value is < * {@link #TIMEOUT_SECONDS} for optimal configuration: ensures at least one * retry read. */ { setSoTimeoutMs(20 * 1000); // 20 seconds } public int getSoTimeoutMs() { return (Integer) kp.get("soTimeoutMs"); } public void setSoTimeoutMs(int timeout) { kp.put("soTimeoutMs", timeout); } /** * Constructs a new <code>FetchFTP</code>. */ public FetchFTP() { } @Override protected boolean shouldProcess(CrawlURI curi) { if (!curi.getUURI().getScheme().equals("ftp")) { return false; } return true; } /** * Processes the given URI. If the given URI is not an FTP URI, then * this method does nothing. Otherwise an attempt is made to connect * to the FTP server. * * <p>If the connection is successful, an attempt will be made to CD to * the path specified in the URI. If the remote CD command succeeds, * then it is assumed that the URI represents a directory. If the * CD command fails, then it is assumed that the URI represents * a file. * * <p>For directories, the directory listing will be fetched using * the FTP LIST command, and saved to the HttpRecorder. If the * <code>extract.from.dirs</code> attribute is set to true, then * the files in the fetched list will be added to the curi as * extracted FTP links. (It was easier to do that here, rather * than writing a separate FTPExtractor.) * * <p>For files, the file will be fetched using the FTP RETR * command, and saved to the HttpRecorder. * * <p>All file transfers (including directory listings) occur using * Binary mode transfer. Also, the local passive transfer mode * is always used, to play well with firewalls. * * @param curi the curi to process * @throws InterruptedException if the thread is interrupted during * processing */ @Override protected void innerProcess(CrawlURI curi) throws InterruptedException { curi.setFetchBeginTime(System.currentTimeMillis()); ClientFTP client = new ClientFTP(); Recorder recorder = curi.getRecorder(); try { if (logger.isLoggable(Level.FINE)) { logger.fine("attempting to fetch ftp uri: " + curi); } fetch(curi, client, recorder); } catch (IOException e) { if (logger.isLoggable(Level.INFO)) { logger.info(curi + ": " + e); } curi.getNonFatalFailures().add(e); curi.setFetchStatus(FetchStatusCodes.S_CONNECT_FAILED); } finally { disconnect(client); curi.setFetchCompletedTime(System.currentTimeMillis()); curi.getData().put(A_FTP_CONTROL_CONVERSATION, client.getControlConversation()); } } /** * A {@link SocketFactory} much like {@link javax.net.DefaultSocketFactory}, * except that the createSocket() methods that open connections support a * connect timeout. */ public class SocketFactoryWithTimeout extends SocketFactory { protected int connectTimeoutMs = 0; public int getConnectTimeoutMs() { return connectTimeoutMs; } public void setConnectTimeoutMs(int connectTimeoutMs) { this.connectTimeoutMs = connectTimeoutMs; } public Socket createSocket() { return new Socket(); } public Socket createSocket(String host, int port) throws IOException, UnknownHostException { Socket sock = createSocket(); sock.connect(new InetSocketAddress(host, port), connectTimeoutMs); return sock; } public Socket createSocket(InetAddress host, int port) throws IOException { Socket sock = createSocket(); sock.connect(new InetSocketAddress(host, port), connectTimeoutMs); return sock; } public Socket createSocket(String host, int port, InetAddress localHost, int localPort) throws IOException, UnknownHostException { Socket sock = createSocket(); sock.bind(new InetSocketAddress(localHost, localPort)); sock.connect(new InetSocketAddress(host, port), connectTimeoutMs); return sock; } public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException { Socket sock = createSocket(); sock.bind(new InetSocketAddress(localAddress, localPort)); sock.connect(new InetSocketAddress(address, port), connectTimeoutMs); return sock; } } protected SocketFactoryWithTimeout socketFactory; /** * Fetches a document from an FTP server. * * @param curi the URI of the document to fetch * @param client the FTPClient to use for the fetch * @param recorder the recorder to preserve the document in * @throws IOException if a network or protocol error occurs * @throws InterruptedException if the thread is interrupted */ private void fetch(CrawlURI curi, ClientFTP client, Recorder recorder) throws IOException, InterruptedException { // Connect to the FTP server. UURI uuri = curi.getUURI(); int port = uuri.getPort(); if (port == -1) { port = 21; } if (socketFactory == null) { socketFactory = new SocketFactoryWithTimeout(); } socketFactory.setConnectTimeoutMs(getSoTimeoutMs()); client.setSocketFactory(socketFactory); client.setConnectTimeout(getSoTimeoutMs()); client.setDefaultTimeout(getSoTimeoutMs()); client.setDataTimeout(getSoTimeoutMs()); client.connect(uuri.getHost(), port); client.setSoTimeout(getSoTimeoutMs()); // must be after connect() // Authenticate. String[] auth = getAuth(curi); client.login(auth[0], auth[1]); // The given resource may or may not be a directory. // To figure out which is which, execute a CD command to // the UURI's path. If CD works, it's a directory. boolean isDirectory = client.changeWorkingDirectory(uuri.getPath()); // Get a data socket. This will either be the result of a NLST // command for a directory, or a RETR command for a file. int command; String path; if (isDirectory) { curi.getAnnotations().add("ftpDirectoryList"); command = FTPCommand.NLST; client.setFileType(FTP.ASCII_FILE_TYPE); path = "."; } else { command = FTPCommand.RETR; client.setFileType(FTP.BINARY_FILE_TYPE); path = uuri.getPath(); } client.enterLocalPassiveMode(); Socket socket = null; try { socket = client.openDataConnection(command, path); // if "227 Entering Passive Mode" these will get reset later curi.setFetchStatus(client.getReplyCode()); curi.getData().put(A_FTP_FETCH_STATUS, client.getReplyStrings()[0]); } catch (IOException e) { // try it again, see AbstractFrontier.needsRetrying() curi.setFetchStatus(FetchStatusCodes.S_CONNECT_LOST); } // Save the streams in the CURI, where downstream processors // expect to find them. if (socket != null) { if (socket.getSoTimeout() != getSoTimeoutMs()) { logger.warning("data socket timeout " + socket.getSoTimeout() + "ms is not expected value " + getSoTimeoutMs() + "ms"); } // Shall we get a digest on the content downloaded? boolean digestContent = getDigestContent(); String algorithm = null; if (digestContent) { algorithm = getDigestAlgorithm(); recorder.getRecordedInput().setDigest(algorithm); recorder.getRecordedInput().startDigest(); } else { // clear recorder.getRecordedInput().setDigest((MessageDigest) null); } try { saveToRecorder(curi, socket, recorder); } finally { recorder.close(); client.closeDataConnection(); // does socket.close() curi.setContentSize(recorder.getRecordedInput().getSize()); // "226 Transfer complete." client.getReply(); curi.setFetchStatus(client.getReplyCode()); curi.getData().put(A_FTP_FETCH_STATUS, client.getReplyStrings()[0]); if (isDirectory) { curi.setContentType("text/plain"); } else { curi.setContentType("application/octet-stream"); } if (logger.isLoggable(Level.FINE)) { logger.fine("read " + recorder.getRecordedInput().getSize() + " bytes from ftp data socket"); } if (digestContent) { curi.setContentDigest(algorithm, recorder.getRecordedInput().getDigestValue()); } } if (isDirectory) { extract(curi, recorder); } } else { // no data - without this, content size is -1 curi.setContentSize(0); } addParent(curi); } /** * Saves the given socket to the given recorder. * * @param curi the curi that owns the recorder * @param socket the socket whose streams to save * @param recorder the recorder to save them to * @throws IOException if a network or file error occurs * @throws InterruptedException if the thread is interrupted */ private void saveToRecorder(CrawlURI curi, Socket socket, Recorder recorder) throws IOException, InterruptedException { recorder.inputWrap(socket.getInputStream()); recorder.outputWrap(socket.getOutputStream()); recorder.markContentBegin(); // Read the remote file/dir listing in its entirety. long softMax = 0; long hardMax = getMaxLengthBytes(); long timeout = (long) getTimeoutSeconds() * 1000L; int maxRate = getMaxFetchKBSec(); RecordingInputStream input = recorder.getRecordedInput(); input.setLimits(hardMax, timeout, maxRate); input.readFullyOrUntil(softMax); } /** * Extract FTP links in a directory listing. * The listing must already be saved to the given recorder. * * @param curi The curi to save extracted links to * @param recorder The recorder containing the directory listing */ private void extract(CrawlURI curi, Recorder recorder) { if (!getExtractFromDirs()) { return; } ReplayCharSequence seq = null; try { seq = recorder.getContentReplayCharSequence(); extract(curi, seq); } catch (IOException e) { logger.log(Level.SEVERE, "IO error during extraction.", e); } catch (RuntimeException e) { logger.log(Level.SEVERE, "IO error during extraction.", e); } finally { close(seq); } } /** * Extracts FTP links in a directory listing. * * @param curi The curi to save extracted links to * @param dir The directory listing to extract links from * @throws URIException if an extracted link is invalid */ private void extract(CrawlURI curi, ReplayCharSequence dir) { logger.log(Level.FINEST, "Extracting URIs from FTP directory."); Matcher matcher = DIR.matcher(dir); while (matcher.find()) { String file = matcher.group(1); addExtracted(curi, file); } } /** * Adds an extracted filename to the curi. A new URI will be formed * by taking the given curi (which should represent the directory the * file lives in) and appending the file. * * @param curi the curi to store the discovered link in * @param file the filename of the discovered link */ private void addExtracted(CrawlURI curi, String file) { try { file = URLEncoder.encode(file, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new AssertionError(e); } if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "Found " + file); } String base = curi.toString(); if (base.endsWith("/")) { base = base.substring(0, base.length() - 1); } try { UURI n = UURIFactory.getInstance(base + "/" + file); CrawlURI link = curi.createCrawlURI(n, LinkContext.NAVLINK_MISC, Hop.NAVLINK); curi.getOutLinks().add(link); } catch (URIException e) { logger.log(Level.WARNING, "URI error during extraction.", e); } } /** * Extracts the parent URI from the given curi, then adds that parent * URI as a discovered link to the curi. * * <p>If the <code>extract-parent</code> attribute is false, then this * method does nothing. Also, if the path of the given curi is * <code>/</code>, then this method does nothing. * * <p>Otherwise the parent is determined by eliminated the lowest part * of the URI's path. Eg, the parent of <code>ftp://foo.com/one/two</code> * is <code>ftp://foo.com/one</code>. * * @param curi the curi whose parent to add */ private void addParent(CrawlURI curi) { if (!getExtractParent()) { return; } UURI uuri = curi.getUURI(); try { if (uuri.getPath().equals("/")) { // There's no parent to add. return; } String scheme = uuri.getScheme(); String auth = uuri.getEscapedAuthority(); String path = uuri.getEscapedCurrentHierPath(); UURI parent = UURIFactory.getInstance(scheme + "://" + auth + path); CrawlURI link = curi.createCrawlURI(parent, LinkContext.NAVLINK_MISC, Hop.NAVLINK); curi.getOutLinks().add(link); } catch (URIException e) { logger.log(Level.WARNING, "URI error during extraction.", e); } } /** * Returns the username and password for the given URI. This method * always returns an array of length 2. The first element in the returned * array is the username for the URI, and the second element is the * password. * * <p>If the URI itself contains the username and password (i.e., it looks * like <code>ftp://username:password@host/path</code>) then that username * and password are returned. * * <p>Otherwise the settings system is probed for the <code>username</code> * and <code>password</code> attributes for this <code>FTPFetch</code> * and the given <code>curi</code> context. The values of those * attributes are then returned. * * @param curi the curi whose username and password to return * @return an array containing the username and password */ private String[] getAuth(CrawlURI curi) { String[] result = new String[2]; UURI uuri = curi.getUURI(); String userinfo; try { userinfo = uuri.getUserinfo(); } catch (URIException e) { assert false; logger.finest("getUserinfo raised URIException."); userinfo = null; } if (userinfo != null) { int p = userinfo.indexOf(':'); if (p > 0) { result[0] = userinfo.substring(0, p); result[1] = userinfo.substring(p + 1); return result; } } result[0] = getUsername(); result[1] = getPassword(); return result; } /** * Quietly closes the given sequence. * If an IOException is raised, this method logs it as a warning. * * @param seq the sequence to close */ private static void close(ReplayCharSequence seq) { if (seq == null) { return; } try { seq.close(); } catch (IOException e) { logger.log(Level.WARNING, "IO error closing ReplayCharSequence.", e); } } /** * Quietly disconnects from the given FTP client. * If an IOException is raised, this method logs it as a warning. * * @param client the client to disconnect */ private static void disconnect(ClientFTP client) { if (client.isConnected()) try { client.logout(); } catch (IOException e) { } if (client.isConnected()) try { client.disconnect(); } catch (IOException e) { logger.warning("Could not disconnect from FTP client: " + e); } } }