Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.writer; import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL; import static org.archive.modules.fetcher.FetchStatusCodes.S_DNS_SUCCESS; import static org.archive.modules.fetcher.FetchStatusCodes.S_WHOIS_SUCCESS; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_FETCH_HISTORY; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WRITE_TAG; import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Logger; import org.archive.checkpointing.Checkpoint; import org.archive.checkpointing.Checkpointable; import org.archive.io.WriterPool; import org.archive.io.WriterPoolMember; import org.archive.io.WriterPoolSettings; import org.archive.modules.CrawlMetadata; import org.archive.modules.CrawlURI; import org.archive.modules.ProcessResult; import org.archive.modules.Processor; import org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule; import org.archive.modules.net.CrawlHost; import org.archive.modules.net.ServerCache; import org.archive.spring.ConfigPath; import org.archive.util.FileUtils; import org.json.JSONException; import org.json.JSONObject; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.Lifecycle; /** * Abstract implementation of a file pool processor. * Subclass to implement for a particular {@link WriterPoolMember} instance. * @author Parker Thompson * @author stack */ public abstract class WriterPoolProcessor extends Processor implements Lifecycle, Checkpointable, WriterPoolSettings { @SuppressWarnings("unused") private static final long serialVersionUID = 1L; private static final Logger logger = Logger.getLogger(WriterPoolProcessor.class.getName()); /** * Whether to gzip-compress files when writing to disk; * by default true, meaning do-compress. */ protected boolean compress = true; @Override public boolean getCompress() { return compress; } public void setCompress(boolean compress) { this.compress = compress; } /** * File prefix. The text supplied here will be supplied to the naming * template (below) as the 'prefix' variable for possible interpolation. * In the default/recommended naming formula, the prefix will appear first. */ protected String prefix = WriterPoolMember.DEFAULT_PREFIX; @Override public String getPrefix() { return prefix; } public void setPrefix(String prefix) { this.prefix = prefix; } /** * Template from which a filename is interpolated. Expressions of the * form ${key} will be replaced by values from a local map of useful * values (including 'prefix', 'timestamp17', and 'serialno') or * global system properties (which includes the local hostname/port/pid). * * The default template is: * * "${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" * * The default template will generate unique names under reasonable * assumptions; be sure you know what you're doing before customizing, * as you could easily create filename collisions with a poorly-designed * filename template, and many downstream tools have historically assumed * that ARCs/WARCs are carefully named to preserve uniqueness. * */ protected String template = WriterPoolMember.DEFAULT_TEMPLATE; @Override public String getTemplate() { return template; } public void setTemplate(String template) { this.template = template; } /** * Max size of each file. */ protected long maxFileSizeBytes = getDefaultMaxFileSize(); protected abstract long getDefaultMaxFileSize(); @Override public long getMaxFileSizeBytes() { return maxFileSizeBytes; } public void setMaxFileSizeBytes(long maxFileSizeBytes) { this.maxFileSizeBytes = maxFileSizeBytes; } /** * Maximum active files in pool. This setting cannot be varied over the life * of a crawl. */ protected int poolMaxActive = WriterPool.DEFAULT_MAX_ACTIVE; public int getPoolMaxActive() { return poolMaxActive; } public void setPoolMaxActive(int poolMaxActive) { this.poolMaxActive = poolMaxActive; } /** * Maximum time to wait on idle writer before (possibly) creating an * additional instance. */ protected int maxWaitForIdleMs = WriterPool.DEFAULT_MAX_WAIT_FOR_IDLE; public int getMaxWaitForIdleMs() { return maxWaitForIdleMs; } public void setMaxWaitForIdleMs(int maxWaitForIdle) { this.maxWaitForIdleMs = maxWaitForIdle; } /** * Whether to skip the writing of a record when URI history information is * available and indicates the prior fetch had an identical content digest. * Note that subclass settings may provide more fine-grained control on * how identical digest content is handled; for those controls to have * effect, this setting must not be 'true' (causing content to be * skipped entirely). * Default is false. */ protected boolean skipIdenticalDigests = false; public boolean getSkipIdenticalDigests() { return skipIdenticalDigests; } public void setSkipIdenticalDigests(boolean skipIdenticalDigests) { this.skipIdenticalDigests = skipIdenticalDigests; } /** * CrawlURI annotation indicating no record was written. */ protected static final String ANNOTATION_UNWRITTEN = "unwritten"; /** * Total file bytes to write to disk. Once the size of all files on disk has * exceeded this limit, this processor will stop the crawler. A value of * zero means no upper limit. */ protected long maxTotalBytesToWrite = 0L; public long getMaxTotalBytesToWrite() { return maxTotalBytesToWrite; } public void setMaxTotalBytesToWrite(long maxTotalBytesToWrite) { this.maxTotalBytesToWrite = maxTotalBytesToWrite; } /** * Whether to flush to underlying file frequently (at least after each * record), or not. Default is true. */ protected boolean frequentFlushes = true; @Override public boolean getFrequentFlushes() { return frequentFlushes; } public void setFrequentFlushes(boolean frequentFlushes) { this.frequentFlushes = frequentFlushes; } /** * Size of buffer in front of disk-writing. Default is 256K. */ protected int writeBufferSize = 256 * 1024; @Override public int getWriteBufferSize() { return writeBufferSize; } public void setWriteBufferSize(int writeBufferSize) { this.writeBufferSize = writeBufferSize; } public CrawlMetadata getMetadataProvider() { return (CrawlMetadata) kp.get("metadataProvider"); } @Autowired public void setMetadataProvider(CrawlMetadata provider) { kp.put("metadataProvider", provider); } transient protected ServerCache serverCache; public ServerCache getServerCache() { return this.serverCache; } @Autowired public void setServerCache(ServerCache serverCache) { this.serverCache = serverCache; } protected ConfigPath directory = new ConfigPath("writer base path", "${launchId}"); public ConfigPath getDirectory() { return directory; } public void setDirectory(ConfigPath directory) { this.directory = directory; } protected boolean startNewFilesOnCheckpoint = true; public boolean getStartNewFilesOnCheckpoint() { return startNewFilesOnCheckpoint; } /** * Whether to close output files and start new ones on checkpoint. True by * default. If false, merely flushes writers. */ public void setStartNewFilesOnCheckpoint(boolean startNewFilesOnCheckpoint) { this.startNewFilesOnCheckpoint = startNewFilesOnCheckpoint; } /** * Where to save files. Supply absolute or relative directory paths. * If relative, paths will be interpreted relative to the local * 'directory' property. order.disk-path setting. If more than one * path specified, we'll round-robin dropping files to each. This * setting is safe to change midcrawl (You can remove and add new * dirs as the crawler progresses). */ protected List<ConfigPath> storePaths = getDefaultStorePaths(); protected abstract List<ConfigPath> getDefaultStorePaths(); public List<ConfigPath> getStorePaths() { return storePaths; } public void setStorePaths(List<ConfigPath> paths) { this.storePaths = paths; } /** * Reference to pool. */ transient private WriterPool pool = null; /** * Total number of bytes written to disc. */ private long totalBytesWritten = 0; private AtomicInteger serial = new AtomicInteger(); /** * @param name Name of this processor. * @param description Description for this processor. */ public WriterPoolProcessor() { super(); } @Override public synchronized void start() { if (isRunning()) { return; } super.start(); setupPool(serial); } @Override public void stop() { if (!isRunning()) { return; } super.stop(); // XXX happens at finish; move to teardown? this.pool.close(); } protected AtomicInteger getSerialNo() { return getPool().getSerialNo(); } /** * Set up pool of files. */ protected abstract void setupPool(final AtomicInteger serial); protected ProcessResult checkBytesWritten() { long max = getMaxTotalBytesToWrite(); if (max <= 0) { return ProcessResult.PROCEED; } if (max <= this.totalBytesWritten) { return ProcessResult.FINISH; // FIXME: Specify reason // controller.requestCrawlStop(CrawlStatus.FINISHED_WRITE_LIMIT); } return ProcessResult.PROCEED; } /** * Whether the given CrawlURI should be written to archive files. * Annotates CrawlURI with a reason for any negative answer. * * @param curi CrawlURI * @return true if URI should be written; false otherwise */ protected boolean shouldWrite(CrawlURI curi) { if (getSkipIdenticalDigests() && IdenticalDigestDecideRule.hasIdenticalDigest(curi)) { curi.getAnnotations().add(ANNOTATION_UNWRITTEN + ":identicalDigest"); return false; } boolean retVal; String scheme = curi.getUURI().getScheme().toLowerCase(); // TODO: possibly move this sort of isSuccess() test into CrawlURI if (scheme.equals("dns")) { retVal = curi.getFetchStatus() == S_DNS_SUCCESS; } else if (scheme.equals("whois")) { retVal = curi.getFetchStatus() == S_WHOIS_SUCCESS; } else if (scheme.equals("http") || scheme.equals("https")) { retVal = curi.getFetchStatus() > 0 && curi.getHttpMethod() != null; } else if (scheme.equals("ftp")) { retVal = curi.getFetchStatus() > 0; } else { logger.info("This writer does not write out scheme " + scheme + " content"); curi.getAnnotations().add(ANNOTATION_UNWRITTEN + ":scheme"); return false; } if (retVal == false) { // status not deserving writing curi.getAnnotations().add(ANNOTATION_UNWRITTEN + ":status"); return false; } return true; } /** * Return IP address of given URI suitable for recording (as in a * classic ARC 5-field header line). * * @param curi CrawlURI * @return String of IP address */ protected String getHostAddress(CrawlURI curi) { // special handling for DNS URIs: want address of DNS server if (curi.getUURI().getScheme().toLowerCase().equals("dns")) { return (String) curi.getData().get(A_DNS_SERVER_IP_LABEL); } // otherwise, host referenced in URI // TODO:FIXME: have fetcher insert exact IP contacted into curi, // use that rather than inferred by CrawlHost lookup CrawlHost h = getServerCache().getHostFor(curi.getUURI()); if (h == null) { throw new NullPointerException("Crawlhost is null for " + curi + " " + curi.getVia()); } InetAddress a = h.getIP(); if (a == null) { throw new NullPointerException("Address is null for " + curi + " " + curi.getVia() + ". Address " + ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ? "was never looked up." : (System.currentTimeMillis() - h.getIpFetched()) + " ms ago.")); } return h.getIP().getHostAddress(); } @Override public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException { if (getStartNewFilesOnCheckpoint()) { this.pool.close(); super.doCheckpoint(checkpointInProgress); setupPool(this.serial); } else { pool.flush(); super.doCheckpoint(checkpointInProgress); } } @Override protected JSONObject toCheckpointJson() throws JSONException { JSONObject json = super.toCheckpointJson(); json.put("serialNumber", getSerialNo().get()); json.put("poolStatus", pool.jsonStatus()); return json; } @Override protected void fromCheckpointJson(JSONObject json) throws JSONException { super.fromCheckpointJson(json); serial.set(json.getInt("serialNumber")); } protected WriterPool getPool() { return pool; } protected void setPool(WriterPool pool) { this.pool = pool; } protected long getTotalBytesWritten() { return totalBytesWritten; } protected void setTotalBytesWritten(long totalBytesWritten) { this.totalBytesWritten = totalBytesWritten; } @Override public abstract List<String> getMetadata(); @Override public List<File> calcOutputDirs() { List<ConfigPath> list = getStorePaths(); ArrayList<File> results = new ArrayList<File>(); for (ConfigPath path : list) { path.setBase(getDirectory()); File f = path.getFile(); if (!f.exists()) { try { FileUtils.ensureWriteableDirectory(f); } catch (Exception e) { e.printStackTrace(); continue; } } results.add(f); } return results; } @Override protected void innerProcess(CrawlURI puri) { throw new AssertionError(); } @Override protected abstract ProcessResult innerProcessResult(CrawlURI uri); @Override protected boolean shouldProcess(CrawlURI curi) { // If failure, or we haven't fetched the resource yet, return if (curi.getFetchStatus() <= 0) { return false; } // If no recorded content at all, don't write record. long recordLength = curi.getContentSize(); if (recordLength <= 0) { // getContentSize() should be > 0 if any material (even just // HTTP headers with zero-length body is available. return false; } return true; } /** * If this fetch is identical to the last written (archived) fetch, then * copy forward the writeTag. This method should generally be called when * writeTag is present from a previous identical fetch, even though this * particular fetch is not being written anywhere (not even a revisit * record). */ protected void copyForwardWriteTagIfDupe(CrawlURI curi) { if (IdenticalDigestDecideRule.hasIdenticalDigest(curi)) { @SuppressWarnings("unchecked") Map<String, Object>[] history = (Map<String, Object>[]) curi.getData().get(A_FETCH_HISTORY); if (history[1].containsKey(A_WRITE_TAG)) { history[0].put(A_WRITE_TAG, history[1].get(A_WRITE_TAG)); } } } @Override protected void innerRejectProcess(CrawlURI curi) throws InterruptedException { copyForwardWriteTagIfDupe(curi); } }