Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.writer; import static org.archive.format.warc.WARCConstants.FTP_CONTROL_CONVERSATION_MIMETYPE; import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; import static org.archive.format.warc.WARCConstants.HEADER_KEY_ETAG; import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP; import static org.archive.format.warc.WARCConstants.HEADER_KEY_LAST_MODIFIED; import static org.archive.format.warc.WARCConstants.HEADER_KEY_PAYLOAD_DIGEST; import static org.archive.format.warc.WARCConstants.HEADER_KEY_PROFILE; import static org.archive.format.warc.WARCConstants.HEADER_KEY_REFERS_TO; import static org.archive.format.warc.WARCConstants.HEADER_KEY_REFERS_TO_DATE; import static org.archive.format.warc.WARCConstants.HEADER_KEY_REFERS_TO_TARGET_URI; import static org.archive.format.warc.WARCConstants.HEADER_KEY_TRUNCATED; import static org.archive.format.warc.WARCConstants.HTTP_REQUEST_MIMETYPE; import static org.archive.format.warc.WARCConstants.HTTP_RESPONSE_MIMETYPE; import static org.archive.format.warc.WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_HEAD; import static org.archive.format.warc.WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_LENGTH; import static org.archive.format.warc.WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_TIME; import static org.archive.format.warc.WARCConstants.PROFILE_REVISIT_IDENTICAL_DIGEST; import static org.archive.format.warc.WARCConstants.PROFILE_REVISIT_NOT_MODIFIED; import static org.archive.format.warc.WARCConstants.TYPE; import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL; import static org.archive.modules.CoreAttributeConstants.A_FTP_CONTROL_CONVERSATION; import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS; import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG; import static org.archive.modules.CoreAttributeConstants.A_WARC_RESPONSE_HEADERS; import static org.archive.modules.CoreAttributeConstants.HEADER_TRUNC; import static org.archive.modules.CoreAttributeConstants.LENGTH_TRUNC; import static org.archive.modules.CoreAttributeConstants.TIMER_TRUNC; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST_COUNT; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ETAG_HEADER; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_FETCH_HISTORY; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_LAST_MODIFIED_HEADER; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_DATE; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_URL; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_FILENAME; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_FILE_OFFSET; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_RECORD_ID; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WRITE_TAG; import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.InetAddress; import java.net.URI; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.archive.format.warc.WARCConstants.WARCRecordType; import org.archive.io.ReplayInputStream; import org.archive.io.warc.WARCRecordInfo; import org.archive.io.warc.WARCWriter; import org.archive.io.warc.WARCWriterPool; import org.archive.io.warc.WARCWriterPoolSettings; import org.archive.modules.CoreAttributeConstants; import org.archive.modules.CrawlMetadata; import org.archive.modules.CrawlURI; import org.archive.modules.ProcessResult; import org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule; import org.archive.modules.extractor.Link; import org.archive.spring.ConfigPath; import org.archive.uid.RecordIDGenerator; import org.archive.uid.UUIDGenerator; import org.archive.util.ArchiveUtils; import org.archive.util.anvl.ANVLRecord; import org.json.JSONException; import org.json.JSONObject; /** * WARCWriterProcessor. * Intends to follow the WARC/1.0 specification. * * <p>TODO: Remove ANVLRecord. Rename NameValue or use RFC822 * (commons-httpclient?) or find something else. * * @contributor stack */ public class WARCWriterProcessor extends WriterPoolProcessor implements WARCWriterPoolSettings { @SuppressWarnings("unused") private static final long serialVersionUID = 6182850087635847443L; private static final Logger logger = Logger.getLogger(WARCWriterProcessor.class.getName()); private ConcurrentMap<String, ConcurrentMap<String, AtomicLong>> stats = new ConcurrentHashMap<String, ConcurrentMap<String, AtomicLong>>(); private AtomicLong urlsWritten = new AtomicLong(); public long getDefaultMaxFileSize() { return 1000000000L; // 1 SI giga-byte (10^9 bytes), per WARC appendix A } public List<ConfigPath> getDefaultStorePaths() { List<ConfigPath> paths = new ArrayList<ConfigPath>(); paths.add(new ConfigPath("warcs default store path", "warcs")); return paths; } /** * Whether to write 'request' type records. Default is true. */ { setWriteRequests(true); } public boolean getWriteRequests() { return (Boolean) kp.get("writeRequests"); } public void setWriteRequests(boolean writeRequests) { kp.put("writeRequests", writeRequests); } /** * Whether to write 'metadata' type records. Default is true. */ { setWriteMetadata(true); } public boolean getWriteMetadata() { return (Boolean) kp.get("writeMetadata"); } public void setWriteMetadata(boolean writeMetadata) { kp.put("writeMetadata", writeMetadata); } /** * Whether to write 'revisit' type records when a URI's history indicates * the previous fetch had an identical content digest. Default is true. * * Decision applies to either URI-based fetch history or URI-agnostic * content digest-based history. */ { setWriteRevisitForIdenticalDigests(true); } public boolean getWriteRevisitForIdenticalDigests() { return (Boolean) kp.get("writeRevisitForIdenticalDigests"); } public void setWriteRevisitForIdenticalDigests(boolean writeRevisits) { kp.put("writeRevisitForIdenticalDigests", writeRevisits); } /** * Whether to write 'revisit' type records when a 304-Not Modified response * is received. Default is true. */ { setWriteRevisitForNotModified(true); } public boolean getWriteRevisitForNotModified() { return (Boolean) kp.get("writeRevisitForNotModified"); } public void setWriteRevisitForNotModified(boolean writeRevisits) { kp.put("writeRevisitForNotModified", writeRevisits); } /** * Generator for record IDs */ protected RecordIDGenerator generator = new UUIDGenerator(); public RecordIDGenerator getRecordIDGenerator() { return generator; } public void setRecordIDGenerator(RecordIDGenerator generator) { this.generator = generator; } private transient List<String> cachedMetadata; public WARCWriterProcessor() { } @Override protected void setupPool(final AtomicInteger serialNo) { setPool(new WARCWriterPool(serialNo, this, getPoolMaxActive(), getMaxWaitForIdleMs())); } /** * Writes a CrawlURI and its associated data to store file. * * Currently this method understands the following uri types: dns, http, and * https. * * @param curi CrawlURI to process. * */ @Override protected ProcessResult innerProcessResult(CrawlURI puri) { CrawlURI curi = (CrawlURI) puri; String scheme = curi.getUURI().getScheme().toLowerCase(); try { if (shouldWrite(curi)) { return write(scheme, curi); } else { copyForwardWriteTagIfDupe(curi); } } catch (IOException e) { curi.getNonFatalFailures().add(e); logger.log(Level.SEVERE, "Failed write of Records: " + curi.toString(), e); } return ProcessResult.PROCEED; } protected ProcessResult write(final String lowerCaseScheme, final CrawlURI curi) throws IOException { WARCWriter writer = (WARCWriter) getPool().borrowFile(); long position = writer.getPosition(); try { // See if we need to open a new file because we've exceeded maxBytes. // Call to checkFileSize will open new file if we're at maximum for // current file. writer.checkSize(); if (writer.getPosition() != position) { // We just closed the file because it was larger than maxBytes. // Add to the totalBytesWritten the size of the first record // in the file, if any. setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - position)); position = writer.getPosition(); } // Reset writer temp stats so they reflect only this set of records. // They'll be added to totals below, in finally block, after records // have been written. writer.resetTmpStats(); writer.resetTmpRecordLog(); // Write a request, response, and metadata all in the one // 'transaction'. final URI baseid = getRecordID(); final String timestamp = ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); if (lowerCaseScheme.startsWith("http")) { writeHttpRecords(curi, writer, baseid, timestamp); } else if (lowerCaseScheme.equals("dns")) { writeDnsRecords(curi, writer, baseid, timestamp); } else if (lowerCaseScheme.equals("ftp")) { writeFtpRecords(writer, curi, baseid, timestamp); } else if (lowerCaseScheme.equals("whois")) { writeWhoisRecords(writer, curi, baseid, timestamp); } else { logger.warning("No handler for scheme " + lowerCaseScheme); } } catch (IOException e) { // Invalidate this file (It gets a '.invalid' suffix). getPool().invalidateFile(writer); // Set the writer to null otherwise the pool accounting // of how many active writers gets skewed if we subsequently // do a returnWriter call on this object in the finally block. writer = null; throw e; } finally { if (writer != null) { updateMetadataAfterWrite(curi, writer, position); getPool().returnFile(writer); } } return checkBytesWritten(); } protected void updateMetadataAfterWrite(final CrawlURI curi, WARCWriter writer, long startPosition) { if (WARCWriter.getStat(writer.getTmpStats(), WARCWriter.TOTALS, WARCWriter.NUM_RECORDS) > 0l) { addStats(writer.getTmpStats()); urlsWritten.incrementAndGet(); } if (logger.isLoggable(Level.FINE)) { logger.fine( "wrote " + WARCWriter.getStat(writer.getTmpStats(), WARCWriter.TOTALS, WARCWriter.SIZE_ON_DISK) + " bytes to " + writer.getFile().getName() + " for " + curi); } setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - startPosition)); curi.addExtraInfo("warcFilename", writer.getFilenameWithoutOccupiedSuffix()); curi.addExtraInfo("warcFileOffset", startPosition); // history for uri-based dedupe @SuppressWarnings("unchecked") Map<String, Object>[] history = (Map<String, Object>[]) curi.getData().get(A_FETCH_HISTORY); if (history != null && history[0] != null) { history[0].put(A_WRITE_TAG, writer.getFilenameWithoutOccupiedSuffix()); } // history for uri-agnostic, content digest based dedupe if (curi.getContentDigest() != null && curi.hasContentDigestHistory()) { for (WARCRecordInfo warcRecord : writer.getTmpRecordLog()) { if ((warcRecord.getType() == WARCRecordType.response || warcRecord.getType() == WARCRecordType.resource) && warcRecord.getContentStream() != null && warcRecord.getContentLength() > 0) { curi.getContentDigestHistory().put(A_ORIGINAL_URL, warcRecord.getUrl()); curi.getContentDigestHistory().put(A_WARC_RECORD_ID, warcRecord.getRecordId()); curi.getContentDigestHistory().put(A_WARC_FILENAME, warcRecord.getWARCFilename()); curi.getContentDigestHistory().put(A_WARC_FILE_OFFSET, warcRecord.getWARCFileOffset()); curi.getContentDigestHistory().put(A_ORIGINAL_DATE, warcRecord.getCreate14DigitDate()); curi.getContentDigestHistory().put(A_CONTENT_DIGEST_COUNT, 1); } else if (warcRecord.getType() == WARCRecordType.revisit && curi.getAnnotations().contains("warcRevisit:digest")) { Integer oldCount = (Integer) curi.getContentDigestHistory().get(A_CONTENT_DIGEST_COUNT); if (oldCount == null) { // shouldn't happen, log a warning? oldCount = 1; } curi.getContentDigestHistory().put(A_CONTENT_DIGEST_COUNT, oldCount + 1); } } } } protected void addStats(Map<String, Map<String, Long>> substats) { for (String key : substats.keySet()) { // intentionally redundant here -- if statement avoids creating // unused empty map every time; putIfAbsent() ensures thread safety if (stats.get(key) == null) { stats.putIfAbsent(key, new ConcurrentHashMap<String, AtomicLong>()); } for (String subkey : substats.get(key).keySet()) { AtomicLong oldValue = stats.get(key).get(subkey); if (oldValue == null) { oldValue = stats.get(key).putIfAbsent(subkey, new AtomicLong(substats.get(key).get(subkey))); } if (oldValue != null) { oldValue.addAndGet(substats.get(key).get(subkey)); } } } } protected void writeDnsRecords(final CrawlURI curi, WARCWriter w, final URI baseid, final String timestamp) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(curi.getContentType()); recordInfo.setRecordId(baseid); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); String ip = (String) curi.getData().get(A_DNS_SERVER_IP_LABEL); if (ip != null && ip.length() > 0) { recordInfo.addExtraHeader(HEADER_KEY_IP, ip); } ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } recordInfo.getRecordId(); } protected void writeWhoisRecords(WARCWriter w, CrawlURI curi, URI baseid, String timestamp) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(curi.getContentType()); recordInfo.setRecordId(baseid); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); Object whoisServerIP = curi.getData().get(CoreAttributeConstants.A_WHOIS_SERVER_IP); if (whoisServerIP != null) { recordInfo.addExtraHeader(HEADER_KEY_IP, whoisServerIP.toString()); } ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } recordInfo.getRecordId(); } protected void writeHttpRecords(final CrawlURI curi, WARCWriter w, final URI baseid, final String timestamp) throws IOException { // Add named fields for ip, checksum, and relate the metadata // and request to the resource field. // TODO: Use other than ANVL (or rename ANVL as NameValue or // use RFC822 (commons-httpclient?). ANVLRecord headers = new ANVLRecord(); if (curi.getContentDigest() != null) { headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); } headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi)); URI rid; if (getWriteRevisitForIdenticalDigests() && curi.hasContentDigestHistory() && curi.getContentDigestHistory().get(A_ORIGINAL_URL) != null) { rid = writeRevisitUriAgnosticDigest(w, timestamp, HTTP_RESPONSE_MIMETYPE, baseid, curi, headers); } else if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) && getWriteRevisitForIdenticalDigests()) { rid = writeRevisitDigest(w, timestamp, HTTP_RESPONSE_MIMETYPE, baseid, curi, headers); } else if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED && getWriteRevisitForNotModified()) { rid = writeRevisitNotModified(w, timestamp, baseid, curi, headers); } else { // Check for truncated annotation String value = null; Collection<String> anno = curi.getAnnotations(); if (anno.contains(TIMER_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_TIME; } else if (anno.contains(LENGTH_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_LENGTH; } else if (anno.contains(HEADER_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_HEAD; } // TODO: Add annotation for TRUNCATED_VALUE_UNSPECIFIED if (value != null) { headers.addLabelValue(HEADER_KEY_TRUNCATED, value); } rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE, baseid, curi, headers); } headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); if (getWriteRequests()) { writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE, baseid, curi, headers); } if (getWriteMetadata()) { writeMetadata(w, timestamp, baseid, curi, headers); } } protected void writeFtpRecords(WARCWriter w, final CrawlURI curi, final URI baseid, final String timestamp) throws IOException { ANVLRecord headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi)); String controlConversation = curi.getData().get(A_FTP_CONTROL_CONVERSATION).toString(); URI rid = writeFtpControlConversation(w, timestamp, baseid, curi, headers, controlConversation); if (curi.getContentDigest() != null) { headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); } if (curi.getRecorder() != null) { if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) && getWriteRevisitForIdenticalDigests()) { rid = writeRevisitDigest(w, timestamp, null, baseid, curi, headers, 0); } else { headers = new ANVLRecord(); // Check for truncated annotation String value = null; Collection<String> anno = curi.getAnnotations(); if (anno.contains(TIMER_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_TIME; } else if (anno.contains(LENGTH_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_LENGTH; } else if (anno.contains(HEADER_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_HEAD; } // TODO: Add annotation for TRUNCATED_VALUE_UNSPECIFIED if (value != null) { headers.addLabelValue(HEADER_KEY_TRUNCATED, value); } if (curi.getContentDigest() != null) { headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); } headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); rid = writeResource(w, timestamp, curi.getContentType(), baseid, curi, headers); } } if (getWriteMetadata()) { headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); writeMetadata(w, timestamp, baseid, curi, headers); } } protected URI writeFtpControlConversation(WARCWriter w, String timestamp, URI baseid, CrawlURI curi, ANVLRecord headers, String controlConversation) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setUrl(curi.toString()); recordInfo.setMimetype(FTP_CONTROL_CONVERSATION_MIMETYPE); recordInfo.setExtraHeaders(headers); recordInfo.setEnforceLength(true); recordInfo.setType(WARCRecordType.metadata); recordInfo.setRecordId(qualifyRecordID(baseid, TYPE, WARCRecordType.metadata.toString())); byte[] b = controlConversation.getBytes("UTF-8"); recordInfo.setContentStream(new ByteArrayInputStream(b)); recordInfo.setContentLength((long) b.length); w.writeRecord(recordInfo); return recordInfo.getRecordId(); } protected URI writeRequest(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.request); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(mimetype); recordInfo.setExtraHeaders(namedFields); recordInfo.setContentLength(curi.getRecorder().getRecordedOutput().getSize()); recordInfo.setEnforceLength(true); final URI uid = qualifyRecordID(baseid, TYPE, WARCRecordType.request.toString()); recordInfo.setRecordId(uid); ReplayInputStream ris = curi.getRecorder().getRecordedOutput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } return recordInfo.getRecordId(); } protected URI writeResponse(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord suppliedFields) throws IOException { ANVLRecord namedFields = suppliedFields; if (curi.getData().containsKey(A_WARC_RESPONSE_HEADERS)) { namedFields = namedFields.clone(); for (Object headerObj : curi.getDataList(A_WARC_RESPONSE_HEADERS)) { String[] kv = StringUtils.split(((String) headerObj), ":", 2); namedFields.addLabelValue(kv[0].trim(), kv[1].trim()); } } WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(mimetype); recordInfo.setRecordId(baseid); recordInfo.setExtraHeaders(namedFields); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } return recordInfo.getRecordId(); } protected URI writeResource(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.resource); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(mimetype); recordInfo.setRecordId(baseid); recordInfo.setExtraHeaders(namedFields); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } return recordInfo.getRecordId(); } protected URI writeRevisitDigest(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { long revisedLength = curi.getRecorder().getRecordedInput().getContentBegin(); revisedLength = revisedLength > 0 ? revisedLength : curi.getRecorder().getRecordedInput().getSize(); return writeRevisitDigest(w, timestamp, mimetype, baseid, curi, namedFields, revisedLength); } protected URI writeRevisitDigest(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields, long contentLength) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.revisit); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(mimetype); recordInfo.setRecordId(baseid); recordInfo.setContentLength(contentLength); recordInfo.setEnforceLength(false); namedFields.addLabelValue(HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST); namedFields.addLabelValue(HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH); recordInfo.setExtraHeaders(namedFields); ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } curi.getAnnotations().add("warcRevisit:digest"); return recordInfo.getRecordId(); } protected URI writeRevisitUriAgnosticDigest(WARCWriter w, String timestamp, String mimetype, URI baseid, CrawlURI curi, ANVLRecord headers) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.revisit); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(mimetype); recordInfo.setRecordId(baseid); recordInfo.setEnforceLength(false); long revisedLength = curi.getRecorder().getRecordedInput().getContentBegin(); revisedLength = revisedLength > 0 ? revisedLength : curi.getRecorder().getRecordedInput().getSize(); recordInfo.setContentLength(revisedLength); headers.addLabelValue(HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST); headers.addLabelValue(HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH); /* * ISO 28500 WARC ISO standard draft says: "The WARC-Refers-To field may * also be used to associate a record of type 'revisit' or 'conversion' * with the preceding record which helped determine the present record * content." */ headers.addLabelValue(HEADER_KEY_REFERS_TO, "<" + curi.getContentDigestHistory().get(A_WARC_RECORD_ID) + ">"); headers.addLabelValue(HEADER_KEY_REFERS_TO_TARGET_URI, curi.getContentDigestHistory().get(A_ORIGINAL_URL).toString()); headers.addLabelValue(HEADER_KEY_REFERS_TO_DATE, curi.getContentDigestHistory().get(A_ORIGINAL_DATE).toString()); recordInfo.setExtraHeaders(headers); ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } curi.getAnnotations().add("warcRevisit:digest"); return recordInfo.getRecordId(); } protected URI writeRevisitNotModified(final WARCWriter w, final String timestamp, final URI baseid, final CrawlURI puri, final ANVLRecord namedFields) throws IOException { CrawlURI curi = (CrawlURI) puri; WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.revisit); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(null); recordInfo.setRecordId(baseid); recordInfo.setContentLength((long) 0); recordInfo.setEnforceLength(false); namedFields.addLabelValue(HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED); // save just enough context to understand basis of not-modified recordInfo.setExtraHeaders(namedFields); if (curi.isHttpTransaction()) { HttpMethod method = curi.getHttpMethod(); saveHeader(A_ETAG_HEADER, method, namedFields, HEADER_KEY_ETAG); saveHeader(A_LAST_MODIFIED_HEADER, method, namedFields, HEADER_KEY_LAST_MODIFIED); } // truncate to zero-length (all necessary info is above) namedFields.addLabelValue(HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH); ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } curi.getAnnotations().add("warcRevisit:notModified"); return recordInfo.getRecordId(); } /** * Save a header from the given HTTP operation into the * provider headers under a new name * * @param origName header name to get if present * @param method http operation containing headers */ protected void saveHeader(String origName, HttpMethod method, ANVLRecord headers, String newName) { Header header = method.getResponseHeader(origName); if (header != null) { headers.addLabelValue(newName, header.getValue()); } } protected URI writeMetadata(final WARCWriter w, final String timestamp, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.metadata); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(ANVLRecord.MIMETYPE); recordInfo.setExtraHeaders(namedFields); recordInfo.setEnforceLength(true); recordInfo.setRecordId(qualifyRecordID(baseid, TYPE, WARCRecordType.metadata.toString())); // Get some metadata from the curi. // TODO: Get all curi metadata. // TODO: Use other than ANVL (or rename ANVL as NameValue or use // RFC822 (commons-httpclient?). ANVLRecord r = new ANVLRecord(); if (curi.isSeed()) { r.addLabel("seed"); } else { if (curi.forceFetch()) { r.addLabel("force-fetch"); } if (StringUtils.isNotBlank(flattenVia(curi))) { r.addLabelValue("via", flattenVia(curi)); } if (StringUtils.isNotBlank(curi.getPathFromSeed())) { r.addLabelValue("hopsFromSeed", curi.getPathFromSeed()); } if (curi.containsDataKey(A_SOURCE_TAG)) { r.addLabelValue("sourceTag", (String) curi.getData().get(A_SOURCE_TAG)); } } long duration = curi.getFetchCompletedTime() - curi.getFetchBeginTime(); if (duration > -1) { r.addLabelValue("fetchTimeMs", Long.toString(duration)); } if (curi.getData().containsKey(A_FTP_FETCH_STATUS)) { r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString()); } if (curi.getRecorder() != null && curi.getRecorder().getCharset() != null) { r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name()); } for (String annotation : curi.getAnnotations()) { if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) { String[] kv = annotation.split(":", 2); r.addLabelValue(kv[0], kv[1]); } } // Add outlinks though they are effectively useless without anchor text. Collection<Link> links = curi.getOutLinks(); if (links != null && links.size() > 0) { for (Link link : links) { r.addLabelValue("outlink", link.toString()); } } // TODO: Other curi fields to write to metadata. // // Credentials // // fetch-began-time: 1154569278774 // fetch-completed-time: 1154569281816 // // Annotations. byte[] b = r.getUTF8Bytes(); recordInfo.setContentStream(new ByteArrayInputStream(b)); recordInfo.setContentLength((long) b.length); w.writeRecord(recordInfo); return recordInfo.getRecordId(); } protected URI getRecordID() throws IOException { return generator.getRecordID(); } protected URI qualifyRecordID(final URI base, final String key, final String value) throws IOException { Map<String, String> qualifiers = new HashMap<String, String>(1); qualifiers.put(key, value); return generator.qualifyRecordID(base, qualifiers); } public List<String> getMetadata() { if (cachedMetadata != null) { return cachedMetadata; } ANVLRecord record = new ANVLRecord(); record.addLabelValue("software", "Heritrix/" + ArchiveUtils.VERSION + " http://crawler.archive.org"); try { InetAddress host = InetAddress.getLocalHost(); record.addLabelValue("ip", host.getHostAddress()); record.addLabelValue("hostname", host.getCanonicalHostName()); } catch (UnknownHostException e) { logger.log(Level.WARNING, "unable top obtain local crawl engine host", e); } // conforms to ISO 28500:2009 as of May 2009 // as described at http://bibnum.bnf.fr/WARC/ // latest draft as of November 2008 record.addLabelValue("format", "WARC File Format 1.0"); record.addLabelValue("conformsTo", "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); // Get other values from metadata provider CrawlMetadata provider = getMetadataProvider(); addIfNotBlank(record, "operator", provider.getOperator()); addIfNotBlank(record, "publisher", provider.getOrganization()); addIfNotBlank(record, "audience", provider.getAudience()); addIfNotBlank(record, "isPartOf", provider.getJobName()); // TODO: make date match 'job creation date' as in Heritrix 1.x // until then, leave out (plenty of dates already in WARC // records // String rawDate = provider.getBeginDate(); // if(StringUtils.isNotBlank(rawDate)) { // Date date; // try { // date = ArchiveUtils.parse14DigitDate(rawDate); // addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date)); // } catch (ParseException e) { // logger.log(Level.WARNING,"obtaining warc created date",e); // } // } addIfNotBlank(record, "description", provider.getDescription()); addIfNotBlank(record, "robots", provider.getRobotsPolicyName().toLowerCase()); addIfNotBlank(record, "http-header-user-agent", provider.getUserAgent()); addIfNotBlank(record, "http-header-from", provider.getOperatorFrom()); // really ugly to return as List<String>, but changing would require // larger refactoring return Collections.singletonList(record.toString()); } protected void addIfNotBlank(ANVLRecord record, String label, String value) { if (StringUtils.isNotBlank(value)) { record.addLabelValue(label, value); } } @Override protected JSONObject toCheckpointJson() throws JSONException { JSONObject json = super.toCheckpointJson(); json.put("urlsWritten", urlsWritten); json.put("stats", stats); return json; } @Override protected void fromCheckpointJson(JSONObject json) throws JSONException { super.fromCheckpointJson(json); // conditionals below are for backward compatibility with old checkpoints if (json.has("urlsWritten")) { urlsWritten.set(json.getLong("urlsWritten")); } if (json.has("stats")) { HashMap<String, Map<String, Long>> cpStats = new HashMap<String, Map<String, Long>>(); JSONObject jsonStats = json.getJSONObject("stats"); if (JSONObject.getNames(jsonStats) != null) { for (String key1 : JSONObject.getNames(jsonStats)) { JSONObject jsonSubstats = jsonStats.getJSONObject(key1); if (!cpStats.containsKey(key1)) { cpStats.put(key1, new HashMap<String, Long>()); } Map<String, Long> substats = cpStats.get(key1); for (String key2 : JSONObject.getNames(jsonSubstats)) { long value = jsonSubstats.getLong(key2); substats.put(key2, value); } } addStats(cpStats); } } } @Override public String report() { // XXX note in report that stats include recovered checkpoint? logger.info("final stats: " + stats); StringBuilder buf = new StringBuilder(); buf.append("Processor: " + getClass().getName() + "\n"); buf.append(" Function: Writes WARCs\n"); buf.append(" Total CrawlURIs: " + urlsWritten + "\n"); buf.append(" Revisit records: " + WARCWriter.getStat(stats, WARCRecordType.revisit.toString(), WARCWriter.NUM_RECORDS) + "\n"); long bytes = WARCWriter.getStat(stats, WARCRecordType.response.toString(), WARCWriter.CONTENT_BYTES) + WARCWriter.getStat(stats, WARCRecordType.resource.toString(), WARCWriter.CONTENT_BYTES); buf.append(" Crawled content bytes (including http headers): " + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n"); bytes = WARCWriter.getStat(stats, WARCWriter.TOTALS, WARCWriter.TOTAL_BYTES); buf.append(" Total uncompressed bytes (including all warc records): " + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n"); buf.append(" Total size on disk (" + (getCompress() ? "compressed" : "uncompressed") + "): " + getTotalBytesWritten() + " (" + ArchiveUtils.formatBytesForDisplay(getTotalBytesWritten()) + ")\n"); return buf.toString(); } }