Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.writer; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WRITE_TAG; import java.io.IOException; import java.io.InputStream; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.StringUtils; import org.archive.io.ArchiveFileConstants; import org.archive.io.ReplayInputStream; import org.archive.io.WriterPoolMember; import org.archive.io.arc.ARCWriter; import org.archive.io.arc.ARCWriterPool; import org.archive.modules.CrawlURI; import org.archive.modules.ProcessResult; import org.archive.spring.ConfigPath; import org.archive.util.ArchiveUtils; /** * Processor module for writing the results of successful fetches (and * perhaps someday, certain kinds of network failures) to the Internet Archive * ARC file format. * * Assumption is that there is only one of these ARCWriterProcessors per * Heritrix instance. * * @author Parker Thompson */ public class ARCWriterProcessor extends WriterPoolProcessor { final static private String METADATA_TEMPLATE = readMetadataTemplate(); @SuppressWarnings("unused") private static final long serialVersionUID = 3L; private static final Logger logger = Logger.getLogger(ARCWriterProcessor.class.getName()); public long getDefaultMaxFileSize() { return 100000000L; // 100 SI mega-bytes (10^8 bytes) } public List<ConfigPath> getDefaultStorePaths() { List<ConfigPath> paths = new ArrayList<ConfigPath>(); paths.add(new ConfigPath("arcs default store path", "arcs")); return paths; } private transient List<String> cachedMetadata; public ARCWriterProcessor() { } @Override protected void setupPool(AtomicInteger serialNo) { setPool(new ARCWriterPool(serialNo, this, getPoolMaxActive(), getMaxWaitForIdleMs())); } /** * Writes a CrawlURI and its associated data to store file. * * Currently this method understands the following uri types: dns, http, * and https. * * @param curi CrawlURI to process. */ protected ProcessResult innerProcessResult(CrawlURI puri) { CrawlURI curi = (CrawlURI) puri; long recordLength = getRecordedSize(curi); ReplayInputStream ris = null; try { if (shouldWrite(curi)) { ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); return write(curi, recordLength, ris, getHostAddress(curi)); } else { logger.info("does not write " + curi.toString()); copyForwardWriteTagIfDupe(curi); } } catch (IOException e) { curi.getNonFatalFailures().add(e); logger.log(Level.SEVERE, "Failed write of Record: " + curi.toString(), e); } finally { IOUtils.closeQuietly(ris); } return ProcessResult.PROCEED; } protected ProcessResult write(CrawlURI curi, long recordLength, InputStream in, String ip) throws IOException { WriterPoolMember writer = getPool().borrowFile(); long position = writer.getPosition(); // See if we need to open a new file because we've exceeded maxBytes. // Call to checkFileSize will open new file if we're at maximum for // current file. writer.checkSize(); if (writer.getPosition() != position) { // We just closed the file because it was larger than maxBytes. // Add to the totalBytesWritten the size of the first record // in the file, if any. setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - position)); position = writer.getPosition(); } ARCWriter w = (ARCWriter) writer; try { if (in instanceof ReplayInputStream) { w.write(curi.toString(), curi.getContentType(), ip, curi.getFetchBeginTime(), recordLength, (ReplayInputStream) in); } else { w.write(curi.toString(), curi.getContentType(), ip, curi.getFetchBeginTime(), recordLength, in); } } catch (IOException e) { // Invalidate this file (It gets a '.invalid' suffix). getPool().invalidateFile(writer); // Set the writer to null otherwise the pool accounting // of how many active writers gets skewed if we subsequently // do a returnWriter call on this object in the finally block. writer = null; throw e; } finally { if (writer != null) { setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - position)); getPool().returnFile(writer); String filename = writer.getFile().getName(); if (filename.endsWith(ArchiveFileConstants.OCCUPIED_SUFFIX)) { filename = filename.substring(0, filename.length() - ArchiveFileConstants.OCCUPIED_SUFFIX.length()); } curi.addExtraInfo("arcFilename", filename); Map<String, Object>[] history = curi.getFetchHistory(); if (history != null && history[0] != null) { history[0].put(A_WRITE_TAG, filename); } } } return checkBytesWritten(); } public List<String> getMetadata() { if (METADATA_TEMPLATE == null) { return null; } if (cachedMetadata != null) { return cachedMetadata; } String meta = METADATA_TEMPLATE; meta = replace(meta, "${VERSION}", ArchiveUtils.VERSION); meta = replace(meta, "${HOST}", getHostName()); meta = replace(meta, "${IP}", getHostAddress()); if (meta != null) { meta = replace(meta, "${JOB_NAME}", getMetadataProvider().getJobName()); meta = replace(meta, "${DESCRIPTION}", getMetadataProvider().getDescription()); meta = replace(meta, "${OPERATOR}", getMetadataProvider().getOperator()); // TODO: fix this to match job-start-date (from UI or operator setting) // in the meantime, don't include a slightly-off date // meta = replace(meta, "${DATE}", GMT()); meta = replace(meta, "${USER_AGENT}", getMetadataProvider().getUserAgent()); meta = replace(meta, "${FROM}", getMetadataProvider().getOperatorFrom()); meta = replace(meta, "${ROBOTS}", getMetadataProvider().getRobotsPolicyName()); } this.cachedMetadata = Collections.singletonList(meta); return this.cachedMetadata; // ${VERSION} // ${HOST} // ${IP} // ${JOB_NAME} // ${DESCRIPTION} // ${OPERATOR} // ${DATE} // ${USER_AGENT} // ${FROM} // ${ROBOTS} } private static String replace(String meta, String find, String replace) { replace = StringUtils.defaultString(replace); replace = StringEscapeUtils.escapeXml(replace); return meta.replace(find, replace); } private static String getHostName() { try { return InetAddress.getLocalHost().getCanonicalHostName(); } catch (UnknownHostException e) { logger.log(Level.SEVERE, "Could not get local host name.", e); return "localhost"; } } private static String getHostAddress() { try { return InetAddress.getLocalHost().getHostAddress(); } catch (UnknownHostException e) { logger.log(Level.SEVERE, "Could not get local host address.", e); return "localhost"; } } private static String readMetadataTemplate() { InputStream input = ARCWriterProcessor.class.getResourceAsStream("arc_metadata_template.xml"); if (input == null) { logger.severe("No metadata template."); return null; } try { return IOUtils.toString(input); } catch (IOException e) { throw new IllegalStateException(e); } finally { IOUtils.closeQuietly(input); } } }