Java tutorial
/* Created on 2006-okt-03 * * Copyright (C) 2006 National Library of Sweden. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package org.archive.modules.writer; import static org.archive.modules.writer.Kw3Constants.ARCHIVE_TIME_KEY; import static org.archive.modules.writer.Kw3Constants.COLLECTION_KEY; import static org.archive.modules.writer.Kw3Constants.CONTENT_LENGTH_KEY; import static org.archive.modules.writer.Kw3Constants.CONTENT_MD5_KEY; import static org.archive.modules.writer.Kw3Constants.HARVESTER_KEY; import static org.archive.modules.writer.Kw3Constants.HEADER_LENGTH_KEY; import static org.archive.modules.writer.Kw3Constants.HEADER_MD5_KEY; import static org.archive.modules.writer.Kw3Constants.IP_ADDRESS_KEY; import static org.archive.modules.writer.Kw3Constants.STATUS_CODE_KEY; import static org.archive.modules.writer.Kw3Constants.URL_KEY; import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.net.InetAddress; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.logging.Level; import java.util.logging.Logger; import org.archive.io.ReplayInputStream; import org.archive.modules.CrawlURI; import org.archive.modules.Processor; import org.archive.modules.net.CrawlHost; import org.archive.modules.net.ServerCache; import org.archive.spring.ConfigPath; import org.archive.util.FileUtils; import org.springframework.beans.factory.annotation.Autowired; /** * Processor module that writes the results of successful fetches to * files on disk. These files are MIME-files of the type used by the * Swedish National Library's Kulturarw3 web harvesting [http://www.kb.se/kw3/]. * * Each URI gets written to its own file and has a path consisting of: * <ul> * <li> A dir named with the first two chars of the website's md5. </li> * <li> A dir named after the website. </li> * <li> 'current' - a dir indicating that this is the directory being written * to by the ongoing crawl. </li> * <li> A file on the format <md5 of url>.<fetchtime in seconds> </li> * </ul> * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837' * * The MIME-file itself consists of three parts: * <ul> * <li> 1. ArchiveInfo - Metadata about the file and its content. </li> * <li> 2. Header - The HTTP response header. </li> * <li> 3. Content - The HTTP response content, plus content-type. </li> * </ul> * * @author oskar */ public class Kw3WriterProcessor extends Processor { @SuppressWarnings("unused") private static final long serialVersionUID = 3L; private static String COLON = ":"; private static String WS = " "; private static String LF = "\n"; /** * Logger. */ private static final Logger logger = Logger.getLogger(Kw3WriterProcessor.class.getName()); /** * Top-level directory for archive files. */ protected ConfigPath path = new ConfigPath("Kw3Writer subdirectory", "arcs"); public ConfigPath getPath() { return this.path; } public void setPath(ConfigPath s) { this.path = s; } /** * Max size for each file. */ protected long maxFileSizeBytes = 100000000L; public long getMaxFileSizeBytes() { return maxFileSizeBytes; } public void setMaxFileSizeBytes(long maxFileSizeBytes) { this.maxFileSizeBytes = maxFileSizeBytes; } /** * Should permissions be changed for the newly created dirs. */ protected boolean chmod = false; public boolean getChmod() { return chmod; } public void setChmod(boolean chmod) { this.chmod = chmod; } /** * What should the permissions be set to. Given as three octal digits, as to * the UNIX 'chmod' command. Ex. 777 for all permissions to everyone. */ protected String chmodValue = "777"; public String getChmodValue() { return this.chmodValue; } public void setChmodValue(String s) { this.chmodValue = s; } /** * Max size for each file.Key for the maximum ARC bytes to write attribute. */ public static final String ATTR_MAX_BYTES_WRITTEN = "total-bytes-to-write"; /** * Name of collection. */ protected String collection = "kw3"; public String getCollection() { return this.collection; } public void setCollection(String s) { this.collection = s; } /** * Name of the harvester that is used for the web harvesting. */ protected String harvester = "heritrix"; public String getHarvester() { return this.harvester; } public void setHarvester(String s) { this.harvester = s; } /** * The server cache to use. */ protected ServerCache serverCache; public ServerCache getServerCache() { return this.serverCache; } @Autowired public void setServerCache(ServerCache serverCache) { this.serverCache = serverCache; } private static String BOUNDARY_START = "KulturArw3_"; /** * Constructor. */ public Kw3WriterProcessor() { } protected boolean shouldProcess(CrawlURI curi) { // Only successful fetches are written. if (!isSuccess(curi)) { return false; } // Only http and https schemes are supported. String scheme = curi.getUURI().getScheme().toLowerCase(); if (!"http".equalsIgnoreCase(scheme) && !"https".equalsIgnoreCase(scheme)) { return false; } return true; } protected void innerProcess(CrawlURI curi) { // Write the MIME-file try { writeMimeFile(curi); } catch (IOException e) { logger.log(Level.WARNING, "i/o error", e); } } /** * The actual writing of the Kulturarw3 MIME-file. * * The MIME-file consists of three parts: * 1. ArchiveInfo - Metadata about the file and its content. * 2. Header - The HTTP response header. * 3. Content - The HTTP response content, plus content-type. * * For more on this format, see '?'. */ protected void writeMimeFile(CrawlURI curi) throws IOException { ReplayInputStream ris = null; OutputStream out = null; try { String boundary = BOUNDARY_START + stringToMD5(curi.toString()); ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); out = initOutputStream(curi); // Part 1: Archive info writeArchiveInfoPart(boundary, curi, ris, out); // Part 2: Header info + HTTP header writeHeaderPart(boundary, ris, out); // Part 3: Content info + HTTP content writeContentPart(boundary, curi, ris, out); // And finally the terminator string String terminator = "\n--" + boundary + "--\n"; out.write(terminator.getBytes()); } finally { if (ris != null) ris.close(); if (out != null) out.close(); } } /** * Get the OutputStream for the file to write to. * * It has a path consisting of: * 1. A dir named with the first two chars of the website's md5. * 2. A dir named after the website. * 3. 'current' - a dir indicating that this is the directory being written * to by the ongoing crawl. * 4. A file on the format <md5 of url>.<fetchtime in seconds> * * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837' */ protected OutputStream initOutputStream(CrawlURI curi) throws IOException { String uri = curi.toString(); int port = curi.getUURI().getPort(); String host = (port == 80 || port <= 0) ? curi.getUURI().getHost() : curi.getUURI().getHost() + ":" + port; long fetchTime = curi.getFetchBeginTime() / 1000; String md5 = stringToMD5(host); File dir = new File(getPath().getFile(), md5.substring(0, 2) + "/" + host + "/current"); if (!dir.exists()) { FileUtils.ensureWriteableDirectory(dir); if (this.chmod) chmods(dir, getPath().getFile()); } md5 = stringToMD5(uri); File arcFile = new File(dir, md5 + "." + fetchTime); return new FastBufferedOutputStream(new FileOutputStream(arcFile)); } protected void writeArchiveInfoPart(String boundary, CrawlURI curi, ReplayInputStream ris, OutputStream out) throws IOException { // Get things we need to write in this part String uri = curi.toString(); String ip = getHostAddress(curi); long headerLength = ris.getHeaderSize(); long contentLength = ris.getContentSize(); long archiveTime = System.currentTimeMillis() / 1000; // Fetchtime in seconds int statusCode = curi.getFetchStatus(); String headerMd5 = null; Object contentMd5 = null; // Get headerMd5 ByteArrayOutputStream baos = new ByteArrayOutputStream(); ris.readHeaderTo(baos); headerMd5 = stringToMD5(baos.toString()); // Get contentMd5 contentMd5 = curi.getContentDigest(); if (contentMd5 != null) contentMd5 = getHexString((byte[]) contentMd5); StringBuffer buffer = new StringBuffer(); buffer.append("MIME-version: 1.1" + LF); buffer.append("Content-Type: multipart/mixed; boundary=" + boundary + LF); buffer.append("HTTP-Part: ArchiveInfo" + LF); buffer.append(COLLECTION_KEY + COLON + WS + this.collection + LF); buffer.append(HARVESTER_KEY + COLON + WS + this.harvester + LF); buffer.append(URL_KEY + COLON + WS + uri + LF); buffer.append(IP_ADDRESS_KEY + COLON + WS + ip + LF); buffer.append(HEADER_LENGTH_KEY + COLON + WS + headerLength + LF); buffer.append(HEADER_MD5_KEY + COLON + WS + headerMd5 + LF); buffer.append(CONTENT_LENGTH_KEY + COLON + WS + contentLength + LF); buffer.append(CONTENT_MD5_KEY + COLON + WS + contentMd5 + LF); buffer.append(ARCHIVE_TIME_KEY + COLON + WS + archiveTime + LF); buffer.append(STATUS_CODE_KEY + COLON + WS + statusCode + LF + LF); out.write(buffer.toString().getBytes()); } protected void writeHeaderPart(String boundary, ReplayInputStream ris, OutputStream out) throws IOException { StringBuffer buffer = new StringBuffer(); buffer.append("--" + boundary + LF); buffer.append("Content-Type: text/plain; charset=\"US-ascii\"" + LF); buffer.append("HTTP-Part: Header" + LF + LF); out.write(buffer.toString().getBytes()); ris.readHeaderTo(out); } protected void writeContentPart(String boundary, CrawlURI curi, ReplayInputStream ris, OutputStream out) throws IOException { // Get things we need to write in this part String uri = curi.toString(); String contentType = curi.getContentType(); long contentLength = ris.getContentSize(); // Only write content if there is some if (contentLength == 0) return; StringBuffer buffer = new StringBuffer(); buffer.append("--" + boundary + LF); buffer.append("Content-Type: " + contentType + LF); buffer.append("HTTP-Part: Content" + LF + LF); out.write(buffer.toString().getBytes()); if (contentLength > getMaxFileSizeBytes()) { ris.readContentTo(out, getMaxFileSizeBytes()); logger.info(" Truncated url: " + uri + ", Size: " + contentLength + ", Content-type: " + contentType); } else { ris.readContentTo(out); } } // --- Private helper functions --- // /* * Get a MD5 checksum based on a String. */ private String stringToMD5(String str) { try { byte b[] = str.getBytes(); MessageDigest md = MessageDigest.getInstance("MD5"); md.update(b); byte[] digest = md.digest(); return getHexString(digest); } catch (NoSuchAlgorithmException e) { logger.log(Level.WARNING, "md5 error", e); } return null; } /* * Fast convert a byte array to a hex string with possible leading zero. */ private String getHexString(byte[] b) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < b.length; i++) { String tmp = Integer.toHexString(b[i] & 0xff); if (tmp.length() < 2) sb.append("0" + tmp); else sb.append(tmp); } return sb.toString(); } /* * Chmods for all newly created directories. */ private void chmods(File dir, File arcsDir) { String topdir = arcsDir.getAbsolutePath(); chmod(dir, this.chmodValue); File parent = dir.getParentFile(); while (!parent.getAbsolutePath().equalsIgnoreCase((topdir))) { chmod(parent, this.chmodValue); parent = parent.getParentFile(); } } /* * Chmod for a specific file or directory. */ private void chmod(File file, String permissions) { Process proc = null; try { proc = Runtime.getRuntime().exec("chmod " + permissions + " " + file.getAbsolutePath()); proc.waitFor(); proc.getInputStream().close(); proc.getOutputStream().close(); proc.getErrorStream().close(); } catch (IOException e) { logger.log(Level.WARNING, "chmod failed", e); } catch (InterruptedException e) { logger.log(Level.WARNING, "chmod failed", e); } } private String getHostAddress(CrawlURI curi) { CrawlHost h = serverCache.getHostFor(curi.getUURI()); if (h == null) { throw new NullPointerException("Crawlhost is null for " + curi + " " + curi.getVia()); } InetAddress a = h.getIP(); if (a == null) { throw new NullPointerException("Address is null for " + curi + " " + curi.getVia() + ". Address " + ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ? "was never looked up." : (System.currentTimeMillis() - h.getIpFetched()) + " ms ago.")); } return h.getIP().getHostAddress(); } }