Java tutorial
/* File: $Id$ * Revision: $Revision$ * Author: $Author$ * Date: $Date$ * * The Netarchive Suite - Software to harvest and preserve websites * Copyright 2004-2012 The Royal Danish Library, the Danish State and * University Library, the National Library of France and the Austrian * National Library. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package dk.netarkivet.harvester.harvesting.controller; import dk.netarkivet.common.CommonSettings; import dk.netarkivet.common.exceptions.ArgumentNotValid; import dk.netarkivet.common.exceptions.IOFailure; import dk.netarkivet.common.utils.*; import dk.netarkivet.harvester.HarvesterSettings; import dk.netarkivet.harvester.harvesting.HeritrixFiles; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.archive.crawler.Heritrix; import java.io.*; import java.util.*; /** * Abstract base class for JMX-based Heritrix controllers. */ public abstract class AbstractJMXHeritrixController implements HeritrixController { /** The logger for this class. */ private static final Log log = LogFactory.getLog(AbstractJMXHeritrixController.class); /** File path Separator. Used to separate the jar-files in the classpath. */ private static final String FILE_PATH_SEPARATOR = ":"; /** * How long we're willing to wait for Heritrix to shutdown in a shutdown * hook. */ private static final long SHUTDOWN_HOOK_MAX_WAIT = 1000L; /** The various files used by Heritrix. */ private final HeritrixFiles files; /** * The threads used to collect process output. Only one thread used * presently. */ private Set<Thread> collectionThreads = new HashSet<Thread>(1); /** * The host name for this machine that matches what Heritrix uses in its * MBean names. */ private final String hostName; /** * The port to use for Heritrix JMX, as set in settings.xml. */ private final int jmxPort = Settings.getInt(HarvesterSettings.HERITRIX_JMX_PORT); /** * The port to use for Heritrix GUI, as set in settings.xml. */ private final int guiPort = Settings.getInt(HarvesterSettings.HERITRIX_GUI_PORT); /** * The shutdownHook that takes care of killing our process. This is removed * in cleanup() when the process is shut down. */ private Thread processKillerHook; /** * The one-shot Heritrix process created in the constructor. It will only * perform a single crawl before being shut down. */ private final Process heritrixProcess; /** * Create a BnfHeritrixController object. * * @param files * Files that are used to set up Heritrix. */ public AbstractJMXHeritrixController(HeritrixFiles files) { ArgumentNotValid.checkNotNull(files, "HeritrixFile files"); this.files = files; SystemUtils.checkPortNotUsed(guiPort); SystemUtils.checkPortNotUsed(jmxPort); hostName = SystemUtils.getLocalHostName(); try { log.info("Starting Heritrix for " + this); /* * To start Heritrix, we need to do the following (taken from the * Heritrix startup shell script): - set heritrix.home to base dir * of Heritrix stuff - set com.sun.management.jmxremote.port to JMX * port - set com.sun.management.jmxremote.ssl to false - set * com.sun.management.jmxremote.password.file to JMX password file - * set heritrix.out to heritrix_out.log - set * java.protocol.handler.pkgs=org.archive.net - send processOutput & * stderr into heritrix.out - let the Heritrix GUI-webserver listen * on all available network interfaces: This is done with argument * "--bind /" (default is 127.0.0.1) - listen on a specific port * using the port argument: --port <GUI port> * * We also need to output something like the following to * heritrix.out: `date Starting heritrix uname -a java -version * JAVA_OPTS ulimit -a */ File heritrixOutputFile = files.getHeritrixOutput(); StringBuilder settingProperty = new StringBuilder(); for (File file : Settings.getSettingsFiles()) { settingProperty.append(File.pathSeparator); String absolutePath = file.getAbsolutePath(); // check that the settings files not only exist but // are readable boolean readable = new File(absolutePath).canRead(); if (!readable) { final String errMsg = "The file '" + absolutePath + "' is missing. "; log.warn(errMsg); throw new IOFailure("Failed to read file '" + absolutePath + "'"); } settingProperty.append(absolutePath); } if (settingProperty.length() > 0) { // delete last path-separator settingProperty.deleteCharAt(0); } List<String> allOpts = new LinkedList<String>(); allOpts.add(new File(new File(System.getProperty("java.home"), "bin"), "java").getAbsolutePath()); allOpts.add("-Xmx" + Settings.get(HarvesterSettings.HERITRIX_HEAP_SIZE)); allOpts.add("-Dheritrix.home=" + files.getCrawlDir().getAbsolutePath()); String jvmOptsStr = Settings.get(HarvesterSettings.HERITRIX_JVM_OPTS); if ((jvmOptsStr != null) && (!jvmOptsStr.isEmpty())) { String[] add = jvmOptsStr.split(" "); allOpts.addAll(Arrays.asList(add)); } allOpts.add("-Dcom.sun.management.jmxremote.port=" + jmxPort); allOpts.add("-Dcom.sun.management.jmxremote.ssl=false"); // check that JMX password and access files are readable. // TODO This should probably be extracted to a method? File passwordFile = files.getJmxPasswordFile(); String pwAbsolutePath = passwordFile.getAbsolutePath(); if (!passwordFile.canRead()) { final String errMsg = "Failed to read the password file '" + pwAbsolutePath + "'. It is possibly missing."; log.warn(errMsg); throw new IOFailure(errMsg); } File accessFile = files.getJmxAccessFile(); String acAbsolutePath = accessFile.getAbsolutePath(); if (!accessFile.canRead()) { final String errMsg = "Failed to read the access file '" + acAbsolutePath + "'. It is possibly missing."; log.warn(errMsg); throw new IOFailure(errMsg); } allOpts.add("-Dcom.sun.management.jmxremote.password.file=" + new File(pwAbsolutePath)); allOpts.add("-Dcom.sun.management.jmxremote.access.file=" + new File(acAbsolutePath)); allOpts.add("-Dheritrix.out=" + heritrixOutputFile.getAbsolutePath()); allOpts.add("-Djava.protocol.handler.pkgs=org.archive.net"); allOpts.add("-Ddk.netarkivet.settings.file=" + settingProperty); allOpts.add(Heritrix.class.getName()); allOpts.add("--bind"); allOpts.add("/"); allOpts.add("--port=" + guiPort); allOpts.add("--admin=" + getHeritrixAdminName() + ":" + getHeritrixAdminPassword()); String[] args = allOpts.toArray(new String[allOpts.size()]); log.info("Starting Heritrix process with args" + Arrays.toString(args)); log.debug("The JMX timeout is set to " + TimeUtils.readableTimeInterval(JMXUtils.getJmxTimeout())); ProcessBuilder builder = new ProcessBuilder(args); updateEnvironment(builder.environment()); FileUtils.copyDirectory(new File("lib/heritrix"), files.getCrawlDir()); builder.directory(files.getCrawlDir()); builder.redirectErrorStream(true); writeSystemInfo(heritrixOutputFile, builder); FileUtils.appendToFile(heritrixOutputFile, "Working directory: " + files.getCrawlDir()); addProcessKillerHook(); heritrixProcess = builder.start(); ProcessUtils.writeProcessOutput(heritrixProcess.getInputStream(), heritrixOutputFile, collectionThreads); } catch (IOException e) { throw new IOFailure("Error starting Heritrix process", e); } } /** * @return the JMX port for communicating with Heritrix. */ protected int getJmxPort() { return jmxPort; } /** * @return the HTTP port used by the Heritrix GUI. */ protected int getGuiPort() { return guiPort; } /** * @return the Heritrix files wrapper. */ protected HeritrixFiles getHeritrixFiles() { return files; } /** * @return the host name */ protected String getHostName() { return hostName; } /** * Get the login name for accessing the Heritrix GUI. This name can be set * in the settings.xml file. * * @return Name to use for accessing Heritrix web GUI */ private String getHeritrixAdminName() { return Settings.get(HarvesterSettings.HERITRIX_ADMIN_NAME); } /** * Get the login password for accessing the Heritrix GUI. This password can * be set in the settings.xml file. * * @return Password to use for accessing the Heritrix GUI */ private String getHeritrixAdminPassword() { return Settings.get(HarvesterSettings.HERITRIX_ADMIN_PASSWORD); } /** * Change an environment to be suitable for running Heritrix. * * At the moment, this involves the following: * * Prepend the Jar files from the lib/heritrix/lib dir to the classpath. * Make sure the Heritrix jar file is at the front. * * @param environment * The environment from a process builder * @throws IOFailure * If a Heritrix jarfile is not found. */ private static void updateEnvironment(Map<String, String> environment) { List<String> classPathParts = SystemUtils.getCurrentClasspath(); File heritrixLibDir = new File("lib/heritrix/lib"); File[] jars = heritrixLibDir.listFiles(new FilenameFilter() { public boolean accept(File file, String string) { return string.endsWith(".jar"); } }); // Reverse sort the file list in order to add in alphabetical order // before the basic jars. Arrays.sort(jars, new Comparator<File>() { public int compare(File file, File file1) { return file1.compareTo(file); } }); String heritixJar = null; for (File lib : jars) { final String jarPath = new File(heritrixLibDir, lib.getName()).getAbsolutePath(); if (lib.getName().startsWith("heritrix-")) { // Heritrix should be at the very head, as it redefines some // of the functions in its dependencies (!). Thus, we have to // save it for later insertion at the head. heritixJar = jarPath; } else { classPathParts.add(0, jarPath); } } if (heritixJar != null) { classPathParts.add(0, heritixJar); } else { throw new IOFailure("Heritrix jar file not found"); } environment.put("CLASSPATH", StringUtils.conjoin(FILE_PATH_SEPARATOR, classPathParts)); } /** * Write various info on the system we're using into the given file. This * info will later get put into metadata for the crawl. * * @param outputFile * A file to write to. * @param builder * The ProcessBuilder being used to start the Heritrix process */ @SuppressWarnings("unchecked") private void writeSystemInfo(File outputFile, ProcessBuilder builder) { PrintWriter writer = null; try { writer = new PrintWriter(new FileWriter(outputFile)); writer.println("The Heritrix process is started in the following" + " environment\n (note that some entries will be" + " changed by the starting JVM):"); Map<String, String> env = builder.environment(); List<String> keyList = new ArrayList<String>(env.keySet()); Collections.sort(keyList); for (String key : keyList) { writer.println(key + "=" + env.get(key)); } writer.println("Process properties:"); Properties properties = System.getProperties(); keyList = new ArrayList<String>((Set) properties.keySet()); Collections.sort(keyList); for (String key : keyList) { writer.println(key + "=" + properties.get(key)); } } catch (IOException e) { log.warn("Error writing basic properties to output file.", e); } finally { if (writer != null) { writer.close(); } } } /** * Add a shutdown hook that kills the process we've created. Since this hook * will be run only in case of JVM shutdown, it cannot expect that the * standard logging framework is still usable, and therefore writes to * stdout instead. */ private void addProcessKillerHook() { // Make sure that the process gets killed at the very end, at least processKillerHook = new Thread() { public void run() { try { // Only non-blocking way to check for process liveness int exitValue = heritrixProcess.exitValue(); System.out.println("Heritrix process of " + this + " exited with exit code " + exitValue); } catch (IllegalThreadStateException e) { // Process is still alive, kill it. System.out.println("Killing process of " + this); heritrixProcess.destroy(); final Integer exitValue = ProcessUtils.waitFor(heritrixProcess, SHUTDOWN_HOOK_MAX_WAIT); if (exitValue != null) { System.out.println("Process of " + this + " returned exit code " + exitValue); } else { System.out.println("Process of " + this + " never exited!"); } } } }; Runtime.getRuntime().addShutdownHook(processKillerHook); } /** * Get a string that describes the current controller in terms of job ID, * harvest ID, and crawldir. * * @return A human-readable string describing this controller. */ @Override public String toString() { if (heritrixProcess != null) { return "job " + files.getJobID() + " of harvest " + files.getHarvestID() + " in " + files.getCrawlDir() + " running process " + heritrixProcess; } else { return "job " + files.getJobID() + " of harvest " + files.getHarvestID() + " in " + files.getCrawlDir(); } } /** * Return true if the Heritrix process has exited, logging the exit value if * so. * * @return True if the process has exited. */ protected boolean processHasExited() { // First check if the process has exited already try { int exitValue = heritrixProcess.exitValue(); log.info("Process of " + this + " returned exit code " + exitValue); return true; } catch (IllegalThreadStateException e) { // Not exited yet, that's fine } return false; } /** * Waits for the Heritrix process to exit. */ protected void waitForHeritrixProcessExit() { final long maxWait = Settings.getLong(CommonSettings.PROCESS_TIMEOUT); final int maxJmxRetries = JMXUtils.getMaxTries(); Integer exitValue = ProcessUtils.waitFor(heritrixProcess, maxWait); if (exitValue != null) { log.info("Heritrix process of " + this + " exited with exit code " + exitValue); } else { log.warn("Heritrix process of " + this + " not dead after " + maxWait + " millis, killing it"); heritrixProcess.destroy(); exitValue = ProcessUtils.waitFor(heritrixProcess, maxWait); if (exitValue != null) { log.info("Heritrix process of " + this + " exited with exit code " + exitValue); } else { // If it's not dead now, there's little we can do. log.fatal( "Heritrix process of " + this + " not dead after destroy. " + "Exiting harvest controller. " + "Make sure you kill the runaway Heritrix " + "before you restart."); NotificationsFactory.getInstance().notify( "Heritrix process of " + this + " not dead after destroy. " + "Exiting harvest controller. " + "Make sure you kill the runaway Heritrix " + "before you restart.", NotificationType.ERROR); System.exit(1); } } Runtime.getRuntime().removeShutdownHook(processKillerHook); // Wait until all collection threads are dead or until we have // tried JMXUtils.MAX_TRIES times. int attempt = 0; do { boolean anyAlive = false; for (Thread t : collectionThreads) { if (t.isAlive()) { anyAlive = true; } } if (!anyAlive) { break; } TimeUtils.exponentialBackoffSleep(attempt); } while (attempt++ < maxJmxRetries); } /** * Return a human-readable description of the job. This will only be visible * in the Heritrix GUI. * * @return String containing various information grabbed from HeritrixFiles. */ protected String getJobDescription() { String dedupPart = (files.getIndexDir() != null) ? "with the deduplication index stored in '" + files.getIndexDir().getAbsolutePath() + "'" : "with deduplication disabled"; return "Job " + files.getJobID() + " for harvest " + files.getHarvestID() + " performed in " + files.getCrawlDir() + dedupPart + " and " + FileUtils.countLines(files.getSeedsTxtFile()) + " seeds"; } public HeritrixFiles getFiles() { return this.files; } }