Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.crawler.selftest; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.math.BigInteger; import java.security.SecureRandom; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; import org.archive.crawler.Heritrix; import org.archive.io.ArchiveRecordHeader; import org.archive.io.arc.ARCReaderFactory; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.util.TmpDirTestCase; import org.mortbay.jetty.Handler; import org.mortbay.jetty.Server; import org.mortbay.jetty.bio.SocketConnector; import org.mortbay.jetty.handler.DefaultHandler; import org.mortbay.jetty.handler.HandlerList; import org.mortbay.jetty.handler.ResourceHandler; /** * Base class for 'self tests', integrations tests formatted as unit * tests, where the crawler launches an entire crawl exercising multiple * features against a test harness website. * * @contributor pjack * @contributor gojomo */ public abstract class SelfTestBase extends TmpDirTestCase { final private Logger LOGGER = Logger.getLogger(SelfTestBase.class.getName()); protected Heritrix heritrix; protected Server httpServer; protected void open() throws Exception { // We expect to be run from the project directory. // (Both eclipse and maven run junit tests from there). String name = getSelfTestName(); // Make sure the project directory contains a selftest profile // and content for the self test. File src = getTestDataDir(); if (!src.exists()) { throw new Exception("No selftest directory for " + name); } // Create temporary directories for Heritrix to run in. File tmpDir = new File(getTmpDir(), "selftest"); File tmpTestDir = new File(tmpDir, name); // If we have an old job lying around from a previous run, delete it. File tmpJobs = new File(tmpTestDir, "jobs"); if (tmpJobs.exists()) { FileUtils.deleteDirectory(tmpJobs); } // Copy the selftest's profile in the project directory to the // default profile in the temporary Heritrix directory. File tmpDefProfile = new File(tmpJobs, "selftest-job"); File profileTemplate = new File(src, "profile"); if (profileTemplate.exists()) { org.apache.commons.io.FileUtils.copyDirectory(profileTemplate, tmpDefProfile); } else { org.archive.util.FileUtils.ensureWriteableDirectory(tmpDefProfile); } // Start up a Jetty that serves the selftest's content directory. startHttpServer(); // Copy configuration for eg Logging over File tmpConfDir = new File(tmpTestDir, "conf"); org.archive.util.FileUtils.ensureWriteableDirectory(tmpConfDir); File srcConf = new File(src.getParentFile(), "conf"); FileUtils.copyDirectory(srcConf, tmpConfDir); String crawlerBeansText = FileUtils.readFileToString(new File(srcConf, "selftest-crawler-beans.cxml")); crawlerBeansText = changeGlobalConfig(crawlerBeansText); File crawlerBeans = new File(tmpDefProfile, "selftest-crawler-beans.cxml"); FileWriter fw = new FileWriter(crawlerBeans); fw.write(crawlerBeansText); fw.close(); startHeritrix(tmpTestDir.getAbsolutePath()); waitForCrawlFinish(); } protected String changeGlobalConfig(String config) { config = config.replace("@@URL_VALUE@@", "http://crawler.archive.org/selftestcrawl"); // if not already changed, used default self-test start URL config = config.replace("@@SEEDS_VALUE@@", getSeedsString()); // if not already replaced, remove other placeholder config = config.replace("@@MORE_PROPERTIES@@", ""); return config; } /** * Get seeds for this test. Should be in form that can be * spliced into a Java properties-format string (any internal * lineends escaped with '\'). * @return String seeds to use */ protected String getSeedsString() { // default barring overrides return "http://127.0.0.1:7777/index.html"; } protected void close() throws Exception { stopHttpServer(); stopHeritrix(); } public void testSomething() throws Exception { try { boolean fail = false; try { open(); verifyCommon(); verify(); } finally { try { close(); } catch (Exception e) { e.printStackTrace(); fail = true; } } assertFalse(fail); } catch (Exception e) { // I hate maven. e.printStackTrace(); throw e; } } protected abstract void verify() throws Exception; protected void stopHttpServer() throws Exception { try { httpServer.stop(); } catch (Exception e) { e.printStackTrace(); } } protected void startHttpServer() throws Exception { Server server = new Server(); SocketConnector sc = new SocketConnector(); sc.setHost("127.0.0.1"); sc.setPort(7777); server.addConnector(sc); ResourceHandler rhandler = new ResourceHandler(); rhandler.setResourceBase(getSrcHtdocs().getAbsolutePath()); HandlerList handlers = new HandlerList(); handlers.setHandlers(new Handler[] { rhandler, new DefaultHandler() }); server.setHandler(handlers); this.httpServer = server; server.start(); } protected void startHeritrix(String path) throws Exception { String authPassword = (new BigInteger(SecureRandom.getSeed(16))).abs().toString(16); String[] args = { "-j", path + "/jobs", "-a", authPassword }; // TODO: add auth password? heritrix = new Heritrix(); heritrix.instanceMain(args); configureHeritrix(); heritrix.getEngine().requestLaunch("selftest-job"); } protected void configureHeritrix() throws Exception { // by default do nothing } protected void stopHeritrix() throws Exception { heritrix.getEngine().shutdown(); heritrix.getComponent().stop(); } protected void waitForCrawlFinish() throws Exception { heritrix.getEngine().waitForNoRunningJobs(0); } protected File getSrcHtdocs() { return new File(getTestDataDir(), "htdocs"); } protected File getTestDataDir() { File r = new File("testdata"); if (!r.exists()) { r = new File("engine"); r = new File(r, "testdata"); if (!r.exists()) { throw new IllegalStateException("Can't find selfest testdata " + "(tried testdata/selftest and " + "heritrix/testdata/selftest)"); } } r = new File(r, "selftest"); r = new File(r, getSelfTestName()); if (!r.exists()) { throw new IllegalStateException("No testdata directory: " + r.getAbsolutePath()); } return r; } protected File getCrawlDir() { File tmp = getTmpDir(); File selftest = new File(tmp, "selftest"); File crawl = new File(selftest, getSelfTestName()); return crawl; } protected File getJobDir() { File crawl = getCrawlDir(); File jobs = new File(crawl, "jobs"); File theJob = new File(jobs, "selftest-job"); return theJob; } protected File getArcDir() { return new File(getJobDir(), "arcs"); } protected File getLogsDir() { return new File(getJobDir(), "logs"); } private String getSelfTestName() { String full = getClass().getName(); int i = full.lastIndexOf('.'); return full.substring(i + 1); } protected void verifyArcsClosed() { File arcsDir = getArcDir(); if (!arcsDir.exists()) { throw new IllegalStateException("Missing arc dir " + arcsDir.getAbsolutePath()); } for (File f : arcsDir.listFiles()) { String fn = f.getName(); if (fn.endsWith(".open")) { throw new IllegalStateException("Arc file not closed at end of crawl: " + f.getAbsolutePath()); } } } protected void verifyLogFileEmpty(String logFileName) { File logsDir = getLogsDir(); File log = new File(logsDir, logFileName); if (log.length() != 0) { throw new IllegalStateException("Log " + logFileName + " isn't empty."); } } protected void verifyCommon() throws Exception { verifyLogFileEmpty("uri-errors.log"); verifyLogFileEmpty("runtime-errors.log"); verifyLogFileEmpty("local-errors.log"); verifyProgressStatistics(); verifyArcsClosed(); } protected void verifyProgressStatistics() throws IOException { File logs = new File(getJobDir(), "logs"); File statsFile = new File(logs, "progress-statistics.log"); String stats = FileUtils.readFileToString(statsFile); if (!stats.contains("CRAWL RUNNING - Preparing")) { fail("progress-statistics.log has no Prepared line."); } if (!stats.contains("CRAWL RUNNING - Running")) { fail("progress-statistics.log has no Running line."); } if (!stats.contains("CRAWL ENDING - Finished")) { fail("progress-statistics.log has missing/wrong Finished line."); } if (!stats.contains("doc/s(avg)")) { fail("progress-statistics.log has no legend."); } } protected List<ArchiveRecordHeader> headersInArcs() throws IOException { List<ArchiveRecordHeader> result = new ArrayList<ArchiveRecordHeader>(); File arcsDir = getArcDir(); if (!arcsDir.exists()) { throw new IllegalStateException("Missing arc dir " + arcsDir.getAbsolutePath()); } File[] files = arcsDir.listFiles(); if (files == null) { return Collections.emptyList(); } for (File f : files) { result.addAll(ARCReaderFactory.get(f).validate()); } return result; } protected Set<String> filesInArcs() throws IOException { List<ArchiveRecordHeader> headers = headersInArcs(); HashSet<String> result = new HashSet<String>(); for (ArchiveRecordHeader arh : headers) { // ignore 'filedesc:' record if (arh.getUrl().startsWith("filedesc:")) { continue; } UURI uuri = UURIFactory.getInstance(arh.getUrl()); String path = uuri.getPath(); if (path.startsWith("/")) { path = path.substring(1); } if (arh.getUrl().startsWith("http:")) { result.add(path); } } LOGGER.finest(result.toString()); return result; } }