dk.netarkivet.externalsoftware.HeritrixTests.java Source code

Java tutorial

Introduction

Here is the source code for dk.netarkivet.externalsoftware.HeritrixTests.java

Source

/*
 * #%L
 * Netarchivesuite - deploy - test
 * %%
 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
 *             the National Library of France and the Austrian National Library.
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */
package dk.netarkivet.externalsoftware;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import is.hi.bok.deduplicator.DeDuplicator;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCRecord;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.dom4j.io.XMLWriter;
import org.dom4j.util.XMLErrorHandler;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.SAXException;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.DomainUtils;
import dk.netarkivet.common.utils.ExceptionUtils;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.StringUtils;
import dk.netarkivet.common.utils.XmlUtils;
import dk.netarkivet.common.utils.cdx.CDXUtils;
import dk.netarkivet.harvester.datamodel.H1HeritrixTemplate;
import dk.netarkivet.harvester.harvesting.HeritrixFiles;
import dk.netarkivet.harvester.harvesting.HeritrixLauncher;
import dk.netarkivet.harvester.harvesting.HeritrixLauncherFactory;
import dk.netarkivet.harvester.harvesting.JobInfo;
import dk.netarkivet.harvester.harvesting.controller.AbstractJMXHeritrixController;
import dk.netarkivet.harvester.harvesting.report.AbstractHarvestReport;
import dk.netarkivet.harvester.harvesting.report.DomainStatsReport;
import dk.netarkivet.harvester.harvesting.report.HarvestReport;
import dk.netarkivet.harvester.harvesting.report.HarvestReportGenerator;
import dk.netarkivet.harvester.harvesting.report.LegacyHarvestReport;
import dk.netarkivet.harvester.indexserver.LuceneUtils;
import dk.netarkivet.testutils.FileAsserts;
import dk.netarkivet.testutils.ReflectUtils;
import dk.netarkivet.testutils.StringAsserts;
import dk.netarkivet.testutils.TestFileUtils;
import dk.netarkivet.testutils.preconfigured.MoveTestFiles;

//import dk.netarkivet.harvester.harvesting.JobInfoTestImpl;

/**
 * Tests various aspects of launching Heritrix and Heritrix' capabilities. Note that some of these tests require much
 * heap space, so JVM parameter -Xmx512M may be required.
 * <p>
 * <p>
 * Note: after upgrading to Heritrix 1.14.3, the unittest testBug820() that tests if it is still necessary to use
 * FixedUURI does not work any more. //import org.apache.commons.httpclient.URIException; //import org.archive.net.UURI;
 * //import dk.netarkivet.common.utils.FixedUURI;
 */
@Ignore // Can't find tests resources
public class HeritrixTests {

    protected final static String WRITE_PROCESSORS_XPATH = "/crawl-order/controller/map[@name='write-processors']";
    protected final static String DEDUPLICATOR_XPATH = WRITE_PROCESSORS_XPATH + "/newObject[@name='DeDuplicator']";

    protected final static String DEDUPLICATOR_INDEX_LOCATION_XPATH = DEDUPLICATOR_XPATH
            + "/string[@name='index-location']";
    protected final static String DEDUPLICATOR_MATCHING_METHOD_XPATH = DEDUPLICATOR_XPATH
            + "/string[@name='matching-method']";

    protected final static String DEDUPLICATOR_ORIGIN_HANDLING_XPATH = DEDUPLICATOR_XPATH
            + "/string[@name='origin-handling']";

    private HeritrixLauncher hl;
    private MoveTestFiles mtf;

    public HeritrixTests() {
        mtf = new MoveTestFiles(TestInfo.ORIGINALS_DIR, TestInfo.WORKING_DIR);
    }

    @Before
    public void setUp() throws Exception {
        FileUtils.removeRecursively(TestInfo.WORKING_DIR);

        long endtime = System.currentTimeMillis() + 1500;
        while (System.currentTimeMillis() < endtime) {
        }
        mtf.setUp();
        // TestInfo.WORKING_DIR.mkdirs();

        // Check, that dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy
        // is recognized as a valid QueueAssignmentPolicy

        // This setting should set either directly by
        // -Dorg.archive.crawler.frontier.AbstractFrontier.queue-assignment-policy=
        // org.archive.crawler.frontier.HostnameQueueAssignmentPolicy,org.archive.crawler.frontier.IPQueueAssignmentPolicy,
        // org.archive.crawler.frontier.BucketQueueAssignmentPolicy,
        // org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy,
        // dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy

        if (!System.getProperties()
                .containsKey("org.archive.crawler.frontier.AbstractFrontier.queue-assignment-policy")) {
            fail("org.archive.crawler.frontier.AbstractFrontier.queue-assignment-policy is not defined!!");
        }
    }

    @After
    public void tearDown() {
        // it takes a little while for heritrix to close completely (including all threads)
        // so we have to wait a little while here !
        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
            // okay
        }
        mtf.tearDown();
        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
            // okay
        }
    }

    /**
     * Centralized place for tests to construct a HeritrixLauncher. - Constructs the given crawlDir. - Copies the given
     * order.xml to the proper place in the given crawlDir. - Copies the given seeds.txt to the proper place in the
     * given crawlDir. - Copies the given indexDir to the proper place in the given crawlDir (index) - Constructs a
     * HeritrixLauncher and returns it Uses the values of JMX_PASSWORD_FILE and JMX_ACCESS_FILE read in settings.
     *
     * @param origOrderXml the given order.xml
     * @param origSeedsFile the given seeds file
     * @param origIndexDir the given index directory
     * @return a HeritrixLauncher object
     * @throws IOException
     */
    private HeritrixLauncher getHeritrixLauncher(File origOrderXml, File origSeedsFile, File origIndexDir)
            throws IOException {

        return getHeritrixLauncher(origOrderXml, origSeedsFile, origIndexDir,
                new File(Settings.get(CommonSettings.JMX_PASSWORD_FILE)),
                new File(Settings.get(CommonSettings.JMX_ACCESS_FILE)));
    }

    /**
     * * Centralized place for tests to construct a HeritrixLauncher. - Constructs the given crawlDir. - Copies the
     * given order.xml to the proper place in the given crawlDir. - Copies the given seeds.txt to the proper place in
     * the given crawlDir. - Copies the given indexDir to the proper place in the given crawlDir (index) - Constructs a
     * HeritrixLauncher and returns it
     *
     * @param origOrderXml the given order.xml
     * @param origSeedsFile the given seeds file
     * @param origIndexDir the given index directory
     * @param jmxPasswordFile The jmx password file to be used by Heritrix
     * @param jmxAccessFile The jmx access file to be used by Heritrix
     * @return
     */
    private HeritrixLauncher getHeritrixLauncher(File origOrderXml, File origSeedsFile, File origIndexDir,
            File jmxPasswordFile, File jmxAccessFile) {

        if (!origOrderXml.exists()) {
            fail("order-File does not exist: " + origOrderXml.getAbsolutePath());
        }
        if (!origSeedsFile.exists()) {
            fail("seeds-File does not exist: " + origSeedsFile.getAbsolutePath());
        }
        if (!origIndexDir.exists()) {
            fail("Index dir does not exist: " + origIndexDir.getAbsolutePath());
        }
        if (!origIndexDir.isDirectory()) {
            fail("Index dir is not a directory: " + origIndexDir.getAbsolutePath());
        }

        // File origSeeds = TestInfo.SEEDS_FILE;
        File crawlDir = TestInfo.HERITRIX_TEMP_DIR;
        crawlDir.mkdirs();
        File orderXml = new File(crawlDir, "order.xml");

        File seedsTxt = new File(crawlDir, "seeds.txt");

        FileUtils.copyFile(origOrderXml, orderXml);
        if (!orderXml.exists()) {
            fail("order-File does not exist: " + orderXml.getAbsolutePath());
        }
        FileUtils.copyFile(origSeedsFile, seedsTxt);
        HeritrixFiles files = new HeritrixFiles(crawlDir, new JobInfoTestImpl(TestInfo.JOBID, TestInfo.HARVESTID),
                jmxPasswordFile, jmxAccessFile);
        /*
         * File tempDir = mtf.newTmpDir(); LuceneUtils.makeDummyIndex(tempDir);
         */
        files.setIndexDir(origIndexDir);

        // TestFileUtils.copyDirectoryNonCVS(origIndexDir, indexDir);

        return HeritrixLauncherFactory.getInstance(files);
    }

    /**
     * Run heritrix with the given order, seeds file and index.
     *
     * @param order An order.xml file as per Heritrix specs
     * @param seeds A file with seeds, one per line
     * @param index the lucene-index
     * @throws IOException
     */
    protected void runHeritrix(File order, File seeds, File index) throws IOException {

        hl = getHeritrixLauncher(order, seeds, index);
        hl.doCrawl();
    }

    /**
     * Check that IOFailure is thrown by the JMXHeritrixController if the JMXPasswordFile does not exist / is hidden /
     * unreadable / impossible to open for other reasons.
     */
    @Ignore
    @Test
    public void testIOFailureThrown() throws IOException {
        // Here it would make sense to get all the settings files and do the control
        // for all of them, but it seems that the Settings are not initialised in the
        // setUp. Therefore the test is made only for jmxremote.password. It would be good
        // to find a way to do the test for all the files.
        File passwordFile = new File(TestInfo.WORKING_DIR, "quickstart.jmxremote.password");
        FileUtils.remove(passwordFile);
        File tempDir = mtf.newTmpDir();
        hl = getHeritrixLauncher(TestInfo.DEFAULT_ORDERXML_FILE, TestInfo.SEEDS_FILE, tempDir, passwordFile,
                new File(Settings.get(CommonSettings.JMX_ACCESS_FILE)));
        try {
            // invoke JMXHeritrixController
            hl.doCrawl();
            // if the exception is not thrown
            fail("An IOFailure should have been thrown when launching " + "with a non existing file ("
                    + passwordFile.getAbsolutePath() + ")");
        } catch (IOFailure iof) {
            assertTrue("Wrong type of IOFailure thrown: " + iof, iof.getMessage().contains("is possibly missing"));
            // ok, the right exception was thrown
        } catch (Exception ex) {
            // a different exception than IOFailure was thrown but the
            // proper IOFailure may be the cause of this exception
            // ex.printStackTrace();

            // System.out.println("ex.getCause().getMessage():" + ex.getCause().getMessage());
            if (!ex.getCause().getMessage()
                    .contains("Failed to read the password file '" + passwordFile.getAbsolutePath() + "'")) {
                ex.printStackTrace();
                fail("An exception different from IOFailure has been thrown "
                        + "when launching with a non existing file (" + passwordFile.getAbsolutePath() + ")"
                        + ExceptionUtils.getStackTrace(ex));
            }
        }
    }

    /**
     * Check that all urls in the given array are listed in the crawl log. Calls fail() at the first url that is not
     * found or if the crawl log is not found.
     *
     * @param urls An array of url strings
     * @throws IOException If TestInfo.HERITRIX_CRAWL_LOG_FILE is not found or is unreadable
     */
    protected void assertAllUrlsInCrawlLog(String[] urls) throws IOException {
        String crawlLog = "";
        crawlLog = FileUtils.readFile(TestInfo.HERITRIX_CRAWL_LOG_FILE);

        for (String s1 : Arrays.asList(urls)) {
            String s = s1;
            if (crawlLog.indexOf(s) == -1) {
                System.out.println("Crawl log: ");
                System.out.println(crawlLog);
                fail("URL " + s + " not found in crawl log");
            }
        }
    }

    /**
     * Check that no urls in the given array are listed in the crawl log. Calls fail() at the first url that is found or
     * if the crawl log is not found.
     *
     * @param urls An array of url strings
     * @throws IOException If TestInfo.HERITRIX_CRAWL_LOG_FILE is not found or is unreadable
     */
    protected void assertNoUrlsInCrawlLog(String[] urls) throws IOException {
        String crawlLog = "";
        crawlLog = FileUtils.readFile(TestInfo.HERITRIX_CRAWL_LOG_FILE);

        for (String s1 : Arrays.asList(urls)) {
            String s = s1;
            if (crawlLog.indexOf(s) != -1) {
                System.out.println("Crawl log: ");
                System.out.println(crawlLog);
                fail("URL " + s + " found in crawl log at " + crawlLog.indexOf(s));
            }
        }
    }

    /**
     * Test that the launcher handles an empty order file correctly.
     */
    @Test
    public void testStartEmptyFile() throws IOException {
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        hl = getHeritrixLauncher(TestInfo.EMPTY_ORDER_FILE, TestInfo.SEEDS_FILE, tempDir);

        try {
            hl.doCrawl();
            fail("An exception should have been caught when launching with an empty order.xml file !");
        } catch (IOFailure e) {
            // Expected case
        }
    }

    /**
     * Test that the launcher actually launches Heritrix and generates at least one arcfile.
     */
    @Test
    public void testLaunch() throws IOException {
        validateOrder(TestInfo.ORDER_FILE);
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        runHeritrix(TestInfo.ORDER_FILE, TestInfo.SEEDS_FILE, tempDir);

        String progressLog = "";
        progressLog = FileUtils.readFile(TestInfo.HERITRIX_PROGRESS_LOG_FILE);

        // test that crawl.log has registered a known URL
        assertAllUrlsInCrawlLog(new String[] { TestInfo.SEARCH_FOR_THIS_URL });

        // test that progress-statistics.log has reported CRAWL ENDED
        StringAsserts.assertStringContains("progress-statistics.log should have reported that the crawl is ended",
                "CRAWL ENDED", progressLog);

        // test that both the heritrix-temp-dir and the bitarchive
        // has at least one file - and has the same file !!
        File[] files = TestInfo.HERITRIX_ARCS_DIR.listFiles(FileUtils.ARCS_FILTER);
        File first_arcfile = files[0];
        if (first_arcfile == null) {
            fail("Directory '" + TestInfo.HERITRIX_ARCS_DIR.getAbsolutePath() + "'  contains no arcfiles !");
        }
    }

    /**
     * Test that the launcher actually launches Heritrix and fetches at least 50 objects from different hosts on tv2.dk
     * (sporten.tv2.dk, nyheder.tv2.dk.....) and netarkivet.dk by parsing the hosts-report.txt This number includes the
     * dns-lookups for each host in these domains.
     */
    @Test
    public void testLaunchWithMaxObjectsPrDomain() throws IOException {
        validateOrder(TestInfo.ORDER_FILE_MAX_OBJECTS);
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        runHeritrix(TestInfo.ORDER_FILE_MAX_OBJECTS, TestInfo.SEEDS_FILE_MAX_OBJECTS, tempDir);

        HeritrixFiles hFiles = HeritrixFiles.getH1HeritrixFilesWithDefaultJmxFiles(TestInfo.HERITRIX_TEMP_DIR,
                new JobInfoTestImpl(0L, 0L));

        File hostReportFile = new File(TestInfo.HERITRIX_TEMP_DIR, "logs/crawl.log");

        HarvestReportGenerator hrg = new HarvestReportGenerator(hFiles);
        DomainStatsReport dsr = new DomainStatsReport(hrg.getDomainStatsMap(), hrg.getDefaultStopReason());
        HarvestReport hhr = new LegacyHarvestReport(dsr);

        Long tv2_objects = hhr.getObjectCount("tv2.dk");
        Long netarkivet_objects = hhr.getObjectCount("netarkivet.dk");
        // int netarkivetHosts = GetHostsForDomain(hostReportFile, "netarkivet.dk");
        assertTrue("Number of objects from tv2.dk should be at least 50, NOT: " + tv2_objects.longValue()
                + "\nNumbers generated from the following host-report.txt: " + FileUtils.readFile(hostReportFile)
                + "\n", tv2_objects.longValue() >= 50);
        assertTrue("Number of objects from netarkivet.dk should be at least 50, NOT: "
                + netarkivet_objects.longValue() + "\nNumbers generated from the following host-report.txt: "
                + FileUtils.readFile(hostReportFile) + "\n", netarkivet_objects >= 50);
    }

    /**
     * Test that the main method works and generates output from known working crawl.
     */
    @Test
    public void testLaunchMain() throws IOException {
        validateOrder(TestInfo.ORDER_FILE);
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        hl = getHeritrixLauncher(TestInfo.ORDER_FILE, TestInfo.SEEDS_FILE, tempDir);
        hl.doCrawl();

        String progressLog = "";
        String crawlLog = "";
        progressLog = FileUtils.readFile(TestInfo.HERITRIX_PROGRESS_LOG_FILE);
        crawlLog = FileUtils.readFile(TestInfo.HERITRIX_CRAWL_LOG_FILE);

        // test that crawl.log has registered a known URL
        StringAsserts.assertStringContains(
                "crawl.log skulle have registreret URL'en: " + TestInfo.SEARCH_FOR_THIS_URL,
                TestInfo.SEARCH_FOR_THIS_URL, crawlLog);

        // test that progress-statistics.log has reported CRAWL ENDED
        StringAsserts.assertStringContains("progress-statistics.log should have reported that the crawl is ended",
                "CRAWL ENDED", progressLog);

        // test that both the heritrix-temp-dir has at least one file
        assertTrue("Directory '" + TestInfo.HERITRIX_ARCS_DIR.getAbsolutePath() + "' contains no arcfiles !",
                TestInfo.HERITRIX_ARCS_DIR.listFiles(FileUtils.ARCS_FILTER).length >= 1);

    }

    /**
     * Test that Heritrix can use a URL seed list to define a harvest. This tests requirement #1.
      */
    @Test
    public void testUrlSeedList() throws IOException {
        validateOrder(TestInfo.ORDER_FILE);
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        runHeritrix(TestInfo.ORDER_FILE, TestInfo.SEEDS_FILE2, tempDir);

        assertAllUrlsInCrawlLog(new String[] { "http://netarkivet.dk/kildetekster/JavaArcUtils-0.3.tar.gz",
                "http://netarkivet.dk/website/press/Bryllup-20040706.pdf",
                "http://netarkivet.dk/proj/pilot_juli2001.pdf" });

        // Check that the unintended ones didn't get caught:
        // URLs that have links to the specified files.
        assertNoUrlsInCrawlLog(new String[] { "http://netarkivet.dk/website/sources/index-da.htm",
                "http://netarkivet.dk/website/sources/index-en.htm",
                "http://netarkivet.dk/website/press/index-da.htm",
                "http://netarkivet.dk/website/press/index-en.htm", "http://netarkivet.dk/pilot-index-da.htm",
                "http://netarkivet.dk/pilot-index-en.htm" });
    }

    /**
     * Test that Heritrix can limit the number of objects harvested pr. domain. This tests requirement #7.
     */
    @Test
    public void testRestrictNumObjectsPrDomain() throws IOException {
        validateOrder(TestInfo.MAX_OBJECTS_ORDER_FILE);
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        runHeritrix(TestInfo.MAX_OBJECTS_ORDER_FILE, TestInfo.SEEDS_FILE, tempDir);

        int num_harvested = 0;
        BufferedReader in = null;
        try {
            in = new BufferedReader(new FileReader(TestInfo.HERITRIX_CRAWL_LOG_FILE));
            while (in.readLine() != null) {
                num_harvested++;
            }

            // we must harvest at max MAX_OBJECTS + 1 (the harvester some times stops at MAX_OBJECTS + 1)
            assertTrue("Number of objects harvested is " + num_harvested + ".  Exceeds " + TestInfo.MAX_OBJECTS,
                    num_harvested < TestInfo.MAX_OBJECTS + 2);
        } finally {
            IOUtils.closeQuietly(in);
        }
    }

    /**
     * Test that Heritrix can limit the number of objects pr. harvest This tests requirement #7.
     */
    @Test
    public void testRestrictNumObjectsHarvested() throws IOException {
        validateOrder(TestInfo.MAX_OBJECTS_ORDER_FILE);
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        runHeritrix(TestInfo.MAX_OBJECTS_ORDER_FILE, TestInfo.SEEDS_FILE, tempDir);

        String progressStatistics = "";
        progressStatistics = FileUtils.readFile(TestInfo.HERITRIX_PROGRESS_LOG_FILE);
        StringAsserts.assertStringContains("Must end by hitting max number of objects",
                "CRAWL ENDED - Finished - Maximum number of documents limit hit", progressStatistics);

        int num_harvested = 0;
        BufferedReader in = null;
        try {
            in = new BufferedReader(new FileReader(TestInfo.HERITRIX_CRAWL_LOG_FILE));
            while (in.readLine() != null) {
                num_harvested++;
            }
            // we must harvest at max MAX_OBJECTS + 1
            // (the harvester sometimes stops at MAX_OBJECTS + 1)
            assertTrue("Number of objects harvested(" + num_harvested + ") should be less than "
                    + (TestInfo.MAX_OBJECTS + 2), num_harvested < TestInfo.MAX_OBJECTS + 2);
        } finally {
            IOUtils.closeQuietly(in);
        }
    }

    /**
     * Test that Heritrix can handle cookies - setting and changing them. This tests requirement #28.
      */
    @Test
    public void testCookiesSupport() throws IOException {
        validateOrder(TestInfo.COOKIES_ORDER_FILE);
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        runHeritrix(TestInfo.COOKIES_ORDER_FILE, TestInfo.COOKIE_SEEDS_FILE, tempDir);

        // test that both the heritrix-temp-dir and the bitarchive has at least one file - and has the same file !!
        File[] files = TestInfo.HERITRIX_ARCS_DIR.listFiles(FileUtils.ARCS_FILTER);
        assertNotNull("Files array should be non-null", files);
        assertEquals("Should be exactly one file in " + TestInfo.HERITRIX_ARCS_DIR.getAbsolutePath(), 1,
                files.length);
        File first_arcfile = files[0];
        assertNotNull("Should be ARC files in " + TestInfo.HERITRIX_ARCS_DIR.getAbsolutePath(), first_arcfile);
        String arcfile = FileUtils.readFile(files[0]);
        // Testing that cookie1 exists because cookie0 will be there
        // with 404.
        StringAsserts.assertStringContains("Must find the web pages",
                "http://netarkivet.dk/website/testsite/cookie1.php", arcfile);
        StringAsserts.assertStringContains("No cookie1 must be set before", "cookie0.php: cookie1 is not set",
                arcfile);
        StringAsserts.assertStringContains("No cookie2 must be set before", "cookie0.php: cookie2 is not set",
                arcfile);
        StringAsserts.assertStringContains("No cookie3 must be set before", "cookie0.php: cookie3 is not set",
                arcfile);
        StringAsserts.assertStringContains("Cookie-setting page must be found", "This sets cookie testcookie1",
                arcfile);
        StringAsserts.assertStringContains("No cookie1 must be set on the page that sets the cookie",
                "cookie1.php: cookie1 is not set", arcfile);
        StringAsserts.assertStringContains("No cookie2 must be set on the page that sets the cookie",
                "cookie1.php: cookie2 is not set", arcfile);
        StringAsserts.assertStringContains("No cookie3 must be set on the page that sets the cookie",
                "cookie1.php: cookie3 is not set", arcfile);
        StringAsserts.assertStringContains("Cookie1 must be found after setting",
                "cookie2.php: cookie1 value is test1", arcfile);
        StringAsserts.assertStringContains("Cookie2 must not be found after setting",
                "cookie2.php: cookie2 is not set", arcfile);
        StringAsserts.assertStringContains("Cookie3 must be found after setting",
                "cookie2.php: cookie3 value is test3", arcfile);
        StringAsserts.assertStringContains("Cookie-changing page must be found", "This changes cookie testcookie1",
                arcfile);
        StringAsserts.assertStringContains("Cookie1 must be changed after changing",
                "cookie5.php: cookie1 value is test2", arcfile);
        StringAsserts.assertStringContains("Cookie2 must not be found after changing cookie1",
                "cookie5.php: cookie2 is not set", arcfile);
        StringAsserts.assertStringContains("Cookie3 must not be changed after changing",
                "cookie5.php: cookie3 value is test3", arcfile);
    }

    /**
     * Test that Heritrix can use a regular expression to limit a harvest Tests with regular expression
     * .*(ArcUtils\.[0-9]\.[0-1]|-da.htm).* which takes the danish index pages and two of three ArcUtil sources.
     * <p>
     * This tests requirement #29.
     */
    @Test
    public void testUrlExpressionRestriction() throws IOException {
        validateOrder(TestInfo.RESTRICTED_URL_ORDER_FILE);
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        runHeritrix(TestInfo.RESTRICTED_URL_ORDER_FILE, TestInfo.SEEDS_FILE, tempDir);

        // Check that we got a bunch of the expected ones
        assertAllUrlsInCrawlLog(new String[] { "http://netarkivet.dk/index-da.php",
                "http://netarkivet.dk/kildetekster/index-da.php",
                "http://netarkivet.dk/kildetekster/ProxyViewer-0.1.tar.gz", });
        // Check that the unintended ones didn't get caught:
        // English index files, 0.2 tarball, gifs, javascript
        assertNoUrlsInCrawlLog(new String[] { "http://netarkivet.dk/index-en.php",
                "http://netarkivet.dk/kildetekster/index-en.php",
                "http://netarkivet.dk/kildetekster/JavaArcUtils-0.3.tar.gz",
                "http://netarkivet.dk/netarkivet_alm/billeder/spacer.gif" });
    }

    /**
     * Test that the Maxbytes feature is handled correctly by the the current harvester. Sets maxbytes limit to 500000
     * bytes.
     */
    @Test
    public void testMaxBytes() throws IOException, IOFailure, DocumentException {
        File MaxbytesOrderFile = new File(TestInfo.WORKING_DIR, "maxBytesOrderxml.xml");
        FileUtils.copyFile(TestInfo.DEFAULT_ORDERXML_FILE, MaxbytesOrderFile);
        FileUtils.copyFile(TestInfo.HERITRIX_SETTINGS_SCHEMA_FILE,
                new File(TestInfo.WORKING_DIR, TestInfo.HERITRIX_SETTINGS_SCHEMA_FILE.getName()));
        Document orderDocument = XmlUtils.getXmlDoc(MaxbytesOrderFile);
        // Not sure what the bytelimit should be to be consistent with what we presently expect Heritrix
        // to do
        long byteLimit = 500000;
        String xpath = H1HeritrixTemplate.GROUP_MAX_ALL_KB_XPATH;
        Node groupMaxSuccessKbNode = orderDocument.selectSingleNode(xpath);
        if (groupMaxSuccessKbNode != null) {
            // Divide by 1024 since Heritrix uses KB rather than bytes,
            // and add 1 to avoid to low limit due to rounding.
            groupMaxSuccessKbNode.setText(Long.toString((byteLimit / 1024) + 1));
        } else {
            fail("QuotaEnforcer node not found in order.xml");
        }
        OutputStream os = new FileOutputStream(MaxbytesOrderFile);
        XMLWriter writer = new XMLWriter(os);
        writer.write(orderDocument);

        validateOrder(MaxbytesOrderFile);
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        runHeritrix(MaxbytesOrderFile, TestInfo.SEEDS_DEFAULT, tempDir);
        // File hostReportFile = new File(TestInfo.HERITRIX_TEMP_DIR, "logs/crawl.log");
        HeritrixFiles hFiles = HeritrixFiles.getH1HeritrixFilesWithDefaultJmxFiles(TestInfo.HERITRIX_TEMP_DIR,
                new JobInfoTestImpl(0L, 0L));

        HarvestReportGenerator hrg = new HarvestReportGenerator(hFiles);
        DomainStatsReport dsr = new DomainStatsReport(hrg.getDomainStatsMap(), hrg.getDefaultStopReason());

        AbstractHarvestReport hhr = new LegacyHarvestReport(dsr);
        Long netarkivet_bytes = hhr.getByteCount("netarkivet.dk");
        long lastNetarkivetBytes = getLastFetchedBytesForDomain("netarkivet.dk");
        // System.out.println("last netarkivet bytes: " + lastNetarkivetBytes);
        // System.out.println(FileUtils.readFile(hostReportFile));
        if (!(netarkivet_bytes.longValue() - lastNetarkivetBytes < byteLimit)) {
            fail("byteLimit (" + netarkivet_bytes.longValue() + ") exceeded");
        }
    }

    /**
     * Tests, whether org.archive.io.RecoverableIOException from the ARCReader can be serialized (bug 755)
      */
    @Test
    public void testArcReaderBug755() throws IOException {
        try {
            throw new org.archive.io.RecoverableIOException("Forced exception: Hit EOF before header EOL");
        } catch (Exception e) {
            // Serialize exception
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            ObjectOutputStream ous = new ObjectOutputStream(baos);
            ous.writeObject(e);
            ous.close();
            baos.close();
        }
    }

    private long getLastFetchedBytesForDomain(String domainName) {

        List<String> crawlLogLines = FileUtils.readListFromFile(TestInfo.HERITRIX_CRAWL_LOG_FILE);
        for (int i = crawlLogLines.size() - 1; i > -1; i--) {
            String[] lineparts = crawlLogLines.get(i).split(" ");
            // Remove superfluous spaces between parts to allow proper parsing of crawl-log
            StringBuffer sb = new StringBuffer();
            for (String linepart : lineparts) {
                if (!linepart.trim().equals("")) {
                    sb.append(linepart.trim() + " ");
                }
            }
            lineparts = (sb.toString().trim()).split(" ");
            // System.out.println("line " + i + " contains " + lineparts.length +" elements");
            // System.out.println("line " + i + ": " + crawlLogLines.get(i));
            if (lineparts[1].equals("-5003")) {
                // Ignore these lines; These urls are skipped as the maxbytes limit has been passed
            } else {
                int bytesIndex = 2;
                int urlIndex = 3;
                try {
                    URL url = new URL(lineparts[urlIndex]);
                    // System.out.println("URL (" + lineparts[urlIndex] + ") has domain: "
                    // + Domain.domainNameFromHostname(url.getHost()));
                    if (DomainUtils.domainNameFromHostname(url.getHost()).equals(domainName)) {
                        return Long.parseLong(lineparts[bytesIndex]);
                    }
                } catch (MalformedURLException e) {
                    System.out.println("line caused an MalformedURLException: " + crawlLogLines.get(i));
                    e.printStackTrace();
                }
            }

        }
        return 0L;
    }

    /**
     * Test that our default order.xml can be validated against the latest heritrix_settings.xsd. The latest version:
     * http://cvs.sourceforge.net/viewcvs.py/archive-crawler/ArchiveOpenCrawler/src/webapps/admin/ This file is also
     * found in the src distributions in the src/webapps/admin/ directory. Note: This heritrix_settings.xsd needs now to
     * be in the same directory as the order-file.
     */
    @Test
    public void testIfDefaultOrderXmlIsStillValid() {
        File order = TestInfo.DEFAULT_ORDERXML_FILE;
        validateOrder(order);
    }

    /**
     * Test we can use the Deduplicator write-processor.
     */
    @Test
    public void testDeduplicatorOrderXml() throws Exception {
        validateOrder(TestInfo.DEDUPLICATOR_ORDERXML_FILE);
        Document d = XmlUtils.getXmlDoc(TestInfo.DEDUPLICATOR_ORDERXML_FILE);

        String INDEX_LOCATION_PATH = "/crawl-order/controller/map[@name='write-processors']"
                + "/newObject[@name='DeDuplicator']" + "/string[@name='index-location']";

        /** XPaths needed to insert/setup the DeDuplicator. */

        /** sample DeDuplicator block taken from faulty releasetest trial order.xml. */
        // <map name="write-processors">
        // <newObject name="DeDuplicator" class="is.hi.bok.deduplicator.DeDuplicator">
        // <boolean name="enabled">true</boolean>
        // <map name="filters"/>
        // <string name="index-location">/home/netarkiv/KFC/harvester_8081/2_1153919296552/index</string>
        // <string name="matching-method">By URL</string>
        // <boolean name="try-equivalent">true</boolean>
        // <boolean name="change-content-size">false</boolean>
        // <string name="mime-filter">^text/.*</string>
        // <string name="filter-mode">Blacklist</string>
        // <string name="analysis-mode">Timestamp</string>
        // <string name="log-level">SEVERE</string>
        // <string name="origin"/>
        // <string name="origin-handling">Use index information</string>
        // <boolean name="stats-per-host">true</boolean>
        // </newObject>

        // Set originHandling. Check that originHandling is one of AVAILABLE_ORIGIN_HANDLING
        // If not, set to default DEFAULT_ORIGIN_HANDLING, and log a warning.
        checkAndSetOrderXMLNode(d, DEDUPLICATOR_ORIGIN_HANDLING_XPATH, DeDuplicator.ATTR_ORIGIN_HANDLING,
                "Use index information", DeDuplicator.AVAILABLE_ORIGIN_HANDLING,
                DeDuplicator.DEFAULT_ORIGIN_HANDLING);
        Node indexLocationNode = d.selectSingleNode(INDEX_LOCATION_PATH);
        if (indexLocationNode != null) {
            // Divide by 1024 since Heritrix uses KB rather than bytes,
            // and add 1 to avoid to low limit due to rounding.
            indexLocationNode.setText(new File(TestInfo.HERITRIX_TEMP_DIR, "index").getAbsolutePath());
        } else {
            fail("IndexLocation node not found in order.xml");
        }
        File modifiedOrderFile = mtf.newTmpFile();

        OutputStream os = new FileOutputStream(modifiedOrderFile);
        XMLWriter writer = new XMLWriter(os);
        writer.write(d);
        // Now the modified order.xml is in modifiedOrderFile

        File indexDir = mtf.newTmpDir();
        File scratchpadDir = mtf.newTmpDir();

        // Sort crawl-log:

        // File orgCrawlog = new File(TestInfo.HERITRIX_TEMP_DIR, "logs/crawl.log");
        File orgCrawlog = new File(TestInfo.TEST_LAUNCH_HARVEST_DIR, "logs/crawl.log");
        assertTrue("File does not exist", orgCrawlog.exists());
        File sortedCrawlLog = new File(scratchpadDir, "sorted-crawl.log");
        FileUtils.sortCrawlLog(orgCrawlog, sortedCrawlLog);

        // File arcsDir = new File(TestInfo.HERITRIX_TEMP_DIR, "arcs");
        File arcsDir = new File(TestInfo.TEST_LAUNCH_HARVEST_DIR, "arcs");

        // Get CDXReader of the cdx for the previous crawl.
        // Note that this may break if the arcs dir has more than one file.
        LuceneUtils.generateIndex(sortedCrawlLog,
                getCXDReaderForArc(arcsDir.listFiles(TestFileUtils.NON_CVS_DIRS_FILTER)[0]), indexDir);

        FileUtils.removeRecursively(TestInfo.HERITRIX_TEMP_DIR);
        runHeritrix(modifiedOrderFile, TestInfo.SEEDS_FILE, indexDir);
        FileAsserts.assertFileMatches("Must have done some dedup", "Duplicates found:  [^0]",
                new File(TestInfo.HERITRIX_TEMP_DIR, "processors-report.txt"));
    }

    /**
     * Test we can harvest from FTP-sites using the FTP processor. Downloads max 25 files from klid.dk using the seed:
     * ftp://ftp.klid.dk/OpenOffice/haandbog
     */
    @Test
    public void testFtpHarvesting() throws Exception {
        validateOrder(TestInfo.FTPHARVESTING_ORDERXML_FILE);
        File tempDir = mtf.newTmpDir();
        LuceneUtils.makeDummyIndex(tempDir);
        runHeritrix(TestInfo.FTPHARVESTING_ORDERXML_FILE, TestInfo.FTP_HARVESTING_SEEDLIST_FILE, tempDir);

        // test that both the heritrix-temp-dir and the bitarchive has at least one file - and has the same file !!
        File[] files = TestInfo.HERITRIX_ARCS_DIR.listFiles(FileUtils.ARCS_FILTER);
        assertNotNull("Files array should be non-null", files);
        assertEquals("Should be exactly one file in " + TestInfo.HERITRIX_ARCS_DIR.getAbsolutePath(), 1,
                files.length);
        File first_arcfile = files[0];
        assertNotNull("Should be ARC files in " + TestInfo.HERITRIX_ARCS_DIR.getAbsolutePath(), first_arcfile);
        ArchiveReader reader = ArchiveReaderFactory.get(files[0]);
        Iterator<ArchiveRecord> i = reader.iterator();
        Set<String> urlSet = new HashSet<String>();
        while (i.hasNext()) {
            ArchiveRecord o = i.next();
            if (o instanceof ARCRecord) {
                ARCRecord a = (ARCRecord) o;
                urlSet.add(a.getMetaData().getUrl());
            } else {
                fail("ARCrecords expected, not objects of class" + o.getClass().getName());
            }
        }
        assertTrue("Should have harvested more than 10 objects but only harvested " + urlSet.size(),
                urlSet.size() > 10);
        String searchString = "ftp://ftp.klid.dk/OpenOffice/haandbog/Haandbog-2-2.pdf";
        if (!urlSet.contains(searchString)) {
            fail("Expected to harvest '" + searchString + "' but we only harvested : "
                    + StringUtils.conjoin(",", urlSet));
        }
    }

    // /////////////////////////////////////////////////////////////////
    // ////////// Helper methods //////////////////////////////////////
    // ////////////////////////////////////////////////////////////////
    private void validateOrder(File anOrderFile) {
        SAXReader reader = new SAXReader();
        reader.setValidation(true);
        try {
            reader.setProperty("http://apache.org/xml/properties/schema/external-noNamespaceSchemaLocation",
                    // TestInfo.HERITRIX_SETTINGS_SCHEMA_FILE.getAbsolutePath());
                    "heritrix_settings.xsd");
            reader.setFeature("http://apache.org/xml/features/validation/schema", true);
            // add error handler which turns any errors into XML
            XMLErrorHandler errorHandler = new XMLErrorHandler();
            reader.setErrorHandler(errorHandler);

            // parse the document
            Document document = reader.read(anOrderFile);
            if (!errorHandler.getErrors().asXML().contentEquals("<errors/>")) {
                fail(anOrderFile.getAbsolutePath() + " is invalid according to schema: "
                        + errorHandler.getErrors().asXML());
            }

            // Find alle classes in the order.xml, and try to load/instantiate these classes
            // TODO Try to instantiate all classes in the given xml.
            iterateChildren(document.getRootElement());

        } catch (SAXException e) {
            fail("SaxException thrown" + e);
        } catch (DocumentException e1) {
            fail("DocumentException thrown" + e1);
        }
    }

    /**
     * Iterate over all children of a XML-element.
     *
     * @param anElement
     */
    private void iterateChildren(Element anElement) {
        Iterator<Element> elementIterator = anElement.elementIterator();
        while (elementIterator.hasNext()) {
            Element element = elementIterator.next();

            if (element.attribute("class") != null) {
                // System.out.println("This element (" + element.getName() +") contains a class-name");
                // System.out.println("Parent = " + element.getPath());
                if (!validClass(element.attribute("class").getText())) {
                    fail("Class not valid: " + element.attribute("class").getText());
                }
            }
            iterateChildren(element);
        }
    }

    /**
     * Check, if class exists, and can be loaded. TODO try to instantiate the class as well.
     *
     * @param className a name for a class
     * @return true, if class exists, and can be loaded.XS
     */
    private boolean validClass(String className) {
        URLClassLoader loader = URLClassLoader.newInstance(classPathAsURLS());
        try {
            loader.loadClass(className);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
            return false;
        }

        return true;
    }

    /**
     * Get a list of URLs as calculated by the Heritrix tests.
     *
     * @return The list of URLs.
     */
    private URL[] classPathAsURLS() {
        URL[] urls = null;
        try {
            Method method = ReflectUtils.getPrivateMethod(AbstractJMXHeritrixController.class, "updateEnvironment",
                    Map.class);
            Map<String, String> environment = new HashMap<String, String>(System.getenv());
            method.invoke(null, environment);
            String[] urlStrings = environment.get("CLASSPATH").split(":");
            urls = new URL[urlStrings.length];
            for (int i = 0; i < urlStrings.length; i++) {
                urls[i] = new URL("file:" + urlStrings[i]);
            }
        } catch (NoSuchMethodException e) {
            e.printStackTrace(System.err);
            fail("Exception " + e.getMessage());
        } catch (IllegalAccessException e) {
            e.printStackTrace(System.err);
            fail("Exception " + e.getMessage());
        } catch (InvocationTargetException e) {
            e.printStackTrace(System.err);
            fail("Exception " + e.getMessage());
        } catch (MalformedURLException e) {
            e.printStackTrace(System.err);
            fail("Exception " + e.getMessage());
        }
        return urls;
    }

    private BufferedReader getCXDReaderForArc(File arcfile) throws Exception {
        File cdxfile = mtf.newTmpFile();
        OutputStream cdxstream = null;
        try {
            cdxstream = new FileOutputStream(cdxfile);
            CDXUtils.writeCDXInfo(arcfile, cdxstream);
        } finally {
            if (cdxstream != null) {
                cdxstream.close();
            }
        }
        File readable = File.createTempFile("sorted", "cdx", TestInfo.WORKING_DIR);
        FileUtils.sortCDX(cdxfile, readable);
        BufferedReader cr = new BufferedReader(new FileReader(readable));
        return cr;
    }

    /*
     * Utility method to facilitate and update an XML node given a set of legal values, and a default value. It logs a
     * warning, if the given settingValue is illegal.
     */
    private void checkAndSetOrderXMLNode(Document doc, String xpath, String settingName, String settingValue,
            String[] legalValues, String defaultValue) {
        boolean settingOK = false;
        for (String possibleValue : legalValues) {
            if (settingValue.equals(possibleValue)) {
                settingOK = true;
            }
        }
        if (!settingOK) {
            System.out.println(String.format("Unrecognized %s value given: %s. Replaced by default %s value: %s",
                    settingName, settingValue, settingName, defaultValue));
            settingValue = defaultValue;
        }
        setOrderXMLNode(doc, xpath, settingValue);
    }

    /**
     * Set a XmlNode defined by the given XPath to the given value.
     *
     * @param doc the Document, which is being modified
     * @param xpath the given XPath
     * @param value the given value
     */
    private void setOrderXMLNode(Document doc, String xpath, String value) {
        Node xpath_node = doc.selectSingleNode(xpath);
        if (xpath_node == null) {
            throw new IOFailure("Element '" + xpath + "' could not be found in this order-file!");
        }
        xpath_node.setText(value);
    }

    class JobInfoTestImpl implements JobInfo {

        private Long jobId;
        private Long harvestId;

        public JobInfoTestImpl(Long jobId, Long harvestId) {
            this.jobId = jobId;
            this.harvestId = harvestId;
        }

        @Override
        public Long getJobID() {
            return jobId;
        }

        @Override
        public Long getOrigHarvestDefinitionID() {
            return this.harvestId;
        }

        @Override
        public String getHarvestFilenamePrefix() {
            return jobId + "-" + harvestId;
        }

    }
}