com.cyberway.issue.crawler.settings.XMLSettingsHandlerTest.java Source code

Java tutorial

Introduction

Here is the source code for com.cyberway.issue.crawler.settings.XMLSettingsHandlerTest.java

Source

/*
 * XMLSettingsHandlerTest
 *
 * $Id: XMLSettingsHandlerTest.java 3704 2005-07-18 17:30:21Z stack-sf $
 *
 * Created on Jan 28, 2004
 *
 * Copyright (C) 2004 Internet Archive.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify it under the
 * terms of the GNU Lesser Public License as published by the Free Software
 * Foundation; either version 2.1 of the License, or any later version.
 *
 * Heritrix is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 * A PARTICULAR PURPOSE. See the GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License along with
 * Heritrix; if not, write to the Free Software Foundation, Inc., 59 Temple
 * Place, Suite 330, Boston, MA 02111-1307 USA
 */
package com.cyberway.issue.crawler.settings;

import java.io.File;
import java.io.IOException;
import java.text.ParseException;

import javax.management.Attribute;
import javax.management.AttributeNotFoundException;
import javax.management.InvalidAttributeValueException;
import javax.management.MBeanException;
import javax.management.ReflectionException;

import org.apache.commons.httpclient.URIException;
import com.cyberway.issue.crawler.datamodel.CrawlOrder;
import com.cyberway.issue.crawler.datamodel.CrawlURI;
import com.cyberway.issue.crawler.framework.CrawlScope;
import com.cyberway.issue.crawler.scope.ClassicScope;
import com.cyberway.issue.crawler.settings.refinements.Criteria;
import com.cyberway.issue.crawler.settings.refinements.PortnumberCriteria;
import com.cyberway.issue.crawler.settings.refinements.Refinement;
import com.cyberway.issue.crawler.settings.refinements.RegularExpressionCriteria;
import com.cyberway.issue.crawler.settings.refinements.TimespanCriteria;
import com.cyberway.issue.net.UURIFactory;

/**
 * Tests the handling of settings files.
 *
 * @author John Erik Halse
 *
 */
public class XMLSettingsHandlerTest extends SettingsFrameworkTestCase {

    /*
     * @see TestCase#setUp()
     */
    protected void setUp() throws Exception {
        super.setUp();
    }

    /*
     * @see TestCase#tearDown()
     */
    protected void tearDown() throws Exception {
        super.tearDown();
    }

    /*
     * Test for void writeSettingsObject(CrawlerSettings)
     */
    public void testWriteSettingsObjectCrawlerSettings()
            throws AttributeNotFoundException, InvalidAttributeValueException, MBeanException, ReflectionException {

        // Write a crawl order file
        CrawlerSettings settings = getGlobalSettings();
        XMLSettingsHandler handler = getSettingsHandler();
        handler.registerValueErrorHandler(this);
        handler.getOrder().setAttribute(new ClassicScope());
        handler.writeSettingsObject(settings);
        assertTrue("Order file was not written", getOrderFile().exists());

        // Get a module to alter a setting on
        ComplexType scope = settings.getModule(CrawlScope.ATTR_NAME);
        assertNotNull("Could not get module scope", scope);

        // Alter two settings in a per host file
        CrawlerSettings perHost = getPerHostSettings();
        Integer newHops = new Integer(500);
        String newFrom = "newfrom";
        scope.setAttribute(perHost, new Attribute(ClassicScope.ATTR_MAX_LINK_HOPS, newHops));
        CrawlOrder order = handler.getOrder();
        ComplexType httpHeaders = (ComplexType) order.getAttribute(CrawlOrder.ATTR_HTTP_HEADERS);
        httpHeaders.setAttribute(perHost, new Attribute(CrawlOrder.ATTR_FROM, newFrom));

        // Write the per host file
        handler.writeSettingsObject(perHost);
        assertTrue("Per host file was not written", handler.settingsToFilename(perHost).exists());

        // Create a new handler for testing that changes was written to disk
        XMLSettingsHandler newHandler = new XMLSettingsHandler(getOrderFile());
        newHandler.initialize();

        // Read perHost
        CrawlerSettings newPerHost = newHandler.getSettingsObject(perHost.getScope());
        assertNotNull("Per host scope could not be read", newPerHost);

        ComplexType newScope = newHandler.getModule(CrawlScope.ATTR_NAME);
        assertNotNull(newScope);
        Integer r1 = (Integer) newScope.getAttribute(newPerHost, ClassicScope.ATTR_MAX_LINK_HOPS);
        assertEquals(newHops, r1);

        ComplexType newHttpHeaders = (ComplexType) newHandler.getOrder().getAttribute(newPerHost,
                CrawlOrder.ATTR_HTTP_HEADERS);
        assertNotNull(newHttpHeaders);

        String r2 = (String) newHttpHeaders.getAttribute(newPerHost, CrawlOrder.ATTR_FROM);
        assertEquals(newFrom, r2);
    }

    /**
     * Test the copying of the entire settings directory.
     *
     * @throws IOException
     */
    public void testCopySettings() throws IOException {
        //String testScope = "www.archive.org";

        // Write the files
        XMLSettingsHandler handler = getSettingsHandler();
        handler.writeSettingsObject(getGlobalSettings());
        handler.writeSettingsObject(getPerHostSettings());

        // Copy to new location
        File newOrderFile = new File(getTmpDir(), "SETTINGS_new_order.xml");
        String newSettingsDir = "SETTINGS_new_per_host_settings";
        handler.copySettings(newOrderFile, newSettingsDir);

        // Check if new files where created.
        assertTrue("Order file was not written", newOrderFile.exists());

        assertTrue("New settings dir not set", handler.settingsToFilename(getPerHostSettings()).getAbsolutePath()
                .matches(".*" + newSettingsDir + ".*"));
        assertTrue("Per host file was not written", handler.settingsToFilename(getPerHostSettings()).exists());
    }

    public void testGetSettings() {
        XMLSettingsHandler handler = getSettingsHandler();
        CrawlerSettings order = handler.getSettingsObject(null);
        CrawlerSettings perHost = handler.getSettings("localhost.localdomain");
        assertNotNull("Didn't get any file", perHost);
        assertSame("Did not get same file", order, perHost);
    }

    public void testGetSettingsObject() {
        String testScope = "audio.archive.org";

        XMLSettingsHandler handler = getSettingsHandler();
        assertNotNull("Couldn't get orderfile", handler.getSettingsObject(null));
        assertNull("Got nonexisting per host file", handler.getSettingsObject(testScope));
        assertNotNull("Couldn't create per host file", handler.getOrCreateSettingsObject(testScope));
        assertNotNull("Couldn't get per host file", handler.getSettingsObject(testScope));
    }

    public void testDeleteSettingsObject() {
        XMLSettingsHandler handler = getSettingsHandler();
        File file = handler.settingsToFilename(getPerHostSettings());
        handler.writeSettingsObject(getPerHostSettings());
        assertTrue("Per host file was not written", file.exists());
        handler.deleteSettingsObject(getPerHostSettings());
        assertFalse("Per host file was not deleted", file.exists());
    }

    public void testReadWriteRefinements() throws ParseException, InvalidAttributeValueException,
            AttributeNotFoundException, MBeanException, ReflectionException, URIException {
        XMLSettingsHandler handler = getSettingsHandler();
        CrawlerSettings global = getGlobalSettings();
        CrawlerSettings per = getPerHostSettings();
        ComplexType headers = (ComplexType) handler.getOrder().getAttribute(CrawlOrder.ATTR_HTTP_HEADERS);

        String globalFrom = (String) headers.getAttribute(CrawlOrder.ATTR_FROM);
        String refinedGlobalFrom = "refined@global.address";
        String refinedPerFrom = "refined@per.address";

        // Create a refinement on the global level
        Refinement globalRefinement = new Refinement(global, "test", "Refinement test");
        Criteria timespanCriteria = new TimespanCriteria("2300", "2300");
        globalRefinement.addCriteria(timespanCriteria);
        Criteria regexpCriteria = new RegularExpressionCriteria(".*www.*");
        globalRefinement.addCriteria(regexpCriteria);
        handler.writeSettingsObject(global);

        // Override an attribute on the global refinement
        CrawlerSettings globalRefinementSetting = globalRefinement.getSettings();
        headers.setAttribute(globalRefinementSetting, new Attribute(CrawlOrder.ATTR_FROM, refinedGlobalFrom));
        handler.writeSettingsObject(globalRefinementSetting);

        // Create a refinement on a per level
        Refinement perRefinement = new Refinement(per, "test2", "Refinement test2");
        Criteria portCriteria = new PortnumberCriteria("10");
        perRefinement.addCriteria(portCriteria);
        handler.writeSettingsObject(per);

        // Override an attribute on the per refinement
        CrawlerSettings perRefinementSetting = perRefinement.getSettings();
        headers.setAttribute(perRefinementSetting, new Attribute(CrawlOrder.ATTR_FROM, refinedPerFrom));
        handler.writeSettingsObject(perRefinementSetting);

        // Create a new handler for testing that changes was written to disk
        XMLSettingsHandler newHandler = new XMLSettingsHandler(getOrderFile());
        newHandler.initialize();
        CrawlerSettings newGlobal = newHandler.getSettingsObject(null);
        assertNotNull("Global scope could not be read", newGlobal);
        CrawlerSettings newPer = newHandler.getSettingsObject(per.getScope());
        assertNotNull("Per host scope could not be read", newPer);

        ComplexType newHeaders = (ComplexType) newHandler.getOrder().getAttribute(CrawlOrder.ATTR_HTTP_HEADERS);
        assertNotNull(newHeaders);

        String newFrom1 = (String) newHeaders.getAttribute(CrawlOrder.ATTR_FROM, getMatchDomainURI());
        String newFrom2 = (String) newHeaders.getAttribute(CrawlOrder.ATTR_FROM, getMatchHostURI());
        CrawlURI matchHostAndPortURI = new CrawlURI(
                UURIFactory.getInstance("http://www.archive.org:10/index.html"));
        String newFrom3 = (String) newHeaders.getAttribute(CrawlOrder.ATTR_FROM, matchHostAndPortURI);

        //Check that we got what we expected
        assertEquals(globalFrom, newFrom1);
        assertEquals(refinedGlobalFrom, newFrom2);
        assertEquals(refinedPerFrom, newFrom3);
    }
}