com.cyberway.issue.crawler.scope.SeedCachingScopeTest.java Source code

Java tutorial

Introduction

Here is the source code for com.cyberway.issue.crawler.scope.SeedCachingScopeTest.java

Source

package com.cyberway.issue.crawler.scope;

/* SeedCachingScopeTest
*
* $Id: SeedCachingScopeTest.java 4651 2006-09-25 18:31:13Z paul_jack $
*
* Created on Mar 30, 2005
*
* Copyright (C) 2005 Internet Archive.
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.httpclient.URIException;
import com.cyberway.issue.crawler.datamodel.CrawlURI;
import com.cyberway.issue.net.UURI;
import com.cyberway.issue.net.UURIFactory;
import com.cyberway.issue.util.TmpDirTestCase;

/**
* Test {@link SeedCachingScope}.
* @author stack gojomo
* @version $Revision: 4651 $, $Date: 2006-09-25 18:31:13 +0000 (Mon, 25 Sep 2006) $
*/
public class SeedCachingScopeTest extends TmpDirTestCase {
    /**
     * Constrained SeedCachingScope subclass for testing
     * 
     * @author gojomo
     */
    private class UnitTestSeedCachingScope extends SeedCachingScope {

        private static final long serialVersionUID = -1651873833038665447L;

        private File seedsfile;

        public UnitTestSeedCachingScope(File seedsfile) {
            super("test");
            this.seedsfile = seedsfile;
        }

        public File getSeedfile() {
            return seedsfile;
        }
    }

    private static Set<UURI> seeds = null;

    /**
     * Comparator for treeset of uuris.
     */
    private static final Comparator<UURI> CMP = new Comparator<UURI>() {
        public int compare(UURI o1, UURI o2) {
            int result = -1;
            if (o1 == null && o1 == null) {
                result = 0;
            } else if (o1 == null) {
                result = -1;
            } else if (o2 == null) {
                result = 1;
            } else {
                String s1 = o1.toString();
                String s2 = o2.toString();
                result = s1.compareTo(s2);
                result = (result < 0) ? result = -1 : (result > 0) ? result = 1 : 0;
            }
            return result;
        }
    };

    /**
     * Seed file reference.
     */
    private File seedsfile;

    /* (non-Javadoc)
     * @see com.cyberway.issue.util.TmpDirTestCase#setUp()
     */
    protected void setUp() throws Exception {
        super.setUp();

        // First create array of seeds and add to treeset.
        SeedCachingScopeTest.seeds = new TreeSet<UURI>(SeedCachingScopeTest.CMP);
        String[] uris = { "mailto:www.google.com", "http://www.port.com:80/etc/motd2",
                "http://a:b@userinfo.com/etc/motd2", "news:www.google.com", "http://www.google.com",
                "https://www.google.com", "gopher://www.google.com", "news://www.google.com",
                "rss://www.google.com", "telnet://www.google.com", "ftp://myname@example.com/etc/motd",
                "ftp://example.com/etc/motd2" };
        for (int i = 0; i < uris.length; i++) {
            SeedCachingScopeTest.seeds.add(UURIFactory.getInstance(uris[i]));
        }

        // Write a seeds file w/ our list of seeds.
        this.seedsfile = new File(getTmpDir(), SeedCachingScopeTest.class.getName() + ".seedfile");
        PrintWriter writer = new PrintWriter(new FileWriter(this.seedsfile));
        for (int i = 0; i < uris.length; i++) {
            writer.println(uris[i]);
        }
        writer.close();
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.util.TmpDirTestCase#tearDown()
     */
    protected void tearDown() throws Exception {
        super.tearDown();
        if (this.seedsfile.exists()) {
            this.seedsfile.delete();
        }
    }

    public void testGeneral() throws URIException {
        // First make sure that I can get the seed set from seed file.
        SeedCachingScope sl = checkContent(SeedCachingScopeTest.seeds);
        // Now do add and see if get set matches seed file content.
        final CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://one.two.three"));
        sl.addSeed(curi);
        Set<UURI> set = new TreeSet<UURI>(SeedCachingScopeTest.CMP);
        set.addAll(SeedCachingScopeTest.seeds);
        set.add(curi.getUURI());
        checkContent(sl, set);
    }

    public void testNoScheme() throws IOException {
        final String NOSCHEME = "x.y.z";
        FileWriter fw = new FileWriter(this.seedsfile, true);
        // Write to new (last) line the URL.
        fw.write("\n");
        fw.write(NOSCHEME);
        fw.flush();
        fw.close();
        boolean found = false;
        SeedCachingScope sl = new UnitTestSeedCachingScope(seedsfile);
        for (Iterator i = sl.seedsIterator(); i.hasNext();) {
            UURI uuri = (UURI) i.next();
            if (uuri.getHost() == null) {
                continue;
            }
            if (uuri.getHost().equals(NOSCHEME)) {
                found = true;
                break;
            }
        }
        assertTrue("Did not find " + NOSCHEME, found);
    }

    private SeedCachingScope checkContent(Set seedSet) {
        return checkContent(null, seedSet);
    }

    private SeedCachingScope checkContent(SeedCachingScope sl, Set seedSet) {
        if (sl == null) {
            sl = new UnitTestSeedCachingScope(this.seedsfile);
        }
        int count = 0;
        for (Iterator i = sl.seedsIterator(); i.hasNext();) {
            count++;
            UURI uuri = (UURI) i.next();
            assertTrue("Does not contain: " + uuri.toString(), seedSet.contains(uuri));
        }
        assertTrue("Different sizes: " + count + ", " + seedSet.size(), count == seedSet.size());
        return sl;
    }
}