org.archive.modules.seeds.TextSeedModule.java Source code

Introduction

Here is the source code for org.archive.modules.seeds.TextSeedModule.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.seeds;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.util.Iterator;
import java.util.concurrent.CountDownLatch;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.io.ReadSource;
import org.archive.modules.CrawlURI;
import org.archive.modules.SchedulingConstants;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.spring.WriteTarget;
import org.archive.util.ArchiveUtils;
import org.archive.util.DevUtils;
import org.archive.util.iterator.LineReadingIterator;
import org.archive.util.iterator.RegexLineIterator;
import org.springframework.beans.factory.annotation.Required;

/**
 * Module that announces a list of seeds from a text source (such
 * as a ConfigFile or ConfigString), and provides a mechanism for
 * adding seeds after a crawl has begun.
 *
 * @contributor gojomo
 */
public class TextSeedModule extends SeedModule implements ReadSource {
    private static final long serialVersionUID = 3L;

    private static final Logger logger = Logger.getLogger(TextSeedModule.class.getName());

    /**
     * Text from which to extract seeds
     */
    protected ReadSource textSource = null;

    public ReadSource getTextSource() {
        return textSource;
    }

    @Required
    public void setTextSource(ReadSource seedsSource) {
        this.textSource = seedsSource;
    }

    /**
     * Number of lines of seeds-source to read on initial load before proceeding
     * with crawl. Default is -1, meaning all. Any other value will cause that
     * number of lines to be loaded before fetching begins, while all extra
     * lines continue to be processed in the background. Generally, this should
     * only be changed when working with very large seed lists, and scopes that
     * do *not* depend on reading all seeds. 
     */
    protected int blockAwaitingSeedLines = -1;

    public int getBlockAwaitingSeedLines() {
        return blockAwaitingSeedLines;
    }

    public void setBlockAwaitingSeedLines(int blockAwaitingSeedLines) {
        this.blockAwaitingSeedLines = blockAwaitingSeedLines;
    }

    public TextSeedModule() {
    }

    /**
     * Announce all seeds from configured source to SeedListeners 
     * (including nonseed lines mixed in). 
     * @see org.archive.modules.seeds.SeedModule#announceSeeds()
     */
    public void announceSeeds() {
        if (getBlockAwaitingSeedLines() > -1) {
            final CountDownLatch latch = new CountDownLatch(getBlockAwaitingSeedLines());
            new Thread() {
                @Override
                public void run() {
                    announceSeeds(latch);
                    while (latch.getCount() > 0) {
                        latch.countDown();
                    }
                }
            }.start();
            try {
                latch.await();
            } catch (InterruptedException e) {
                // do nothing
            }
        } else {
            announceSeeds(null);
        }
    }

    protected void announceSeeds(CountDownLatch latchOrNull) {
        BufferedReader reader = new BufferedReader(textSource.obtainReader());
        try {
            announceSeedsFromReader(reader, latchOrNull);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    /**
     * Announce all seeds (and nonseed possible-directive lines) from
     * the given Reader
     * @param reader source of seed/directive lines
     * @param latchOrNull if non-null, sent countDown after each line, allowing 
     * another thread to proceed after a configurable number of lines processed
     */
    protected void announceSeedsFromReader(BufferedReader reader, CountDownLatch latchOrNull) {
        String s;
        Iterator<String> iter = new RegexLineIterator(new LineReadingIterator(reader),
                RegexLineIterator.COMMENT_LINE, RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
                RegexLineIterator.ENTRY);

        int count = 0;
        while (iter.hasNext()) {
            s = (String) iter.next();
            if (Character.isLetterOrDigit(s.charAt(0))) {
                // consider a likely URI
                seedLine(s);
                count++;
                if (count % 20000 == 0) {
                    System.runFinalization();
                }
            } else {
                // report just in case it's a useful directive
                nonseedLine(s);
            }
            if (latchOrNull != null) {
                latchOrNull.countDown();
            }
        }
        publishConcludedSeedBatch();
    }

    /**
     * Handle a read line that is probably a seed.
     * 
     * @param uri String seed-containing line
     */
    protected void seedLine(String uri) {
        String originalUri = uri;
        if (!uri.matches("[a-zA-Z][\\w+\\-]+:.*")) { // Rfc2396 s3.1 scheme,
                                                     // minus '.'
                                                     // Does not begin with scheme, so try http://
            uri = "http://" + uri;
        }
        try {
            UURI uuri = UURIFactory.getInstance(uri);
            CrawlURI curi = new CrawlURI(uuri);
            curi.setSeed(true);
            curi.setSchedulingDirective(SchedulingConstants.MEDIUM);
            if (getSourceTagSeeds()) {
                curi.setSourceTag(originalUri);
            }
            publishAddedSeed(curi);
        } catch (URIException e) {
            // try as nonseed line as fallback
            nonseedLine(uri);
        }
    }

    /**
     * Handle a read line that is not a seed, but may still have
     * meaning to seed-consumers (such as scoping beans). 
     * 
     * @param uri String seed-containing line
     */
    protected void nonseedLine(String line) {
        publishNonSeedLine(line);
    }

    /**
     * Treat the given file as a source of additional seeds,
     * announcing to SeedListeners.
     * 
     * @see org.archive.modules.seeds.SeedModule#actOn(java.io.File)
     */
    public void actOn(File f) {
        BufferedReader reader = null;
        try {
            reader = ArchiveUtils.getBufferedReader(f);
            announceSeedsFromReader(reader, null);
        } catch (IOException ioe) {
            logger.log(Level.SEVERE, "problem reading seed file " + f, ioe);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    /**
     * Add a new seed to scope. By default, simply appends
     * to seeds file, though subclasses may handle differently.
     *
     * <p>This method is *not* sufficient to get the new seed 
     * scheduled in the Frontier for crawling -- it only 
     * affects the Scope's seed record (and decisions which
     * flow from seeds). 
     *
     * @param curi CandidateUri to add
     * @return true if successful, false if add failed for any reason
     */
    @Override
    public synchronized void addSeed(final CrawlURI curi) {
        if (!(textSource instanceof WriteTarget)) {
            // TODO: do something else to log seed update
            logger.warning("nowhere to log added seed: " + curi);
        } else {
            // TODO: determine if this modification to seeds file means
            // TextSeedModule should (again) be Checkpointable
            try {
                Writer fw = ((WriteTarget) textSource).obtainWriter(true);
                // Write to new (last) line the URL.
                fw.write("\n");
                fw.write("# Heritrix added seed "
                        + ((curi.getVia() != null) ? "redirect from " + curi.getVia() : "(JMX)") + ".\n");
                fw.write(curi.toString());
                fw.flush();
                fw.close();
            } catch (IOException e) {
                DevUtils.warnHandle(e, "problem writing new seed");
            }
        }
        publishAddedSeed(curi);
    }

    public Reader obtainReader() {
        return textSource.obtainReader();
    }
}