org.asqatasun.crawler.processor.module.AsqatasunTextSeedModule.java Source code

Java tutorial

Introduction

Here is the source code for org.asqatasun.crawler.processor.module.AsqatasunTextSeedModule.java

Source

/*
 * Asqatasun - Automated webpage assessment
 * Copyright (C) 2008-2015  Asqatasun.org
 *
 * This file is part of Asqatasun.
 *
 * Asqatasun is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * Contact us by mail: asqatasun AT asqatasun DOT org
 */
package org.asqatasun.crawler.processor.module;

import java.io.*;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.io.ReadSource;
import org.archive.modules.CrawlURI;
import org.archive.modules.SchedulingConstants;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.spring.WriteTarget;
import org.archive.util.DevUtils;
import org.archive.util.iterator.LineReadingIterator;
import org.archive.util.iterator.RegexLineIterator;
import org.springframework.beans.factory.annotation.Required;

/**
 *
 * @author jkowalczyk
 */
public class AsqatasunTextSeedModule extends AsqatasunSeedModule implements ReadSource {

    private static final long serialVersionUID = 3L;

    private static final Logger logger = Logger.getLogger(AsqatasunTextSeedModule.class.getName());

    /**
     * Text from which to extract seeds
     */
    protected ReadSource textSource = null;

    public ReadSource getTextSource() {
        return textSource;
    }

    @Required
    public void setTextSource(ReadSource seedsSource) {
        this.textSource = seedsSource;
    }

    public AsqatasunTextSeedModule() {
    }

    /**
     * Announce all seeds from configured source to SeedListeners
     * (including nonseed lines mixed in).
     * @see org.archive.modules.seeds.SeedModule#announceSeeds()
     */
    @Override
    public void announceSeeds() {
        BufferedReader reader = new BufferedReader(textSource.obtainReader());
        try {
            announceSeedsFromReader(reader);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    /**
     * Announce all seeds (and nonseed possible-directive lines) from
     * the given Reader
     * @param reader
     */
    protected void announceSeedsFromReader(BufferedReader reader) {
        String s;
        Iterator<String> iter = new RegexLineIterator(new LineReadingIterator(reader),
                RegexLineIterator.COMMENT_LINE, RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
                RegexLineIterator.ENTRY);

        while (iter.hasNext()) {
            s = iter.next();
            if (Character.isLetterOrDigit(s.charAt(0))) {
                // consider a likely URI
                seedLine(s);
            } else {
                // report just in case it's a useful directive
                nonseedLine(s);
            }
        }
        publishConcludedSeedBatch();
    }

    /**
     * Handle a read line that is probably a seed.
     *
     * @param uri String seed-containing line
     */
    protected void seedLine(String uri) {
        if (!uri.matches("[a-zA-Z][\\w+\\-]+:.*")) { // Rfc2396 s3.1 scheme,
                                                     // minus '.'
                                                     // Does not begin with scheme, so try http://
            uri = "http://" + uri;
        }
        try {
            UURI uuri = UURIFactory.getInstance(uri);
            CrawlURI curi = new CrawlURI(uuri);
            curi.setSeed(true);
            curi.setSchedulingDirective(SchedulingConstants.MEDIUM);
            if (getSourceTagSeeds()) {
                curi.setSourceTag(curi.toString());
            }
            publishAddedSeed(curi);
        } catch (URIException e) {
            // try as nonseed line as fallback
            nonseedLine(uri);
        }
    }

    /**
     * Handle a read line that is not a seed, but may still have
     * meaning to seed-consumers (such as scoping beans).
     *
     * @param line String seed-containing line
     */
    protected void nonseedLine(String line) {
        publishNonSeedLine(line);
    }

    /**
     * Treat the given file as a source of additional seeds,
     * announcing to SeedListeners.
     *
     * @see org.archive.modules.seeds.SeedModule#actOn(java.io.File)
     */
    @Override
    public void actOn(File f) {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(f));
            announceSeedsFromReader(reader);
        } catch (FileNotFoundException fnf) {
            logger.log(Level.SEVERE, "seed file source not found", fnf);
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }

    /**
     * Add a new seed to scope. By default, simply appends
     * to seeds file, though subclasses may handle differently.
     *
     * <p>This method is *not* sufficient to get the new seed
     * scheduled in the Frontier for crawling -- it only
     * affects the Scope's seed record (and decisions which
     * flow from seeds).
     *
     * @param curi CandidateUri to add
     */
    @Override
    public synchronized void addSeed(final CrawlURI curi) {
        if (!(textSource instanceof WriteTarget)) {
            // TODO: do something else to log seed update
            logger.log(Level.WARNING, "nowhere to log added seed: {0}", curi);
        } else {
            // TODO: determine if this modification to seeds file means
            // TextSeedModule should (again) be Checkpointable
            try {
                Writer fw = ((WriteTarget) textSource).obtainWriter(true);
                // Write to new (last) line the URL.
                fw.write("\n");
                fw.write("# Heritrix added seed "
                        + ((curi.getVia() != null) ? "redirect from " + curi.getVia() : "(JMX)") + ".\n");
                fw.write(curi.toString());
                fw.flush();
                fw.close();
            } catch (IOException e) {
                DevUtils.warnHandle(e, "problem writing new seed");
            }
        }
        publishAddedSeed(curi);
    }

    @Override
    public Reader obtainReader() {
        return textSource.obtainReader();
    }

}