com.norconex.collector.core.AbstractCollector.java Source code

Introduction

Here is the source code for com.norconex.collector.core.AbstractCollector.java
Source

/* Copyright 2014 Norconex Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.norconex.collector.core;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import com.norconex.collector.core.crawler.ICrawler;
import com.norconex.collector.core.crawler.ICrawlerConfig;
import com.norconex.committer.core.ICommitter;
import com.norconex.importer.Importer;
import com.norconex.jef4.job.IJob;
import com.norconex.jef4.job.group.AsyncJobGroup;
import com.norconex.jef4.log.FileLogManager;
import com.norconex.jef4.status.FileJobStatusStore;
import com.norconex.jef4.status.IJobStatus;
import com.norconex.jef4.status.JobState;
import com.norconex.jef4.suite.AbstractSuiteLifeCycleListener;
import com.norconex.jef4.suite.JobSuite;
import com.norconex.jef4.suite.JobSuiteConfig;

/**
 * Base implementation of a Collector. 
 * Instances of this class can hold several crawler, running at once.
 * This is convenient when there are configuration setting to be shared amongst
 * crawlers.  When you have many crawler jobs defined that have nothing
 * in common, it may be best to configure and run them separately, to facilitate
 * troubleshooting.  There is no best rule for this, experimentation 
 * will help you.
 * @author Pascal Essiembre
 */
@SuppressWarnings("nls")
public abstract class AbstractCollector implements ICollector {

    private static final Logger LOG = LogManager.getLogger(AbstractCollector.class);

    private AbstractCollectorConfig collectorConfig;

    private ICrawler[] crawlers;
    private JobSuite jobSuite;

    /**
     * Creates and configure a Collector with the provided
     * configuration.
     * @param collectorConfig Collector configuration
     */
    public AbstractCollector(AbstractCollectorConfig collectorConfig) {
        //TODO clone config so modifications no longer apply.
        if (collectorConfig == null) {
            throw new IllegalArgumentException("Collector Configuration cannot be null.");
        }

        this.collectorConfig = collectorConfig;

        ICrawlerConfig[] crawlerConfigs = this.collectorConfig.getCrawlerConfigs();
        if (crawlerConfigs != null) {
            ICrawler[] newCrawlers = new ICrawler[crawlerConfigs.length];
            for (int i = 0; i < crawlerConfigs.length; i++) {
                ICrawlerConfig crawlerConfig = crawlerConfigs[i];
                newCrawlers[i] = createCrawler(crawlerConfig);
            }
            this.crawlers = newCrawlers;
        } else {
            this.crawlers = new ICrawler[] {};
        }
    }

    /**
     * Gets the job suite.
     * @return the jobSuite
     */
    @Override
    public JobSuite getJobSuite() {
        return jobSuite;
    }

    /**
     * Start all crawlers defined in configuration.
     * @param resumeNonCompleted whether to resume where previous crawler
     *        aborted (if applicable) 
     */
    @Override
    public void start(boolean resumeNonCompleted) {

        //TODO move this code to a config validator class?
        if (StringUtils.isBlank(getCollectorConfig().getId())) {
            throw new CollectorException("Collector must be given " + "a unique identifier (id).");
        }

        if (jobSuite != null) {
            throw new CollectorException("Collector is already running. Wait for it to complete "
                    + "before starting the same instance again, or stop "
                    + "the currently running instance first.");
        }
        jobSuite = createJobSuite();
        try {
            jobSuite.execute(resumeNonCompleted);
        } finally {
            jobSuite = null;
        }
    }

    /**
     * Stops a running instance of this Collector.
     */
    @Override
    public void stop() {
        if (jobSuite == null) {
            jobSuite = createJobSuite();
        }

        IJobStatus status = jobSuite.getStatus();
        if (status == null || !status.isState(JobState.RUNNING, JobState.UNKNOWN)) {
            throw new CollectorException("This collector cannot be stopped since it is NOT "
                    + "running. Current state: " + jobSuite.getStatus().getState());
        } else if (LOG.isDebugEnabled()) {
            LOG.debug("Suite state: " + status.getState());
        }

        try {
            LOG.info("Making a stop request...");
            jobSuite.stop();
            LOG.info("Stop request made.");
            LOG.info("PLEASE NOTE: To ensure a clean stop, "
                    + "crawlers may wait until they are done with documents "
                    + "currently being processed. If an urgent stop is "
                    + "required or you do not want to wait, manually kill " + "the process.");
            //TODO wait for stop confirmation before setting to null?
            jobSuite = null;
        } catch (IOException e) {
            throw new CollectorException("Could not stop collector: " + getId(), e);
        }
    }

    @Override
    public JobSuite createJobSuite() {
        ICrawler[] crawlers = getCrawlers();

        IJob rootJob = null;
        if (crawlers.length > 1) {
            rootJob = new AsyncJobGroup(getId(), crawlers);
        } else if (crawlers.length == 1) {
            rootJob = crawlers[0];
        }

        JobSuiteConfig suiteConfig = new JobSuiteConfig();

        //TODO have a base workdir, which is used to figure out where to put
        // everything (log, progress), and make log and progress overwritable.

        ICollectorConfig collectorConfig = getCollectorConfig();
        suiteConfig.setLogManager(new FileLogManager(collectorConfig.getLogsDir()));
        suiteConfig.setJobStatusStore(new FileJobStatusStore(collectorConfig.getProgressDir()));
        suiteConfig.setWorkdir(collectorConfig.getProgressDir());
        suiteConfig.setSuiteLifeCycleListeners(new AbstractSuiteLifeCycleListener() {
            @Override
            public void suiteStarted(JobSuite suite) {
                printReleaseVersion();
            }
        });
        JobSuite suite = new JobSuite(rootJob, suiteConfig);
        LOG.info("Suite of " + crawlers.length + " crawler jobs created.");
        return suite;
    }

    /**
     * Creates a new crawler instance.
     * @param config crawler configuration
     * @return new crawler
     */
    protected abstract ICrawler createCrawler(ICrawlerConfig config);

    /**
     * Gets the collector configuration
     * @return the collectorConfig
     */
    @Override
    public AbstractCollectorConfig getCollectorConfig() {
        return collectorConfig;
    }

    @Override
    public String getId() {
        return collectorConfig.getId();
    }

    /**
     * Add the provided crawlers to this collector.
     * @param crawlers crawlers to add
     */
    public void setCrawlers(ICrawler[] crawlers) {
        this.crawlers = Arrays.copyOf(crawlers, crawlers.length);
    }

    /**
     * Gets all crawler instances in this collector.
     * @return crawlers
     */
    public ICrawler[] getCrawlers() {
        return Arrays.copyOf(crawlers, crawlers.length);
    }

    private void printReleaseVersion() {
        printReleaseVersion("Collector", getClass().getPackage());
        printReleaseVersion("Collector Core", AbstractCollector.class.getPackage());
        printReleaseVersion("Importer", Importer.class.getPackage());
        printReleaseVersion("JEF", IJob.class.getPackage());

        //--- Committers ---
        printReleaseVersion("Committer Core", ICommitter.class.getPackage());
        Set<ICommitter> committers = new HashSet<>();
        for (ICrawler crawler : getCrawlers()) {
            ICommitter committer = crawler.getCrawlerConfig().getCommitter();
            if (committer != null) {
                Package committerPackage = committer.getClass().getPackage();
                if (committerPackage != null
                        && !committerPackage.getName().startsWith("com.norconex.committer.core")) {
                    committers.add(committer);
                }
            }
        }
        for (ICommitter c : committers) {
            printReleaseVersion(c.getClass().getSimpleName(), c.getClass().getPackage());
        }
    }

    private void printReleaseVersion(String moduleName, Package p) {
        String version = p.getImplementationVersion();
        if (StringUtils.isBlank(version)) {
            // No version is likely due to using an unpacked or modified 
            // jar, or the jar not being packaged with version 
            // information.
            LOG.info("Version: \"" + moduleName + "\" version is undefined.");
            return;
        }
        LOG.info("Version: " + p.getImplementationTitle() + " " + p.getImplementationVersion() + " ("
                + p.getImplementationVendor() + ")");
    }

}