org.archive.crawler.framework.Engine.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.crawler.framework.Engine.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.framework;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.archive.util.ArchiveUtils;

/**
 * Implementation for Engine.  Jobs and profiles are stored in a 
 * directory called the jobsDir.  The jobs are contained as subdirectories of
 * jobDir.  
 * 
 * @contributor pjack
 * @contributor gojomo
 */
public class Engine {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 4L;

    final public static String LOGS_DIR_NAME = "logs subdirectory";
    final public static String REPORTS_DIR_NAME = "reports subdirectory";

    final private static Logger LOGGER = Logger.getLogger(Engine.class.getName());

    /** directory where job directories are expected */
    protected File jobsDir;
    /** map of job short names -> CrawlJob instances */
    protected HashMap<String, CrawlJob> jobConfigs = new HashMap<String, CrawlJob>();

    protected String profileCxmlPath = "/org/archive/crawler/restlet/profile-crawler-beans.cxml";

    public Engine(File jobsDir) {
        this.jobsDir = jobsDir;

        try {
            org.archive.util.FileUtils.ensureWriteableDirectory(jobsDir);
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }

        findJobConfigs();
        // TODO: cleanup any cruft from improperly ended jobs 
    }

    /**
     * Find all job configurations in the usual place -- subdirectories
     * of the jobs directory with files ending '.cxml', and from jobPathFiles
     * (previously added by user) found in the jobs directory
     */
    public void findJobConfigs() {
        // TODO: allow other places/paths to be scanned/added as well?

        // remove crawljobs whose directories have disappeared
        // TODO: try a more delicate cleanup; eg: if appCtx exists?
        for (String jobName : jobConfigs.keySet().toArray(new String[0])) {
            CrawlJob cj = jobConfigs.get(jobName);
            if (!cj.getJobDir().exists()) {
                jobConfigs.remove(jobName);
            }
        }

        // just in case...
        if (!jobsDir.exists()) {
            LOGGER.log(Level.SEVERE, "jobsDir has disappeared: " + jobsDir.toString());
            return;
        }

        // discover any new job directories
        for (File candidateFile : jobsDir.listFiles()) {
            File jobFile = candidateFile;
            if (candidateFile.getName().endsWith(".jobpath")) {
                // convert .jobpaths to the referenced external directory
                jobFile = getJobDirectoryFrom(candidateFile);
            }
            if (jobConfigs.containsKey(jobFile.getName())) {
                continue;
            }
            if (!addJobDirectory(jobFile)) {
                LOGGER.log(Level.WARNING,
                        "invalid job directory: " + jobFile + " where job expected from: " + candidateFile);
            }
        }
    }

    /**
     * Return the job directory File read from the supplied ".jobpath" file,
     * or null on any error. 
     */
    protected File getJobDirectoryFrom(File jobPathFile) {
        try {
            return new File(FileUtils.readFileToString(jobPathFile).trim());
        } catch (IOException e) {
            LOGGER.log(Level.SEVERE, "bad .jobpath: " + jobPathFile, e);
            return null;
        }
    }

    /**
     * Adds a job directory to the Engine known jobConfigs if not extant.
     * 
     * @param dir directory to be added
     * @return true if directory successfully added, false for any failure
     */
    public boolean addJobDirectory(File dir) {
        if (dir == null) {
            return false;
        }
        File[] candidateConfigs = dir.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return name.endsWith(".cxml");
            }
        });
        if (candidateConfigs == null || candidateConfigs.length == 0) {
            // no CXML file found!
            return false;
        }
        if (jobConfigs.containsKey(dir.getName())) {
            // same-name job already exists
            return false;
        }
        for (File cxml : candidateConfigs) {
            try {
                CrawlJob cj = new CrawlJob(cxml);
                if (!cj.getJobDir().getParentFile().equals(getJobsDir())) {
                    writeJobPathFile(cj);
                }
                jobConfigs.put(cj.getShortName(), cj);
                LOGGER.log(Level.INFO, "added crawl job: " + cj.getShortName());
                return true;
            } catch (IOException iae) {
                LOGGER.log(Level.SEVERE, "unable to add job directory" + dir, iae);
            } catch (IllegalArgumentException iae) {
                LOGGER.log(Level.SEVERE, "bad cxml: " + cxml, iae);
            }
        }
        // path rejected for some reason
        return false;
    }

    public Map<String, CrawlJob> getJobConfigs() {
        return jobConfigs;
    }

    /**
     * Copy a job to a new location, possibly making a job
     * a profile or a profile a runnable job. 
     * 
     * @param orig CrawlJob representing source
     * @param destDir File location destination
     * @param asProfile true if destination should become a profile
     * @throws IOException 
     */
    public synchronized void copy(CrawlJob orig, File destDir, boolean asProfile) throws IOException {
        org.archive.util.FileUtils.ensureWriteableDirectory(destDir);
        if (destDir.list().length > 0) {
            throw new IOException("destination dir not empty");
        }
        File srcDir = orig.getPrimaryConfig().getParentFile();

        // FIXME: Add option for only copying history DB
        // FIXME: Don't hardcode these names
        // FIXME: (?) copy any referenced file (ConfigFile/ConfigPath),
        // even outside the job directory? 

        // copy all simple files except the 'job.log' and its '.lck' (if any)
        FileUtils.copyDirectory(srcDir, destDir, FileFilterUtils.andFileFilter(FileFilterUtils.fileFileFilter(),
                FileFilterUtils.notFileFilter(FileFilterUtils.prefixFileFilter("job.log"))));

        // ...and all contents of 'resources' subdir...
        File srcResources = new File(srcDir, "resources");
        if (srcResources.isDirectory()) {
            FileUtils.copyDirectory(srcResources, new File(destDir, "resources"));
        }

        File newPrimaryConfig = new File(destDir, orig.getPrimaryConfig().getName());
        if (asProfile) {
            if (!orig.isProfile()) {
                // rename cxml to have 'profile-' prefix
                FileUtils.moveFile(newPrimaryConfig, new File(destDir, "profile-" + newPrimaryConfig.getName()));
            }
        } else {
            if (orig.isProfile()) {
                // rename cxml to remove 'profile-' prefix
                FileUtils.moveFile(newPrimaryConfig, new File(destDir, newPrimaryConfig.getName().substring(8)));
            }
        }
        findJobConfigs();
    }

    /**
     * Copy a job to a new location, possibly making a job
     * a profile or a profile a runnable job. 
     * 
     * @param cj CrawlJob representing source
     * @param copyTo String location destination; interpreted relative to jobsDir
     * @param asProfile true if destination should become a profile
     * @throws IOException 
     */
    public void copy(CrawlJob cj, String copyTo, boolean asProfile) throws IOException {
        File dest = new File(copyTo);
        if (!dest.isAbsolute()) {
            dest = new File(jobsDir, copyTo);
        }
        copy(cj, dest, asProfile);
    }

    public String getHeritrixVersion() {
        return ArchiveUtils.VERSION;
    }

    public synchronized void deleteJob(CrawlJob job) throws IOException {
        FileUtils.deleteDirectory(job.getJobDir());
    }

    public void requestLaunch(String shortName) {
        jobConfigs.get(shortName).launch();
    }

    public CrawlJob getJob(String shortName) {
        if (!jobConfigs.containsKey(shortName)) {
            // try a rescan if not already present
            findJobConfigs();
        }
        return jobConfigs.get(shortName);
    }

    public File getJobsDir() {
        return jobsDir;
    }

    public Map<String, Object> heapReportData() {
        Map<String, Object> map = new LinkedHashMap<String, Object>();
        map.put("usedBytes", Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory());
        map.put("totalBytes", Runtime.getRuntime().totalMemory());
        map.put("maxBytes", Runtime.getRuntime().maxMemory());
        return map;
    }

    public String heapReport() {
        long totalMemory = Runtime.getRuntime().totalMemory();
        long freeMemory = Runtime.getRuntime().freeMemory();
        long maxMemory = Runtime.getRuntime().maxMemory();
        StringBuilder sb = new StringBuilder(64);
        sb.append((totalMemory - freeMemory) / 1024).append(" KiB used; ").append(totalMemory / 1024)
                .append(" KiB current heap; ").append(maxMemory / 1024).append(" KiB max heap");
        return sb.toString();
    }

    public void shutdown() {
        // TODO stop everything
        for (CrawlJob job : jobConfigs.values()) {
            if (job.isRunning()) {
                job.terminate();
            }
        }
        waitForNoRunningJobs(0);
    }

    /**
     * Wait for all jobs to be in non-running state, or until timeout
     * (given in ms) elapses. Use '0' for no timeout (wait as long as
     * necessary.
     * 
     * @param timeout
     * @return true if timeout occurred and a job is (possibly) still running
     */
    public boolean waitForNoRunningJobs(long timeout) {
        long startTime = System.currentTimeMillis();
        // wait for all jobs to not be running
        outer: while (true) {
            if (timeout > 0 && (startTime + timeout) > System.currentTimeMillis()) {
                return true;
            }
            try {
                Thread.sleep(500);
            } catch (InterruptedException e) {
                break;
            }
            for (CrawlJob job : jobConfigs.values()) {
                if (job.isRunning()) {
                    continue outer;
                }
            }
            break;
        }
        try {
            // wait an extra second for good measure
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            // ignore
        }
        return false;
    }

    /**
     * @return InputStream resource from defined profile CXML path
     */
    protected InputStream getProfileCxmlResource() {
        return getClass().getResourceAsStream(profileCxmlPath);
    }

    /**
     * create a new job dir and copy profile CXML into as non-profile CXML
     * @param newJobDir new job directory
     * @throws IOException 
     */
    public boolean createNewJobWithDefaults(File newJobDir) {
        try {
            // get crawler-beans template into string
            InputStream inStream = getProfileCxmlResource();
            String defaultCxmlStr;
            defaultCxmlStr = IOUtils.toString(inStream);
            inStream.close();

            // write default crawler-beans string to new job dir
            org.archive.util.FileUtils.ensureWriteableDirectory(newJobDir);
            File newJobCxml = new File(newJobDir, "crawler-beans.cxml");
            FileUtils.writeStringToFile(newJobCxml, defaultCxmlStr);

            return true;

        } catch (IOException e) {
            LOGGER.log(Level.SEVERE, "failed to create new job: " + newJobDir.getAbsolutePath());
            return false;
        }
    }

    /**
     * Writes a .jobpath file for the new CrawlJob, whose directory is
     * outside the main Engine jobs directory. 
     * 
     * @param job CrawlJob whose main directory the .jobpath should point to
     * @throws IOException for any IO error
     */
    public void writeJobPathFile(CrawlJob job) throws IOException {
        String jobpathFileName = job.getShortName() + ".jobpath";
        File jobpathFile = new File(jobsDir, jobpathFileName);
        FileUtils.writeStringToFile(jobpathFile, job.getJobDir().getAbsolutePath() + "\n");
        LOGGER.log(Level.INFO, "wrote jobpath file: " + jobpathFileName);
    }
}