org.archive.crawler.framework.ActionDirectory.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.crawler.framework.ActionDirectory.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.framework;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.script.Bindings;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.modules.seeds.SeedModule;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.FilesystemLinkMaker;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.context.Lifecycle;

/**
 * Directory watched for new files. Depending on their extension, will
 * process with regard to current crawl, and rename with a datestamp 
 * into the 'done' directory. 
 * 
 * Currently supports:
 *  - .seeds(.gz)
 *      add each URI found in file as a new seed (to be crawled
 *      if not already; to affect scope if appropriate).
 *  - (.s).recover(.gz)
 *      treat as traditional recovery log: consider all 'Fs'-tagged lines 
 *      included, then try-rescheduling all 'F+'-tagged lines. (If ".s." 
 *      present, try scoping URIs before including/scheduling.)
 *  - (.s).include(.gz) 
 *      add each URI found in a recover-log like file (regardless of its
 *      tagging) to the frontier's alreadyIncluded filter, preventing them
 *      from being recrawled. ('.s.' indicates to apply scoping.)
 *  - (.s).schedule(.gz)
 *      add each URI found in a recover-log like file (regardless of its
 *      tagging) to the frontier's queues. ('.s.' indicates to apply 
 *      scoping.)
 *      
 * Future support planned:
 *  - .robots: invalidate robots ASAP
 *  - (?) .block: block-all on named site(s)
 *  -  .overlay: add new overlay settings
 *  - .js .rb .bsh .rb etc - execute arbitrary script (a la ScriptedProcessor)
 * 
 * @contributor gojomo
 */
public class ActionDirectory implements ApplicationContextAware, Lifecycle, Runnable {
    final private static Logger LOGGER = Logger.getLogger(ActionDirectory.class.getName());

    protected ScheduledExecutorService executor;

    /** how long after crawl start to first scan action directory */
    protected int initialDelaySeconds = 10;

    public int getInitialDelaySeconds() {
        return initialDelaySeconds;
    }

    public void setInitialDelaySeconds(int initialDelay) {
        this.initialDelaySeconds = initialDelay;
    }

    /** delay between scans of actionDirectory for new files */
    protected int delaySeconds = 30;

    public int getDelaySeconds() {
        return delaySeconds;
    }

    public void setDelaySeconds(int delay) {
        this.delaySeconds = delay;
    }

    protected ConfigPath actionDir = new ConfigPath("ActionDirectory source directory", "action");

    public ConfigPath getActionDir() {
        return actionDir;
    }

    public void setActionDir(ConfigPath actionDir) {
        this.actionDir = actionDir;
    }

    protected ConfigPath doneDir = new ConfigPath("ActionDirectory done directory", "${launchId}/actions-done");

    public ConfigPath getDoneDir() {
        return doneDir;
    }

    public void setDoneDir(ConfigPath doneDir) {
        this.doneDir = doneDir;
    }

    protected ApplicationContext appCtx;

    public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
        this.appCtx = applicationContext;
    }

    protected SeedModule seeds;

    public SeedModule getSeeds() {
        return this.seeds;
    }

    @Autowired
    public void setSeeds(SeedModule seeds) {
        this.seeds = seeds;
    }

    /** autowired frontier for actions */
    protected Frontier frontier;

    public Frontier getFrontier() {
        return this.frontier;
    }

    @Autowired
    public void setFrontier(Frontier frontier) {
        this.frontier = frontier;
    }

    public boolean isRunning() {
        return executor != null && !executor.isShutdown();
    }

    public void start() {
        if (isRunning()) {
            return;
        }
        try {
            // create directories
            org.archive.util.FileUtils.ensureWriteableDirectory(getActionDir().getFile());
            org.archive.util.FileUtils.ensureWriteableDirectory(getDoneDir().getFile());
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
        // start background executor
        executor = Executors.newSingleThreadScheduledExecutor();
        executor.scheduleWithFixedDelay(this, getInitialDelaySeconds(), getDelaySeconds(), TimeUnit.SECONDS);
    }

    public void stop() {
        executor.shutdown();
        try {
            while (!executor.awaitTermination(10, TimeUnit.SECONDS))
                ;
        } catch (InterruptedException e) {
            // do nothing
        }
    }

    /** 
     * Action taken at scheduled intervals
     * @see java.lang.Runnable#run()
     */
    public void run() {
        scanActionDirectory();
    }

    /**
     * Find any new files in the 'action' directory; process each in
     * order. 
     */
    protected void scanActionDirectory() {
        File dir = actionDir.getFile();
        File[] files = dir.listFiles((FileFilter) FileFilterUtils.fileFileFilter());
        Arrays.sort(files);
        for (File f : files) {
            try {
                actOn(f);
            } catch (Throwable e) {
                LOGGER.log(Level.SEVERE, "unhandled exception from actifile: " + f, e);
            }
        }
    }

    /**
     * Process an individual action file found 
     * 
     * @param actionFile File to process
     */
    protected void actOn(File actionFile) {
        LOGGER.info("processing action file: " + actionFile);
        String filename = actionFile.getName();
        boolean isGzip = filename.endsWith(".gz");
        String corename = isGzip ? filename.substring(0, filename.length() - 3) : filename;
        String timestamp = ArchiveUtils.get17DigitDate();

        if (corename.endsWith(".seeds")) {
            // import seeds
            getSeeds().actOn(actionFile);
        } else if (corename.endsWith(".recover")) {
            // apply recovery-log
            boolean alsoScope = corename.endsWith(".s.recover");
            try {
                // consider-included all successes and explicit-includes...
                getFrontier().importRecoverFormat(actionFile, alsoScope, true, false, "F[si] ");
                // then retry all adds...
                getFrontier().importRecoverFormat(actionFile, alsoScope, false, false, "F\\+ ");
            } catch (IOException ioe) {
                LOGGER.log(Level.SEVERE, "problem with action file: " + actionFile, ioe);
            }
        } else if (corename.endsWith(".include")) {
            // consider-included-only (do not schedule)
            boolean alsoScope = corename.endsWith(".s.include");
            try {
                getFrontier().importRecoverFormat(actionFile, alsoScope, true, false, ".*");
            } catch (IOException ioe) {
                LOGGER.log(Level.SEVERE, "problem with action file: " + actionFile, ioe);
            }
        } else if (corename.endsWith(".schedule")) {
            // schedule to queues
            boolean alsoScope = corename.endsWith(".s.schedule");
            try {
                getFrontier().importRecoverFormat(actionFile, alsoScope, false, false, ".*");
            } catch (IOException ioe) {
                LOGGER.log(Level.SEVERE, "problem with action file: " + actionFile, ioe);
            }
        } else if (corename.endsWith(".force")) {
            // schedule to queues
            boolean alsoScope = corename.endsWith(".s.force");
            try {
                getFrontier().importRecoverFormat(actionFile, alsoScope, false, true, ".*");
            } catch (IOException ioe) {
                LOGGER.log(Level.SEVERE, "problem with action file: " + actionFile, ioe);
            }
            //        } else if (filename.endsWith(".robots")) {
            //            // force refresh of robots
            //            // TODO
        } else if (!tryAsScript(actionFile, timestamp)) {
            LOGGER.warning("action file ignored: " + actionFile);
        }

        // move file to 'done' area with timestamp prefix
        while (actionFile.exists()) {
            try {
                File doneFile = new File(doneDir.getFile(), timestamp + "." + actionFile.getName());
                FileUtils.moveFile(actionFile, doneFile);

                // attempt to symlink from action/done/ to done file
                File actionDoneDirFile = new File(actionDir.getFile(), "done");
                if (!actionDoneDirFile.equals(doneDir.getFile())) {
                    actionDoneDirFile.mkdirs();
                    File doneSymlinkFile = new File(actionDoneDirFile, doneFile.getName());
                    boolean success = FilesystemLinkMaker.makeSymbolicLink(doneFile.getPath(),
                            doneSymlinkFile.getPath());
                    if (!success) {
                        LOGGER.warning("failed to create symlink from " + doneSymlinkFile + " to " + doneFile);
                    }
                }
            } catch (IOException e) {
                LOGGER.log(Level.SEVERE, "unable to move " + actionFile, e);
            }
        }
    }

    /** shared ScriptEngineManager */
    protected static ScriptEngineManager MANAGER = new ScriptEngineManager();

    /**
     * Try the actionFile as a script, deducing the proper scripting
     * language from its file extension. Return true if evaluation was
     * tried with a known script engine. 
     * 
     * Provides 'appCtx' and 'rawOut' to script for accessing crawl
     * and outputting text to a '.out' file paired with the 'done/' 
     * action file. If an exception occurs, it will be logged to an
     * '.ex' file alongside the script file in 'done/'. 
     * 
     * @param actionFile file to try
     * @param timestamp timestamp correlating out/ex files with done script
     * @return true if engine evaluation began (even if an error occurred)
     */
    protected boolean tryAsScript(File actionFile, String timestamp) {
        int i = actionFile.getName().lastIndexOf(".");
        if (i < 0) {
            return false;
        }

        // deduce language/engine from extension
        String extension = actionFile.getName().substring(i + 1);
        ScriptEngine engine = MANAGER.getEngineByExtension(extension);
        if (engine == null) {
            return false;
        }

        // prepare engine
        StringWriter rawString = new StringWriter();
        PrintWriter rawOut = new PrintWriter(rawString);
        Exception ex = null;
        Bindings bindings = new BeanLookupBindings(appCtx);
        bindings.put("rawOut", rawOut);
        bindings.put("appCtx", appCtx);

        // evaluate and record any exception
        try {
            String script = FileUtils.readFileToString(actionFile);
            engine.eval(script, bindings);
        } catch (IOException e) {
            ex = e;
        } catch (ScriptException e) {
            ex = e;
        } catch (RuntimeException e) {
            ex = e;
        } finally {
            // the script could create an object that persists and retains a reference to the Bindings
            bindings.put("rawOut", null);
            bindings.put("appCtx", null);
        }

        // report output/exception to files paired with script in done dir
        rawOut.flush();
        String allOut = rawString.toString();
        if (StringUtils.isNotBlank(allOut)) {
            File outFile = new File(doneDir.getFile(), timestamp + "." + actionFile.getName() + ".out");
            try {
                FileUtils.writeStringToFile(outFile, rawString.toString());
            } catch (IOException ioe) {
                LOGGER.log(Level.SEVERE, "problem during action file: " + actionFile, ioe);
            }
        }
        if (ex != null) {
            File exFile = new File(doneDir.getFile(), timestamp + "." + actionFile.getName() + ".exception");
            try {
                FileUtils.writeStringToFile(exFile, ex.toString());
            } catch (IOException ioe) {
                LOGGER.log(Level.SEVERE, "problem during action file: " + actionFile, ioe);
            }
        }

        return true;
    }

}