org.archive.crawler.settings.XMLSettingsHandler.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.crawler.settings.XMLSettingsHandler.java

Source

/* XMLSettingsHandler
 *
 * $Id: XMLSettingsHandler.java 6325 2009-05-28 01:24:48Z gojomo $
 *
 * Created on Dec 18, 2003
 *
 * Copyright (C) 2004 Internet Archive.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package org.archive.crawler.settings;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.TreeSet;
import java.util.logging.Logger;

import javax.management.Attribute;
import javax.management.AttributeNotFoundException;
import javax.management.InvalidAttributeValueException;
import javax.management.MBeanAttributeInfo;
import javax.management.MBeanException;
import javax.management.MBeanInfo;
import javax.management.ReflectionException;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.io.IOUtils;
import org.archive.crawler.datamodel.CrawlOrder;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;

/** A SettingsHandler which uses XML files as persistent storage.
 *
 * @author John Erik Halse
 */
public class XMLSettingsHandler extends SettingsHandler {
    private static Logger logger = Logger.getLogger("org.archive.crawler.settings.XMLSettingsHandler");

    // XML element name constants
    protected static final String XML_SCHEMA = "heritrix_settings.xsd";
    protected static final String XML_ROOT_ORDER = "crawl-order";
    protected static final String XML_ROOT_HOST_SETTINGS = "crawl-settings";
    protected static final String XML_ROOT_REFINEMENT = "crawl-refinement";
    protected static final String XML_ELEMENT_CONTROLLER = "controller";
    protected static final String XML_ELEMENT_META = "meta";
    protected static final String XML_ELEMENT_NAME = "name";
    protected static final String XML_ELEMENT_DESCRIPTION = "description";
    protected static final String XML_ELEMENT_OPERATOR = "operator";
    protected static final String XML_ELEMENT_ORGANIZATION = "organization";
    protected static final String XML_ELEMENT_AUDIENCE = "audience";
    protected static final String XML_ELEMENT_DATE = "date";
    protected static final String XML_ELEMENT_REFINEMENTLIST = "refinement-list";
    protected static final String XML_ELEMENT_REFINEMENT = "refinement";
    protected static final String XML_ELEMENT_REFERENCE = "reference";
    protected static final String XML_ELEMENT_LIMITS = "limits";
    protected static final String XML_ELEMENT_TIMESPAN = "timespan";
    protected static final String XML_ELEMENT_PORTNUMBER = "portnumber";
    protected static final String XML_ELEMENT_URIMATCHES = "uri-matches";
    protected static final String XML_ELEMENT_CONTENTMATCHES = "content-type-matches";
    protected static final String XML_ELEMENT_OBJECT = "object";
    protected static final String XML_ELEMENT_NEW_OBJECT = "newObject";
    protected static final String XML_ATTRIBUTE_NAME = "name";
    protected static final String XML_ATTRIBUTE_CLASS = "class";
    protected static final String XML_ATTRIBUTE_FROM = "from";
    protected static final String XML_ATTRIBUTE_TO = "to";

    private File orderFile;
    private final static String settingsFilename = "settings";
    private final static String settingsFilenameSuffix = "xml";
    private final static String REFINEMENT_DIR = "_refinements";

    /** Create a new XMLSettingsHandler object.
     *
     * @param orderFile where the order file is located.
     * @throws InvalidAttributeValueException
     */
    public XMLSettingsHandler(File orderFile) throws InvalidAttributeValueException {
        super();
        this.orderFile = orderFile.getAbsoluteFile();
    }

    /** Initialize the SettingsHandler.
     *
     * This method builds the settings data structure and initializes it with
     * settings from the order file given to the constructor.
     */
    public void initialize() {
        super.initialize();
    }

    /** 
     * Initialize the SettingsHandler from a source.
     *
     * This method builds the settings data structure and initializes it with
     * settings from the order file given as a parameter. The intended use is
     * to create a new order file based on a default (template) order file.
     *
     * @param source the order file to initialize from.
     */
    public void initialize(File source) {
        File tmpOrderFile = orderFile;
        orderFile = source.getAbsoluteFile();
        this.initialize();
        orderFile = tmpOrderFile;
    }

    private File getSettingsDirectory() {
        String settingsDirectoryName = null;
        try {
            settingsDirectoryName = (String) getOrder().getAttribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY);
        } catch (AttributeNotFoundException e) {
            e.printStackTrace();
        } catch (MBeanException e) {
            e.printStackTrace();
        } catch (ReflectionException e) {
            e.printStackTrace();
        }

        return getPathRelativeToWorkingDirectory(settingsDirectoryName);
    }

    /** Resolves the filename for a settings object into a file path.
     *
     * It will also create the directory structure leading to this file
     * if it doesn't exist.
     *
     * @param settings the settings object to get file path for.
     * @return the file path for this settings object.
     */
    protected final File settingsToFilename(CrawlerSettings settings) {
        File file;

        if (settings.getScope() == null || settings.getScope().equals("")) {
            if (settings.isRefinement()) {
                file = new File(getSettingsDirectory(), File.separatorChar + REFINEMENT_DIR + File.separatorChar
                        + settings.getName() + '.' + settingsFilenameSuffix);
            } else {
                file = orderFile;
            }
        } else {
            String elements[] = settings.getScope().split("\\.");
            if (elements.length == 0) {
                return orderFile;
            }

            StringBuffer path = new StringBuffer();
            for (int i = elements.length - 1; i > 0; i--) {
                path.append(elements[i]);
                path.append(File.separatorChar);
            }
            path.append(elements[0]);

            if (settings.isRefinement()) {
                file = new File(getSettingsDirectory(), path.toString() + File.separatorChar + REFINEMENT_DIR
                        + File.separatorChar + settings.getName() + '.' + settingsFilenameSuffix);
            } else {
                file = new File(getSettingsDirectory(),
                        path.toString() + File.separatorChar + settingsFilename + "." + settingsFilenameSuffix);
            }
        }
        return file;
    }

    public final void writeSettingsObject(CrawlerSettings settings) {
        File filename = settingsToFilename(settings);
        writeSettingsObject(settings, filename);
    }

    /** Write a CrawlerSettings object to a specified file.
     *
     * This method is similar to {@link #writeSettingsObject(CrawlerSettings)}
     * except that it uses the submitted File object instead of trying to
     * resolve where the file should be written.
     *
     * @param settings the settings object to be serialized.
     * @param filename the file to which the settings object should be written.
     */
    public final void writeSettingsObject(CrawlerSettings settings, File filename) {

        logger.fine("Writing " + filename.getAbsolutePath());
        filename.getParentFile().mkdirs();

        FileOutputStream fos = null;
        try {
            long lastSaved = 0L;
            File backup = null;
            if (getOrder().getController() != null && filename.exists()) {
                // The crawler is running and file exists - make backup first.
                String name = filename.getName();
                lastSaved = settings.getLastSavedTime().getTime();
                name = name.substring(0, name.lastIndexOf('.')) + '_' + ArchiveUtils.get14DigitDate(lastSaved) + "."
                        + settingsFilenameSuffix;
                backup = new File(filename.getParentFile(), name);
                FileUtils.copyFiles(filename, backup);
            }

            fos = new FileOutputStream(filename);
            StreamResult result = new StreamResult(new BufferedOutputStream(fos));
            Transformer transformer = TransformerFactory.newInstance().newTransformer();
            Source source = new CrawlSettingsSAXSource(settings);
            transformer.transform(source, result);

            // Hack to get rid of unnesessary backupfiles.
            // What happens is that the WUI often saves settings files
            // several times during a settings change. This code removes the
            // last backup file if its no more than 2 minutes old.
            if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) {
                backup.delete();
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            IOUtils.closeQuietly(fos);
        }
    }

    /** Read the CrawlerSettings object from a specific file.
     *
     * @param settings the settings object to be updated with data from the
     *                 persistent storage.
     * @param f the file to read from.
     * @return the updated settings object or null if there was no data for this
     *         in the persistent storage.
     */
    protected final CrawlerSettings readSettingsObject(CrawlerSettings settings, File f) {
        CrawlerSettings result = null;
        try {
            InputStream is = null;
            if (!f.exists()) {
                // Perhaps the file we're looking for is on the CLASSPATH.
                // DON'T look on the CLASSPATH for 'settings.xml' files.  The
                // look for 'settings.xml' files happens frequently. Not looking
                // on classpath for 'settings.xml' is an optimization based on
                // ASSUMPTION that there will never be a 'settings.xml' saved
                // on classpath.
                if (!f.getName().startsWith(settingsFilename)) {
                    is = XMLSettingsHandler.class.getResourceAsStream(toResourcePath(f));
                }
            } else {
                is = new FileInputStream(f);
            }
            if (is != null) {
                XMLReader parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
                InputStream file = new BufferedInputStream(is);
                parser.setContentHandler(new CrawlSettingsSAXHandler(settings));
                InputSource source = new InputSource(file);
                source.setSystemId(f.toURL().toExternalForm());
                parser.parse(source);
                result = settings;
            }
        } catch (SAXParseException e) {
            logger.warning(e.getMessage() + " in '" + e.getSystemId() + "', line: " + e.getLineNumber()
                    + ", column: " + e.getColumnNumber());
        } catch (SAXException e) {
            logger.warning(e.getMessage() + ": " + e.getException().getMessage());
        } catch (ParserConfigurationException e) {
            logger.warning(e.getMessage() + ": " + e.getCause().getMessage());
        } catch (FactoryConfigurationError e) {
            logger.warning(e.getMessage() + ": " + e.getException().getMessage());
        } catch (IOException e) {
            logger.warning("Could not access file '" + f.getAbsolutePath() + "': " + e.getMessage());
        }
        return result;
    }

    /**
     * Convert a File to a path that might be resolved from classpath/JAR
     * resource sources. Such paths use linux-like path-separators. 
     * 
     * @param f File 
     * @return path, shorn of any Windows-specific drive identifiers
     */
    public static String toResourcePath(File f) {
        String path = f.toURI().getPath();
        if (path.matches("^/[A-Z]:/.*")) {
            // remove Windows drive-prefix, if any
            path = path.substring(3);
        }
        return path;
    }

    protected final CrawlerSettings readSettingsObject(CrawlerSettings settings) {
        File filename = settingsToFilename(settings);
        return readSettingsObject(settings, filename);
    }

    /** Get the <code>File</code> object pointing to the order file.
     *
     * @return File object for the order file.
     */
    public File getOrderFile() {
        return orderFile;
    }

    /** Creates a replica of the settings file structure in another directory
     * (fully recursive, includes all per host settings). The SettingsHandler
     * will then refer to the new files.
     *
     * Observe that this method should only be called after the SettingsHandler
     * has been initialized.
     *
     * @param newOrderFileName where the new order file should be saved.
     * @param newSettingsDirectory the top level directory of the per host/domain
     *                          settings files.
     * @throws IOException
     */
    public void copySettings(File newOrderFileName, String newSettingsDirectory) throws IOException {
        File oldSettingsDirectory = getSettingsDirectory();

        // Write new orderfile and point the settingshandler to it
        orderFile = newOrderFileName;
        try {
            getOrder().setAttribute(new Attribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY, newSettingsDirectory));
        } catch (Exception e) {
            throw new IOException("Could not update settings with new location: " + e.getMessage());
        }
        writeSettingsObject(getSettingsObject(null));

        File newDir = getPathRelativeToWorkingDirectory(newSettingsDirectory);

        // Copy the per host files if src and dest directories are different.
        if (oldSettingsDirectory.compareTo(newDir) != 0) {
            FileUtils.copyFiles(oldSettingsDirectory, newDir);
        }
    }

    /**
     * Transforms a relative path so that it is relative to the location of the
     * order file. If an absolute path is given, it will be returned unchanged.<p>
     * The location of it's order file is always considered as the 'working'
     * directory for any given settings.
     * @param path A relative path to a file (or directory)
     * @return The same path modified so that it is relative to the file level
     *         location of the order file for the settings handler.
     */
    public File getPathRelativeToWorkingDirectory(String path) {
        File f = new File(path);
        // If path is not absolute, set f's directory
        // relative to the path of the order file
        if (!f.isAbsolute()) {
            f = new File(this.getOrderFile().getParent(), path);
        }
        return f;
    }

    public Collection getDomainOverrides(String rootDomain) {
        File settingsDir = getSettingsDirectory();

        //Find the right start directory.
        ArrayList<String> domains = new ArrayList<String>();
        //First we deconstruct the rootDomain string
        while (rootDomain != null && rootDomain.length() > 0) {
            if (rootDomain.indexOf('.') < 0) {
                // Last level.
                domains.add(rootDomain);
                break; //We're done.
            } else {
                // Got more then one level left.
                domains.add(rootDomain.substring(0, rootDomain.indexOf('.')));
                // Strip down rootDomain.
                rootDomain = rootDomain.substring(rootDomain.indexOf('.') + 1);
            }
        }
        //Build up a proper path
        //Since the domains are right to left, we start at the end of the array.
        StringBuffer subDir = new StringBuffer();
        for (int i = (domains.size() - 1); i >= 0; i--) {
            subDir.append(File.separator + domains.get(i));
        }
        //Then we move to the approprite directory.
        settingsDir = new File(settingsDir.getPath() + subDir);
        TreeSet<String> confirmedSubDomains = new TreeSet<String>();
        if (settingsDir.exists()) {
            // Found our place! Search through it's subdirs.
            File[] possibleSubDomains = settingsDir.listFiles();
            for (int i = 0; i < possibleSubDomains.length; i++) {
                if (possibleSubDomains[i].isDirectory() && isOverride(possibleSubDomains[i])) {
                    // Found one!
                    confirmedSubDomains.add(possibleSubDomains[i].getName());
                }
            }
        }
        return confirmedSubDomains;
    }

    /**
     * Checks if a file is a a 'per host' override or if it's a directory if it
     * or it's subdirectories  contains a 'per host' override file.
     * @param f The file or directory to check
     * @return True if the file is an override or it's a directory that contains
     *         such a file.
     */
    private boolean isOverride(File f) {
        if (f.isDirectory()) {
            // Have a directory, check it's contents.
            File[] subs = f.listFiles();
            for (int i = 0; i < subs.length; i++) {
                if (isOverride(subs[i])) {
                    // Found one. Can stop looking.
                    return true;
                }
            }
        } else if (f.getName().equals(settingsFilename + "." + settingsFilenameSuffix)) {
            // This is an override file (or sure looks like one in any case).
            return true;
        }
        // Didn't find an override.
        return false;
    }

    /** Delete a settings object from persistent storage.
     *
     * Deletes the file represented by the submitted settings object. All empty
     * directories that are parents to the files path are also deleted.
     *
     * @param settings the settings object to delete.
     */
    public void deleteSettingsObject(CrawlerSettings settings) {
        super.deleteSettingsObject(settings);
        File settingsDirectory = getSettingsDirectory();
        File settingsFile = settingsToFilename(settings);

        if (!settingsFile.delete()) {
            throw new RuntimeException("Could not delete: " + settingsFile);
        }
        settingsFile = settingsFile.getParentFile();
        while (settingsFile.isDirectory() && settingsFile.list().length == 0
                && !settingsFile.equals(settingsDirectory)) {
            if (!settingsFile.delete()) {
                logger.warning("Could not delete: " + settingsFile);
            }
            settingsFile = settingsFile.getParentFile();
        }
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.settings.SettingsHandler#getListOfAllFiles()
     */
    public List<String> getListOfAllFiles() {
        ArrayList<String> list = new ArrayList<String>();
        // Add CrawlOrder.
        list.add(getOrderFile().getAbsolutePath());
        // Iterate through the entire override hierarchy
        if (getSettingsDirectory().exists()) {
            recursiveFindFiles(getSettingsDirectory(), list);
        }
        // Get files used by settings modules.
        recursiveFindSecondaryFiles(getOrder(), list);
        return list;
    }

    /**
     * Add any files being used by any of the Modules making up the settings to
     * the list.
     *
     * @param mbean A ModuleType to interrogate for files. Any child modules
     *           will be recursively interrogated.
     * @param list The list to add found files to.
     */
    private void recursiveFindSecondaryFiles(ComplexType mbean, ArrayList<String> list) {
        MBeanInfo info = mbean.getMBeanInfo();
        MBeanAttributeInfo[] a = info.getAttributes();
        // Interrogate the current module
        if (mbean instanceof ModuleType) {
            ((ModuleType) mbean).listUsedFiles(list);
        }

        // Recursively interrogate all sub modules that are of ModuleType
        for (int n = 0; n < a.length; n++) {
            if (a[n] == null) {
                // Error null attribute.
            } else {
                ModuleAttributeInfo att = (ModuleAttributeInfo) a[n];
                Object currentAttribute;
                try {
                    currentAttribute = mbean.getAttribute(att.getName());
                    if (currentAttribute instanceof ComplexType) {
                        recursiveFindSecondaryFiles((ComplexType) currentAttribute, list);
                    }
                } catch (AttributeNotFoundException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } catch (MBeanException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                } catch (ReflectionException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
    }

    /**
     * Starting at the specific directory this method will iterate through all
     * sub directories and add each file (as absolute name, with path as a
     * string) to the provided ArrayList. Any file found under the settings
     * directory with the proper suffix will be considered valid and added to
     * the list.
     * @param dir Starting directory
     * @param list The list to add to
     */
    private void recursiveFindFiles(File dir, ArrayList<String> list) {
        File[] subs = dir.listFiles();
        if (subs != null) {
            for (int i = 0; i < subs.length; i++) {
                if (subs[i].isDirectory()) {
                    recursiveFindFiles(subs[i], list);
                } else {
                    if (subs[i].getName().endsWith(settingsFilenameSuffix)) {
                        // Add it to list
                        list.add(subs[i].getAbsolutePath());
                    }
                }
            }
        }
    }
}