org.archive.crawler.migrate.MigrateH1to3Tool.java Source code

Introduction

Here is the source code for org.archive.crawler.migrate.MigrateH1to3Tool.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.migrate;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/**
 * Utility class which takes a H1 order.xml and creates a similar
 * H3 job directory, with as many simple settings converted over
 * (as top-of-crawler-beans overrides) as possible at this time.
 * 
 * (Future versions will handle more complicated H1 settings
 * customizations, such as per-host overrides or choices of 
 * alternate implementing classes for Scope, Processors, etc.)
 * 
 * @contributor siznax
 * @contributor gojomo
 */
public class MigrateH1to3Tool {

    protected Document sourceOrderXmlDom;

    protected static DocumentBuilder DOCUMENT_BUILDER;

    static {
        try {
            DOCUMENT_BUILDER = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws Exception {
        new MigrateH1to3Tool().instanceMain(args);
    }

    public void instanceMain(String[] args) throws Exception {

        if (args.length != 2) {
            printHelp();
            return;
        }

        String sourceOrderXmlFileArg = args[0];
        String destinationH3JobDirArg = args[1];

        File sourceOrderXmlFile = new File(sourceOrderXmlFileArg);
        if (!sourceOrderXmlFile.isFile()) {
            System.err.println("ERROR sourceOrderXmlFileArg is not a file: " + sourceOrderXmlFileArg);
            System.exit(1);
        }
        File destinationH3JobDir = new File(destinationH3JobDirArg);

        org.archive.util.FileUtils.ensureWriteableDirectory(destinationH3JobDir);

        System.out.println("H1 source: " + sourceOrderXmlFile.getAbsolutePath());
        System.out.println("H3 destination: " + destinationH3JobDir.getAbsolutePath());

        System.out.print("Migrating settings...");

        InputStream inStream = getClass()
                .getResourceAsStream("/org/archive/crawler/migrate/migrate-template-crawler-beans.cxml");
        String template = IOUtils.toString(inStream);
        inStream.close();

        Map<String, String> migrateH1toH3Map = getMigrateMap();

        try {
            sourceOrderXmlDom = DOCUMENT_BUILDER.parse(sourceOrderXmlFile);
        } catch (SAXException e) {
            System.err.println("ERROR caught exception parsing input file: " + e.getMessage() + "\n");
            e.printStackTrace();
        }

        Map<String, String> h1simpleSettings = flattenH1Order(sourceOrderXmlDom);

        List<String> notApplicable = new ArrayList<String>();
        List<String> needsAttention = new ArrayList<String>();
        int migrated = 0;
        StringBuilder sb = new StringBuilder();
        for (String key : h1simpleSettings.keySet()) {
            String beanPath = migrateH1toH3Map.get(key);
            String value = h1simpleSettings.get(key);
            System.out.print(".");
            if (beanPath == null) {
                // no equivalence rule
                needsAttention.add(key + " " + value);
                continue;
            }
            if (beanPath.startsWith("$")) {
                // rule indicates not-available/not-applicable
                notApplicable.add(key + " " + value);
                continue;
            }
            if (beanPath.startsWith("*")) {
                // TODO: needs special handling
                if (beanPath.equals("*metadata.userAgentTemplate")) {
                    splitH1userAgent(value, sb);
                    migrated += 2;
                } else {
                    needsAttention.add(key + " " + value);
                }
                continue;
            }
            if (beanPath.startsWith("^")) {
                // uppercase to new enum-style
                value = value.toUpperCase();
                beanPath = beanPath.substring(1);
            }
            sb.append(beanPath).append("=").append(value).append("\n");
            migrated++;
        }

        System.out.println();
        System.out.println();

        // patch all overrides derived from H1 into H3 template
        String beansCxml = template.replace("###MIGRATE_OVERRIDES###", sb.toString());

        File targetBeansXmlFile = new File(destinationH3JobDir, "crawler-beans.cxml");
        FileUtils.writeStringToFile(targetBeansXmlFile, beansCxml);

        File sourceSeedsTxtFile = new File(sourceOrderXmlFile.getParentFile(), "seeds.txt");
        File destinationSeedsTxtFile = new File(destinationH3JobDir, "seeds.txt");

        if (!sourceSeedsTxtFile.isFile()) {
            System.err.println("ERROR sourceSeedsTxtFile not found: " + sourceSeedsTxtFile);
            System.exit(1);
        }

        FileUtils.copyFile(sourceSeedsTxtFile, destinationSeedsTxtFile);

        System.out.println(notApplicable.size() + " settings skipped as not-applicable");
        System.out.println("These are probably harmless, but if the following settings were");
        System.out.println("important to your crawl process, investigate other options.");
        listProblems(notApplicable);
        System.out.println();
        System.out.println(needsAttention.size() + " settings may need attention");
        System.out.println("Please review your original crawl and the created H3 job, for each");
        System.out.println("of the following, and manually update as needed.");
        listProblems(needsAttention);
        System.out.println();
        System.out.println(migrated + " H1 settings successfully migrated to H3 configuration");
        System.out.println();
        System.out.println("Review your converted crawler-beans.cxml at:");
        System.out.println(targetBeansXmlFile.getAbsolutePath());

    }

    protected void listProblems(List<String> problems) {
        for (String problem : problems) {
            System.out.println(" " + problem);
        }
    }

    protected void printHelp() {
        System.out.println("Usage: takes two arguments. First argument is path to a "
                + "Heritrix 1.X order.xml, second argument is path for a new "
                + "Heritrix 3.X job directory. Will generate a basic H3 job "
                + "with as many of the H1 settings replicated as currently " + "possible.");
    }

    protected void splitH1userAgent(String userAgent, StringBuilder sb) {
        String originalUrl = userAgent.replaceAll("^.*?\\+(http://[^)]*).*$", "$1");
        String newTemplate = userAgent.replace(originalUrl, "@OPERATOR_CONTACT_URL@");
        // TODO: catch, change outdated version info? 
        sb.append("metadata.operatorContactUrl=").append(originalUrl).append("\n")
                .append("metadata.userAgentTemplate=").append(newTemplate).append("\n");
    }

    protected Map<String, String> getMigrateMap() throws IOException {
        Map<String, String> map = new HashMap<String, String>();
        InputStream inStream = getClass().getResourceAsStream("/org/archive/crawler/migrate/H1toH3.map");
        LineIterator iter = IOUtils.lineIterator(inStream, "UTF-8");
        while (iter.hasNext()) {
            String[] fields = iter.nextLine().split("\\|");
            map.put(fields[1], fields[0]);
        }
        inStream.close();
        return map;
    }

    /**
     * Given a Document, return a Map of all non-blank simple text 
     * nodes, keyed by the pseudo-XPath to their parent element. 
     * 
     * @param h1order Document to extract Map
     * @return Map<String,String> Xpath-like-String -> non-blank text content
     * @throws XPathExpressionException
     */
    public static Map<String, String> flattenH1Order(Document h1order) throws XPathExpressionException {
        Map<String, String> flattened = new LinkedHashMap<String, String>();
        XPathExpression xpath = XPathFactory.newInstance().newXPath().compile("//text()");
        NodeList nodes = (NodeList) xpath.evaluate(h1order, XPathConstants.NODESET);
        for (int i = 0; i < nodes.getLength(); i++) {
            Node node = nodes.item(i);
            if (StringUtils.isNotBlank(node.getTextContent())) {
                String pseudoXPath = getPseudoXpath(node.getParentNode());
                pseudoXPath = pseudoXPath.replaceFirst("/crawl-order", "/");

                //                System.out.println(
                //                        pseudoXPath
                //                        +" "+node.getTextContent());

                flattened.put(pseudoXPath, node.getTextContent());
            }
        }
        //        System.err.println(flattened.size());
        //        System.err.println(flattened);

        return flattened;
    }

    /**
     * Given a node, give back an XPath-like string that addresses it. 
     * (For our constrained order.xml files, it is a valid and unique
     * XPath, but the simple approach used here might not generate 
     * unique XPaths on all XML.
     * 
     * @param node node to get pseudo-XPath
     * @return String pseudo-XPath
     */
    protected static String getPseudoXpath(Node node) {
        String pseudoXpath = "";
        Node currentNode = node;
        while (currentNode.getParentNode() != null) {
            String thisSegment = currentNode.getNodeName();
            if (currentNode.getAttributes().getNamedItem("name") != null) {
                thisSegment = "*[@" + currentNode.getAttributes().getNamedItem("name") + "]";
            }
            pseudoXpath = "/" + thisSegment + pseudoXpath;
            currentNode = currentNode.getParentNode();
        }
        return pseudoXpath;
    }
}