org.onehippo.cms7.brokenlinks.BrokenLinksCheckingJob.java Source code

Introduction

Here is the source code for org.onehippo.cms7.brokenlinks.BrokenLinksCheckingJob.java
Source

/**
 * Copyright 2013-2013 Hippo B.V. (http://www.onehippo.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.onehippo.cms7.brokenlinks;

import java.rmi.RemoteException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.jcr.ItemNotFoundException;
import javax.jcr.Node;
import javax.jcr.NodeIterator;
import javax.jcr.PathNotFoundException;
import javax.jcr.RepositoryException;
import javax.jcr.Session;
import javax.jcr.query.Query;
import javax.jcr.query.QueryResult;

import org.apache.commons.lang.StringUtils;
import org.hippoecm.repository.api.Document;
import org.hippoecm.repository.api.HippoNodeType;
import org.hippoecm.repository.api.HippoWorkspace;
import org.hippoecm.repository.api.Workflow;
import org.hippoecm.repository.api.WorkflowException;
import org.hippoecm.repository.api.WorkflowManager;
import org.onehippo.repository.scheduling.RepositoryJob;
import org.onehippo.repository.scheduling.RepositoryJobExecutionContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * BrokenLinksCheckingJob
 * <P>
 * This job implementation is instantiated by Hippo Repository Scheduler Service
 * based on the job information provided by the broken links checker daemon module.
 * </P>
 */
public class BrokenLinksCheckingJob implements RepositoryJob {

    private static Logger log = LoggerFactory.getLogger(BrokenLinksCheckingJob.class);

    @Override
    public void execute(RepositoryJobExecutionContext context) throws RepositoryException {
        log.info("BrokenLinksCheckingJob begins ...");
        long start = System.currentTimeMillis();

        Session session = null;

        try {
            session = context.createSystemSession();
            session.refresh(false);
            Map<String, String> params = new HashMap<String, String>();

            for (String attrName : context.getAttributeNames()) {
                params.put(attrName, context.getAttribute(attrName));
            }
            checkBrokenLinks(session, new CheckExternalBrokenLinksConfig(params));
        } finally {
            if (session != null) {
                session.logout();
            }

            log.info("BrokenLinksCheckingJob ends, spending {} seconds.",
                    (System.currentTimeMillis() - start) / 1000.0);
        }
    }

    private void checkBrokenLinks(final Session session, final CheckExternalBrokenLinksConfig config)
            throws RepositoryException {
        final WorkflowManager workflowManager = ((HippoWorkspace) session.getWorkspace()).getWorkflowManager();

        final LinkChecker linkChecker = new LinkChecker(config, session);
        log.info("Checking broken external links, configuration: ", config);
        // For the xpath query below, do not include a path constraint to begin with, like
        // /jcr:root/content/documents as this results in much less efficient queries
        String xpath = "//element(*,hippostd:html)";
        QueryResult result = session.getWorkspace().getQueryManager().createQuery(xpath, Query.XPATH).execute();
        NodeIterator hippostdHtmlNodes = result.getNodes();

        // the key of the links map is the uuid of the handle node
        Map<String, Set<Link>> linksByHandleUUID = new HashMap<String, Set<Link>>();
        // the unique links by URL
        Map<String, Link> linksByURL = new HashMap<String, Link>();

        long start = System.currentTimeMillis();
        int count = 0;
        int totalLinksCount = 0;
        while (hippostdHtmlNodes.hasNext()) {
            try {
                Node hippostdHtml = hippostdHtmlNodes.nextNode();

                if (!hippostdHtml.getPath().startsWith(config.getStartPath())) {
                    // skip paths that do not start with the path we want to scan below
                    continue;
                }

                // we need to group the links per handle because all hippostd:content
                // fields below a handle store their broken links directly below the handle
                Node handleNode = getHandleNode(hippostdHtml);

                if (handleNode == null) {
                    // could not find handle for hippostd:html node. Skip it
                    continue;
                }

                String handleUUID = handleNode.getIdentifier();
                // hippostd:content is a mandatory property so no need to check for existence
                String content = hippostdHtml.getProperty("hippostd:content").getString();

                try {
                    Set<Link> linksForHandle = linksByHandleUUID.get(handleUUID);
                    count++;

                    final List<String> links = PlainTextLinksExtractor.getLinks(content);
                    totalLinksCount += links.size();
                    for (String url : links) {

                        if (isExcludedURL(config, url)) {
                            log.info("The URL is excluded while broken links checking in '{}': {}",
                                    hippostdHtml.getPath(), url);
                            continue;
                        }

                        if (linksForHandle == null) {
                            linksForHandle = new HashSet<>();
                            linksByHandleUUID.put(handleUUID, linksForHandle);
                        }

                        Link alreadyPresent = linksByURL.get(url);

                        if (alreadyPresent == null) {
                            String sourceNodeIdentifier = hippostdHtml.getIdentifier();
                            Link link = new Link(url, sourceNodeIdentifier);

                            if (StringUtils.startsWithIgnoreCase(url, "http:")
                                    || StringUtils.startsWithIgnoreCase(url, "https:")) {
                                linksByURL.put(url, link);
                            } else {
                                linksByURL.put(sourceNodeIdentifier + "/" + url, link);
                            }

                            log.debug("Adding to test for handle with '{}' the url '{}'", handleUUID, url);
                            linksForHandle.add(link);
                        } else {
                            log.debug("Adding to test for handle with '{}' the url '{}'", handleUUID, url);
                            linksForHandle.add(alreadyPresent);
                        }
                    }
                } catch (IllegalStateException e) {
                    log.warn("Unable to get link from hippostd:html for node '{}'", hippostdHtml.getPath());
                }
            } catch (RepositoryException e) {
                log.warn(
                        "RepositoryException for hippostd:html node from search result. Skip and continue with next");
            }
        }
        long scanningTook = (System.currentTimeMillis() - start);
        log.info("Finished scanning all hippostd:html nodes for external links in {} seconds.",
                String.valueOf((scanningTook / 1000.0)));
        log.info("In total {}  hippostd:html nodes were scanned.", String.valueOf(count));
        log.info("In total {} handles have links", linksByHandleUUID.size());
        log.info("In total there are {} unique links", linksByURL.size());
        log.info("In total there were {} links scanned", totalLinksCount);
        log.info("Starting scanning for external links that are broken");

        start = System.currentTimeMillis();
        // this set keeps track of scanned links to avoid needless double scanning

        // Now first check all external links whether they are available : The linkChecker runs multi-threaded thus
        // to utilize the multi-threading best, it is best to scan all Links combined, not just the ones for a single handle
        linkChecker.run(linksByURL.values());
        linkChecker.shutdown();

        log.info("Finished testing availability of all URLs. Tested '{}' URLs in {} seconds.",
                String.valueOf(linksByURL.size()), String.valueOf((scanningTook / 1000.0)));

        for (Map.Entry<String, Set<Link>> entry : linksByHandleUUID.entrySet()) {

            // all links belong to one document, so we can safely collect and process them at once:
            Collection<Link> brokenLinks = new ArrayList<Link>();
            for (Link link : entry.getValue()) {
                if (link.isBroken()) {
                    brokenLinks.add(link);
                }
            }
            // the key in the Map contains the handleUUID
            try {
                Node handleNode = session.getNodeByIdentifier(entry.getKey());
                if (!brokenLinks.isEmpty() || handleNode.isNodeType(NodeType.BROKENLINKS_MIXIN)) {
                    // need to get the document below the handle to be able to get the workflow
                    Node doc;
                    try {
                        doc = handleNode.getNode(handleNode.getName());
                    } catch (PathNotFoundException e) {
                        log.warn("could not find document below handle '{}'. SKip", handleNode.getPath());
                        continue;
                    }
                    try {
                        Workflow reportWorkflow = workflowManager.getWorkflow("brokenlinks", new Document(doc));
                        if (reportWorkflow instanceof ReportBrokenLinksWorkflow) {
                            ((ReportBrokenLinksWorkflow) reportWorkflow).reportBrokenLinks(brokenLinks);
                        }
                    } catch (WorkflowException e) {
                        if (log.isDebugEnabled()) {
                            log.warn(
                                    "WorkflowException exception while trying to write link report to handle with uuid '"
                                            + entry.getKey() + "'",
                                    e);
                        } else {
                            log.warn(
                                    "WorkflowException exception while trying to write link report to handle with uuid '{}' : {}",
                                    entry.getKey(), e.toString());
                        }
                    } catch (RepositoryException e) {
                        if (log.isDebugEnabled()) {
                            log.warn("Repository exception while trying to write link report to handle with uuid '"
                                    + entry.getKey() + "'", e);
                        } else {
                            log.warn(
                                    "Repository exception while trying to write link report to handle with uuid '{}' : {}",
                                    entry.getKey(), e.toString());
                        }
                    } catch (RemoteException e) {
                        if (log.isDebugEnabled()) {
                            log.warn("Repository exception while trying to write link report to handle with uuid '"
                                    + entry.getKey() + "'", e);
                        } else {
                            log.warn(
                                    "Repository exception while trying to write link report to handle with uuid '{}' : {}",
                                    entry.getKey(), e.toString());
                        }
                    }

                }
            } catch (ItemNotFoundException e) {
                if (log.isDebugEnabled()) {
                    log.warn(
                            "ItemNotFoundException exception while trying to create link report to handle with uuid '"
                                    + entry.getKey() + "'",
                            e);
                } else {
                    log.warn(
                            "ItemNotFoundException exception while trying to create link report to handle with uuid '{}' : {}",
                            entry.getKey(), e.toString());
                }
            } catch (RepositoryException e) {
                if (log.isDebugEnabled()) {
                    log.warn(
                            "RepositoryException exception while trying to create link report to handle with uuid '"
                                    + entry.getKey() + "'",
                            e);
                } else {
                    log.warn(
                            "RepositoryException exception while trying to create link report to handle with uuid '{}' : {}",
                            entry.getKey(), e.toString());
                }
            }

        }

    }

    /**
     * @return the first ancestor node of <code>node</code> of type hippo:handle and <code>null</code> if no such ancestor exists
     */
    private Node getHandleNode(final Node node) throws RepositoryException {
        Node parent = node.getParent();
        if (parent.isNodeType(HippoNodeType.NT_HANDLE)) {
            return parent;
        }
        if (parent.isSame(parent.getSession().getRootNode())) {
            return null;
        }
        return getHandleNode(parent);
    }

    private boolean isExcludedURL(final CheckExternalBrokenLinksConfig config, final String url) {
        for (Pattern excludePattern : config.getUrlExcludePatterns()) {
            Matcher m = excludePattern.matcher(url);

            if (m.matches()) {
                return true;
            }
        }

        return false;
    }

}