Java tutorial
/* * Copyright 2011-2013 Hippo B.V. (http://www.onehippo.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.onehippo.cms7.brokenlinks; import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.net.URI; import java.util.Calendar; import java.util.NoSuchElementException; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.jcr.ItemNotFoundException; import javax.jcr.Node; import javax.jcr.RepositoryException; import javax.jcr.Session; import org.apache.commons.lang.StringUtils; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpHead; import org.apache.http.client.methods.HttpRequestBase; import org.apache.http.client.params.ClientPNames; import org.apache.http.conn.ClientConnectionManager; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.HttpParams; import org.apache.http.params.SyncBasicHttpParams; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.HttpContext; import org.hippoecm.repository.api.HippoNodeType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * NOTE: Do not re-use a {@link LinkChecker} instance after calling {@link #run(Iterable)} as after the call to * {@link #run(Iterable)} is finished connection resources are released and it can not be reused again. * To run another check create a new instance of {@link LinkChecker} and then call {@link #run(Iterable)} again */ public class LinkChecker { private static Logger log = LoggerFactory.getLogger(LinkChecker.class); private static final Pattern URL_SCHEME_PATTERN = Pattern.compile("^([A-Za-z]+):.*$"); // refresh session after checking 50 internal links private static final int REFRESH_SESSION_INTERVAL = 50; private static final String BROKEN_REFERENCE_MESSAGE = "Broken reference"; private final Session session; private final HttpClient httpClient; private final int nrOfThreads; public LinkChecker(CheckExternalBrokenLinksConfig config, Session session) { this.session = session; ClientConnectionManager connManager = new PoolingClientConnectionManager(); HttpParams params = new SyncBasicHttpParams(); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter(CoreConnectionPNames.STALE_CONNECTION_CHECK, false); params.setBooleanParameter(ClientPNames.HANDLE_AUTHENTICATION, false); HttpClient client = null; try { final String httpClientClassName = config.getHttpClientClassName(); Class<? extends HttpClient> clientClass = (Class<? extends HttpClient>) Class .forName(httpClientClassName); final Constructor<? extends HttpClient> constructor = clientClass .getConstructor(ClientConnectionManager.class, HttpParams.class); client = constructor.newInstance(connManager, params); } catch (ClassNotFoundException e) { log.error("Could not find configured http client class", e); } catch (NoSuchMethodException e) { log.error("Could not find constructor of signature <init>(ClientConnectionmanager, HttpParams)", e); } catch (InvocationTargetException e) { log.error("Could not invoke constructor of httpClient", e); } catch (InstantiationException e) { log.error("Could not instantiate http client", e); } catch (IllegalAccessException e) { log.error("Not allowed to access http client constructor", e); } if (client == null) { client = new DefaultHttpClient(connManager, params); } httpClient = client; nrOfThreads = config.getNrOfHttpThreads(); // authentication preemptive true // allow circular redirects true } /** * Note that this method does not always return the same Link instances as in the <code>links</code> argument * because if the url of a Link was already scanned before, we replace the Link object with an already scanned * Link object * @param links */ public void run(final Iterable<Link> links) { runCheckerThreads(links); } public void shutdown() { // see http://hc.apache.org/httpcomponents-client-ga/tutorial/html/connmgmt.html#d5e635 httpClient.getConnectionManager().shutdown(); } private void runCheckerThreads(final Iterable<Link> links) { ConcurrentLinkedQueue<Link> queue = new ConcurrentLinkedQueue<Link>(); for (Link link : links) { queue.add(link); } final int threadCount = Math.min(queue.size(), nrOfThreads); final AtomicInteger internalLinksChecked = new AtomicInteger(); Thread[] threads = new Thread[threadCount]; for (int i = 0; i < threadCount; i++) { threads[i] = new LinkCheckerRunner(queue, internalLinksChecked); threads[i].setUncaughtExceptionHandler(new LogUncaughtExceptionHandler(log)); } for (int i = 0; i < threadCount; i++) { threads[i].start(); } try { for (int i = 0; i < threadCount; i++) { threads[i].join(); } } catch (InterruptedException ex) { // aborted } try { session.refresh(false); } catch (RepositoryException e) { log.warn("Failed to clear the session.", e); } } private class LinkCheckerRunner extends Thread { private final ConcurrentLinkedQueue<Link> queue; private final AtomicInteger internalLinksChecked; public LinkCheckerRunner(final ConcurrentLinkedQueue<Link> queue, final AtomicInteger internalLinksChecked) { this.queue = queue; this.internalLinksChecked = internalLinksChecked; } @Override public void run() { try { while (true) { // Get the next item to process, throws a NoSuchElementException when we're done Link link = queue.remove(); String url = link.getUrl(); if (StringUtils.isNotBlank(url)) { Matcher schemedUrlMatcher = URL_SCHEME_PATTERN.matcher(url); if (schemedUrlMatcher.matches()) { final String scheme = StringUtils.lowerCase(schemedUrlMatcher.group(1)); if (StringUtils.equals("http", scheme) || StringUtils.equals("https", scheme)) { checkExternalHttpLink(link); } else { log.debug("LinkChecker doesn't check non http(s) urls: '{}'.", url); } } else { checkInternalLink(link); } } } } catch (NoSuchElementException ex) { // Deliberate ignore, end of run } } private void checkInternalLink(Link link) { String url = link.getUrl(); if (StringUtils.contains(url, "/")) { log.debug("Not a CMS internal link which cannot have a '/': {}", url); return; } if (StringUtils.isEmpty(link.getSourceNodeIdentifier())) { log.debug("Unable to check internal link. The link is unaware of source node identifier: {}", url); return; } if (session == null) { log.warn("Session is not given to LinkChecker!"); return; } synchronized (session) { try { Node sourceNode = session.getNodeByIdentifier(link.getSourceNodeIdentifier()); Node linkedNode = findLinkedNode(sourceNode, url); if (linkedNode == null) { link.setBroken(true); link.setBrokenSince(Calendar.getInstance()); link.setResultCode(Link.ERROR_CODE); link.setResultMessage(BROKEN_REFERENCE_MESSAGE); } else if (linkedNode.isNodeType(HippoNodeType.NT_HANDLE)) { // when there's no variant node under the handle (e.g, in the attic), it's broken link as well. if (!linkedNode.hasNode(linkedNode.getName())) { link.setBroken(true); link.setBrokenSince(Calendar.getInstance()); link.setResultCode(Link.ERROR_CODE); link.setResultMessage(BROKEN_REFERENCE_MESSAGE); } } if (internalLinksChecked.incrementAndGet() % REFRESH_SESSION_INTERVAL == 0) { session.refresh(false); } } catch (RepositoryException e) { log.warn("Failed to find the source node.", e); } } } private Node findLinkedNode(final Node sourceNode, String linkName) { try { if (!sourceNode.hasNode(linkName)) { log.debug("The source node doesn't have the link node named '{}'.", linkName); return null; } Node linkNode = sourceNode.getNode(linkName); if (!linkNode.hasProperty(HippoNodeType.HIPPO_DOCBASE)) { log.debug("The link node doesn't have the '{}' property.", HippoNodeType.HIPPO_DOCBASE); return null; } String docbase = linkNode.getProperty(HippoNodeType.HIPPO_DOCBASE).getString(); if (StringUtils.isBlank(docbase)) { log.debug("The link node has a blank '{}' property.", HippoNodeType.HIPPO_DOCBASE); return null; } return session.getNodeByIdentifier(docbase); } catch (ItemNotFoundException e) { log.debug("The linked node is not found.", e); } catch (RepositoryException e) { log.warn("Failed to find linked node.", e); } return null; } private void checkExternalHttpLink(Link link) { String url = StringUtils.trim(link.getUrl()); final HttpContext httpContext = new BasicHttpContext(); HttpRequestBase httpRequest = null; try { URI linkUri = LinkURIUtils.createHttpURIFromString(url); httpRequest = new HttpHead(linkUri); HttpResponse httpResponse = httpClient.execute(httpRequest, httpContext); int headResultCode = httpResponse.getStatusLine().getStatusCode(); httpRequest.reset(); if (headResultCode == HttpStatus.SC_METHOD_NOT_ALLOWED) { httpRequest = new HttpGet(url); httpResponse = httpClient.execute(httpRequest, httpContext); headResultCode = httpResponse.getStatusLine().getStatusCode(); httpRequest.reset(); } if (headResultCode == HttpStatus.SC_MOVED_PERMANENTLY || headResultCode >= HttpStatus.SC_BAD_REQUEST) { link.setBroken(true); link.setBrokenSince(Calendar.getInstance()); link.setResultCode(headResultCode); } } catch (IOException ioException) { link.setBroken(true); link.setBrokenSince(Calendar.getInstance()); link.setResultCode(Link.EXCEPTION_CODE); link.setResultMessage(ioException.getClass().getCanonicalName()); } catch (IllegalArgumentException ex) { link.setBroken(true); link.setBrokenSince(Calendar.getInstance()); link.setResultCode(Link.EXCEPTION_CODE); link.setResultMessage(ex.getClass().getCanonicalName()); } finally { if ((httpRequest != null) && (!httpRequest.isAborted())) { httpRequest.reset(); } } } } static class LogUncaughtExceptionHandler implements Thread.UncaughtExceptionHandler { private Logger exceptionLog; LogUncaughtExceptionHandler(Logger exceptionLog) { this.exceptionLog = exceptionLog; } @Override public void uncaughtException(final Thread t, final Throwable e) { exceptionLog.error(e.getClass().getName() + ": " + e.getMessage(), e); t.getThreadGroup().uncaughtException(t, e); } } }