org.commoncrawl.service.listcrawler.CrawlListsUI.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.listcrawler.CrawlListsUI.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.service.listcrawler;

import java.io.IOException;
import java.io.PrintWriter;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.concurrent.Semaphore;

import javax.crypto.BadPaddingException;
import javax.crypto.Cipher;
import javax.crypto.IllegalBlockSizeException;
import javax.crypto.NoSuchPaddingException;
import javax.crypto.spec.SecretKeySpec;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.server.AsyncWebServerRequest;
import org.commoncrawl.server.CommonCrawlServer;
import org.commoncrawl.service.listcrawler.CrawlListDatabaseRecord;
import org.commoncrawl.service.listcrawler.CrawlListDomainItem;
import org.commoncrawl.service.listcrawler.CrawlListMetadata;
import org.commoncrawl.service.listcrawler.CrawlList.QueueState;
import org.commoncrawl.util.CCStringUtils;

import com.google.gson.stream.JsonWriter;

@SuppressWarnings("serial")
/** 
 * Servlet used to support the crawl lists ui
 * 
 * @author rana
 *
 */
public class CrawlListsUI extends HttpServlet {

    public static final Log LOG = LogFactory.getLog(CrawlListsUI.class);

    static String salt = "#$@!1Z";
    static byte secretKey[] = { (byte) 0xcd, (byte) 0xe7, (byte) 0xe9, (byte) 0x9d, (byte) 0xb4, (byte) 0x84,
            (byte) 0xc5, 0x2f, 0x49, (byte) 0xee, 0x16, (byte) 0xb1, 0x12, (byte) 0xa6, (byte) 0xef, (byte) 0xb7 };

    public static String decryptUserKey(String userKey) {
        if (userKey.length() % 2 != 0) {
            return null;
        }
        byte keyAsHex[] = hexStringToByteArray(userKey);
        if (keyAsHex != null) {
            SecretKeySpec skeySpec = new SecretKeySpec(secretKey, "AES");
            try {
                Cipher cipher = Cipher.getInstance("AES");
                cipher.init(Cipher.DECRYPT_MODE, skeySpec);
                byte[] original = cipher.doFinal(keyAsHex);
                String originalString = new String(original);

                if (originalString.startsWith(salt)) {
                    return originalString.substring(salt.length());
                }

            } catch (NoSuchAlgorithmException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (NoSuchPaddingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (InvalidKeyException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IllegalBlockSizeException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (BadPaddingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        return null;
    }

    public static byte[] hexStringToByteArray(String s) {
        int len = s.length();
        byte[] data = new byte[len / 2];
        for (int i = 0; i < len; i += 2) {
            data[i / 2] = (byte) ((Character.digit(s.charAt(i), 16) << 4) + Character.digit(s.charAt(i + 1), 16));
        }
        return data;
    }

    public static class HttpResult {
        public int _resultCode = HttpServletResponse.SC_OK;
        public String _resultDesc = "";
    }

    @Override
    protected void doGet(final HttpServletRequest req, final HttpServletResponse resp)
            throws ServletException, IOException {

        String listIdStr = req.getParameter("listId");
        String reqType = req.getParameter("reqType");
        HttpResult result = new HttpResult();

        result._resultCode = HttpServletResponse.SC_BAD_REQUEST;

        resp.setContentType("application/json");

        if (listIdStr != null && reqType != null) {

            long listId = Long.parseLong(listIdStr);
            if (reqType.equals("listLists")) {
                String encCustomerId = req.getParameter("customerId");
                String customerId = decryptUserKey(encCustomerId);

                if (customerId != null) {
                    getListsForCustomer(customerId, resp, result);
                }
            } else if (reqType.equals("subDomainCount")) {
                String encCustomerId = req.getParameter("customerId");
                String customerId = decryptUserKey(encCustomerId);

                if (customerId != null) {
                    getSubDomainCount(customerId, listId, resp, result);
                }
            } else if (reqType.equals("subDomainList")) {
                String offset = req.getParameter("offset");
                String count = req.getParameter("count");
                String encCustomerId = req.getParameter("customerId");
                String customerId = decryptUserKey(encCustomerId);
                if (offset != null && count != null && customerId != null) {
                    getDomainListForListId(customerId, listId, Integer.parseInt(offset), Integer.parseInt(count),
                            resp, result);
                }
            } else if (reqType.equals("listDetails")) {
                String encCustomerId = req.getParameter("customerId");
                String customerId = decryptUserKey(encCustomerId);
                if (customerId != null) {
                    getListDetails(customerId, listId, resp, result);
                }
            } else if (reqType.equals("domainDetails")) {
                String domainId = req.getParameter("domainId");
                String encCustomerId = req.getParameter("customerId");
                String customerId = decryptUserKey(encCustomerId);

                if (domainId != null && customerId != null) {
                    getDomainDetailForDomain(customerId, listId, domainId, resp, result);
                }
            }
        }

        if (result._resultCode != HttpServletResponse.SC_OK) {
            resp.sendError(result._resultCode, result._resultDesc);
        }
    }

    private static CrawlListDomainItem buildListSummary(CrawlListMetadata metadata) {

        CrawlListDomainItem domainItem = new CrawlListDomainItem();

        int robotsExcludedItemsCount = 0;
        int errorItemsCount = 0;
        int inCacheItems = 0;
        int processedItemsCount = 0;
        int http200Count = 0;

        http200Count += metadata.getHttp200Count();

        robotsExcludedItemsCount += metadata.getRobotsExcludedCount();

        errorItemsCount += metadata.getTimeoutErrorCount();
        errorItemsCount += metadata.getIOExceptionCount();
        errorItemsCount += metadata.getDNSErrorCount();
        errorItemsCount += metadata.getOtherErrorCount();

        processedItemsCount += metadata.getHttp200Count();
        processedItemsCount += metadata.getHttp403Count();
        processedItemsCount += metadata.getHttp404Count();
        processedItemsCount += metadata.getHttp500Count();
        processedItemsCount += metadata.getHttpOtherCount();

        domainItem.setUrlCount(metadata.getUrlCount());
        domainItem.setUrlsCrawled(processedItemsCount);
        domainItem.setHttp200Count(http200Count);
        domainItem.setInCacheItemsCount(0);
        domainItem.setRobotsExcludedCount(robotsExcludedItemsCount);
        domainItem.setErrorCount(errorItemsCount);
        domainItem.setQueuedCount(metadata.getQueuedItemCount());

        return domainItem;
    }

    public static void getDomainDetailForDomain(final String customerId, final long listId, final String domainName,
            final HttpServletResponse resp, final HttpResult result) throws IOException {

        final CommonCrawlServer server = CommonCrawlServer.getServerSingleton();

        server.dispatchAsyncWebRequest(new AsyncWebServerRequest("", resp.getWriter()) {

            @Override
            public boolean handleRequest(Semaphore completionSemaphore) throws IOException {

                ProxyServer proxyServer = (ProxyServer) server;

                if (!proxyServer.doesListBelongToCustomer(listId, customerId)) {
                    resp.sendError(HttpServletResponse.SC_FORBIDDEN);
                } else {

                    CrawlList list = proxyServer.getCrawlHistoryManager().getList(listId);
                    if (list != null && list.isListLoaded()) {
                        CrawlListMetadata metadata = list.getSubDomainMetadataByDomain(domainName);

                        if (metadata != null) {

                            CrawlListDomainItem item = buildListSummary(metadata);

                            PrintWriter writer = resp.getWriter();
                            JsonWriter jsonWriter = new JsonWriter(writer);

                            try {
                                jsonWriter.beginObject();
                                jsonWriter.name("items");
                                jsonWriter.beginArray();

                                if (item.getHttp200Count() != 0)
                                    jsonWriter.beginArray().value("http200").value(item.getHttp200Count())
                                            .endArray();
                                int http403Count = metadata.getHttp403Count() + metadata.getRedirectHttp403Count();
                                if (http403Count != 0)
                                    jsonWriter.beginArray().value("http403").value(http403Count).endArray();
                                int http404Count = metadata.getHttp404Count() + metadata.getRedirectHttp404Count();
                                if (http404Count != 0)
                                    jsonWriter.beginArray().value("http404").value(http404Count).endArray();
                                int http500Count = metadata.getHttp500Count() + metadata.getRedirectHttp500Count();
                                if (http500Count != 0)
                                    jsonWriter.beginArray().value("http500").value(http500Count).endArray();
                                int httpOtherCount = metadata.getHttpOtherCount()
                                        + metadata.getRedirectHttpOtherCount();
                                if (httpOtherCount != 0)
                                    jsonWriter.beginArray().value("httpOther").value(httpOtherCount).endArray();
                                if (item.getInCacheItemsCount() != 0)
                                    jsonWriter.beginArray().value("inCache").value(item.getInCacheItemsCount())
                                            .endArray();
                                if (item.getRobotsExcludedCount() != 0)
                                    jsonWriter.beginArray().value("robotsExcluded")
                                            .value(item.getRobotsExcludedCount()).endArray();

                                // caculate errors 
                                int timeoutErrorCount = metadata.getTimeoutErrorCount()
                                        + metadata.getRedirectTimeoutErrorCount();
                                int ioexceptionErrorCount = metadata.getIOExceptionCount()
                                        + metadata.getRedirectIOExceptionCount();
                                int otherErrorCount = metadata.getOtherErrorCount();

                                if (timeoutErrorCount != 0)
                                    jsonWriter.beginArray().value("timeouts").value(timeoutErrorCount).endArray();

                                if (ioexceptionErrorCount != 0)
                                    jsonWriter.beginArray().value("exceptions").value(ioexceptionErrorCount)
                                            .endArray();

                                /*
                                    if (otherErrorCount != 0)
                                       jsonWriter.beginArray().value("other errors").value(otherErrorCount).endArray();
                                 */

                                // calculate remaining items 
                                int remainingItems = metadata.getUrlCount();
                                // take off http counts
                                remainingItems -= item.getHttp200Count();
                                remainingItems -= http403Count;
                                remainingItems -= http404Count;
                                remainingItems -= http500Count;
                                remainingItems -= httpOtherCount;
                                remainingItems -= item.getRobotsExcludedCount();
                                remainingItems -= (timeoutErrorCount + ioexceptionErrorCount);
                                if (remainingItems > 0) {
                                    jsonWriter.beginArray().value("remaining").value(remainingItems).endArray();
                                }

                                jsonWriter.endArray();
                                jsonWriter.endObject();

                                result._resultCode = HttpServletResponse.SC_OK;

                            } catch (Exception e) {
                                throw new IOException(e);
                            }

                        }
                    }
                }
                return false;
            }
        });
    }

    private static void getListsForCustomer(final String customerId, final HttpServletResponse resp,
            final HttpResult result) throws IOException {

        final CommonCrawlServer server = CommonCrawlServer.getServerSingleton();

        server.dispatchAsyncWebRequest(new AsyncWebServerRequest("", resp.getWriter()) {

            @Override
            public boolean handleRequest(final Semaphore completionSemaphore) throws IOException {

                final ProxyServer proxyServer = (ProxyServer) server;

                LOG.info("Getting List for Customer:" + customerId);
                final Collection<CrawlListDatabaseRecord> recordSet = proxyServer
                        .getListInfoForCustomerId(customerId).values();

                final ArrayList<CrawlListDatabaseRecord> sortedSet = new ArrayList<CrawlListDatabaseRecord>();

                sortedSet.addAll(recordSet);

                // sort by timestamp
                Collections.sort(sortedSet, new Comparator<CrawlListDatabaseRecord>() {

                    @Override
                    public int compare(CrawlListDatabaseRecord o1, CrawlListDatabaseRecord o2) {
                        return (o1.getListId() > o2.getListId()) ? -1 : 1;
                    }

                });

                LOG.info("Found:" + sortedSet.size() + " Lists for Customer:" + customerId);

                if (sortedSet.size() != 0) {

                    Thread thread = new Thread(new Runnable() {

                        @Override
                        public void run() {
                            LOG.info("Running Worker Thread");
                            try {
                                PrintWriter writer = resp.getWriter();

                                JsonWriter jsonWriter = new JsonWriter(writer);

                                jsonWriter.beginObject();
                                jsonWriter.name("items");
                                jsonWriter.beginArray();

                                for (CrawlListDatabaseRecord listRecord : sortedSet) {

                                    // get the list 
                                    CrawlList list = proxyServer.getCrawlHistoryManager()
                                            .getList(listRecord.getListId());
                                    if (list == null) {
                                        LOG.error("DID NOT Find List Object for List:" + listRecord.getListId()
                                                + " Name:" + listRecord.getListName() + " FileName:"
                                                + listRecord.getSourceFileName() + " TempFile:"
                                                + listRecord.getTempFileName());
                                    }
                                    if (list != null) {
                                        String queueState = "W";
                                        if (list.isListLoaded()) {

                                            if (list.getQueuedState() == QueueState.QUEUEING)
                                                queueState = "Q";
                                            else if (list.getQueuedState() == QueueState.QUEUED)
                                                queueState = "L";
                                            else if (list.getQueuedState() == QueueState.ERROR)
                                                queueState = "L";
                                            else
                                                queueState = "?";
                                            CrawlListMetadata metadata = list.getMetadata();
                                            CrawlListDomainItem summary = buildListSummary(metadata);
                                            // populate identification info 
                                            summary.setListId(list.getListId());
                                            summary.setListName(listRecord.getListName());

                                            jsonWriter.beginArray();
                                            jsonWriter.value(summary.getListId());
                                            jsonWriter.value(queueState);
                                            jsonWriter.value(summary.getListName());
                                            jsonWriter.value(list.getSubDomainItemCount());
                                            jsonWriter.value(summary.getUrlCount());
                                            jsonWriter.value(summary.getUrlsCrawled());
                                            jsonWriter.value(summary.getHttp200Count());
                                            jsonWriter.value(summary.getRobotsExcludedCount());
                                            jsonWriter.value(summary.getErrorCount());
                                            jsonWriter.value(summary.getQueuedCount());

                                            jsonWriter.endArray();
                                        } else if (list.getLoadState() == CrawlList.LoadState.QUEUED_FOR_LOADING) {
                                            jsonWriter.beginArray();
                                            jsonWriter.value(list.getListId());
                                            jsonWriter.value(queueState);
                                            jsonWriter.value("<B>Queued:</B>" + listRecord.getListName());
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);

                                            jsonWriter.endArray();
                                        } else if (list.getLoadState() == CrawlList.LoadState.REALLY_LOADING) {
                                            jsonWriter.beginArray();
                                            jsonWriter.value(list.getListId());
                                            jsonWriter.value("<B>Loading:</B>" + listRecord.getListName());
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);

                                            jsonWriter.endArray();
                                        } else if (list.getLoadState() == CrawlList.LoadState.ERROR) {
                                            jsonWriter.beginArray();
                                            jsonWriter.value(list.getListId());
                                            jsonWriter.value("ERR");
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);
                                            jsonWriter.value(0);

                                            jsonWriter.endArray();
                                        }
                                    }
                                }

                                jsonWriter.endArray();
                                jsonWriter.endObject();

                                LOG.info("Done");

                                result._resultCode = HttpServletResponse.SC_OK;

                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                            } catch (Exception e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                            } finally {
                                LOG.error("DONE");
                                completionSemaphore.release();
                            }
                        }

                    });

                    LOG.info("Spawning Worker Thread");
                    thread.start();

                    return true;
                }
                return false;
            }

        });

    }

    public static void getListDetails(final String customerId, final long listId, final HttpServletResponse resp,
            final HttpResult result) throws IOException {
        final CommonCrawlServer server = CommonCrawlServer.getServerSingleton();

        server.dispatchAsyncWebRequest(new AsyncWebServerRequest("", resp.getWriter()) {

            @Override
            public boolean handleRequest(Semaphore completionSemaphore) throws IOException {

                ProxyServer proxyServer = (ProxyServer) server;

                if (!proxyServer.doesListBelongToCustomer(listId, customerId)) {
                    resp.sendError(HttpServletResponse.SC_FORBIDDEN);
                } else {

                    CrawlList list = proxyServer.getCrawlHistoryManager().getList(listId);

                    if (list != null && list.isListLoaded()) {

                        CrawlListMetadata metadata = list.getMetadata();
                        CrawlListDomainItem item = buildListSummary(metadata);

                        PrintWriter writer = resp.getWriter();

                        writer.println("{ " + "total:" + item.getUrlCount() + "," + "crawled:"
                                + item.getUrlsCrawled() + "," + "http200:" + item.getHttp200Count() + ","
                                + "inCache:" + item.getInCacheItemsCount() + "," + "robotsExcluded:"
                                + item.getRobotsExcludedCount() + "," + "error:" + item.getErrorCount() + "queued:"
                                + item.getQueuedCount() + "}");

                        result._resultCode = HttpServletResponse.SC_OK;

                    }
                }
                return false;
            }
        });
    }

    public static void getSubDomainCount(final String customerId, final long listId, final HttpServletResponse resp,
            final HttpResult result) throws IOException {

        final CommonCrawlServer server = CommonCrawlServer.getServerSingleton();

        server.dispatchAsyncWebRequest(new AsyncWebServerRequest("", resp.getWriter()) {

            @Override
            public boolean handleRequest(Semaphore completionSemaphore) throws IOException {

                ProxyServer proxyServer = (ProxyServer) server;

                if (!proxyServer.doesListBelongToCustomer(listId, customerId)) {
                    resp.sendError(HttpServletResponse.SC_FORBIDDEN);
                } else {
                    CrawlList list = proxyServer.getCrawlHistoryManager().getList(listId);
                    if (list != null && list.isListLoaded()) {
                        PrintWriter writer = resp.getWriter();
                        writer.println("{ " + "itemCount:" + list.getSubDomainItemCount() + "}");

                        result._resultCode = HttpServletResponse.SC_OK;

                    }
                }

                return false;
            }
        });

    }

    public static void getDomainListForListId(final String customerId, final long listId, final int offset,
            final int count, final HttpServletResponse resp, final HttpResult result) throws IOException {

        final CommonCrawlServer server = CommonCrawlServer.getServerSingleton();

        server.dispatchAsyncWebRequest(new AsyncWebServerRequest("", resp.getWriter()) {

            @Override
            public boolean handleRequest(Semaphore completionSemaphore) throws IOException {

                ProxyServer proxyServer = (ProxyServer) server;

                if (!proxyServer.doesListBelongToCustomer(listId, customerId)) {
                    resp.sendError(HttpServletResponse.SC_FORBIDDEN);
                } else {
                    CrawlList list = proxyServer.getCrawlHistoryManager().getList(listId);
                    if (list != null && list.isListLoaded()) {
                        PrintWriter writer = resp.getWriter();
                        JsonWriter jsonWriter = new JsonWriter(writer);

                        try {
                            jsonWriter.beginObject();
                            jsonWriter.name("items");
                            jsonWriter.beginArray();

                            int urlCount = 0;
                            for (CrawlListDomainItem item : list.getSubDomainList(offset, count)) {

                                jsonWriter.beginArray();
                                jsonWriter.value(item.getDomainName());
                                jsonWriter.value(item.getUrlCount());
                                urlCount += item.getUrlCount();
                                jsonWriter.value(item.getUrlsCrawled());
                                jsonWriter.value(item.getQueuedCount());
                                jsonWriter.value(item.getHashCode());
                                jsonWriter.endArray();
                            }

                            jsonWriter.endArray();
                            jsonWriter.name("remainingItems").value(list.getMetadata().getUrlCount() - urlCount);
                            jsonWriter.endObject();

                            result._resultCode = HttpServletResponse.SC_OK;
                        } catch (Exception e) {
                            throw new IOException(e);
                        }
                    } else {
                        resp.getWriter().print("Crawl List NULL!!");
                    }
                }
                return false;
            }
        });
    }

}