com.villemos.ispace.ktree.KtreeAccessor.java Source code

Introduction

Here is the source code for com.villemos.ispace.ktree.KtreeAccessor.java
Source

/**
 * villemos solutions [space^] (http://www.villemos.com) 
 * Probe. Send. Act. Emergent solution. 
 * Copyright 2011 Gert Villemos
 * All Rights Reserved.
 * 
 * Released under the Apache license, version 2.0 (do what ever
 * you want, just dont claim ownership).
 * 
 * NOTICE:  All information contained herein is, and remains
 * the property of villemos solutions, and its suppliers
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to villemos solutions
 * and its suppliers and may be covered by European and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * 
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from villemos solutions.
 * 
 * And it wouldn't be nice either.
 * 
 */

package com.villemos.ispace.ktree;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;

import org.apache.camel.CamelContext;
import org.apache.camel.Endpoint;
import org.apache.camel.Exchange;
import org.apache.camel.impl.DefaultExchange;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;

import com.thoughtworks.xstream.XStream;
import com.villemos.ispace.httpcrawler.HttpAccessor;
import com.villemos.ispace.httpcrawler.HttpClientConfigurer;
import com.villemos.ispace.httpcrawler.HttpCrawlerConsumer;
import com.villemos.ispace.ktree.folder.Folder;
import com.villemos.ispace.ktree.folder.Item;
import com.villemos.ispace.ktree.folder.Items;
import com.villemos.ispace.ktree.folder.Result;
import com.villemos.ispace.ktree.metadata.MetadataField;
import com.villemos.ispace.ktree.metadata.MetadataItem;
import com.villemos.ispace.ktree.session.Session;

/**
 * Specialized crawler for crawling the ktree website.
 * 
 * The crawler will access a initial folder, thereafter iterate through all found folders
 * collecting folder and document information as it goes. 
 * 
 * The end result is a Map, containing a set of lists describing the found data.
 * 
 */
public class KtreeAccessor extends HttpAccessor {

    private static final Log LOG = LogFactory.getLog(KtreeAccessor.class);

    /** Collection of folders crawled / to be crawled. Will dynamically change as the 
     * crawler runs. */
    protected Map<String, String> uncrawledFolders = new HashMap<String, String>();
    protected List<String> crawledFolders = new ArrayList<String>();

    /** The data to be forwarded in the route. The documents is a list of all
     * documents found. The components is a list of all folders found ending with
     * 'Components'. Releases is a list of all folers found ending with 'Release [ID]'. */
    protected List<Item> documents = new ArrayList<Item>();

    protected List<Folder> folders = new ArrayList<Folder>();

    /** Statistics*/
    protected long entriesProcessed = 0;

    //protected KtreeCrawlerEndpoint endpoint = null;

    protected XStream xstream = new XStream();
    {
        xstream.processAnnotations(Session.class);

        xstream.processAnnotations(Folder.class);
        xstream.processAnnotations(Result.class);
        xstream.processAnnotations(Items.class);
        xstream.processAnnotations(Item.class);

        xstream.processAnnotations(MetadataField.class);
        xstream.processAnnotations(MetadataItem.class);
    }
    protected Session session = null;

    protected String baseRestUrl = "";

    protected CamelContext camelContext = null;

    public class StringComperatorIgnoreCase implements Comparator<String> {

        public int compare(String o1, String o2) {
            return o1.compareToIgnoreCase(o2);
        }
    }

    public KtreeAccessor(Endpoint endpoint, Object object, CamelContext camelContext) {
        super(endpoint);
        this.camelContext = camelContext;
    }

    public KtreeAccessor(Endpoint endpoint, HttpCrawlerConsumer consumer) {
        super(endpoint, consumer);
    }

    public void doPoll() throws Exception {
        sendStatusMessage("Starting crawl of Ktree.");

        /** Initialize. */
        if (((KtreeCrawlerEndpoint) endpoint).isAppendMode() == false) {
            LOG.info("Append mode DISABLED. Clearing previous crawled data.");
            uncrawledFolders = new HashMap<String, String>();
            crawledFolders = new ArrayList<String>();
            documents = new ArrayList<Item>();
            entriesProcessed = 0;
        } else {
            LOG.info("Append mode ENABLED. Will add new data to results of previous crawl.");
        }

        this.poll();
        sendStatusMessage("Crawl of Ktree completed.");
    }

    /** Setup the initial login form request. This method is called by
     * the main method in the iSpace http crawler, to create the form
     * login. The framework will thereafter call 'processSite'. 
     * 
     * The authentication is managed by the iSpace framework.
     *  
     */
    @Override
    protected HttpUriRequest createInitialRequest(URI uri) {
        baseRestUrl = uri + "/ktwebservice/KTWebService.php?";
        sendStatusMessage("Accessing Ktree site '" + uri + "'.");
        return new HttpGet(baseRestUrl + "method=login&password=" + ((KtreeCrawlerEndpoint) endpoint).getPassword()
                + "&username=" + ((KtreeCrawlerEndpoint) endpoint).getUser());
    }

    /** Method called by the iSpace httpcrawler when the initial request has
     * been performed and succeeded. The page corresponds to the first
     * page AFTER the login. */
    @Override
    protected void processSite(URI uri, HttpResponse response) throws IOException {

        /** read the complete page. */
        String page = HttpClientConfigurer.readFully(response.getEntity().getContent());

        /** Iteratively crawl the site. */
        processPage(page);

    }

    protected KtreeCrawlerEndpoint getKtreeEndpoint() {
        return ((KtreeCrawlerEndpoint) getEndpoint());
    }

    public Map<String, List> getResults() {
        TreeMap<String, List> results = new TreeMap<String, List>(new StringComperatorIgnoreCase());

        if (getKtreeEndpoint().isDocuments()) {
            results.put(getKtreeEndpoint().getDocumentTabName(), documents);
        }
        if (getKtreeEndpoint().isFolders()) {
            results.put(getKtreeEndpoint().getFolderTabName(), folders);
        }

        return results;
    }

    /**
     * Method to iteratively crawl the ktree site.
     * 
     * The method will first go to the root folder, then in a loop
     * collect and process all founds subfolders (URLs).
     * 
     * @param firstPage
     */
    protected void processPage(String firstPage) {

        /** Get the session ID. */
        session = new Session();
        xstream.fromXML(firstPage, session);

        /** Entry is the dashboard. Get the relevant folder. */
        try {

            /** Add root folders to the list of uncrawled folders. */
            for (String folder : getKtreeEndpoint().getInitialFolders()) {
                LOG.info("Querying root folder '" + folder + "' for processing.");
                uncrawledFolders.put(folder, "/");
            }

            /** Iteratively process all URLs */
            while (uncrawledFolders.size() > 0 && (documents.size() < getKtreeEndpoint().getMaxNumberOfDocuments()
                    || getKtreeEndpoint().getMaxNumberOfDocuments() == -1)) {
                Iterator<Entry<String, String>> it = uncrawledFolders.entrySet().iterator();
                Entry<String, String> entry = it.next();

                String folderId = entry.getKey();
                String path = entry.getValue();
                it.remove();
                LOG.debug("Processing folder '" + folderId + "' with path '" + path + "'.");

                /** Register this URL as crawled. We do this before we crawl, as no matter whether we succeed or not in
                 * the crawl of the page, we should not crawl the page again. */
                crawledFolders.add(folderId);
                processFolder(folderId, path);
                sendStatusMessage("Folders crawled: " + crawledFolders.size() + ", Folders pending: "
                        + uncrawledFolders.size() + ", Found documents: " + processed);
                LOG.info("Folders crawled: " + crawledFolders.size() + ", Folders pending: "
                        + uncrawledFolders.size() + ", Processed documents: " + processed
                        + ". Processing folder ID '" + folderId + "'.");
                entriesProcessed++;
            }

            LOG.info("Crawled '" + crawledFolders.size() + "' folders.");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Process the page of a folder. 
     * 
     * @param url
     * @param path
     * @throws ClientProtocolException
     * @throws IOException
     */
    public void processFolder(String folderId, String path) throws ClientProtocolException, IOException {
        /** Get the page. */
        HttpGet get = new HttpGet(baseRestUrl + "method=get_folder_contents&session_id=" + session.results
                + "&folder_id=" + folderId);

        HttpResponse response = client.execute(get);
        String page = HttpClientConfigurer.readFully(response.getEntity().getContent());

        /** Detect URLs */
        detectUrls(folderId, page, path);
    }

    protected void detectUrls(String url, String page, String path) {

        /** Read the folder. */
        Folder folder = new Folder();
        try {
            xstream.fromXML(page.replaceAll("&", "&amp;"), folder);
        } catch (Exception e) {
            e.printStackTrace();
        }
        folders.add(folder);

        String ignorePattern = getKtreeEndpoint().getIgnorePattern();

        /** Iterate through the items and find all folders. */
        if (folder.results != null && folder.results.items != null && folder.results.items.items != null) {
            for (Item item : folder.results.items.items) {

                if (folder.results.full_path.equals("/")) {
                    item.absoluteFilename = folder.results.full_path + item.filename;
                } else {
                    item.absoluteFilename = "/" + folder.results.full_path + "/" + item.filename;
                }

                /** If this is a folder... */
                if (item.item_type.equals("F")) {

                    /** Check if we should ignore the folder. */
                    if (ignorePattern != null) {
                        if (item.absoluteFilename.matches(ignorePattern) == true) {
                            LOG.info("Ignoring folder '" + item.absoluteFilename);
                            continue;
                        }
                    }

                    if (crawledFolders.contains(item.id) == false) {
                        uncrawledFolders.put(item.id, item.absoluteFilename + "/");
                        LOG.debug("Querying detected subfolder '" + item.id + "' in path '" + item.absoluteFilename
                                + "' for processing.");
                    } else {
                        LOG.warn("Duplicate folder ID '" + path + "/" + item.id + "'.");
                    }
                } else if (getKtreeEndpoint().isDocuments() == true) {

                    /** Get the metadata for this entry. */
                    MetadataItem metadata = new MetadataItem();
                    try {
                        HttpGet get = new HttpGet(baseRestUrl + "method=get_document_metadata&session_id="
                                + session.results + "&document_id=" + item.id);
                        HttpResponse response = client.execute(get);
                        String page2 = HttpClientConfigurer.readFully(response.getEntity().getContent());

                        /** TODO Do this properly. XStream cant handle the usage of 'item' two
                         * places in the response XML schema. Therefore we have to manually extract one. */
                        int startIndex = page2.indexOf("<fields>");
                        int endIndex = page2.indexOf("</fields>");
                        if (startIndex != -1) {
                            String substring = "<response>" + page2.substring(startIndex, endIndex + 9)
                                    + "</response>";
                            xstream.fromXML(substring.replaceAll("&", "&amp;"), metadata);
                        } else {
                            LOG.warn("Failed to retrieve metadata for document '" + item.id + "'.");
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                    }

                    /** Set additional item information. */
                    item.inPath = folder.results.full_path;
                    item.inParentFolderId = folder.results.folder_id;
                    item.inParentFolder = folder.results.folder_name;

                    /** Set the metadata. */
                    item.metadata = new HashMap<String, Object>();
                    for (MetadataField field : metadata.fields) {
                        item.metadata.put(field.name, field.value);
                    }

                    /** Add the document item to the documents found. */
                    documents.add(item);
                    sendStatusMessage("Added document '" + item.absoluteFilename + "'.");
                    LOG.info("Added document '" + item.absoluteFilename + "'.");
                    processed++;
                }
            }
        }
    }

    protected void sendStatusMessage(String message) {

        if (camelContext != null && ((KtreeCrawlerEndpoint) endpoint).isSendStatus() == true) {
            Exchange exchange = new DefaultExchange(camelContext);
            exchange.getIn().setBody(message);
            camelContext.createProducerTemplate().send("direct:status", exchange);
        }
    }
}