org.paxle.indexer.impl.IndexerWorker.java Source code

Java tutorial

Introduction

Here is the source code for org.paxle.indexer.impl.IndexerWorker.java

Source

/**
 * This file is part of the Paxle project.
 * Visit http://www.paxle.net for more information.
 * Copyright 2007-2010 the original author or authors.
 *
 * Licensed under the terms of the Common Public License 1.0 ("CPL 1.0").
 * Any use, reproduction or distribution of this program constitutes the recipient's acceptance of this agreement.
 * The full license text is available under http://www.opensource.org/licenses/cpl1.0.txt
 * or in the file LICENSE.txt in the root directory of the Paxle distribution.
 *
 * Unless required by applicable law or agreed to in writing, this software is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

package org.paxle.indexer.impl;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.paxle.core.doc.ICommand;
import org.paxle.core.doc.ICrawlerDocument;
import org.paxle.core.doc.IDocumentFactory;
import org.paxle.core.doc.IIndexerDocument;
import org.paxle.core.doc.IParserDocument;
import org.paxle.core.threading.AWorker;

public class IndexerWorker extends AWorker<ICommand> {

    private final Log logger = LogFactory.getLog(IndexerWorker.class);

    private final IDocumentFactory idocFactory;

    public IndexerWorker(IDocumentFactory idocFactory) {
        this.idocFactory = idocFactory;
    }

    @Override
    protected void execute(ICommand command) {
        final long start = System.currentTimeMillis();

        IIndexerDocument indexerDoc = null;
        ArrayList<IIndexerDocument> indexerSubDocs = null;
        try {
            /* ================================================================
             * Input Parameter Check
             * ================================================================ */
            String errorMsg = null;
            if (command.getResult() != ICommand.Result.Passed) {
                errorMsg = String.format("Won't index resource '%s'. Command status is: '%s' (%s)",
                        command.getLocation(), command.getResult(), command.getResultText());
            } else if (command.getCrawlerDocument() == null) {
                errorMsg = String.format("Won't index resource '%s'. Crawler-document is null",
                        command.getLocation());
            } else if (command.getCrawlerDocument().getStatus() != ICrawlerDocument.Status.OK) {
                errorMsg = String.format("Won't index resource '%s'. Crawler-document status is: '%s' (%s)",
                        command.getLocation(), command.getCrawlerDocument().getStatus(),
                        command.getCrawlerDocument().getStatusText());
            } else if (command.getParserDocument() == null) {
                errorMsg = String.format("Won't index resource '%s'. Parser-document is null",
                        command.getLocation());
            } else if (command.getParserDocument().getStatus() != IParserDocument.Status.OK) {
                errorMsg = String.format("Won't index resource '%s'. Parser-document status is: '%s' (%s)",
                        command.getLocation(), command.getCrawlerDocument().getStatus(),
                        command.getCrawlerDocument().getStatusText());
            }

            if (errorMsg != null) {
                this.logger.warn(errorMsg);
                return;
            }

            /* ================================================================
             * Generate Indexer Document
             * ================================================================ */

            // generate the "main" indexer document from the "main" parser document including the
            // data from the command object
            if ((command.getParserDocument().getFlags() & IParserDocument.FLAG_NOINDEX) == 0) {
                this.logger.debug(String.format("Indexing of URL '%s' (%s) ...", command.getLocation(),
                        command.getCrawlerDocument().getMimeType()));

                indexerDoc = this.generateIIndexerDoc(command.getLocation(),
                        command.getCrawlerDocument().getCrawlerDate(), null, command.getParserDocument());
            } else {
                this.logger.info(String.format("Indexing of URL '%s' (%s) ommitted due to 'noindex'-flag",
                        command.getLocation(), command.getCrawlerDocument().getMimeType()));

                // don't exit here already, we still have to process the sub-parser-docs
            }

            // generate indexer docs from all parser-sub-documents and add them to the command
            indexerSubDocs = new ArrayList<IIndexerDocument>();

            final class Entry {
                public String key;
                public IParserDocument pdoc;

                public Entry(final String key, final IParserDocument pdoc) {
                    this.key = key;
                    this.pdoc = pdoc;
                }
            }

            // traverse the tree of sub-documents
            final Queue<Entry> queue = new LinkedList<Entry>();
            for (Map.Entry<String, IParserDocument> pdoce : command.getParserDocument().getSubDocs().entrySet())
                queue.add(new Entry(pdoce.getKey(), pdoce.getValue()));

            while (!queue.isEmpty()) {
                Entry e = queue.remove();
                if ((e.pdoc.getFlags() & IParserDocument.FLAG_NOINDEX) == 0) {
                    IIndexerDocument indexerSubDoc = this.generateIIndexerDoc(command.getLocation(),
                            command.getCrawlerDocument().getCrawlerDate(), e.key, e.pdoc);
                    indexerSubDocs.add(indexerSubDoc);
                }

                for (final Map.Entry<String, IParserDocument> pdoce : e.pdoc.getSubDocs().entrySet())
                    queue.add(new Entry(e.key + "/" + pdoce.getKey(), pdoce.getValue()));
            }

            /* ================================================================
             * Process indexer response
             * ================================================================ */

            /* There may be the case, that - i.e. by a document's and it's parser's restriction - the main
             * document, from which the sub-docs are retrieved, may not be indexed, but links, and therefore
             * sub-docs, may be followed.
             * In this case we simply omit the main document. If the document has no children, then this is the
             * only thing we need to check for correctness. */
            if (indexerSubDocs.size() == 0) {

                if (indexerDoc == null) {
                    command.setResult(ICommand.Result.Failure,
                            String.format("Indexer returned no indexer-document."));
                    return;
                } else if (indexerDoc.getStatus() == null || indexerDoc.getStatus() != IIndexerDocument.Status.OK) {
                    command.setResult(ICommand.Result.Failure,
                            String.format("Indexer-document status is '%s'.", indexerDoc.getStatus()));
                    return;
                }

            }

            // XXX: what to take if both (pdoc and cdoc) contain a different value for last mod?
            if (command.getCrawlerDocument().getLastModDate() != null) {
                indexerDoc.set(IIndexerDocument.LAST_MODIFIED, command.getCrawlerDocument().getLastModDate());
            }
            indexerDoc.set(IIndexerDocument.SIZE, Long.valueOf(command.getCrawlerDocument().getSize()));

            // setting command status to passed
            command.setResult(ICommand.Result.Passed);

        } catch (Throwable e) {
            // setting command status
            command.setResult(ICommand.Result.Failure, String.format("Unexpected '%s' while indexing resource. %s",
                    e.getClass().getName(), e.getMessage()));

            // log error
            this.logger.warn(String.format("Unexpected '%s' while indexing resource '%s'.", e.getClass().getName(),
                    command.getLocation()), e);
        } finally {
            /* Add indexer-docs to command-object.
             * 
             * This must be done even in error situations to 
             * - allow filters to correct the error (if possible)
             * - to report the error back properly (e.g. to store it into db
             *   or send it back to a remote peer). 
             */
            if (indexerDoc != null) {
                command.addIndexerDocument(indexerDoc);
            }

            if (indexerSubDocs != null) {
                // get all indexer-sub-docs and add them to the command
                for (IIndexerDocument indexerSubDoc : indexerSubDocs) {
                    // XXX: do sub-docs need a size-field, too?
                    command.addIndexerDocument(indexerSubDoc);
                }
            }

            ICrawlerDocument crawlerDoc = command.getCrawlerDocument();
            IParserDocument parserDoc = command.getParserDocument();

            if (logger.isDebugEnabled()) {
                this.logger.info(String.format(
                        "Finished indexing of resource '%s' in %d ms.\r\n" + "\tCrawler-Status: '%s' %s\r\n"
                                + "\tParser-Status:  '%s' %s\r\n" + "\tIndexer-Status: '%s' %s",
                        command.getLocation(), Long.valueOf(System.currentTimeMillis() - start),
                        (crawlerDoc == null) ? "unknown" : crawlerDoc.getStatus().toString(),
                        (crawlerDoc == null) ? ""
                                : (crawlerDoc.getStatusText() == null) ? "" : crawlerDoc.getStatusText(),
                        (parserDoc == null) ? "unknown" : parserDoc.getStatus().toString(),
                        (parserDoc == null) ? ""
                                : (parserDoc.getStatusText() == null) ? "" : parserDoc.getStatusText(),
                        (indexerDoc == null) ? "unknown" : indexerDoc.getStatus().toString(),
                        (indexerDoc == null) ? ""
                                : (indexerDoc.getStatusText() == null) ? "" : indexerDoc.getStatusText()));
            } else if (logger.isInfoEnabled()) {
                this.logger.info(String.format(
                        "Finished indexing of resource '%s' in %d ms.\r\n" + "\tIndexer-Status: '%s' %s",
                        command.getLocation(), Long.valueOf(System.currentTimeMillis() - start),
                        (indexerDoc == null) ? "unknown" : indexerDoc.getStatus().toString(),
                        (indexerDoc == null) ? ""
                                : (indexerDoc.getStatusText() == null) ? "" : indexerDoc.getStatusText()));
            }
        }
    }

    private IIndexerDocument generateIIndexerDoc(final URI location, final Date lastCrawled, final String name,
            final IParserDocument pdoc) throws IOException {
        final IIndexerDocument idoc = this.idocFactory.createDocument(IIndexerDocument.class);
        try {
            final Collection<String> kw = pdoc.getKeywords();
            final Set<String> lng = pdoc.getLanguages();

            final String protocol = location.getScheme();
            final File textFile = pdoc.getTextFile();

            /* this non-standard format has been chosen intentionally to allow an easy overview about which fields
             * are set
             *       Precondition                           Field-name                        Data
             *       ~~~~~~~~~~~~                           ~~~~~~~~~~                        ~~~~
             */
            if (pdoc.getAuthor() != null)
                idoc.set(IIndexerDocument.AUTHOR, pdoc.getAuthor());
            if (name != null)
                idoc.set(IIndexerDocument.INTERNAL_NAME, name);
            if (kw.size() > 0)
                idoc.set(IIndexerDocument.KEYWORDS, kw.toArray(new String[kw.size()]));
            if (lng != null && lng.size() > 0)
                idoc.set(IIndexerDocument.LANGUAGES, lng.toArray(new String[lng.size()]));
            idoc.set(IIndexerDocument.LAST_CRAWLED,
                    (lastCrawled == null) ? new Date(System.currentTimeMillis()) : lastCrawled);
            if (pdoc.getLastChanged() != null)
                idoc.set(IIndexerDocument.LAST_MODIFIED, pdoc.getLastChanged());
            idoc.set(IIndexerDocument.LOCATION, location.toString());
            idoc.set(IIndexerDocument.MIME_TYPE, pdoc.getMimeType());
            if (protocol != null)
                idoc.set(IIndexerDocument.PROTOCOL, protocol);
            if (pdoc.getSummary() != null)
                idoc.set(IIndexerDocument.SUMMARY, pdoc.getSummary());
            if (pdoc.getTitle() != null)
                idoc.set(IIndexerDocument.TITLE, pdoc.getTitle());
            if (textFile != null)
                idoc.set(IIndexerDocument.TEXT, textFile);
            // TODO: IIndexerDocument.TOPICS

            idoc.setStatus(IIndexerDocument.Status.OK);
        } catch (Exception e) {
            this.logger.warn(
                    "Unable to index the sub-document '" + name + "' of '" + location + "': " + e.getMessage(), e);
            idoc.setStatus((e instanceof IOException) ? IIndexerDocument.Status.IOError
                    : IIndexerDocument.Status.IndexerError, e.getMessage());
        }
        return idoc;
    }
}