org.eclipse.smila.connectivity.framework.crawler.web.WebCrawler.java Source code

Java tutorial

Introduction

Here is the source code for org.eclipse.smila.connectivity.framework.crawler.web.WebCrawler.java

Source

/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andrey Basalaev (brox IT Solutions GmbH) - initial creator, Ivan Churkin (brox IT Solutions GmbH)
 * Sebastian Voigt (brox IT Solutions GmbH)
 **********************************************************************************************************************/
package org.eclipse.smila.connectivity.framework.crawler.web;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.connectivity.ConnectivityId;
import org.eclipse.smila.connectivity.framework.AbstractCrawler;
import org.eclipse.smila.connectivity.framework.CrawlerCriticalException;
import org.eclipse.smila.connectivity.framework.CrawlerException;
import org.eclipse.smila.connectivity.framework.DataReference;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.Attribute;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.FieldAttributeType;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.MetaReturnType;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.MetaType;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.Process;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.WebSite;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParserManager;
import org.eclipse.smila.connectivity.framework.performancecounters.CrawlerPerformanceCounterHelper;
import org.eclipse.smila.connectivity.framework.schema.config.DataSourceConnectionConfig;
import org.eclipse.smila.connectivity.framework.schema.config.DataSourceConnectionConfig.Attributes;
import org.eclipse.smila.connectivity.framework.schema.config.interfaces.IAttribute;
import org.eclipse.smila.connectivity.framework.util.DataReferenceFactory;
import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.DataFactoryCreator;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.utils.collections.NameValuePair;
import org.eclipse.smila.utils.workspace.WorkspaceHelper;

/**
 * The WebCrawler class.
 */
public class WebCrawler extends AbstractCrawler {

    /**
     * The Constant POC_BYTES.
     */
    public static final String POC_BYTES = "bytes";

    /**
     * The Constant POC_PAGES.
     */
    public static final String POC_PAGES = "pages";

    /**
     * The Constant POC_PRODUCER_EXCEPTIONS.
     */
    public static final String POC_PRODUCER_EXCEPTIONS = "producerExceptions";

    /**
     * The Constant POC_AVEREGE_TIME_TO_FETCH.
     */
    public static final String POC_AVEREGE_TIME_TO_FETCH = "averageHttpFetchTime";

    /**
     * The Constant BUNDLE_NAME.
     */
    private static final String BUNDLE_NAME = "org.eclipse.smila.connectivity.framework.crawler.web";

    /**
     * The Constant UTF_8.
     */
    private static final String UTF_8 = "utf-8";

    /**
     * Separator between metadata name and value, eg. Server: example.com
     */
    private static final char METADATA_SEPARATOR = ':';

    /**
     * The Constant QUEUE_POLL_WAITING.
     */
    private static final int QUEUE_POLL_WAITING = 300;

    /**
     * The Constant HAS_NEXT_WAITING.
     */
    private static final int HAS_NEXT_WAITING = 50;

    /**
     * The Constant CAPACITY.
     */
    private static final int CAPACITY = 100;

    /**
     * The Constant STEP.
     */
    private static final int STEP = 10;

    /**
     * The Constant LOG.
     */
    private final Log _log = LogFactory.getLog(WebCrawler.class);

    /**
     * The queue.
     */
    private ArrayBlockingQueue<Record> _queue;

    /**
     * The data source Id.
     */
    private String _dataSourceID;

    /**
     * The attributes.
     */
    private Attribute[] _attributes;

    /**
     * The crawl thread.
     */
    private CrawlingProducerThread _crawlThread;

    /**
     * The web sites.
     */
    private Iterator<WebSite> _webSites;

    /**
     * The web site iterator.
     */
    private WebSiteIterator _webSiteIterator;

    /**
     * The opened flag.
     */
    private boolean _opened;

    /**
     * The force closing.
     */
    private boolean _forceClosing;

    /**
     * The producer running.
     */
    private boolean _producerRunning;

    /**
     * The opened monitor.
     */
    private final Object _openedMonitor = new Object();

    /**
     * The record factory.
     */
    private final DataFactory _factory = DataFactoryCreator.createDefaultFactory();

    /**
     * The workspace path.
     */
    private String _workspace;

    /**
     * The _critical error.
     */
    private CrawlerCriticalException _criticalException;

    /**
     * The _performance counters.
     */
    private CrawlerPerformanceCounterHelper<WebCrawlerPerformanceAgent> _performanceCounters;

    /**
     * Regex pattern for extrating charset information from ContentType header.
     */
    private final Pattern _contentTypePattern = Pattern
            .compile("^CONTENT-TYPE\\s*:\\s*(?:.|\\s)*CHARSET\\s*=\\s*([\\w-]*)", Pattern.CASE_INSENSITIVE);

    /**
     * Regex pattern for extrating mimetype information from ContentType header.
     */
    private final Pattern _mimeTypePattern = java.util.regex.Pattern
            .compile("^CONTENT-TYPE\\s*:\\s*([^\\s;]+)(\\s*;?.*)$", java.util.regex.Pattern.CASE_INSENSITIVE);

    /**
     * Webcrawler parsers manager.
     */
    private ParserManager _parserManager;

    /**
     * Map containing Records with properties that are only required for creating DataReference.
     */
    private HashMap<ConnectivityId, Record> _dataReferenceRecords = new HashMap<ConnectivityId, Record>();

    /**
     * Map containing records with all properties.
     */
    private HashMap<ConnectivityId, Record> _records = new HashMap<ConnectivityId, Record>();

    /**
     * Instantiates a new web crawler.
     */
    public WebCrawler() {
        super();
        if (_log.isDebugEnabled()) {
            _log.debug("Creating WebCrawler instance");
        }
    }

    /*
     * ******************** implementation of interface Crawler ********************
     */

    /**
     * {@inheritDoc}
     */
    @Override
    public void initialize(final DataSourceConnectionConfig config)
            throws CrawlerException, CrawlerCriticalException {
        _log.info("Initializing WebCrawler...");
        synchronized (_openedMonitor) {
            if (_opened) {
                throw new CrawlerCriticalException(
                        "Crawler is busy (it should not happen because new instances are created by ComponentFactories)");
            }
            _opened = true;
        }

        _dataReferenceRecords = new HashMap<ConnectivityId, Record>();
        _records = new HashMap<ConnectivityId, Record>();

        _performanceCounters = new CrawlerPerformanceCounterHelper<WebCrawlerPerformanceAgent>(config, hashCode(),
                WebCrawlerPerformanceAgent.class);
        _queue = new ArrayBlockingQueue<Record>(CAPACITY);
        _forceClosing = false;
        _producerRunning = true;

        _dataSourceID = config.getDataSourceID();
        final Attributes attributes = config.getAttributes();
        final List<IAttribute> attrs = attributes.getAttribute();
        _attributes = attrs.toArray(new Attribute[attrs.size()]);

        final Process process = (Process) config.getProcess();
        _webSites = process.getWebSite().iterator();

        // _webSiteIterator = new WebSiteIterator(_webSites.next());
        initDataFolder();
        initializeNextSite();

        _crawlThread = new CrawlingProducerThread();
        _crawlThread.start();

        _log.debug("WebCrawler indexer started");
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public DataReference[] getNext() throws CrawlerException, CrawlerCriticalException {
        rethrowProducerExceptions();
        while (hasNext()) {
            rethrowProducerExceptions();
            try {
                final List<Record> list = new ArrayList<Record>();
                final Record topRecord = _queue.poll(QUEUE_POLL_WAITING, TimeUnit.MILLISECONDS);
                if (topRecord != null) {
                    list.add(topRecord);
                    _queue.drainTo(list, STEP - 1);
                    final DataReference[] dataRefs = new DataReference[list.size()];
                    for (int i = 0; i < list.size(); i++) {
                        final Record record = list.get(i);
                        final AnyMap idAttributes = _factory.createAnyMap();
                        final AnyMap hashAttributes = _factory.createAnyMap();
                        getIdAndHashAttributes(record.getMetadata(), idAttributes, hashAttributes);
                        dataRefs[i] = DataReferenceFactory.getInstance().createDataReference(this, _dataSourceID,
                                idAttributes, hashAttributes, getHashAttachments(record));
                        _dataReferenceRecords.put(dataRefs[i].getId(), record);
                    }
                    return dataRefs;
                }
            } catch (final InterruptedException e) {
                ; // nothing
            }
        }
        return null;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void close() throws CrawlerException {
        synchronized (_openedMonitor) {
            _opened = false;
            _log.info("Closing WebCrawler...");
            _queue = null;
            _records.clear();
            _records = null;
            _dataReferenceRecords.clear();
            _dataReferenceRecords = null;
            _performanceCounters = null;
            _forceClosing = true;
            if (_crawlThread != null) {
                try {
                    _crawlThread.join();
                } catch (final InterruptedException e) {
                    ;// nothing
                } catch (final NullPointerException e) {
                    ;// nothing
                }
                _crawlThread = null;
            }
            _dataSourceID = null;
            _attributes = null;
            _criticalException = null;
            if (_workspace != null) {
                FileUtils.deleteQuietly(new File(_workspace));
            }
        }
    }

    /*
     * ******************** implementation of interface CrawlerCallback ********************
     */

    /**
     * {@inheritDoc}
     */
    @Override
    public AnyMap getMetadata(final ConnectivityId id) throws CrawlerException, CrawlerCriticalException {
        Record record = null;
        try {
            record = getRecord(id);
        } catch (final Exception exception) {
            throw new CrawlerException(exception);
        }
        return record.getMetadata();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public byte[] getAttachment(final ConnectivityId id, final String name)
            throws CrawlerException, CrawlerCriticalException {
        Record record = null;
        try {
            record = getRecord(id);
        } catch (final Exception exception) {
            throw new CrawlerException(exception);
        }
        return record.getAttachment(name);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public String[] getAttachmentNames(final ConnectivityId id) throws CrawlerException, CrawlerCriticalException {
        Record record = null;
        try {
            record = getRecord(id);
        } catch (final Exception exception) {
            throw new CrawlerException(exception);
        }
        final ArrayList<String> names = new ArrayList<String>();
        final Iterator<String> it = record.getAttachmentNames();
        while (it.hasNext()) {
            names.add(it.next());
        }
        return names.toArray(new String[names.size()]);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void dispose(final ConnectivityId id) {
        _dataReferenceRecords.remove(id);
        _records.remove(id);
    }

    /* ******************** private stuff ******************** */

    /**
     * Populates given lists with id and hash attributes of given MObject.
     * 
     * @param metadata
     *          MObject
     * @param idAttributes
     *          list of id attributes
     * @param hashAttributes
     *          list of hash attributes
     */
    private void getIdAndHashAttributes(final AnyMap metadata, final AnyMap idAttributes,
            final AnyMap hashAttributes) {
        for (final Attribute attribute : _attributes) {
            if (!attribute.isAttachment()) {
                if (attribute.isKeyAttribute()) {
                    idAttributes.put(attribute.getName(), metadata.get(attribute.getName()));
                } else if (attribute.isHashAttribute()) {
                    hashAttributes.put(attribute.getName(), metadata.get(attribute.getName()));
                }
            }
        }
    }

    /**
     * Returns map consisting of attachment names and values.
     * 
     * @param record
     *          Record
     * @return map
     */
    private Map<String, byte[]> getHashAttachments(final Record record) {
        if (record.hasAttachments()) {
            final Map<String, byte[]> hashAttachments = new HashMap<String, byte[]>();
            for (final Iterator<String> it = record.getAttachmentNames(); it.hasNext();) {
                final String attachmentName = it.next();
                hashAttachments.put(attachmentName, record.getAttachment(attachmentName));
            }
            return hashAttachments;
        }
        return null;
    }

    /**
     * Sets given {@link Attribute} to the given record.
     * 
     * @param record
     *          Record
     * @param indexDocument
     *          indexDocument
     * @param attribute
     *          attribute to set
     * @throws UnsupportedEncodingException
     *           UnsupportedEncodingException
     */
    private void setAttribute(final Record record, final IndexDocument indexDocument, final Attribute attribute)
            throws UnsupportedEncodingException {
        final String name = attribute.getName();
        final AnyMap metadata = record.getMetadata();
        if (attribute.isAttachment()) {
            final Object value = readAttribute(indexDocument, attribute, false);
            if (value != null) {
                if (value instanceof String) {
                    record.setAttachment(name, ((String) value).getBytes(UTF_8));
                } else if (value instanceof byte[]) {
                    record.setAttachment(name, (byte[]) value);
                } else {
                    throw new RuntimeException("Unknown attachment type!");
                }
            }
            // TODO serialization to byte[] for other types of attachments.
        } else {
            final Object value = readAttribute(indexDocument, attribute, true);
            if (value != null) {
                final Any metaDataAttribute;
                if (value instanceof NameValuePair[]) {
                    final AnyMap anyMap = _factory.createAnyMap();
                    for (NameValuePair nameValuePair : (NameValuePair[]) value) {
                        anyMap.put(nameValuePair.getName(), _factory.createStringValue(nameValuePair.getValue()));
                    }
                    metaDataAttribute = anyMap;
                } else if (value instanceof Object[]) {
                    final AnySeq anySeq = _factory.createAnySeq();
                    for (Object object : (Object[]) value) {
                        anySeq.add(_factory.parseFromObject(object));
                    }
                    metaDataAttribute = anySeq;
                } else {
                    metaDataAttribute = _factory.parseFromObject(value);
                }
                metadata.put(name, metaDataAttribute);
            }
        }
    }

    /**
     * Initialize next site.
     * 
     * @return true, if successful
     * 
     * @throws CrawlerCriticalException
     *           the crawler critical exception
     */
    private boolean initializeNextSite() throws CrawlerCriticalException {
        _webSiteIterator = null;
        if (_webSites.hasNext()) {
            final WebSite site = _webSites.next();
            _webSiteIterator = new WebSiteIterator(site, _parserManager, _performanceCounters);
            final boolean hasNext = _webSiteIterator.hasNext();
            if (!hasNext) {
                _forceClosing = true;
                _criticalException = new CrawlerCriticalException(
                        "Unable to connect to web site specified in project " + site.getProjectName());
            }
            return hasNext;
        }
        return false;
    }

    /**
     * Rethrow producer exceptions.
     * 
     * @throws CrawlerCriticalException
     *           the crawler critical exception
     */
    private void rethrowProducerExceptions() throws CrawlerCriticalException {
        if (_criticalException != null) {
            throw _criticalException;
        }
    }

    /**
     * Checks for next.
     * 
     * @return true, if successful
     * 
     * @throws CrawlerCriticalException
     *           the crawler critical exception
     */
    private boolean hasNext() throws CrawlerCriticalException {
        while (_producerRunning && _queue.isEmpty()) {
            try {
                Thread.sleep(HAS_NEXT_WAITING);
            } catch (final InterruptedException e) {
                ; // nothing
            }
        }
        rethrowProducerExceptions();
        return !_queue.isEmpty();
    }

    /**
     * Creates the record.
     * 
     * @param id
     *          the record id
     * @return the record
     * @throws IOException
     *           Signals that an I/O exception has occurred.
     * @throws CrawlerException
     *           the crawler exception
     */
    private Record getRecord(final ConnectivityId id) throws IOException, CrawlerException {
        if (_records.containsKey(id)) {
            return _records.get(id);
        } else {
            final Record record = _dataReferenceRecords.get(id);
            final AnyMap metadata = record.getMetadata();
            IndexDocument indexDocument = null;
            for (final Attribute attribute : _attributes) {
                if (!(attribute.isHashAttribute() || attribute.isKeyAttribute())) {
                    if (indexDocument == null) {
                        final String url = metadata.getStringValue(FieldAttributeType.URL.value());
                        indexDocument = deserializeIndexDocument(DigestUtils.md5Hex(url));
                    }
                    setAttribute(record, indexDocument, attribute);
                }
            }

            if (_log.isDebugEnabled()) {
                _log.debug("Created record for url: " + metadata.getStringValue(FieldAttributeType.URL.value()));
            }

            _records.put(id, record);
            _dataReferenceRecords.remove(id);
            return record;
        }
    }

    /**
     * Creates the di data.
     * 
     * @param indexDocument
     *          the index document
     * 
     * @return the m object
     * 
     * @throws IOException
     *           Signals that an I/O exception has occurred.
     */
    private Record createDataReferenceRecord(final IndexDocument indexDocument) throws IOException {
        final Record record = _factory.createRecord();
        for (final Attribute attribute : _attributes) {
            // read key, hash and 'URL' attributes
            if (attribute.isKeyAttribute() || attribute.isHashAttribute() || attribute.getFieldAttribute() != null
                    && attribute.getFieldAttribute().equals(FieldAttributeType.URL)) {
                setAttribute(record, indexDocument, attribute);
            }
        }

        return record;
    }

    /**
     * Read attribute.
     * 
     * @param indexDocument
     *          the index document
     * @param attribute
     *          the attribute
     * @param forceByteToString
     *          the force byte to string
     * 
     * @return the attribute value
     * 
     * @throws UnsupportedEncodingException
     *           the unsupported encoding exception
     */
    private Serializable readAttribute(final IndexDocument indexDocument, final Attribute attribute,
            final boolean forceByteToString) throws UnsupportedEncodingException {
        if (attribute.getFieldAttribute() != null) {
            switch (attribute.getFieldAttribute()) {
            case URL:
                return indexDocument.getUrl();
            case CONTENT:
                // search encoding in headers
                String charsetName = indexDocument.extractFromResponseHeaders(_contentTypePattern, 1);
                if (charsetName == null) {
                    charsetName = UTF_8;
                }
                if (forceByteToString) {
                    return new String(indexDocument.getContent(), charsetName);
                } else {
                    if (UTF_8.equalsIgnoreCase(charsetName)) {
                        return indexDocument.getContent();
                    }
                    // decode to utf
                    try {
                        return (new String(indexDocument.getContent(), charsetName)).getBytes(UTF_8);
                    } catch (UnsupportedEncodingException uee) {
                        throw new UnsupportedEncodingException("Test");
                    }
                }
            case TITLE:
                return indexDocument.getTitle();
            case MIME_TYPE:
                return indexDocument.extractFromResponseHeaders(_mimeTypePattern, 1);
            default:
                throw new IllegalArgumentException("Unknown field attribute type " + attribute.getFieldAttribute());
            }
        } else if (attribute.getMetaAttribute() != null) {
            final MetaType metaType = attribute.getMetaAttribute().getType();
            final List<String> metaNames = attribute.getMetaAttribute().getMetaName();
            List<String> metaData;
            switch (metaType) {
            case META_DATA:
                metaData = getFilteredMetadataList(indexDocument.getHtmlMetaData(), metaNames);
                break;
            case RESPONSE_HEADER:
                metaData = getFilteredMetadataList(indexDocument.getResponseHeaders(), metaNames);
                break;
            case META_DATA_WITH_RESPONSE_HEADER_FALL_BACK:
                metaData = getFilteredMetadataList(indexDocument.getMetaDataWithResponseHeaderFallBack(),
                        metaNames);
                break;
            default:
                throw new IllegalArgumentException("Unknown meta attribute type " + attribute.getFieldAttribute());
            }

            final MetaReturnType returnType = attribute.getMetaAttribute().getReturnType();
            switch (returnType) {
            case META_DATA_STRING:
                return metaData.toArray();
            case META_DATA_VALUE:
                for (int i = 0; i < metaData.size(); i++) {
                    final String metaDataString = metaData.get(i);
                    metaData.set(i,
                            metaDataString.substring(metaDataString.indexOf(METADATA_SEPARATOR) + 1).trim());
                }
                return metaData.toArray();
            case META_DATA_M_OBJECT:
                final NameValuePair[] metaDataNameValuePairs = new NameValuePair[metaData.size()];
                for (int i = 0; i < metaData.size(); i++) {
                    final String metaDataString = metaData.get(i);
                    final String metadataName = metaDataString
                            .substring(0, metaDataString.indexOf(METADATA_SEPARATOR)).trim();
                    final String metaDataValue = metaDataString
                            .substring(metaDataString.indexOf(METADATA_SEPARATOR) + 1).trim();
                    metaDataNameValuePairs[i] = new NameValuePair(metadataName, metaDataValue);
                }
                return metaDataNameValuePairs;
            default:
                throw new IllegalArgumentException("Unknown meta attribute return type " + returnType);
            }
        } else {
            throw new IllegalArgumentException("Unknown attribute " + attribute.getName());
        }
    }

    /**
     * Returns fileterd metadata list.
     * 
     * @param list
     *          the list
     * @param filters
     *          the filters
     * 
     * @return the filtered metadata list
     */
    private List<String> getFilteredMetadataList(final List<String> list, final List<String> filters) {
        if (filters.isEmpty()) {
            return list;
        }
        final List<String> filteredList = new ArrayList<String>();
        for (final String s : list) {
            if (s.indexOf(METADATA_SEPARATOR) > 0) {
                final String metadataName = s.substring(0, s.indexOf(METADATA_SEPARATOR)).trim();
                for (final String metaName : filters) {
                    if (metadataName.equals(metaName)) {
                        filteredList.add(s);
                    }
                }
            }
        }
        return filteredList;
    }

    /**
     * Serialize index document to have possibility to fill record later.
     * 
     * @param indexDocument
     *          the index document
     * @param filename
     *          the filename
     * 
     * @throws CrawlerException
     *           the crawler exception
     */
    private void serializeIndexDocument(final String filename, final IndexDocument indexDocument)
            throws CrawlerException {
        if (_log.isDebugEnabled()) {
            _log.debug("Serializing document " + filename);
        }
        ObjectOutputStream objstream = null;
        try {
            objstream = new ObjectOutputStream(new FileOutputStream(new File(_workspace, filename)));
            objstream.writeObject(indexDocument);
        } catch (final Throwable e) {
            throw new CrawlerException("Unable to serialize index document", e);
        } finally {
            IOUtils.closeQuietly(objstream);
        }
    }

    /**
     * Deserialize index document.
     * 
     * @param filename
     *          the filename
     * 
     * @return the index document
     * 
     * @throws CrawlerException
     *           the crawler exception
     */
    private IndexDocument deserializeIndexDocument(final String filename) throws CrawlerException {
        IndexDocument indexDocument = null;
        if (_log.isDebugEnabled()) {
            _log.debug("Deserializing document " + filename);
        }
        final File file = new File(_workspace, filename);
        if (!file.exists()) {
            throw new CrawlerException(
                    String.format("Unable to find file %s for deserializing cached document", file.getPath()));
        }
        ObjectInputStream objstream = null;
        try {
            objstream = new ObjectInputStream(new FileInputStream(file));
            indexDocument = (IndexDocument) objstream.readObject();
        } catch (final Throwable e) {
            throw new CrawlerException(e);
        } finally {
            IOUtils.closeQuietly(objstream);
        }
        return indexDocument;
    }

    /**
     * Initializes the data folder.
     * 
     * @throws CrawlerCriticalException
     *           the crawler critical exception
     */
    private void initDataFolder() throws CrawlerCriticalException {
        try {
            final File file = WorkspaceHelper.createWorkingDir(BUNDLE_NAME, String.valueOf(hashCode()));
            file.mkdir();
            _workspace = file.getCanonicalPath();
        } catch (final IOException e) {
            throw new CrawlerCriticalException("Unable to initialize workspace", e);
        }
    }

    /**
     * To be used by Declarative Services. Sets the ParserManager service.
     * 
     * @param parserManager
     *          ParserManager Service.
     */
    public void setParserManager(final ParserManager parserManager) {
        _parserManager = parserManager;
        if (_log.isDebugEnabled()) {
            _log.debug("ParserManager is bound");
        }
    }

    /**
     * To be used by Declarative Services. Removes ParserManager service.
     * 
     * @param parserManager
     *          ParserManager Service.
     */
    public void unsetParserManager(final ParserManager parserManager) {
        if (parserManager == _parserManager) {
            _parserManager = null;
        }
        if (_log.isDebugEnabled()) {
            _log.debug("ParserManager is unbound");
        }
    }

    /**
     * The Class CrawlThread.
     */
    private class CrawlingProducerThread extends Thread {

        /**
         * {@inheritDoc}
         * 
         * @see java.lang.Thread#run()
         */
        @Override
        public void run() {
            try {
                while (!_forceClosing && hasNextDoc()) {
                    final IndexDocument document = _webSiteIterator.next();
                    boolean waiting = true;
                    Record record = null;
                    while (waiting) {
                        try {
                            if (record == null) {
                                record = createDataReferenceRecord(document);
                            }

                            // serialize index document to have possibility to
                            // get attachments later
                            if (_forceClosing) {
                                break;
                            }
                            serializeIndexDocument(DigestUtils.md5Hex(document.getUrl()), document);

                            if (_forceClosing) {
                                break;
                            }
                            final ArrayBlockingQueue<Record> queue = _queue;
                            if (queue != null) {
                                _queue.put(record);
                            }
                            waiting = false;
                        } catch (final InterruptedException e) {
                            ;// nothing
                        } catch (final IOException e) {
                            _log.error("", e);
                            _performanceCounters.increment(POC_PRODUCER_EXCEPTIONS);
                            _performanceCounters.addException(e);
                        }
                    }
                }
            } catch (final Throwable t) {
                _log.error("Producer error", t);
                if (_performanceCounters != null) {
                    _performanceCounters.increment(POC_PRODUCER_EXCEPTIONS);
                    _performanceCounters.addException(t);
                }
            } finally {
                _webSiteIterator = null;
                _producerRunning = false;
                if (_forceClosing) {
                    _log.info("Producer finished by forcing close procedure");
                } else {
                    _log.info("Producer finished!");
                }
            }
        }

        /**
         * Checks for next.
         * 
         * @return true, if successful
         * 
         * @throws CrawlerCriticalException
         *           the crawler critical exception
         */
        private boolean hasNextDoc() throws CrawlerCriticalException {
            if (_webSiteIterator.hasNext()) {
                return true;
            } else if (_webSites.hasNext()) {
                return initializeNextSite();
            }
            return false;
        }

    }

}