org.eclipse.smila.connectivity.framework.compound.zip.ZipCompoundCrawler.java Source code

Java tutorial

Introduction

Here is the source code for org.eclipse.smila.connectivity.framework.compound.zip.ZipCompoundCrawler.java

Source

/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Daniel Stucky (empolis GmbH) - initial API and implementation
 **********************************************************************************************************************/
package org.eclipse.smila.connectivity.framework.compound.zip;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.connectivity.ConnectivityId;
import org.eclipse.smila.connectivity.framework.CrawlerCallback;
import org.eclipse.smila.connectivity.framework.CrawlerCriticalException;
import org.eclipse.smila.connectivity.framework.CrawlerException;
import org.eclipse.smila.connectivity.framework.DataReference;
import org.eclipse.smila.connectivity.framework.compound.AbstractCompoundCrawler;
import org.eclipse.smila.connectivity.framework.schema.config.CompoundHandling;
import org.eclipse.smila.connectivity.framework.schema.config.DataSourceConnectionConfig;
import org.eclipse.smila.connectivity.framework.schema.config.ElementAttributeType;
import org.eclipse.smila.connectivity.framework.util.ConnectivityHashFactory;
import org.eclipse.smila.connectivity.framework.util.DataReferenceFactory;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.utils.file.EncodingHelper;
import org.eclipse.smila.utils.workspace.WorkspaceHelper;

/**
 * The Interface CompoundHandler.
 */
public class ZipCompoundCrawler extends AbstractCompoundCrawler {

    /** The compund system metadata element. */
    String COMPOUND_METADATA_ELEMENT = "_compounds";

    /**
     * The Constant BUNDLE_ID.
     */
    private static final String BUNDLE_ID = "org.eclipse.smila.connectivity.framework.compound.zip";

    /**
     * The Constant QUEUE_POLL_WAITING.
     */
    private static final int QUEUE_POLL_WAITING = 300;

    /**
     * The Constant HAS_NEXT_WAITING.
     */
    private static final int HAS_NEXT_WAITING = 50;

    /**
     * The Constant CAPACITY.
     */
    private static final int CAPACITY = 100;

    /**
     * The Constant STEP.
     */
    private static final int STEP = 10;

    /**
     * Reference to the zip file.
     */
    private ZipFile _zipFile;

    /**
     * The _queue.
     */
    private ArrayBlockingQueue<DataReference> _queue;

    /**
     * The _crawl thread.
     */
    private ZipEntryProducerThread _producerThread;

    /**
     * The Id to ZipEntry mapping.
     */
    private Map<ConnectivityId, ZipEntry> _entryMap;

    /**
     * The _attributes.
     */
    private CompoundHandling.CompoundAttribute[] _compoundAttributes;

    /**
     * The _attachment names.
     */
    private String[] _attachmentNames;

    /**
     * Flag if initialize() was called successfully.
     */
    private boolean _initialized;

    /**
     * The LOG.
     */
    private final Log _log = LogFactory.getLog(this.getClass());

    /**
     * 
     * @param crawlerId
     */
    public ZipCompoundCrawler() {
        super();
    }

    /** {@inheritDoc} */
    @Override
    public void initialize(final DataSourceConnectionConfig config)
            throws CrawlerException, CrawlerCriticalException {
        if (config == null) {
            throw new CrawlerCriticalException("parameter config is null");
        }

        final Record record = getCompoundRecord();
        if (record == null) {
            throw new CrawlerCriticalException("the compound record was not set");
        }

        try {
            // get configured compound attributes and attachments
            final CompoundHandling.CompoundAttributes attributes = config.getCompoundHandling()
                    .getCompoundAttributes();
            final List<CompoundHandling.CompoundAttribute> attrs = attributes.getCompoundAttributes();
            _compoundAttributes = attrs.toArray(new CompoundHandling.CompoundAttribute[attrs.size()]);
            final List<String> attachmentsNames = new ArrayList<String>();
            for (final CompoundHandling.CompoundAttribute a : _compoundAttributes) {
                if (a.isAttachment()) {
                    attachmentsNames.add(a.getName());
                }
            }
            _attachmentNames = attachmentsNames.toArray(new String[attachmentsNames.size()]);

            // get the content of the compound object
            final String contentAttachmentName = config.getCompoundHandling().getContentAttachment();
            byte[] content = record.getAttachment(contentAttachmentName);
            if (content == null) {
                content = new byte[0];
            }
            final File workingDir = WorkspaceHelper.createWorkingDir(BUNDLE_ID);
            final File file = new File(workingDir,
                    createTempFileNape(new ConnectivityId(record.getSource(), record.getId())));
            IOUtils.copy(new ByteArrayInputStream(content), new FileOutputStream(file));
            _zipFile = new ZipFile(file);

            _queue = new ArrayBlockingQueue<DataReference>(CAPACITY);
            _entryMap = new HashMap<ConnectivityId, ZipEntry>();
            _producerThread = new ZipEntryProducerThread(this);
            _producerThread.start();
            _initialized = true;
        } catch (Throwable e) {
            final String msg = "Error during initialization";
            if (_log.isErrorEnabled()) {
                _log.error(msg, e);
            }
            try {
                close();
            } catch (Exception ex) {
                if (_log.isErrorEnabled()) {
                    _log.error("Error during close in initialization", ex);
                }
            }
            throw new CrawlerCriticalException(msg, e);
        }
    }

    /**
     * creates a temporary file name from a given id.
     * 
     * @param id
     *          the id
     * @return a filename for the id
     */
    private String createTempFileNape(ConnectivityId id) {
        return id.getIdHash();
    }

    /** {@inheritDoc} */
    @Override
    public DataReference[] getNext() throws CrawlerException, CrawlerCriticalException {
        if (!_initialized) {
            throw new CrawlerCriticalException("ZipCompoundCrawler was not initialized");
        }
        while (hasNext()) {
            final List<DataReference> refList = new ArrayList<DataReference>();
            try {
                final DataReference ref = _queue.poll(QUEUE_POLL_WAITING, TimeUnit.MILLISECONDS);
                if (ref != null) {
                    refList.add(ref);
                    final int size = _queue.drainTo(refList, STEP - 1);
                    return refList.toArray(new DataReference[size + 1]);
                }
            } catch (final InterruptedException e) {
                if (_log.isTraceEnabled()) {
                    _log.trace("InterruptedException in getNext(): ", e);
                }
            }
        }
        return null;
    }

    /** {@inheritDoc} */
    @Override
    public void close() throws CrawlerException {
        _initialized = false;
        if (_zipFile != null) {
            try {
                _zipFile.close();
            } catch (IOException e) {
                final String msg = "Could not close temporary zip file " + _zipFile.getName();
                if (_log.isErrorEnabled()) {
                    _log.error(msg, e);
                }
            }

            final File file = new File(_zipFile.getName());
            file.delete();
            _zipFile = null;
        }
    }

    /**
     * {@inheritDoc}
     * 
     * @see CrawlerCallback#getMObject(Id)
     */
    @Override
    public AnyMap getMetadata(ConnectivityId id) throws CrawlerException, CrawlerCriticalException {
        if (!_initialized) {
            throw new CrawlerCriticalException("ZipCompoundCrawler was not initialized");
        }

        final ZipEntry zipEntry = _entryMap.get(id);
        if (zipEntry == null) {
            throw new CrawlerException("Could not find ZipEntry for id " + id);
        }

        final AnyMap metadataObject = getCompoundRecord().getFactory().createAnyMap();
        for (final CompoundHandling.CompoundAttribute attribute : _compoundAttributes) {
            if (!attribute.isAttachment()) {
                final Object value = readAttribute(zipEntry, attribute, true);
                // special handling for compound path:
                if (attribute.getElementAttribute() == ElementAttributeType.PATH) {
                    final AnySeq parentCompounds = getCompoundRecord().getMetadata()
                            .getSeq(COMPOUND_METADATA_ELEMENT);
                    if (getCompoundRecord() != null
                            && getCompoundRecord().getMetadata().getStringValue(attribute.getName()) != null) {
                        final AnySeq newCompounds;
                        if (parentCompounds == null) {
                            newCompounds = getCompoundRecord().getFactory().createAnySeq();
                        } else {
                            newCompounds = parentCompounds.getFactory().cloneAnySeq(parentCompounds);
                        }
                        newCompounds.add(getCompoundRecord().getFactory().createStringValue(
                                getCompoundRecord().getMetadata().getStringValue(attribute.getName())));
                        metadataObject.put(COMPOUND_METADATA_ELEMENT, newCompounds);
                    }
                }
                if (value != null) {
                    try {
                        metadataObject.put(attribute.getName(),
                                getCompoundRecord().getFactory().parseFromObject(value));
                    } catch (final Throwable e) {
                        throw new CrawlerException(e);
                    }
                } // if
            } // if
        } // for
        return metadataObject;
    }

    /**
     * {@inheritDoc}
     * 
     * @see CrawlerCallback#getAttachmentNames(Id)
     */
    @Override
    public String[] getAttachmentNames(final ConnectivityId id) throws CrawlerException, CrawlerCriticalException {
        if (!_initialized) {
            throw new CrawlerCriticalException("ZipCompoundCrawler was not initialized");
        }

        return _attachmentNames;
    }

    /**
     * {@inheritDoc}
     * 
     * @see CrawlerCallback#getAttachment(Id, String)
     */
    @Override
    public byte[] getAttachment(final ConnectivityId id, final String name)
            throws CrawlerException, CrawlerCriticalException {
        if (!_initialized) {
            throw new CrawlerCriticalException("ZipCompoundCrawler was not initialized");
        }

        final ZipEntry zipEntry = _entryMap.get(id);
        if (zipEntry == null) {
            throw new CrawlerException("Could not find ZipEntry for id " + id);
        }

        // find attribute
        for (final CompoundHandling.CompoundAttribute attribute : _compoundAttributes) {
            if (attribute.getName().equals(name)) {
                return readAttachment(zipEntry, attribute);
            }
        }
        throw new CrawlerException(String.format("Unable to find attachment definition for [%s]", name));
    }

    /**
     * {@inheritDoc}
     * 
     * @see CrawlerCallback#dispose(Id)
     */
    @Override
    public void dispose(final ConnectivityId id) {
        _entryMap.remove(id);
    }

    /**
     * Checks for next.
     * 
     * @return true, if successful
     */
    private boolean hasNext() {
        while (_producerThread != null && _queue.isEmpty()) {
            try {
                Thread.sleep(HAS_NEXT_WAITING);
            } catch (final InterruptedException e) {
                if (_log.isTraceEnabled()) {
                    _log.trace("InterruptedException in hasNext(): ", e);
                }
            }
        }
        return !_queue.isEmpty();
    }

    /**
     * Reads the content of the given ZipEntry.
     * 
     * @param zipEntry
     *          the ZipEntry
     * @return a byte[]
     * @throws CrawlerException
     *           if any error occurs
     */
    private byte[] readZipEntryContent(final ZipEntry zipEntry) throws CrawlerException {
        final ByteArrayOutputStream out = new ByteArrayOutputStream();
        InputStream in = null;
        try {
            in = _zipFile.getInputStream(zipEntry);
            IOUtils.copy(in, out);
        } catch (IOException e) {
            final String msg = "Error reading content of ZipEntry '" + zipEntry.getName() + "' of record id "
                    + getCompoundRecord().getId();
            if (_log.isErrorEnabled()) {
                _log.error(msg, e);
            }
            throw new CrawlerException(msg, e);
        } catch (Throwable e) {
            // this may occur if a ZipEntry containing Umlauts in the filename is read. Depending on the zip tool, the
            // characters may not be encoded in utf-8, but ZipEntry expects names to be encoded in utf-8
            final String msg = "Error reading content of ZipEntry '" + zipEntry.getName() + "' of record id "
                    + getCompoundRecord().getId();
            if (_log.isErrorEnabled()) {
                _log.error(msg, e);
            }
            throw new CrawlerException(msg, e);
        } finally {
            IOUtils.closeQuietly(in);
            IOUtils.closeQuietly(out);
        }
        return out.toByteArray();
    }

    /**
     * Read attribute value.
     * 
     * @param zipEntry
     *          the ZipEntry
     * @param attribute
     *          the attribute
     * @param forceByteToString
     *          the force byte to string
     * 
     * @return the object
     * 
     * @throws CrawlerException
     *           the crawler exception
     */
    private Serializable readAttribute(final ZipEntry zipEntry, final CompoundHandling.CompoundAttribute attribute,
            final boolean forceByteToString) throws CrawlerException {
        switch (attribute.getElementAttribute()) {
        case NAME:
            return FilenameUtils.getName(zipEntry.getName());
        case FILE_EXTENSION:
            return FilenameUtils.getExtension(zipEntry.getName());
        case PATH:
            return zipEntry.getName();
        case LAST_MODIFIED_DATE:
            return new Date(zipEntry.getTime());
        case SIZE:
            return new Long(zipEntry.getSize());
        case CONTENT:
            try {
                final byte[] bytes = readZipEntryContent(zipEntry);
                if (forceByteToString) {
                    final String encoding = EncodingHelper.getEncoding(bytes);
                    if (encoding != null) {
                        return IOUtils.toString(new ByteArrayInputStream(bytes), encoding);
                    } else {
                        return IOUtils.toString(new ByteArrayInputStream(bytes));
                    }
                } else {
                    return bytes;
                }
            } catch (final IOException e) {
                throw new CrawlerException(e);
            }
        default:
            throw new RuntimeException(
                    "Unknown compound element attributes type " + attribute.getElementAttribute());
        }
    }

    /**
     * Read attachment.
     * 
     * @param zipEntry
     *          the ZipEntry
     * @param attribute
     *          the attribute
     * 
     * @return the byte[]
     * 
     * @throws CrawlerException
     *           the crawler exception
     */
    private byte[] readAttachment(final ZipEntry zipEntry, final CompoundHandling.CompoundAttribute attribute)
            throws CrawlerException {
        final Serializable value = readAttribute(zipEntry, attribute, false);
        if (value != null) {
            if (value instanceof String) {
                try {
                    return ((String) value).getBytes("utf-8");
                } catch (final UnsupportedEncodingException e) {
                    throw new CrawlerException(e);
                }
            } else if (value instanceof byte[]) {
                return (byte[]) value;
            } // TODO serialization to byte[] for other types of attachments.
        }
        return null;
    }

    /**
     * Worker thread that fills the internal Queue with DataReference objects. It iterates over the _zipFile and creates a
     * DataReference for each ZipEntry, ignoring directory entries.
     */
    private class ZipEntryProducerThread extends Thread {

        /**
         * The _crawlerCallback.
         */
        private final CrawlerCallback _crawlerCallback;

        /**
         * Instantiates a new crawling producer thread.
         * 
         * @param crawlerCallback
         *          the CrawlerCallback
         */
        public ZipEntryProducerThread(final CrawlerCallback crawlerCallback) {
            super();
            _crawlerCallback = crawlerCallback;
        }

        /**
         * {@inheritDoc}
         * 
         * @see java.lang.Thread#run()
         */
        @Override
        public void run() {
            try {
                if (_zipFile != null) {
                    final Enumeration<? extends ZipEntry> entries = _zipFile.entries();
                    if (entries != null) {
                        while (entries.hasMoreElements()) {
                            final ZipEntry entry = entries.nextElement();
                            if (entry != null && !entry.isDirectory()) {
                                // creation of Id and Hash are NOT configurable
                                final ConnectivityId parentId = new ConnectivityId(getCompoundRecord().getSource(),
                                        getCompoundRecord().getId());
                                final ConnectivityId id = new ConnectivityId(parentId, entry.getName());
                                final String hash = ConnectivityHashFactory.getInstance()
                                        .createHash(Long.toString(entry.getTime()));
                                final DataReference dataRef = DataReferenceFactory.getInstance()
                                        .createDataReference(_crawlerCallback, id, hash);
                                boolean added = false;
                                while (!added) {
                                    added = _queue.add(dataRef);
                                    _entryMap.put(id, entry);
                                }
                            } // if
                        } // while
                    } // if
                } // if
            } catch (final Throwable ex) {
                if (_log.isErrorEnabled()) {
                    _log.error("Producer error", ex);
                }
            } finally {
                _producerThread = null;
                if (_log.isInfoEnabled()) {
                    _log.info("ZipEntry producer thread finished!");
                }
            }
        }
    }

}