Java tutorial
/******************************************************************************* * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html * * Contributors: Ivan Churkin (brox IT Solutions GmbH) - initial creator Sebastian Voigt (Brox IT Solutions GmbH) - * initial creator **********************************************************************************************************************/ package org.eclipse.smila.connectivity.framework.crawler.filesystem; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.io.Serializable; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.TimeUnit; import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOCase; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.eclipse.smila.connectivity.ConnectivityId; import org.eclipse.smila.connectivity.framework.AbstractCrawler; import org.eclipse.smila.connectivity.framework.CrawlerCallback; import org.eclipse.smila.connectivity.framework.CrawlerCriticalException; import org.eclipse.smila.connectivity.framework.CrawlerException; import org.eclipse.smila.connectivity.framework.DataReference; import org.eclipse.smila.connectivity.framework.crawler.filesystem.messages.Attribute; import org.eclipse.smila.connectivity.framework.crawler.filesystem.messages.Process; import org.eclipse.smila.connectivity.framework.crawler.filesystem.messages.Process.Filter; import org.eclipse.smila.connectivity.framework.crawler.filesystem.messages.Process.Filter.Exclude; import org.eclipse.smila.connectivity.framework.crawler.filesystem.messages.Process.Filter.Include; import org.eclipse.smila.connectivity.framework.performancecounters.CrawlerPerformanceCounterHelper; import org.eclipse.smila.connectivity.framework.schema.config.DataSourceConnectionConfig; import org.eclipse.smila.connectivity.framework.schema.config.DataSourceConnectionConfig.Attributes; import org.eclipse.smila.connectivity.framework.schema.config.interfaces.IAttribute; import org.eclipse.smila.connectivity.framework.schema.config.interfaces.IProcess; import org.eclipse.smila.connectivity.framework.util.DataReferenceFactory; import org.eclipse.smila.datamodel.Any; import org.eclipse.smila.datamodel.AnyMap; import org.eclipse.smila.datamodel.DataFactory; import org.eclipse.smila.datamodel.DataFactoryCreator; import org.eclipse.smila.utils.file.EncodingHelper; /** * The Class FileSystemCrawler. */ public class FileSystemCrawler extends AbstractCrawler { /** * The Constant POC_FOLDERS. */ private static final String POC_FOLDERS = "folders"; /** * The Constant POC_FILES. */ private static final String POC_FILES = "files"; /** * The Constant POC_PRODUCER_EXCEPTIONS. */ private static final String POC_PRODUCER_EXCEPTIONS = "producerExceptions"; /** * The Constant QUEUE_POLL_WAITING. */ private static final int QUEUE_POLL_WAITING = 300; /** * The Constant HAS_NEXT_WAITING. */ private static final int HAS_NEXT_WAITING = 50; /** * The Constant CAPACITY. */ private static final int CAPACITY = 100; /** * The Constant STEP. */ private static final int STEP = 10; /** * The LOG. */ private final Log _log = LogFactory.getLog(FileSystemCrawler.class); /** * The _queue. */ private ArrayBlockingQueue<DataReference> _queue; /** * The _crawl thread. */ private CrawlingProducerThread _crawlThread; /** * The running. */ private boolean _isProducerRunning = true; /** * The opened flag. */ private boolean _opened; /** * The _opened monitor. */ private final Object _openedMonitor = new Object(); /** * The force close flag. */ private boolean _forceClosing; /** * The _factory. */ private final DataFactory _factory = DataFactoryCreator.createDefaultFactory(); /** * The _attributes. */ private Attribute[] _attributes; /** * The _attachment names. */ private String[] _attachmentNames; /** * Map (sourceid + id) to path. */ private Map<ConnectivityId, File> _idToPath; /** * The _counter helper. */ private CrawlerPerformanceCounterHelper<FileSystemCrawlerPerformanceAgent> _performanceCounters; /** * Instantiates a new file system crawler. */ public FileSystemCrawler() { super(); if (_log.isDebugEnabled()) { _log.debug("Creating FileSystemCrawler instance"); } } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.Crawler# * initialize(org.eclipse.smila.connectivity.framework.schema.config.DataSourceConnectionConfig) */ @Override public void initialize(final DataSourceConnectionConfig config) throws CrawlerException, CrawlerCriticalException { _log.info("Initializing FileSystemCrawler..."); synchronized (_openedMonitor) { if (_opened) { throw new CrawlerCriticalException( "Crawler is busy (it should not happen because new instances are created by ComponentFactories)"); } checkFolders(config); _opened = true; } _forceClosing = false; _isProducerRunning = true; _queue = new ArrayBlockingQueue<DataReference>(CAPACITY); _idToPath = new HashMap<ConnectivityId, File>(); final Attributes attributes = config.getAttributes(); final List<IAttribute> attrs = attributes.getAttribute(); _performanceCounters = new CrawlerPerformanceCounterHelper<FileSystemCrawlerPerformanceAgent>(config, hashCode(), FileSystemCrawlerPerformanceAgent.class); _attributes = attrs.toArray(new Attribute[attrs.size()]); final List<String> attachmentsNames = new ArrayList<String>(); for (final Attribute a : _attributes) { if (a.isAttachment()) { attachmentsNames.add(a.getName()); } } _attachmentNames = attachmentsNames.toArray(new String[attachmentsNames.size()]); _crawlThread = new CrawlingProducerThread(this, config); _crawlThread.start(); } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.Crawler#getNext() */ @Override public DataReference[] getNext() throws CrawlerException, CrawlerCriticalException { while (hasNext()) { final List<DataReference> refList = new ArrayList<DataReference>(); try { final DataReference ref = _queue.poll(QUEUE_POLL_WAITING, TimeUnit.MILLISECONDS); if (ref != null) { refList.add(ref); final int size = _queue.drainTo(refList, STEP - 1); return refList.toArray(new DataReference[size + 1]); } } catch (final InterruptedException e) { ; // nothing } } return null; } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.CrawlerCallback#dispose(org.eclipse.smila.datamodel.id.Id) */ @Override public void dispose(final ConnectivityId id) { synchronized (_idToPath) { _idToPath.remove(id); } } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.CrawlerCallback#getAttachment(org.eclipse.smila.datamodel.id.Id, * java.lang.String) */ @Override public byte[] getAttachment(final ConnectivityId id, final String name) throws CrawlerException, CrawlerCriticalException { final File file = getFileById(id); // find attribute for (final Attribute attribute : _attributes) { if (attribute.getName().equals(name)) { return readAttachment(file, attribute); } } throw new CrawlerCriticalException(String.format("Unable to find attachment definition for [%s]", name)); } /** * Gets the file by id. * * @param id * the id * @return the file by id * * @throws CrawlerException * the crawler exception */ private File getFileById(final ConnectivityId id) throws CrawlerException { File file; synchronized (_idToPath) { file = _idToPath.get(id); } ensureFileExists(id, file); return file; } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.CrawlerCallback#getAttachmentNames(org.eclipse.smila.datamodel.id.Id) */ @Override public String[] getAttachmentNames(final ConnectivityId id) throws CrawlerException, CrawlerCriticalException { return _attachmentNames; } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.CrawlerCallback#getMObject(org.eclipse.smila.datamodel.id.Id) */ @Override public AnyMap getMetadata(final ConnectivityId id) throws CrawlerException, CrawlerCriticalException { final File file = getFileById(id); final AnyMap metadata = _factory.createAnyMap(); for (final Attribute attribute : _attributes) { if (!attribute.isAttachment()) { final Object value = readAttribute(file, attribute, true); if (value != null) { try { metadata.put(attribute.getName(), _factory.parseFromObject(value)); } catch (final Throwable e) { throw new CrawlerException(e); } } } } return metadata; } /** * Ensure file exists. * * @param id * the id * @param file * the file * * @throws CrawlerException * the crawler exception */ private void ensureFileExists(final ConnectivityId id, final File file) throws CrawlerException { if (file == null) { throw new CrawlerException(String.format("Unable to find file for id [%s].", id)); } if (file == null || !file.exists()) { throw new CrawlerException(String.format("Unable to find file [%s]", file.getPath())); } } /** * check are folders exists, if not throw critical exception. * * @param config * the config * * @throws CrawlerCriticalException * the crawler critical exception */ private void checkFolders(final DataSourceConnectionConfig config) throws CrawlerCriticalException { final Process process = (Process) config.getProcess(); final int processingLength = process.getBaseDirAndFilter().size(); int i = 0; while (i < processingLength) { final String path = (String) process.getBaseDirAndFilter().get(i++); // filter i++; final File file = new File(path); if (!file.exists() || !file.isDirectory()) { throw new CrawlerCriticalException(String.format("Folder \"%s\" is not found", path)); } } } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.Crawler#close() */ @Override public void close() throws CrawlerException { synchronized (_openedMonitor) { _opened = false; _log.info("Closing FileSystemCrawler..."); _forceClosing = true; _isProducerRunning = false; _crawlThread = null; _queue = null; _idToPath = null; _attachmentNames = null; _performanceCounters = null; } } /** * Checks for next. * * @return true, if successful */ private boolean hasNext() { while (_isProducerRunning && _queue.isEmpty()) { try { Thread.sleep(HAS_NEXT_WAITING); } catch (final InterruptedException e) { ; // nothing } } return !_queue.isEmpty(); } /** * Read attribute value. * * @param file * the file * @param attribute * the attribute * @param forceByteToString * the force byte to string * * @return the object * * @throws CrawlerException * the crawler exception */ private Serializable readAttribute(final File file, final Attribute attribute, final boolean forceByteToString) throws CrawlerException { switch (attribute.getFileAttributes()) { case NAME: return file.getName(); case FILE_EXTENSION: return FilenameUtils.getExtension(file.getName()); case PATH: return file.getAbsolutePath(); case LAST_MODIFIED_DATE: return new Date(file.lastModified()); case SIZE: return new Long(file.length()); case CONTENT: try { final byte[] bytes = FileUtils.readFileToByteArray(file); if (forceByteToString) { try { return EncodingHelper.convertToString(bytes); } catch (final Exception e) { throw new CrawlerException("Error decoding content from file " + file.getAbsolutePath(), e); } } else { return bytes; } } catch (final IOException e) { throw new CrawlerException("Error reading attribute from file " + file.getAbsolutePath(), e); } default: throw new RuntimeException("Unknown file attributes type " + attribute.getFileAttributes()); } } /** * Read attachment. * * @param file * the file * @param attribute * the attribute * * @return the byte[] * * @throws CrawlerException * the crawler exception */ private byte[] readAttachment(final File file, final Attribute attribute) throws CrawlerException { final Serializable value = readAttribute(file, attribute, false); if (value != null) { if (value instanceof String) { try { return ((String) value).getBytes("utf-8"); } catch (final UnsupportedEncodingException e) { throw new CrawlerException(e); } } else if (value instanceof byte[]) { return (byte[]) value; } // TODO serialization to byte[] for other types of attachments. } return null; } /** * The Class CrawlThread. */ private class CrawlingProducerThread extends Thread { /** * The _crawler callback. */ private final CrawlerCallback _crawlerCallback; /** * The _data source id. */ private final String _dataSourceID; /** * The _process. */ private final Process _process; /** * The _processing length. */ private final int _processingLength; /** * Instantiates a new crawling producer thread. * * @param configuration * the configuration * @param crawlerCallback * the crawler callback */ public CrawlingProducerThread(final CrawlerCallback crawlerCallback, final DataSourceConnectionConfig configuration) { super(); final IProcess process = configuration.getProcess(); _crawlerCallback = crawlerCallback; _dataSourceID = configuration.getDataSourceID(); _process = (Process) process; _processingLength = _process.getBaseDirAndFilter().size(); } /** * {@inheritDoc} * * @see java.lang.Thread#run() */ @Override public void run() { try { try { int i = 0; while (i < _processingLength) { final String path = (String) _process.getBaseDirAndFilter().get(i++); final Filter filter = (Filter) _process.getBaseDirAndFilter().get(i++); final File file = new File(path); if (!file.exists() || !file.isDirectory()) { _log.error("Folder " + path + " is not found"); continue; } processFolder(file, filter); } } catch (final Throwable ex) { if (_performanceCounters != null) { _performanceCounters.addException(ex); } _log.error("Producer error", ex); } finally { _isProducerRunning = false; if (_forceClosing) { _log.info("Producer finished by forcing close procedure"); } else { _log.info("Producer finished!"); } } } catch (RuntimeException ex) { ex.printStackTrace(); } } /** * Process folder. * * @param dir * the dir * @param filter * the filter * * @throws CrawlerException * the crawler exception */ private void processFolder(final File dir, final Filter filter) throws CrawlerException { if (_forceClosing) { return; } final CrawlerFileFilter fileFilter = new CrawlerFileFilter(filter); treeWalk(dir, fileFilter, filter.isRecursive()); } /** * Tree walk. * * @param dir * the dir * @param fileFilter * the file filter * @param isRecursive * the is recursive * * @throws CrawlerException * the crawler exception */ private void treeWalk(final File dir, final CrawlerFileFilter fileFilter, final boolean isRecursive) throws CrawlerException { if (_forceClosing) { return; } final File[] entries = dir.listFiles(fileFilter); if (entries == null) { _log.warn("Unknown IO error while listing directory " + dir + ", skipping."); } else { for (int i = 0; i < entries.length; i++) { final File file = entries[i]; if (file.isFile()) { boolean waiting = true; DataReference reference = null; while (waiting) { try { if (reference == null) { reference = initializeDataReference(entries[i]); } synchronized (_idToPath) { _idToPath.put(reference.getId(), file); } _queue.put(reference); waiting = false; _performanceCounters.increment(POC_FILES); } catch (final Throwable e) { _performanceCounters.increment(POC_PRODUCER_EXCEPTIONS); _performanceCounters.addException(e); _log.error("", e); } } } else if (isRecursive && file.isDirectory()) { treeWalk(file, fileFilter, true); } else { _log.warn("Path " + file + " is neither file nor directory, skipping."); } } } _performanceCounters.increment(POC_FOLDERS); } /** * Initialize data reference. * * @param file * the file * * @return the data reference * * @throws CrawlerException * the crawler exception */ private DataReference initializeDataReference(final File file) throws CrawlerException { final AnyMap idAttributes = _factory.createAnyMap(); final AnyMap hashAttributes = _factory.createAnyMap(); final Map<String, byte[]> hashAttachments = new HashMap<String, byte[]>(); readIdAndHashAttributesAndAttachments(file, idAttributes, hashAttributes, hashAttachments); return DataReferenceFactory.getInstance().createDataReference(_crawlerCallback, _dataSourceID, idAttributes, hashAttributes, hashAttachments); } /** * Read id and hash attributes and attachments. * * @param file * the file * @param idAttributes * the id attributes * @param hashAttributes * the hash attributes * @param hashAttachments * the hash attachments * * @throws CrawlerException * the crawler exception */ private void readIdAndHashAttributesAndAttachments(final File file, final AnyMap idAttributes, final AnyMap hashAttributes, final Map<String, byte[]> hashAttachments) throws CrawlerException { for (final Attribute attributeDef : _attributes) { if (attributeDef.isKeyAttribute() || attributeDef.isHashAttribute()) { if (attributeDef.isAttachment()) { final byte[] value = readAttachment(file, attributeDef); hashAttachments.put(attributeDef.getName(), value); } else { final Object value = readAttribute(file, attributeDef, true); if (value != null) { final Any attribute = _factory.parseFromObject(value); final String name = attributeDef.getName(); if (attributeDef.isKeyAttribute()) { idAttributes.put(name, attribute); } if (attributeDef.isHashAttribute()) { hashAttributes.put(name, attribute); } } } } } } /** * The Class CrawlerFileFilter. */ private class CrawlerFileFilter implements FileFilter { /** * The _filter. */ private final Filter _filter; /** * The _case. */ private final IOCase _case; /** * Instantiates a new crawler filter. * * @param filter * the filter */ public CrawlerFileFilter(final Filter filter) { _filter = filter; if (filter.isCaseSensitive()) { _case = IOCase.SENSITIVE; } else { _case = IOCase.INSENSITIVE; } } /** * {@inheritDoc} * * @see java.io.FileFilter#accept(java.io.File) */ @Override public boolean accept(final File file) { if (file.isDirectory()) { return true; } // process includes , if there is no includes defined, then accept file if (_filter.getInclude() != null && _filter.getInclude().size() > 0) { final long dateLong = file.lastModified(); boolean acceptedByInclude = false; for (final Include include : _filter.getInclude()) { if (include.getDateFrom() != null) { if (dateLong < include.getDateFrom().getTime()) { continue; } } if (include.getDateTo() != null) { if (dateLong > include.getDateTo().getTime()) { continue; } } if (FilenameUtils.wildcardMatch(file.getName(), include.getName(), _case)) { acceptedByInclude = true; break; } } if (!acceptedByInclude) { return false; } } // process excludes for (final Exclude exclude : _filter.getExclude()) { if (FilenameUtils.wildcardMatch(file.getName(), exclude.getName(), _case)) { return false; } } return true; } } } }