Java tutorial
/******************************************************************************* * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html * * Contributors: Michael Breidenband (brox IT Solutions GmbH) - initial creator **********************************************************************************************************************/ package org.eclipse.smila.connectivity.framework.crawler.jdbc; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.math.BigInteger; import java.sql.Blob; import java.sql.Clob; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.ResultSetMetaData; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.eclipse.smila.connectivity.ConnectivityId; import org.eclipse.smila.connectivity.framework.AbstractCrawler; import org.eclipse.smila.connectivity.framework.Crawler; import org.eclipse.smila.connectivity.framework.CrawlerCriticalException; import org.eclipse.smila.connectivity.framework.CrawlerException; import org.eclipse.smila.connectivity.framework.DataReference; import org.eclipse.smila.connectivity.framework.crawler.jdbc.messages.Attribute; import org.eclipse.smila.connectivity.framework.crawler.jdbc.messages.Process; import org.eclipse.smila.connectivity.framework.crawler.jdbc.messages.Process.Database; import org.eclipse.smila.connectivity.framework.crawler.jdbc.messages.Process.Selections; import org.eclipse.smila.connectivity.framework.crawler.jdbc.messages.Process.Selections.Grouping; import org.eclipse.smila.connectivity.framework.crawler.jdbc.util.GroupingRange; import org.eclipse.smila.connectivity.framework.crawler.jdbc.util.PreparedStatementTypedParameter; import org.eclipse.smila.connectivity.framework.performancecounters.CrawlerPerformanceCounterHelper; import org.eclipse.smila.connectivity.framework.schema.config.DataSourceConnectionConfig; import org.eclipse.smila.connectivity.framework.schema.config.DataSourceConnectionConfig.Attributes; import org.eclipse.smila.connectivity.framework.schema.config.interfaces.IAttribute; import org.eclipse.smila.connectivity.framework.util.DataReferenceFactory; import org.eclipse.smila.datamodel.Any; import org.eclipse.smila.datamodel.AnyMap; import org.eclipse.smila.datamodel.AnySeq; import org.eclipse.smila.datamodel.DataFactory; import org.eclipse.smila.datamodel.DataFactoryCreator; import org.eclipse.smila.datamodel.InvalidValueTypeException; import org.eclipse.smila.datamodel.Record; /** * The Class JdbcCrawler. Instances of this class can be used to crawl JDBC datasources. Clients can access the * retrieved data via the methods defined in the {@link Crawler}-Interface. * * The {@link DataSourceConnectionConfig} passed in the initialize()-Method must comply to the Schema defined in * JdbcDataSourceConnectionConfigSchema.xsd. * * */ public class JdbcCrawler extends AbstractCrawler { /** * Name used for the {@link CrawlerPerformanceCounters} property logging the number of retrieved database rows. */ public static final String POC_DATA_ROWS_RETRIEVED = "databaseRows"; /** * Name used for the {@link CrawlerPerformanceCounters} property logging the number of occured * {@link CrawlerException}s. */ private static final String POC_CRAWLING_EXCEPTIONS = "producerExceptions"; /** * Name used for the {@link CrawlerPerformanceCounters} property logging the number of occured * {@link CrawlerCriticalException}s. */ private static final String POC_CRITICAL_CRAWLING_EXCEPTIONS = "producerCriticalExceptions"; /** * Name used for the {@link CrawlerPerformanceCounters} property logging the number of created {@link DataReference}s. */ private static final String POC_DATA_REFS_CREATED = "dataRefsCreated"; /** * Name used for the {@link CrawlerPerformanceCounters} property logging the number of created {@link Record}s. */ private static final String POC_RECORDS_CREATED = "recordsCreated"; /** * Name used for the {@link CrawlerPerformanceCounters} property logging the number of {@link DataReference}s * retrieved from the Crawler by clients. */ private static final String POC_DATA_REFS_RETRIEVED_BY_CLIENT = "dataRefsRetrievedByClient"; /** Capacity of the internal queue. */ private static final int INTERNAL_QUEUE_CAPACITY = 12000; /** If the internal record cache grows above this limit warnings are issued to the logger. */ private static final int RECORD_CACHE_WARNING_THRESHOLD = 4 * INTERNAL_QUEUE_CAPACITY; /** Timeout in ms used when polling the queue for MObjects in {@link #getNextDeltaIndexingData()}. */ private static final long QUEUE_POLL_WAITING = 300; /** * Time in ms to wait for the {@link #_producerThread} to terminate when the {@link #close()}-method is called. */ private static final long WAIT_FOR_CRAWLING_THREAD_TERMINATION_WHEN_FORCECLOSING = 5000; /** Max number of {@link MObject}s returned by {@link #getNextDeltaIndexingData()}. */ private static final int MAX_QUEUE_SIZE = 20; /** Thread-Timeout in ms used in {@link #hasNextItemInQueue()}. */ private static final int HAS_NEXT_ITEM_THREAD_WAIT = 50; /** The Log object for logging exceptions and messages. */ private final Log _log = LogFactory.getLog(JdbcCrawler.class); /** The Monitor object for synchronizing when opening and closing the Crawler. */ private final Object _openedMonitor = new Object(); /** This flag reflects whether the Crawler is in "opened"-State. */ private boolean _opened; /** The {@link Process} object of the {@link DataSourceConnectionConfig}. */ private Process _process; /** the data source id. */ private String _dataSourceID; // /** // * This {@link List} is temporarily filled with {@link MObject}s from the {@link #_internalQueue} when // * {@link #getNextDeltaIndexingData()} is called. // */ // private final ArrayList<MObject> _tempList = new ArrayList<MObject>(); /** * Pre-Caches the Record-object for each MObject in the current DeltaIndexing block. Is filled when * {@link #getNextDeltaIndexingData()} is called. The key ist the offset of the corresponding {@link MObject} in the * {@link #_tempList}. The {@link List} gets cleared after {@link #getNextDeltaIndexingData()} is called, so it should * never have a size larger than {@link #MAX_QUEUE_SIZE}. */ private HashMap<ConnectivityId, Record> _recordCache; // /** // * Pre-caches the data retrieved from the JDBC-{@link ResultSet} for later creation of corresponding // * {@link Record}-Objects. Associates the {@link MObject} created by the {@link CrawlingProducerThread} with // * the data of the current row in the JDBC-{@link ResultSet}. The {@link Map} gets cleared after // * {@link #getNextDeltaIndexingData()} is called, so it should never have a size larger than // * {@link #DELTA_INDEXING_SIZE}. // */ // private HashMap<MObject, Object[]> _dataCache; /** * The internal queue, which is filled with {@link MObject}s by the {@link CrawlingProducerThread}. When it grows to a * size of {@link #INTERNAL_QUEUE_CAPACITY} its put()-Method blocks, so the ProcuerThread blocks until the queue is * drained a bit by the client calling {@link #getNextDeltaIndexingData()}. * * @see ArrayBlockingQueue */ private ArrayBlockingQueue<DataReference> _internalQueue; /** * This Map contains the mapping between the attribute names in the {@link DataSourceConnectionConfig} and the * corresponding column indexes of the {@link #_retrievalResultSet}. */ private HashMap<String, Integer> _attributeMapping; /** * The JDBC connection to the database. * * @see Connection */ private Connection _connection; /** The {@link PreparedStatement} used for querying the JDBC datasource. */ private PreparedStatement _retrievalStatement; /** The {@link ResultSet} returned by the {@link #_retrievalStatement}. */ private ResultSet _retrievalResultSet; /** The {@link ResultSetMetaData} associated with the {@link #_retrievalResultSet}. */ private ResultSetMetaData _retrievalResultSetMetaData; /** * The {@link RecordFactory}-instance used for creating the {@link Record}s. In the case of {@link JdbcCrawler} * obtained by {@link RecordFactory#DEFAULT_INSTANCE}. */ private final DataFactory _dataFactory = DataFactoryCreator.createDefaultFactory(); /** The attributes to be retrieved from the datasource as defined in the {@link DataSourceConnectionConfig}. */ private Attribute[] _attributes; /** Flag indicating if the {@link CrawlingProducerThread} is currently active. */ private boolean _isProducerRunning; /** The {@link CrawlingProducerThread}-instance which populates the {@link #_internalQueue}. */ private CrawlingProducerThread _producerThread; /** * Class member storing the last {@link CrawlerCriticalException} encountered in the {@link CrawlingProducerThread}. */ private CrawlerCriticalException _producerException; /** Flag indicating whether the {@link #close()}-Method has been called by the client. */ private boolean _forceClosing; /** An {@link ArrayList} containing the {@link GroupingRange}s which were determined. */ private ArrayList<GroupingRange> _groupingRanges; /** The {@link Iterator} associated with the {@link #_groupingRanges}-List. */ private Iterator<GroupingRange> _groupingRangesIterator; /** The {@link GroupingRange} currently used in the {@link #_retrievalStatement}. */ private GroupingRange _currentGroupingRange; /** the {@link CrawlerPerformanceCounters} used by the {@link JdbcCrawler}. */ private CrawlerPerformanceCounterHelper<JdbcCrawlerPerformanceAgent> _performanceCounters; /** * Standard constructor of {@link JdbcCrawler}. */ public JdbcCrawler() { super(); if (_log.isDebugEnabled()) { _log.debug("Creating new JdbcCrawler instance"); } } /** * This method should be called by clients after completing their work with the {@link JdbcCrawler} can release its * JDBC- and other resources. If the method is called before the {@link CrawlingProducerThread} terminates, the thread * will terminate after evaluating the {@link #_forceClosing}-flag the next time which is set by the {@link #close()} * -Method. * */ @Override public void close() { synchronized (_openedMonitor) { _forceClosing = true; _opened = false; _log.info("Closing JdbcCrawler..."); try { _producerThread.join(WAIT_FOR_CRAWLING_THREAD_TERMINATION_WHEN_FORCECLOSING); } catch (final InterruptedException exception) { if (_log.isTraceEnabled()) { _log.trace("Encounterd InterruptedException while waiting for the ProducerThread to die.", exception); } } _isProducerRunning = false; _producerThread = null; } try { if (_retrievalResultSet != null) { _retrievalResultSet.close(); _retrievalResultSet = null; } } catch (final SQLException e) { if (_log.isErrorEnabled()) { _log.error(e.getMessage(), e); } } try { if (_retrievalStatement != null) { _retrievalStatement.close(); _retrievalStatement = null; } } catch (final SQLException e) { if (_log.isErrorEnabled()) { _log.error(e.getMessage(), e); } } try { if (_connection != null) { _connection.close(); _connection = null; } } catch (final SQLException e) { if (_log.isErrorEnabled()) { _log.error(e.getMessage(), e); } } _dataSourceID = null; } /** * Creates an {@link MObject} for the passed data. * * @param data * An {@link Object[]} which constitutes a database row retrieved via the {@link #_retrievalResultSet}. * * @return the {@link MObject} created. * @throws CrawlerCriticalException * If {@code data} was {@code null} or one of the attributes defined in {@link #_attributes} could not be * retrieved from {@code data} */ private DataReference createDataReference(final Object[] data) throws CrawlerCriticalException { DataReference dataRef = null; final Record record = createRecord(data); final AnyMap idAttributes = getIdAttributes(record); final AnyMap hashAttributes = getHashAttributes(record); dataRef = DataReferenceFactory.getInstance().createDataReference(this, _dataSourceID, idAttributes, hashAttributes); _recordCache.put(dataRef.getId(), record); if (_recordCache.size() > RECORD_CACHE_WARNING_THRESHOLD) { _performanceCounters.increment(POC_RECORDS_CREATED); } return dataRef; } /** * @param record * the record to examine * @return a map with the hash attributes */ private AnyMap getHashAttributes(final Record record) { final AnyMap hashAttributes = _dataFactory.createAnyMap(); for (Entry<String, Any> entry : record.getMetadata().entrySet()) { final String attributeName = entry.getKey(); for (final Attribute processingAttrib : _attributes) { if (processingAttrib.getName().equals(attributeName) && processingAttrib.isHashAttribute()) { hashAttributes.put(attributeName, entry.getValue()); } } } return hashAttributes; } /** * @param record * the record to examine * @return a map with the id attributes */ private AnyMap getIdAttributes(final Record record) { final AnyMap idAttributes = _dataFactory.createAnyMap(); for (Entry<String, Any> entry : record.getMetadata().entrySet()) { final String attributeName = entry.getKey(); for (final Attribute processingAttrib : _attributes) { if (processingAttrib.getName().equals(attributeName) && processingAttrib.isKeyAttribute()) { idAttributes.put(attributeName, entry.getValue()); } } } return idAttributes; } /** * * Creates a {@link Record}-object for the passed data and metadata. * * @param data * The data to be used for the {@link Record} - an {@link Object[]} constituting a database row. * @return The created {@link Record}-object. * @throws CrawlerCriticalException * If any of the specified {@link #_attributes} could not be extracted from the {@code data}. * */ private Record createRecord(final Object[] data) throws CrawlerCriticalException { final Record record = _dataFactory.createRecord(); final AnyMap metaData = record.getMetadata(); for (final Attribute attribute : _attributes) { if (attribute.isAttachment()) { // set Attachment attributes as Attachments to the record final Object attachmentValue = readAttribute(data, attribute); if (attachmentValue != null) { if (attachmentValue instanceof String) { try { record.setAttachment(attribute.getName(), ((String) attachmentValue).getBytes("utf-8")); } catch (final UnsupportedEncodingException exception) { _log.warn("UTF-8 Encoding ist not supported by this VM. (Very unlikely...)", exception); } } else if (attachmentValue instanceof byte[]) { record.setAttachment(attribute.getName(), (byte[]) attachmentValue); } else if (attachmentValue instanceof Blob) { final Blob blob = (Blob) attachmentValue; byte[] byteValue = null; try { byteValue = IOUtils.toByteArray(blob.getBinaryStream()); } catch (final IOException exception) { _log.error( "Encountered IOException when getting byte[]-Value of BLOB-Stream for attribute [" + attribute.getName() + "]. Assigning null-Value.", exception); byteValue = null; } catch (final SQLException exception) { _log.error("Encountered SQLException when retrieving BLOB-Stream for attribute [" + attribute.getName() + "]. Assigning null-Value.", exception); byteValue = null; } record.setAttachment(attribute.getName(), byteValue); } else if (attachmentValue instanceof Clob) { final Clob clob = (Clob) attachmentValue; byte[] byteValue = null; try { byteValue = IOUtils.toByteArray(clob.getAsciiStream()); } catch (final IOException exception) { _log.error( "Encountered IOException when getting byte[]-Value of CLOB-Stream for attribute [" + attribute.getName() + "]. Assigning null-Value.", exception); byteValue = null; } catch (final SQLException exception) { _log.error("Encountered SQLException when retrieving CLOB-Stream for attribute [" + attribute.getName() + "]. Assigning null-Value.", exception); byteValue = null; } record.setAttachment(attribute.getName(), byteValue); } else { throw new IllegalArgumentException( "Unsupported Attachment type [" + attachmentValue.getClass().getName() + "]"); } } // else: attribute is NOT an attachment ... } else { final Object value = readAttribute(data, attribute); if (value != null) { if (value instanceof Object[]) { try { final AnySeq anySeq = record.getFactory().createAnySeq(); for (Object object : (Object[]) value) { anySeq.add(record.getFactory().parseFromObject(object)); } metaData.put(attribute.getName(), anySeq); } catch (final InvalidValueTypeException exception) { _log.error("Could not set value of attribute [" + attribute.getName() + "] as LiteralArrayAttribute.", exception); } } else { try { metaData.put(attribute.getName(), record.getFactory().parseFromObject(value)); } catch (final InvalidValueTypeException exception) { _log.error("Could not set value of attribute [" + attribute.getName() + "] as SimpleLiteralAttribute.", exception); } } } } } return record; } /** * @return The Crawlers ProducerThread. * @see CrawlingProducerThread */ public Thread getProducerThread() { return _producerThread; } /** * This method evaluates if there is any more data to be fetched from the data source. * <p> * This is the case if any of the following is true * </p> * <ul> * <li>the {@link #_retrievalResultSet} has at least one more row to fetch</li> * <li>there ist at least one more {@link GroupingRange} in {@link #_groupingRanges} to be processed * </ul> * * @return True if there is at least one more database row to be processed, false otherwise. * @throws CrawlerCriticalException * If an {@link SQLException} is encountered when checking the {@link #_retrievalResultSet} for more rows. */ private boolean hasNext() throws CrawlerCriticalException { try { if (_retrievalResultSet == null) { populateRetrievalResultSet(); } if (_retrievalResultSet.next()) { _retrievalResultSet.previous(); return true; } else if (_groupingRangesIterator != null && _groupingRangesIterator.hasNext()) { _currentGroupingRange = _groupingRangesIterator.next(); populateRetrievalResultSet(); while (_retrievalResultSet.next()) { _retrievalResultSet.previous(); return true; } } } catch (final SQLException e) { throw new CrawlerCriticalException("Encounterd SQLException in hasNext()-Procedure", e); } return false; } /** * Checks whether there is more data to be returned for a call to {@link #getNextDeltaIndexingData()}. * * @return Boolean flag: {@code true} if the {@link #_internalQueue} is not empty, {@code false} otherwise. */ private boolean hasNextItemInQueue() { while (_isProducerRunning && _internalQueue.isEmpty()) { try { Thread.sleep(HAS_NEXT_ITEM_THREAD_WAIT); } catch (final InterruptedException e) { _log.trace("Got interrupted while waiting for queue to fill up in hasNextItemInQueue()-Procedure"); } } return !_internalQueue.isEmpty(); } /** * {@inheritDoc} * */ @Override public void initialize(final DataSourceConnectionConfig config) throws CrawlerException, CrawlerCriticalException { if (_log.isDebugEnabled()) { _log.debug("Initializing JdbcCrawler..."); } synchronized (_openedMonitor) { if (_opened) { throw new CrawlerCriticalException( "Crawler is already busy. This should not be the case when initializing."); } _opened = true; _forceClosing = false; } _performanceCounters = new CrawlerPerformanceCounterHelper<JdbcCrawlerPerformanceAgent>(config, hashCode(), JdbcCrawlerPerformanceAgent.class); _isProducerRunning = true; _internalQueue = new ArrayBlockingQueue<DataReference>(INTERNAL_QUEUE_CAPACITY); _dataSourceID = config.getDataSourceID(); final Attributes attributes = config.getAttributes(); final List<IAttribute> attributeList = attributes.getAttribute(); _attributes = attributeList.toArray(new Attribute[attributeList.size()]); _process = (Process) config.getProcess(); _recordCache = new HashMap<ConnectivityId, Record>(); _producerThread = new CrawlingProducerThread(); _producerThread.start(); } /** * This method populates the {@link #_attributeMapping} {@link HashMap}, mapping attribute names to column indexes in * the {@link #_retrievalResultSet}. It uses the columnNames provided by the {@link ResultSetMetaData}-object * {@link #_retrievalResultSetMetaData} * * @throws SQLException * If any of the operations on {@link #_retrievalResultSetMetaData} failed for whatever reason. */ private void populateAttributeMapping() throws SQLException { _attributeMapping = new HashMap<String, Integer>(); for (final Attribute attribute : _attributes) { for (int i = 1; i <= _retrievalResultSetMetaData.getColumnCount(); i++) { if (_retrievalResultSetMetaData.getColumnName(i).trim() .equalsIgnoreCase(attribute.getColumnName().trim())) { _attributeMapping.put(attribute.getName(), i); _log.debug("Mapping dataset column with name [" + _retrievalResultSetMetaData.getColumnName(i) + "] to Attribute with name [" + attribute.getName() + "] which declares SQL-Type [" + attribute.getSqlType() + "] and selects column [" + attribute.getColumnName() + "]"); break; } } } if (_attributeMapping.size() != _retrievalResultSetMetaData.getColumnCount() || _attributeMapping.size() != _attributes.length) { _log.warn("Only " + _attributeMapping.size() + " Attribute-Mappings could be found. " + _retrievalResultSetMetaData.getColumnCount() + " Resultset-Columns and " + (_attributes.length - _attributeMapping.size()) + " Schema-Attributes remain unmapped. Check name and type conformance"); } } /** * This method populates the {@link #_retrievalResultSet} using the {@link PreparedStatement} * {@link #_retrievalStatement}. If groupigs are enabled the respective parameters of the statement are set to the * values of {@link #_currentGroupingRange} before statement execution. After executing the statement the member * {@link #_retrievalResultSetMetaData} is set. * * @throws SQLException * If any of the JDBC operations fail for whatever reason. */ private void populateRetrievalResultSet() throws SQLException { // free resources to avoid oom-exceptions if (_retrievalResultSet != null) { _log.info("Closing Retrieval Resultset for re-population"); _retrievalResultSet.close(); _retrievalResultSet = null; } if (_groupingRanges != null && _groupingRanges.size() > 0) { // if groupings are enabled we need to set the retrieval parameters in the statement // insert "start"-values final PreparedStatementTypedParameter[] startValues = _currentGroupingRange.getStartValues(); for (int i = 0; i < startValues.length; i++) { _log.trace("Setting Start-Value [" + startValues[i].toString() + "] as statement parameter"); startValues[i].applyToPreparedStatement(_retrievalStatement); } // insert "end"-values final PreparedStatementTypedParameter[] endValues = _currentGroupingRange.getEndValues(); for (int i = 0; i < endValues.length; i++) { _log.trace("Setting End-Value [" + endValues[i].toString() + "] as statement parameter"); endValues[i].applyToPreparedStatement(_retrievalStatement); } } // execute statement and assign resultset to member variable _log.trace("Executing retrieval statement"); final ResultSet resultSet = _retrievalStatement.executeQuery(); _retrievalResultSet = resultSet; _retrievalResultSetMetaData = _retrievalResultSet.getMetaData(); if (_attributeMapping == null) { populateAttributeMapping(); } } /** * Sets up the JDBC-{@link Connection} to the datasource as specified in the {@link Database}-attribute in the * {@link DataSourceConnectionConfig} and sets the {@link #_connection} property accordingly. * * @throws CrawlerCriticalException * If any of the following conditions arises: * <ul> * <li>The Driver class cannot be found</li> * <li>The Driver class cannont be instatiated</li> * <li>A {@link SQLException} occurs when creating the {@link Connection}</li> * </ul> */ private void prepareConnection() throws CrawlerCriticalException { final Database database = _process.getDatabase(); final String driverName = database.getJdbcDriver(); try { Class.forName(database.getJdbcDriver()).newInstance(); if (_log.isDebugEnabled()) { _log.debug("Loaded JDBC driver [" + driverName + "]"); } } catch (final ClassNotFoundException e) { final String errorMessage = "Unable to load jdbc driver [" + driverName + "]"; throw new CrawlerCriticalException(errorMessage, e); } catch (final InstantiationException e) { final String errorMessage = "Unable to load jdbc driver [" + driverName + "]"; throw new CrawlerCriticalException(errorMessage, e); } catch (final IllegalAccessException e) { final String errorMessage = "Unable to load jdbc driver [" + driverName + "]"; throw new CrawlerCriticalException(errorMessage, e); } try { if (_log.isInfoEnabled()) { _log.info("Connecting to database [" + database.getConnection() + "]"); } _connection = DriverManager.getConnection(database.getConnection(), database.getUser(), database.getPassword()); } catch (final SQLException e) { final String errorMessage = "Failed to connect to database [" + database.getConnection() + "]"; throw new CrawlerCriticalException(errorMessage, e); } } /** * Populates the {@link #_groupingRanges}-{@link ArrayList} according to the configuration specified in the * {@link Grouping}-attribute of the {@link DataSourceConnectionConfig}. The SQL-statements needed for this are * executed via a local {@link Statement}-object, just as the data is retrieved via a local {@link ResultSet}-object. * * @throws CrawlerCriticalException * If any of the following conditions occur: * <ul> * <li>Any of the columns used for grouping has a data type which is not supported: !(Number||String)</li> * <li>A SQLException is raised while retrieving the grouping data from the database</li> * </ul> */ private void prepareGrouping() throws CrawlerCriticalException { final Grouping grouping = _process.getSelections().getGrouping(); BigInteger stepping = BigInteger.ONE; ResultSet groupingResultSet = null; ResultSetMetaData groupingMetaData = null; if (grouping != null) { _groupingRanges = new ArrayList<GroupingRange>(); final String groupingSQL = grouping.getSQL(); stepping = grouping.getStepping(); Statement groupingStatement = null; try { groupingStatement = _connection.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE, ResultSet.CONCUR_READ_ONLY); _log.debug("Executing SQL for grouping preparation: [" + groupingSQL + "]"); groupingResultSet = groupingStatement.executeQuery(groupingSQL); groupingMetaData = groupingResultSet.getMetaData(); _log.debug("Retrieved groupingResultSet with [" + groupingMetaData.getColumnCount() + "] columns"); for (int i = 1; i <= groupingMetaData.getColumnCount(); i++) { Class<?> columnClass = null; try { columnClass = Class.forName(groupingMetaData.getColumnClassName(i)); } catch (final ClassNotFoundException e) { _log.error("This should never happen: the class[" + groupingMetaData.getColumnClassName(i) + "] for the column " + i + " in the grouping result set could not be resolved"); } if (Number.class.isAssignableFrom(columnClass)) { _log.debug("RowNr " + i + " of the grouping result set is of type [" + columnClass.getName() + "], which is derived from [Number]: fine for use in a grouping"); continue; } else if (String.class.equals(columnClass)) { _log.debug("RowNr " + i + " of the grouping result set is of type [String]: fine for use in a grouping"); } else { throw new CrawlerCriticalException("RowNr " + i + " of the grouping result set is of type [" + columnClass.getName() + "]: NOT supported as a grouping field"); } } int groupingRecords = 0; PreparedStatementTypedParameter[] startValues = null; PreparedStatementTypedParameter[] endValues = null; final PreparedStatementTypedParameter[] finalValues = new PreparedStatementTypedParameter[groupingMetaData .getColumnCount()]; while (groupingResultSet.next()) { if (groupingRecords == 0) { startValues = new PreparedStatementTypedParameter[groupingMetaData.getColumnCount()]; for (int i = 1; i <= groupingMetaData.getColumnCount(); i++) { startValues[i - 1] = new PreparedStatementTypedParameter(groupingResultSet.getObject(i), (i * 2) - 1, groupingMetaData.getColumnType(i)); } } groupingRecords++; if (groupingRecords == stepping.intValue()) { endValues = new PreparedStatementTypedParameter[groupingMetaData.getColumnCount()]; for (int i = 1; i <= groupingMetaData.getColumnCount(); i++) { endValues[i - 1] = new PreparedStatementTypedParameter(groupingResultSet.getObject(i), i * 2, groupingMetaData.getColumnType(i)); } final GroupingRange groupingRange = new GroupingRange(startValues, endValues); _groupingRanges.add(groupingRange); if (_log.isTraceEnabled()) { _log.trace( "Added GroupingRange: [" + groupingRange.toString() + "] to _groupingRanges"); } groupingRecords = 0; continue; } for (int i = 1; i <= groupingMetaData.getColumnCount(); i++) { finalValues[i - 1] = new PreparedStatementTypedParameter(groupingResultSet.getObject(i), i * 2, groupingMetaData.getColumnType(i)); } } if (groupingRecords != 0 && stepping.intValue() != 1) { final GroupingRange finalgroupingRange = new GroupingRange(startValues, finalValues); _groupingRanges.add(finalgroupingRange); _log.debug( "Added final GroupingRange [" + finalgroupingRange.toString() + "] to _groupingRanges"); } } catch (final SQLException e1) { throw new CrawlerCriticalException("Encountered SQLException while preparing Groupings"); } finally { try { if (groupingStatement != null) { groupingStatement.close(); } } catch (final SQLException e) { _log.error("Could not closeGrouping statement"); } try { groupingResultSet.close(); _log.debug("Closed Grouping Resultset"); } catch (final SQLException e) { _log.error("Could not close Resultset for Grouping statement"); } } } // set current grouping to first grouping in list (if list is not empty) _groupingRangesIterator = _groupingRanges.iterator(); if (_groupingRangesIterator.hasNext()) { _currentGroupingRange = _groupingRangesIterator.next(); } _log.debug(String.format("Prepared %d grouping ranges based on specified stepping of %d", _groupingRanges.size(), stepping.intValue())); } /** * This method is called during initialization and assembles the {@link PreparedStatement}-member * {@link #_retrievalStatement} used for data retrieval according to the configuration in the {@link Selections} * -attribute of the {@link DataSourceConnectionConfig}. If grouping is enabled in the * {@link DataSourceConnectionConfig} the {@link #prepareGrouping()}-method is called. * * @throws CrawlerCriticalException * If the {@link PreparedStatement} could not be created on the {@link #_connection} * */ private void prepareRetrievalStatement() throws CrawlerCriticalException { String retrievalSql = _process.getSelections().getSQL(); retrievalSql = retrievalSql.trim(); if (_process.getSelections().getGrouping() != null) { prepareGrouping(); _log.debug("Transforming SQL passed from index: [" + retrievalSql + "]"); final Pattern groupingPlaceholderPattern = Pattern.compile("%\\d\\d(min|max)"); final Matcher matcher = groupingPlaceholderPattern.matcher(retrievalSql); final String transformedSQL = matcher.replaceAll("?"); _log.debug("Using transformed SQL for PreparedStatement: [" + transformedSQL + "]"); retrievalSql = transformedSQL; } try { _retrievalStatement = _connection.prepareStatement(retrievalSql, ResultSet.TYPE_SCROLL_INSENSITIVE, ResultSet.CONCUR_READ_ONLY); } catch (final SQLException e) { throw new CrawlerCriticalException("Failed to create statement on database connection", e); } } /** * Reads the specified attribute's value from the database row passed as {@link Object[]}. Uses the * {@link #_attributeMapping} {@link Map} to select the right index in {@code data}. * * @param data * A database row. * @param attribute * The attribute whose value is to be determined. * @return the attribute value as {@link Object}. * @throws CrawlerCriticalException * If any of the following conditions occur: * <ul> * <li>The parameter {@code data} was {@code null}.</li> * <li>No mapping for the attribute could be found</li> * </ul> */ private Object readAttribute(final Object[] data, final Attribute attribute) throws CrawlerCriticalException { if (data == null) { throw new CrawlerCriticalException("Could not extract required attribute [" + attribute.getName() + "]. The data Object to read it from was null"); } int index = -1; try { index = _attributeMapping.get(attribute.getName()); return data[index - 1]; } catch (final ArrayIndexOutOfBoundsException e) { throw new CrawlerCriticalException( "Could not extract required attribute [" + attribute.getName() + "]"); } } /** * This method is called by the {@code public}-methods of {@link JdbcCrawler} prior to any other activity to ensure * that critical exceptions that were caused in the Producer-Thread and stored in the class member * {@link #_producerException} are delegated to the client. * * @throws CrawlerCriticalException * If a critical exception was stored in {@link #_producerException} it gets thrown here. */ private void rethrowProducerExceptions() throws CrawlerCriticalException { if (_producerException != null) { if (_log.isDebugEnabled()) { _log.debug("Rethrowing Producer Exceptions"); } throw _producerException; } } /** * Inner class of {@link JdbcCrawler} subclassing the {@link Thread}-class. It is instantiated in * {@link JdbcCrawler#initialize(DataSourceConnectionConfig)} and handles the actual crawling process so crawling can * happen asynchronously. * * @author mbreidenband * */ private class CrawlingProducerThread extends Thread { /** * Crawls the JDBC-datasource. {@inheritDoc} * * @see java.lang.Thread#run() */ @Override public void run() { try { try { prepareConnection(); } catch (final CrawlerCriticalException e) { _producerException = e; _performanceCounters.increment(POC_CRITICAL_CRAWLING_EXCEPTIONS); _log.error("Encountered critical Exception in prepareConnection() procedure", e); } try { prepareRetrievalStatement(); } catch (final CrawlerCriticalException e) { _producerException = e; _performanceCounters.increment(POC_CRITICAL_CRAWLING_EXCEPTIONS); _log.error("Encountered critical Exception in prepareRetievalStatement() procedure", e); } while (!_forceClosing && hasNext()) { final Object[] values = new Object[_retrievalResultSetMetaData.getColumnCount()]; _retrievalResultSet.next(); for (int i = 1; i <= values.length; i++) { values[i - 1] = _retrievalResultSet.getObject(i); } _performanceCounters.increment(POC_DATA_ROWS_RETRIEVED); boolean waiting = true; DataReference dataRef = null; while (waiting) { if (dataRef == null) { try { dataRef = createDataReference(values); _performanceCounters.increment(POC_DATA_REFS_CREATED); } catch (final InvalidValueTypeException e) { _performanceCounters.increment(POC_CRAWLING_EXCEPTIONS); _log.error("", e); } try { if (_log.isTraceEnabled()) { _log.trace("Putting DataReference [" + dataRef + "] in internal queue"); } synchronized (_openedMonitor) { _internalQueue.put(dataRef); } } catch (final InterruptedException e) { if (_log.isTraceEnabled()) { _log.trace("Got interrupted..."); } } } waiting = false; } } } catch (final CrawlerCriticalException e) { _producerException = e; _performanceCounters.increment(POC_CRITICAL_CRAWLING_EXCEPTIONS); _log.error("Encountered critical Exception in Producer-Thread", e); } catch (final SQLException e) { _producerException = new CrawlerCriticalException("Encountered SQLException in Producer-Thread", e); _performanceCounters.increment(POC_CRITICAL_CRAWLING_EXCEPTIONS); _log.error("Encountered SQLException in Producer-Thread", e); } catch (final RuntimeException e) { _producerException = new CrawlerCriticalException("Encountered RuntimeException in Producer-Thread", e); _performanceCounters.increment(POC_CRITICAL_CRAWLING_EXCEPTIONS); _log.error("Encountered RuntimeException in ProducerThread", e); } finally { _isProducerRunning = false; if (_forceClosing) { _log.info("DbCrawling was terminated by close()-Procedure"); } else if (_producerException != null) { _log.info("DbCrawling terminated with Exception"); } else { _log.info("DbCrawling terminated normally"); } } } } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.Crawler#getNext() */ @Override public DataReference[] getNext() throws CrawlerException, CrawlerCriticalException { rethrowProducerExceptions(); while (hasNextItemInQueue()) { try { final DataReference dataRef = _internalQueue.poll(QUEUE_POLL_WAITING, TimeUnit.MILLISECONDS); if (dataRef != null) { synchronized (_openedMonitor) { final List<DataReference> tempList = new ArrayList<DataReference>(); tempList.add(dataRef); final int size = _internalQueue.drainTo(tempList, MAX_QUEUE_SIZE - 1); _performanceCounters.incrementBy(POC_DATA_REFS_RETRIEVED_BY_CLIENT, size + 1); return tempList.toArray(new DataReference[size + 1]); } } } catch (final InterruptedException e) { ; // nothing } catch (final Throwable t) { _log.error("Error occurred in getNext()", t); throw new CrawlerCriticalException(t); } } return null; } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.CrawlerCallback#dispose(org.eclipse.smila.datamodel.id.Id) */ @Override public void dispose(final ConnectivityId id) { _recordCache.remove(id); } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.CrawlerCallback#getAttachment(org.eclipse.smila.datamodel.id.Id, * java.lang.String) */ @Override public byte[] getAttachment(final ConnectivityId id, final String name) throws CrawlerException, CrawlerCriticalException { final Record record = _recordCache.get(id); if (record == null) { throw new CrawlerException( "The requested record with id [" + id + "] was not found in the Crawler's cache"); } return record.getAttachment(name); } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.CrawlerCallback#getAttachmentNames(org.eclipse.smila.datamodel.id.Id) */ @Override public String[] getAttachmentNames(final ConnectivityId id) throws CrawlerException, CrawlerCriticalException { final Record record = _recordCache.get(id); if (record == null) { throw new CrawlerException( "The requested record with id [" + id + "] was not found in the Crawler's cache"); } final ArrayList<String> names = new ArrayList<String>(); final Iterator<String> it = record.getAttachmentNames(); while (it.hasNext()) { names.add(it.next()); } return names.toArray(new String[] {}); } /** * {@inheritDoc} * * @see org.eclipse.smila.connectivity.framework.CrawlerCallback#getMObject(org.eclipse.smila.datamodel.id.Id) */ @Override public AnyMap getMetadata(final ConnectivityId id) throws CrawlerException, CrawlerCriticalException { final Record record = _recordCache.get(id); if (record == null) { throw new CrawlerException( "The requested record with id [" + id + "] was not found in the Crawler's cache"); } return record.getMetadata(); } }