Java tutorial
/* * Copyright 2013 NGDATA nv * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.ngdata.hbaseindexer.indexer; import com.yammer.metrics.Metrics; import com.yammer.metrics.core.Meter; import com.yammer.metrics.core.MetricName; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.LBHttpSolrClient; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputField; import org.apache.solr.common.util.DateUtil; import java.io.IOException; import java.util.*; import java.util.concurrent.TimeUnit; /** * Writes updates (new documents and deletes) to Fusion. */ public class FusionDocumentWriter implements SolrInputDocumentWriter { private static final Log log = LogFactory.getLog(FusionDocumentWriter.class); public static MetricName metricName(Class<?> producerClass, String metric, String indexerName) { return new MetricName("Lucidworks", producerClass.getSimpleName(), metric, indexerName); } private Meter atomicUpdatesReceivedMeter; private Meter fusionAddMeter; private Meter fusionAddErrorMeter; private Meter fusionDocsReceivedMeter; private Meter solrAtomicUpdatesMeter; private Meter solrAtomicUpdatesErrorMeter; private Meter fusionDocsProcessedMeter; private Meter indexDeleteMeter; protected FusionPipelineClient pipelineClient; private String solrProxies; protected SolrClient solrProxy; private final String deleteByQueryAppendString = "|||*"; private String strIndexName; public FusionDocumentWriter(String indexName, Map<String, String> connectionParams) { String fusionEndpoint = connectionParams.get("fusion.pipeline"); if (fusionEndpoint == null) throw new IllegalStateException( "The 'fusion.pipeline' parameter is required when using Lucidworks Fusion!"); String fusionSolrProxy = connectionParams.get("fusion.solrproxy"); if (fusionSolrProxy == null) throw new IllegalStateException( "The 'fusion.solrproxy' parameter is required when using Lucidworks Fusion!"); String fusionUser = connectionParams.get("fusion.user"); String fusionPass = connectionParams.get("fusion.pass"); String fusionRealm = connectionParams.get("fusion.realm"); log.info( "Connecting to Fusion pipeline " + fusionEndpoint + " as " + fusionUser + ", realm=" + fusionRealm); try { pipelineClient = new FusionPipelineClient(fusionEndpoint, fusionUser, fusionPass, fusionRealm); } catch (Exception exc) { log.error("Failed to create FusionPipelineClient for " + fusionEndpoint + " due to: " + exc); if (exc instanceof RuntimeException) { throw (RuntimeException) exc; } else { throw new RuntimeException(exc); } } try { log.info("method:FusionDocumentWriter: Create Solr proxy next; fusionSolrProxy:[" + fusionSolrProxy + "], indexName:[" + indexName + "]."); solrProxy = new LBHttpSolrClient(pipelineClient.getHttpClient(), fusionSolrProxy.split(",")); } catch (Exception exc) { log.error("Failed to create LBHttpSolrClient for " + fusionSolrProxy + " due to: " + exc); if (exc instanceof RuntimeException) { throw (RuntimeException) exc; } else { throw new RuntimeException(exc); } } solrProxies = fusionSolrProxy; // just used for logging below fusionAddMeter = Metrics.newMeter(metricName(getClass(), "Docs sent to Fusion", indexName), "Documents sent to Fusion", TimeUnit.SECONDS); fusionAddErrorMeter = Metrics.newMeter(metricName(getClass(), "Failed Fusion Docs", indexName), "Failed docs sent to Fusion", TimeUnit.SECONDS); fusionDocsReceivedMeter = Metrics.newMeter(metricName(getClass(), "Fusion Docs Received", indexName), "Docs received (to be processed for Fusion)", TimeUnit.SECONDS); fusionDocsProcessedMeter = Metrics.newMeter(metricName(getClass(), "Fusion Docs Flattened", indexName), "Processed docs to send to Fusion (flattened parent / child docs)", TimeUnit.SECONDS); atomicUpdatesReceivedMeter = Metrics.newMeter(metricName(getClass(), "Atomic Updates Received", indexName), "Atomic updates received (before processing)", TimeUnit.SECONDS); solrAtomicUpdatesMeter = Metrics.newMeter(metricName(getClass(), "Atomic Updates Sent", indexName), "Atomic updates sent to Solr", TimeUnit.SECONDS); solrAtomicUpdatesErrorMeter = Metrics.newMeter(metricName(getClass(), "Failed Atomic Updates", indexName), "Failed atomic updates due to Solr errors", TimeUnit.SECONDS); indexDeleteMeter = Metrics.newMeter(metricName(getClass(), "Index deletes", indexName), "Documents deleted from Solr index", TimeUnit.SECONDS); strIndexName = indexName; log.info("Fusion document writer initialized successfully for Fusion end point:[" + fusionEndpoint + "]"); } public void add(int shard, Map<String, SolrInputDocument> inputDocumentMap) throws SolrServerException, IOException { // shs: Processing added to handle atomic updates to documents that were being passed in via the inputDocumentMap. List<SolrInputDocument> inputDocuments; try { inputDocuments = addAtomicUpdateDocuments(inputDocumentMap.values()); // Catch the exception so that any remaining documents in the collection will be able to be processed and submitted // to the fusion indexing pipeline. } catch (Exception exc) { log.error("Failed to process atomic updates due to: " + exc, exc); throw new RuntimeException(exc); } // shs: Only process the inputDocuments collection if there are still documents remaining in the collection after // the atomic updates have been processed. if (inputDocuments != null && !inputDocuments.isEmpty()) { fusionDocsReceivedMeter.mark(inputDocuments.size()); List<Map<String, Object>> fusionDocs; try { fusionDocs = toJsonDocs(null, inputDocuments, 0); } catch (Exception exc) { throw new RuntimeException(exc); } int numFusionDocsRcvd = fusionDocs != null ? fusionDocs.size() : 0; if (numFusionDocsRcvd > 0) { fusionDocsProcessedMeter.mark(numFusionDocsRcvd); try { pipelineClient.postBatchToPipeline(fusionDocs); fusionAddMeter.mark(numFusionDocsRcvd); } catch (Exception e) { log.warn("FusionPipelineClient failed to process batch of " + numFusionDocsRcvd + " docs due to: " + e + "; will re-try each doc individually"); retryFusionAddsIndividually(fusionDocs); } } } } /** * shs: New method added for handling Atomic Updates. * @param inputDocuments This is the list of documents that are atomic update documents. * @return inputDocuments (modified: the atomic update documents have been removed from the incoming parameter and * only non-atomic update documents remain in that collection). The null value will be returned if all * documents from the list of SolrInputDocuments have been removed. * @throws SolrServerException * @throws IOException */ protected List<SolrInputDocument> addAtomicUpdateDocuments(Collection<SolrInputDocument> inputDocuments) throws SolrServerException, IOException { // Visit each document in the collection and determine if it is a document that is an atomic update request. If it // is then add it to the atomicUpdateDocs and remove it from inputDocuments. Collection<SolrInputDocument> atomicUpdateDocuments = new ArrayList<SolrInputDocument>(); List<SolrInputDocument> retain = null; // docs that aren't atomic updates Iterator<SolrInputDocument> docIterator = inputDocuments.iterator(); while (docIterator.hasNext()) { boolean documentIsAtomic = false; SolrInputDocument doc = docIterator.next(); for (SolrInputField solrInputField : doc.values()) { Object val = solrInputField.getValue(); // If the type of the field just retrieved from the document is a Map object, then this could be an atomic // update document. if (val instanceof Map) { int entryCount = 0; // Used only for log messages. If the log message below is removed, this may also be deleted. for (Map.Entry<String, Object> entry : ((Map<String, Object>) val).entrySet()) { String key = entry.getKey(); if (key.equals("add") || key.equals("set") || key.equals("remove") || key.equals("removeregex") || key.equals("inc")) { // keep track of the time we saw this doc on the hbase side Map<String, String> atomicUpdateMap = new HashMap<String, String>(); atomicUpdateMap.put("set", DateUtil.getThreadLocalDateFormat().format(new Date())); doc.addField("_hbasets_tdt", atomicUpdateMap); // The atomic update documents should be added to the atomicUpdateDocs... atomicUpdateDocuments.add(doc); documentIsAtomic = true; break; // from the entry for loop } entryCount++; } } // The document was determined to be an atomic update document, no need to visit the rest of the fields, so // break from the solrInputField loop. if (documentIsAtomic) break; } // end for if (!documentIsAtomic) { if (retain == null) retain = new ArrayList<SolrInputDocument>(); retain.add(doc); } } // end while more docs // The following processing is necessary IFF there are documents in the atomicUpdateDocuments collection, which // means that we have documents needing an atomic update. if (atomicUpdateDocuments != null && !atomicUpdateDocuments.isEmpty()) { // For Fusion document submission, the SolrInputDocument must be converted to JSON; however, since these // documents are going directly to Solr and are NOT being submitted to a Fusion indexing pipeline, they do not // need to be converted to JSON. atomicUpdatesReceivedMeter.mark(atomicUpdateDocuments.size()); try { // Submit atomic update documents to Solr at this point. solrProxy.add(atomicUpdateDocuments, 500); solrAtomicUpdatesMeter.mark(atomicUpdateDocuments.size()); } catch (Exception e) { log.warn("Solr failed to process batch of " + atomicUpdateDocuments.size() + " atomic updates due to: " + e + "; will re-try each doc individually"); retrySolrAtomicUpdatesIndividually(atomicUpdateDocuments); } } // Finally, return any documents remaining in the inputDocuments collection for processing through the Fusion // indexing pipeline. return retain; } /** * shs: This method was modified to add the parent and docCount parameters. These were added to support the recursion * needed to flatten (aka denormalize) the parent/child document to any nesting level the input document may have * been nested to. * @param parentSolrDoc This is the parent document of the list of childSolrDocs. It is in SolrInputDocument format. * This parameter will be 'null' when first called. When this method is called recursively, * it will not be null. * @param childSolrDocs This is the list documents that are the children (nested) documents of parentSolrDoc. * @param docCount This is the number of child documents for a parent. It is used as the child ID * @return The list documents to be sent to Fusion for indexing formatted as JSON documents. * @throws Exception */ protected List<Map<String, Object>> toJsonDocs(SolrInputDocument parentSolrDoc, Collection<SolrInputDocument> childSolrDocs, int docCount) throws Exception { boolean isDebugEnabled = log.isDebugEnabled(); if (isDebugEnabled) { log.debug("Method:toJsonDocs - Processing SolrInputDocuments: parent:[" + (parentSolrDoc == null ? "null" : parentSolrDoc.toString()) + "] with " + childSolrDocs.size() + " child documents."); } List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(childSolrDocs.size()); for (SolrInputDocument childSolrDoc : childSolrDocs) { if (isDebugEnabled) { log.debug("Method:toJsonDocs - Processing SolrInputDocuments: parent:[" + (parentSolrDoc == null ? "null" : parentSolrDoc.toString()) + "]; child:[" + childSolrDoc.toString() + "]"); } list.addAll(toJson(parentSolrDoc, childSolrDoc, docCount)); } return list; } /** * shs: This method was modified from its original to add the input parameters 'parent' and 'docCount'. This was done * to enable recursion to be used to find all parent/child relationships to any level. * @param parent The parent document for the child document being passed in. Parent may be null if the child being * passed in is a member of the initial documents submitted. * @param child This is the child document. This document will be examined to determine if there are nested * child documents in it. * @param docCount This is the number of child documents for a parent. It is used as the child ID * @return The list of JSON formatted documents, denormalized (aka. flattened) and ready to be send to the * Fusion indexing pipeline endpoint. * @throws Exception */ protected List<Map<String, Object>> toJson(SolrInputDocument parent, SolrInputDocument child, int docCount) throws Exception { List<Map<String, Object>> docs = new ArrayList<Map<String, Object>>(); if (child != null) { List<SolrInputDocument> childDocs = child.getChildDocuments(); if (childDocs != null && !childDocs.isEmpty()) { // Recursive call back to 'toJsonDocs()' docs.addAll(toJsonDocs(child, childDocs, docCount++)); } else { // I'm not certain the increment should be on the docCount here... docs.add(doc2json(parent, child, docCount++)); } } return docs; } /** * shs: This method was modified from its original to add the input parameters 'parent' and 'docCount'. This was done * to enable recursion to be used to find all parent/child relationships to any level. The method will merge the * fields in the parent document into the child document and will then convert that merged document into JSON * format and return that JSON document to the caller. * @param parent The parent document for the child document being passed in. Parent may be null if the child being * passed in is a member of the initial documents submitted. * @param child This is the child document. It will have the parent's fields merged into it. * @param docCount This is a count of the number of documents that have been added in this processing. * @return The merged parent and child documents as a JSON formatted document, in a format acceptable to * Fusion. */ protected Map<String, Object> doc2json(SolrInputDocument parent, SolrInputDocument child, int docCount) { Map<String, Object> json = new HashMap<String, Object>(); if (child != null) { String docId = (String) child.getFieldValue("id"); if (docId == null) { if (parent != null) { String parentId = (String) parent.getFieldValue("id"); docId = parentId + "-" + docCount; } if (docId == null) throw new IllegalStateException("Couldn't resolve the id for document: " + child); } json.put("id", docId); List fields = new ArrayList(); if (parent != null) { if (log.isDebugEnabled()) log.debug("Method:doc2json - Merging parent and child docs, parent:[" + parent.toString() + "]; child[" + child.toString() + "]."); // have a parent doc ... flatten by adding all parent doc fields to the child with prefix _p_ for (String f : parent.getFieldNames()) { if ("id".equals(f)) { fields.add(mapField("_p_id", null /* field name prefix */, parent.getField("id").getFirstValue())); } else { appendField(parent, f, "_p_", fields); } } } for (String f : child.getFieldNames()) { if (!"id".equals(f)) { // id already added appendField(child, f, null, fields); } } // keep track of the time we saw this doc on the hbase side String tdt = DateUtil.getThreadLocalDateFormat().format(new Date()); fields.add(mapField("_hbasets_tdt", null, tdt)); if (log.isDebugEnabled()) log.debug(strIndexName + " Reconcile id = " + docId + " and timestamp = " + tdt); json.put("fields", fields); } else { log.warn("method:doc2json - Input parameter 'child' was null."); } return json; } protected void appendField(SolrInputDocument doc, String f, String pfx, List fields) { SolrInputField field = doc.getField(f); int vc = field.getValueCount(); if (vc <= 0) return; // no values to add for this field if (vc == 1) { Map<String, Object> fieldMap = mapField(f, pfx, field.getFirstValue()); if (fieldMap != null) fields.add(fieldMap); } else { for (Object val : field.getValues()) { Map<String, Object> fieldMap = mapField(f, pfx, val); if (fieldMap != null) fields.add(fieldMap); } } } protected Map<String, Object> mapField(String f, String pfx, Object val) { Map<String, Object> fieldMap = new HashMap<String, Object>(10); String fieldName = (pfx != null) ? pfx + f : f; fieldMap.put("name", fieldName); fieldMap.put("value", val); return fieldMap; } private void retryFusionAddsIndividually(List<Map<String, Object>> docs) throws SolrServerException, IOException { for (Map<String, Object> next : docs) { try { pipelineClient.postBatchToPipeline(Collections.singletonList(next)); fusionAddMeter.mark(); } catch (Exception e) { log.error("Failed to index document [" + next.get("id") + "] due to: " + e + "; doc: " + next); fusionAddErrorMeter.mark(); } } } /** * shs: This method was added to handle documents that are submitted as atomic updates. It is the final option to * get documents into the index if the previous indexing attempt failed when sending the entire colleciton of * documents to the Solr proxy. In this case, each document in the collection of documents will be submitted to * the Solr proxy, one document at a time. * @param docs The SolrInputDocuments to be added, one at a time. * @throws SolrServerException * @throws IOException */ private void retrySolrAtomicUpdatesIndividually(Collection<SolrInputDocument> docs) throws SolrServerException, IOException { for (SolrInputDocument nextDoc : docs) { try { solrProxy.add(nextDoc); solrAtomicUpdatesMeter.mark(); } catch (Exception e) { log.error("Failed to index atomic update document [" + nextDoc.get("id") + "] due to: " + e + "; doc: " + nextDoc + "solrProxy:[" + solrProxy.toString() + "]"); solrAtomicUpdatesErrorMeter.mark(); } } } public void deleteById(int shard, List<String> idsToDelete) throws SolrServerException, IOException { int len = 15; String listLogInfo = (idsToDelete.size() > len) ? (idsToDelete.subList(0, len).toString() + " + " + (idsToDelete.size() - len) + " more ...") : idsToDelete.toString(); log.info("Sending a deleteById '" + idsToDelete + "' to Solr(s) at: " + solrProxies); boolean deleteByIdsSucceeded = false; try { solrProxy.deleteById(idsToDelete, 500); indexDeleteMeter.mark(idsToDelete.size()); deleteByIdsSucceeded = true; // This statement was inserted for Zendesk ticket 4186. If the delete by IDs succeeds above, we also need to ensure // that all children documents that have the id (HBase row ID) in their id followed immediately by the // deleteByQueryAppendString followed by any additional characters are also deleted from the index. deleteByQuery(idsToDelete, "id", deleteByQueryAppendString); } catch (Exception e) { log.error("Delete docs by " + (deleteByIdsSucceeded ? "query" : "id") + " failed due to: " + e + "; ids: " + idsToDelete + (deleteByIdsSucceeded ? " appended with '" + deleteByQueryAppendString : "") + ". Retry deleting individually by id."); retryDeletesIndividually(idsToDelete, deleteByIdsSucceeded); } } /** * If deleting the documents using a list of document IDs fails, then retry the deletes while iterating through the * list of document IDs to be deleted. As the result of changes for ticket 4186, a delete by ID will also result in * a delete by query attemp, where the id is postpended by a specific string. In the situation where there were * nested (parent/child) documents submitted for indexing, the documents were denormalized and when this happens * the document ID of the denormalized child documents contains the ID of the parent (the HBase row ID) followed by a * specificpattern of characters followed by an additional string that uniquely identifies the child document. The * delete by query will take the parent ID (HBase row ID) and append to it the specific pattern of characters followed * by the wild card character ('*'). This will result in all child documents of the parent being deleted from the index. * * @param idsToDelete The ID (HBase row ID) of the documents to be deleted * @param retryDeletesByQueryOnly If the delete by IDs succeeded and only the delete by queries failed, this will be * 'true' to indicate that only the delete by query should be retried. If the value is * 'false', the delete by id failed, meaning that the delete by query was never * attempted; thus, both the delete by id AND the delete by query must be attempted. * @throws SolrServerException * @throws IOException */ private void retryDeletesIndividually(List<String> idsToDelete, boolean retryDeletesByQueryOnly) throws SolrServerException, IOException { for (String idToDelete : idsToDelete) { if (!retryDeletesByQueryOnly) { try { solrProxy.deleteById(idToDelete, 500); indexDeleteMeter.mark(); } catch (SolrException e) { log.error("Failed to delete document with ID " + idToDelete + " due to: " + e + ". Retry deleting by query as '" + idToDelete + deleteByQueryAppendString + "'"); } } // Try to delete the document by query so as to remove all child documents from the index. try { deleteByQuery(idToDelete + deleteByQueryAppendString); } catch (SolrException e) { log.error("Failed to delete document by query inside method 'retryDeletesIndividually' with ID " + idToDelete + deleteByQueryAppendString + " due to: " + e + "."); } } } public void deleteByQuery(String deleteQuery) throws SolrServerException, IOException { log.info("Sending a deleteByQuery '" + deleteQuery + "' to Solr(s) at: " + solrProxies); try { solrProxy.deleteByQuery(deleteQuery, 500); } catch (Exception e) { log.error("Failed to execute deleteByQuery(String deleteQuery): " + deleteQuery + " due to: " + e); } } // private void deleteByQuery(String idToDelete, String queryFieldName, String deleteQueryAppendStr) throws SolrServerException, IOException { // try { // deleteByQuery(queryFieldName + ":" + idToDelete + deleteQueryAppendStr); // } catch (Exception e) { // log.error("Failed to execute deleteByQuery(String idToDelete, String deleteQueryAppendStr): " + idToDelete + deleteQueryAppendStr + " due to: " + e); // } // } private void deleteByQuery(List<String> idsToDelete, String queryFieldName, String deleteQueryAppendStr) throws SolrServerException, IOException { for (String idToDelete : idsToDelete) { try { deleteByQuery(queryFieldName + ":" + idToDelete + deleteQueryAppendStr); } catch (Exception e) { log.error("Failed to execute deleteByQuery(List<String> idsToDelete, String deleteQueryAppendStr): " + idToDelete + deleteQueryAppendStr + " due to: " + e); } } } public void close() { log.info("shutting down pipeline client and solr proxy"); try { pipelineClient.shutdown(); } catch (Exception ignore) { } try { solrProxy.shutdown(); } catch (Exception ignore) { } } }