Java tutorial
package org.apache.solr.update; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import; import; import; import; import; import; import; import; import; import; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import monty.solr.jni.MontySolrVM; import monty.solr.jni.PythonCall; import monty.solr.jni.PythonMessage; import; import; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; import org.apache.solr.handler.RequestHandlerBase; import org.apache.solr.handler.dataimport.DataImportHandler; import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrRequestHandler; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.IndexSchema; import; import; import org.apache.solr.util.WebUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This handler keeps Solr index in sync with the Invenio database. * Basically, on every invocation it calls Invenio to retrieve set * of added/updated/deleted document recids. * * Note from the author: I don't like my code at all, it should be simpler * * When we have these ids, we'll call the respective handlers and * pass them recids. This implementation extends {@link DataImportHandler} * therefore it is sequential. While one import is running, consecutive * requests to the same import handler class will respond with * importStatus <b>busy</b> * * @param request parameters * <p> * - <b>last_recid</b>: the recid of the reference record, it will be the * orientation point to find all newer changed/added/deleted recs * * If last_recid == -1, we start from the first document * <p> * - <b>generate</b>: boolean parameter which means empty lucene documents * should be generated in the range <b>{last_recid, max_recid}</b> * * - <b>max_recid</b>: integer, marks the end of the interval, must be * supplied when using generate * If <b>generate</b> is false, then we will try to retrieve recids * from invenio and start the indexing/updates * * <p> * - <b>inveniourl</b> : complete url to the Invenio search (we'll prepend query * parameters, eg. inveniourl?p=recid:x->y) * <p> * - <b>updateurl</b> : complete url to the Solr update handler (this handler * should fetch <b>updated</b> source documents and index them) * <p> * - <b>importurl</b> : complete url to the Solr update handler (this handler * should fetch <b>new</b> source documents and index them) * <p> * - <b>deleteurl</b> : complete url to the Solr update handler (this handler * should remove <b>deleted</b> documents from Solr index) * * * <p> * Example configuration: * <pre> * last_recid: 90 * inveniourl: http://invenio-server/search * updateurl: http://localhost:8983/solr/update-dataimport?command=full-import&dirs=/proj/fulltext/extracted * importurl: http://localhost:8983/solr/import-dataimport?command=full-import&arg1=val1&arg2=val2 * deleteurl: http://localhost:8983/solr/delete-dataimport?command=full-import * maximport: 200 * * using modification date of the recid 90 we discover... * * updated records: 53, 54, 55, 100 * added records: 101,103 * deleted records: 91,92,93,102 * * ...which results in 3 requests (newline breaks added for readability): * * * 1. http://localhost:8983/solr/update-dataimport?command=full-import&dirs=/proj/fulltext/extracted * &url=http://invenio-server/search?p=recid:53->55 OR recid:100&rg=200&of=xm * * 2. http://localhost:8983/solr/import-dataimport?command=full-import&arg1=val1&arg2=val2 * &url=http://invenio-server/search?p=recid:101 OR recid:103&rg=200&of=xm * * 3. http://localhost:8983/solr/delete-dataimport?command=full-import * &url=http://invenio-server/search?p=recid:91-93 OR recid:102&rg=200&of=xm * * </pre> * * NOTE: the url parameter <b>url</b> is url-encoded (it is here in plain form for readability) * * <p> * Also, if you want to try the update handler manually, you must encode the parameters, eg: * * <code> * http://localhost:8983/solr/invenio_updater?last_recid=100&index=true * &inveniourl=http%3A%2F%2Finvenio-server%2Fsearch * &importurl=http%3A%2F%2Flocalhost%3A8983%2Fsolr%2Fwaiting-dataimport%3Fcommand%3Dfull-import%26dirs%3D%2Fproj%2Fadsx%2Ffulltext%2Fextracted * </code> * */ public class InvenioKeepRecidUpdated extends RequestHandlerBase implements PythonCall { public static final Logger log = LoggerFactory.getLogger(InvenioKeepRecidUpdated.class); private volatile int counter = 0; private boolean asynchronous = true; private volatile String workerMessage = ""; private volatile String tokenMessage = ""; static String IKRU_PROPERTIES = ""; // will be put into context static final String LAST_RECID = "last_recid"; // name of the param from url and also what is passed to python static final String LAST_UPDATE = "mod_date"; // name of the param from url and also what is passed to python static final String ADDED = "ADDED"; // datastructure returned from python with recids is keyed static final String UPDATED = "UPDATED"; static final String DELETED = "DELETED"; static final String PARAM_INVENIO = "inveniourl"; // url params that influence processing static final String PARAM_IMPORT = "importurl"; static final String PARAM_UPDATE = "updateurl"; static final String PARAM_DELETE = "deleteurl"; static final String PARAM_MAXIMPORT = "maximport"; static final String PARAM_BATCHSIZE = "batchsize"; static final String PARAM_COMMIT = "commit"; static final String PARAM_MAX_RECID = "max_recid"; static final String PARAM_TOKEN = "idtoken"; private String pythonFunctionName = "get_recids_changes"; private int max_maximport = 20000; private int max_batchsize = 500000; @Override public void init(NamedList args) { super.init(args); if (args.get("defaults") == null) { return; } NamedList defs = (NamedList) args.get("defaults"); if (defs.get("max_maximport") != null) { max_maximport = Integer.valueOf((String) defs.get("max_maximport")); } if (defs.get("max_batchsize") != null) { max_batchsize = Integer.valueOf((String) defs.get("max_batchsize")); } if (defs.get("pythonFunctionName") != null) { pythonFunctionName = (String) defs.get("pythonFunctionName"); } if (defs.get("propertiesFile") != null) { IKRU_PROPERTIES = (String) defs.get("propertiesFile"); } } public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException, InterruptedException { if (isBusy()) { rsp.add("message", "Import is already running, please retry later..."); rsp.add("importStatus", "busy"); rsp.add("workerMessage", getWorkerMessage()); rsp.add(PARAM_TOKEN, getToken()); return; } setBusy(true); SolrParams params = req.getParams(); Properties prop = loadProperties(params); setToken(prop.getProperty(PARAM_TOKEN)); long start = System.currentTimeMillis(); Map<String, Object> dictData = null; try { dictData = retrieveRecids(prop, req, rsp); } catch (RuntimeException e) { setBusy(false); throw e; } if (dictData == null) { setBusy(false); return; } req.getContext().put(IKRU_PROPERTIES, prop); if (isAsynchronous()) { runAsynchronously(dictData, req); } else { runSynchronously(dictData, req); setBusy(false); } long end = System.currentTimeMillis(); rsp.add("importStatus", isBusy() ? "busy" : "idle"); rsp.add("workerMessage", getWorkerMessage()); rsp.add("QTime", end - start); setWorkerMessage("Last import finished in: " + (end - start)); rsp.add(PARAM_TOKEN, getToken()); setToken(""); } private void setToken(String string) { tokenMessage = string; } private String getToken() { return tokenMessage; } /* * The method that discovers what was changed in Invenio DB */ @SuppressWarnings("unchecked") protected Map<String, Object> retrieveRecids(Properties prop, SolrQueryRequest req, SolrQueryResponse rsp) { HashMap<String, Object> retData = new HashMap<String, Object>(); SolrParams params = req.getParams(); Integer lastRecid = null; String lastUpdate = null; if (prop.containsKey(LAST_RECID)) { lastRecid = Integer.valueOf(prop.getProperty(LAST_RECID)); } if (prop.containsKey(LAST_UPDATE)) { lastUpdate = prop.getProperty(LAST_UPDATE); } Map<String, int[]> dictData; // we'll generate empty records (good just to have a mapping between invenio // and lucene docids; necessary for search operations) if (params.getBool("generate", false)) { Integer max_recid = params.getInt(PARAM_MAX_RECID, 0); if (max_recid == 0 || max_recid < lastRecid) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "The max_recid parameter missing!"); } dictData = new HashMap<String, int[]>(); int[] a = new int[max_recid - lastRecid]; for (int i = 0, ii = lastRecid + 1; ii < max_recid + 1; i++, ii++) { a[i] = ii; } dictData.put("ADDED", a); retData.put(LAST_UPDATE, null); retData.put(LAST_RECID, max_recid); } else { // get recids from Invenio {'ADDED': int, 'UPDATED': int, 'DELETED': // int } PythonMessage message = MontySolrVM.INSTANCE.createMessage(pythonFunctionName) .setSender(this.getClass().getSimpleName()) .setParam("max_records", params.getInt(PARAM_BATCHSIZE)).setParam("request", req) .setParam("response", rsp); if (lastRecid != null) message.setParam(LAST_RECID, lastRecid); if (lastUpdate != null) message.setParam(LAST_UPDATE, lastUpdate); if (lastRecid == null && lastUpdate == null) { message.setParam(LAST_UPDATE, getLastIndexUpdate(req)); }"Retrieving changed recs: max_records=" + params.getInt(PARAM_BATCHSIZE) + " last_recid=" + lastRecid + " last_update=" + lastUpdate); MontySolrVM.INSTANCE.sendMessage(message); Object results = message.getResults(); if (results == null) { rsp.add("message", "No new/updated/deleted records inside Invenio."); rsp.add("importStatus", "idle"); return null; } dictData = (HashMap<String, int[]>) results; retData.put(LAST_UPDATE, (String) message.getParam(LAST_UPDATE)); retData.put(LAST_RECID, (Integer) message.getParam(LAST_RECID));"Retrieved: last_update=" + retData.get(LAST_UPDATE) + " last_recid=" + retData.get(LAST_RECID)); } retData.put("dictData", dictData); return retData; } private String getLastIndexUpdate(SolrQueryRequest req) { SolrIndexSearcher searcher = req.getSearcher(); // Invenio uses mod_date.strftime(format="%Y-%m-%d %H:%M:%S") -> '2013-11-29 16:40:33' SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S"); Date date = new Date(searcher.getOpenTime()); return df.format(date); } public void setPythonFunctionName(String name) { pythonFunctionName = name; } public String getPythonFunctionName() { return pythonFunctionName; } private void runAsynchronously(Map<String, Object> dictData, SolrQueryRequest req) { final Map<String, Object> dataToProcess = dictData; final SolrQueryRequest localReq = new LocalSolrQueryRequest(req.getCore(), req.getParams()); localReq.getContext().put(IKRU_PROPERTIES, req.getContext().get(IKRU_PROPERTIES)); new Thread(new Runnable() { public void run() { try { runSynchronously(dataToProcess, localReq); } catch (IOException e) { log.error(e.getLocalizedMessage()); log.error(e.getStackTrace().toString()); } catch (InterruptedException e) { log.error(e.getLocalizedMessage()); log.error(e.getStackTrace().toString()); } finally { setBusy(false); localReq.close(); } } }).start(); } public void setAsynchronous(boolean val) { asynchronous = val; } public boolean isAsynchronous() { return asynchronous; } private void setBusy(boolean b) { if (b == true) { counter++; } else { counter--; } } public boolean isBusy() { if (counter < 0) { throw new IllegalStateException("Huh, 2+2 is not 4?! Should never happen."); } return counter > 0; } public void setWorkerMessage(String msg) {; workerMessage = msg; } public String getWorkerMessage() { return workerMessage; } /* * The main method/logic */ private void runSynchronously(Map<String, Object> data, SolrQueryRequest req) throws MalformedURLException, IOException, InterruptedException {"=============================================================================");;;;"============================================================================="); SolrParams params = req.getParams(); SolrCore core = req.getCore(); String importurl = params.get(PARAM_IMPORT, null); String updateurl = params.get(PARAM_UPDATE, null); String deleteurl = params.get(PARAM_DELETE, null); @SuppressWarnings("unchecked") HashMap<String, int[]> dictData = (HashMap<String, int[]>) data.get("dictData"); Properties prop = (Properties) req.getContext().get(IKRU_PROPERTIES); if (dictData.containsKey(ADDED) && dictData.get(ADDED).length > 0) { setWorkerMessage("Phase 1/3. Adding records: " + dictData.get(ADDED).length); if (importurl != null) { if (importurl.equals("blankrecords")) { runProcessingAdded(dictData.get(ADDED), req); } else { runProcessing(core, importurl, dictData.get(ADDED), req); } } } if (dictData.containsKey(UPDATED) && dictData.get(UPDATED).length > 0) { setWorkerMessage("Phase 2/3. Updating records: " + dictData.get(UPDATED).length); if (updateurl != null) { if (updateurl.equals("blankrecords")) { runProcessingUpdated(dictData.get(UPDATED), req); } else { runProcessing(core, updateurl, dictData.get(UPDATED), req); } } } if (dictData.containsKey(DELETED) && dictData.get(DELETED).length > 0) { setWorkerMessage("Phase 3/3. deleting records: " + dictData.get(DELETED).length); if (deleteurl != null) { if (deleteurl.equals("blankrecords")) { runProcessingDeleted(dictData.get(DELETED), req); } else { runProcessing(core, deleteurl, dictData.get(DELETED), req); } } } // save the state into the properties (the modification date must be there // in all situations prop.put(LAST_UPDATE, (String) data.get(LAST_UPDATE)); prop.put(LAST_RECID, String.valueOf((Integer) data.get(LAST_RECID))); prop.remove(PARAM_BATCHSIZE); prop.remove(PARAM_MAXIMPORT); prop.remove(PARAM_TOKEN); saveProperties(prop); if (params.getBool(PARAM_COMMIT, false)) { setWorkerMessage("Phase 3/3. Writing index..."); CommitUpdateCommand updateCmd = new CommitUpdateCommand(req, false); req.getCore().getUpdateHandler().commit(updateCmd); } } private void runProcessing(SolrCore core, String handlerUrl, int[] recids, SolrQueryRequest req) throws MalformedURLException, IOException, InterruptedException { URI u = null; SolrRequestHandler handler = null; try { u = new URI(handlerUrl); String p = u.getPath(); if (u.getHost() == null || u.getHost() == "") { if (core.getRequestHandler(p) != null) { handler = core.getRequestHandler(p); } else if (!p.startsWith("/") && core.getRequestHandler("/" + p) != null) { handler = core.getRequestHandler("/" + p); } if (handler != null) { Map<String, List<String>> handlerParams = WebUtils.parseQuery(u.getQuery()); HashMap<String, String[]> hParams = new HashMap<String, String[]>(); for (String val : handlerParams.keySet()) { String[] nV = new String[handlerParams.get(val).size()]; int i = 0; for (String v : handlerParams.get(val)) { nV[i] = v; i++; } hParams.put(val, nV); } runProcessingInternally(handler, recids, req, hParams); return; } } } catch (URISyntaxException e) { e.printStackTrace(); } runProcessingUpload(handlerUrl, recids, req); } public File getPropertyFile() { return new File(IKRU_PROPERTIES); } private Properties loadProperties(SolrParams params) throws FileNotFoundException, IOException { Properties prop = new Properties(); File f = getPropertyFile(); if (f.exists()) { FileInputStream input = new FileInputStream(f); prop.load(input); input.close(); } String prop_recid = null; if (prop.containsKey(LAST_RECID)) { prop_recid = (String) prop.remove(LAST_RECID); } String prop_mod_date = null; if (prop.containsKey(LAST_UPDATE)) { prop_mod_date = (String) prop.remove(LAST_UPDATE); } boolean userParam = false; // parameters in url have always precedence (if both set // it is up to the python to figure out who has precedence if (params.getInt(LAST_RECID) != null) { prop.put(LAST_RECID, params.get(LAST_RECID)); userParam = true; } if (params.get(LAST_UPDATE, null) != null) { prop.put(LAST_UPDATE, params.get(LAST_UPDATE)); userParam = true; } if (!userParam) { // when no user params were supplied, prefer the mod_date over recid if (prop_mod_date != null) { prop.put(LAST_UPDATE, prop_mod_date); } else if (prop_recid != null) { prop.put(LAST_RECID, prop_recid); } } if (params.get(PARAM_BATCHSIZE, null) != null) { int bs = params.getInt(PARAM_BATCHSIZE); if (bs > max_batchsize) { prop.put(PARAM_BATCHSIZE, max_batchsize); } else { prop.put(PARAM_BATCHSIZE, bs); } } if (params.get(PARAM_MAXIMPORT, null) != null) { int mi = params.getInt(PARAM_MAXIMPORT); if (mi > max_maximport) { prop.put(PARAM_MAXIMPORT, max_maximport); } else { prop.put(PARAM_MAXIMPORT, mi); } } prop.put(PARAM_TOKEN, params.get(PARAM_TOKEN, "")); return prop; } private void saveProperties(Properties prop) throws IOException { File f = getPropertyFile(); FileOutputStream out = new FileOutputStream(f);, null); out.close(); } /* * When method-blankrecords we are creating/adding empty docs */ protected void runProcessingAdded(int[] recids, SolrQueryRequest req) throws IOException { IndexSchema schema = req.getSchema(); UpdateHandler updateHandler = req.getCore().getUpdateHandler(); String uniqField = schema.getUniqueKeyField().getName(); AddUpdateCommand addCmd = new AddUpdateCommand(req); //addCmd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1); //addCmd.setFlags(UpdateCommand.BUFFERING); if (recids.length > 0) { for (int i = 0; i < recids.length; i++) { addCmd.clear(); addCmd.solrDoc = new SolrInputDocument(); addCmd.solrDoc.addField(uniqField, recids[i]); updateHandler.addDoc(addCmd); } } } protected void runProcessingUpdated(int[] recids, SolrQueryRequest req) throws IOException { IndexSchema schema = req.getSchema(); UpdateHandler updateHandler = req.getCore().getUpdateHandler(); String uniqField = schema.getUniqueKeyField().getName(); AddUpdateCommand addCmd = new AddUpdateCommand(req); //addCmd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1); //addCmd.setFlags(UpdateCommand.BUFFERING); if (recids.length > 0) { // Map<Integer, Integer> map = DictionaryRecIdCache.INSTANCE // .getTranslationCache(req.getSearcher().getAtomicReader(), // uniqField); for (int i = 0; i < recids.length; i++) { // if (!map.containsKey(recids[i])) { addCmd.clear(); addCmd.solrDoc = new SolrInputDocument(); addCmd.solrDoc.addField(uniqField, recids[i]); updateHandler.addDoc(addCmd); // } // else { // log.error("Trying to update a record which doesn't have recid! recid=" + recids[i]); // } } } } protected void runProcessingDeleted(int[] recids, SolrQueryRequest req) throws IOException { UpdateHandler updateHandler = req.getCore().getUpdateHandler(); DeleteUpdateCommand delCmd = new DeleteUpdateCommand(req); //delCmd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1); //delCmd.setFlags(UpdateCommand.BUFFERING); if (recids.length > 0) { for (int i = 0; i < recids.length; i++) { delCmd.clear(); = Integer.toString(recids[i]); updateHandler.delete(delCmd); } } } /* * Internally calling dataimport handler */ protected void runProcessingInternally(SolrRequestHandler handler, int[] recids, SolrQueryRequest req, HashMap<String, String[]> hParams) throws IOException, InterruptedException { SolrParams params = req.getParams(); Integer maximport = params.getInt(PARAM_MAXIMPORT, max_maximport); String inveniourl = params.get(PARAM_INVENIO, null); List<String> queryParts = getQueryIds(maximport, recids); LocalSolrQueryRequest localReq = null; int i = 0; for (String queryPart : queryParts) { i++; String[] invP = new String[1]; invP[0] = getInternalURL(inveniourl, queryPart, maximport); hParams.put("url", invP); localReq = new LocalSolrQueryRequest(req.getCore(), hParams); SolrQueryResponse rsp = new SolrQueryResponse(); try { req.getCore().execute(handler, localReq, rsp); } finally { localReq.close(); } if (queryParts.size() > 1) { log.warn("Warning, we have started the importer, but it runs in parallel!"); log.warn("And we will initiate another: " + (queryParts.size() - i)); } } } protected void runProcessingUpload(String handlerUrl, int[] recids, SolrQueryRequest req) throws MalformedURLException, IOException, InterruptedException { SolrParams params = req.getParams(); Integer maximport = params.getInt(PARAM_MAXIMPORT, max_maximport); String inveniourl = params.get(PARAM_INVENIO, null); List<String> urlsToFetch = new ArrayList<String>(); List<String> queryParts = getQueryIds(maximport, recids); for (String queryPart : queryParts) { urlsToFetch.add(getFetchURL(handlerUrl, inveniourl, queryPart, maximport)); } runUpload(urlsToFetch); } /* * Calling external URL's */ protected void runUpload(List<String> urlsToFetch) throws MalformedURLException, IOException, InterruptedException { int i = 0; while (urlsToFetch.size() > 0) { String url = urlsToFetch.remove(0); String html = IOUtils.toString(new URL(url).openStream()); while (html.contains("busy")) { Thread.sleep(200); } i++; if (i > 20) { throw new SolrException(SolrException.ErrorCode.SERVICE_UNAVAILABLE, "The remote url is constantly busy: " + url); } } } protected String getFetchURL(String importurl, String inveniourl, String queryPart, Integer maximport) throws UnsupportedEncodingException { String sign = importurl.contains("?") ? "&" : "?"; String sign2 = inveniourl.contains("?") ? "&" : "?"; return importurl + sign + "url=" + + sign2 + "p=" +, "UTF-8") + "&rg=" + maximport + "&of=xm", "UTF-8"); } public static String getInternalURL(String sourceUrl, String queryPart, Integer maximport) throws UnsupportedEncodingException { String sign = sourceUrl.contains("?") ? "&" : "?"; return sourceUrl + sign + "p=" +, "UTF-8") + "&rg=" + maximport + "&of=xm"; } // ////////////////////// SolrInfoMBeans methods ////////////////////// /** * Will split array of intgs into query "recid:4->15 OR recid:78 OR recid:80->82" */ public static List<String> getQueryIds(int maxspan, int[] recids) { Arrays.sort(recids); List<String> ret = new ArrayList<String>(); StringBuilder query; int i = 0; while (i < recids.length) { int delta = 1; int last_id = 0; query = new StringBuilder(); query.append("recid:" + recids[i]); for (i++; i < recids.length; i++) { if (delta >= maxspan) { break; } if (recids[i] - 1 == recids[i - 1]) { last_id = recids[i]; delta += 1; continue; } if (last_id > 0) { query.append("->" + last_id); last_id = 0; } query.append(" OR recid:" + recids[i]); delta += 1; } if (last_id > 0) { query.append("->" + last_id); } ret.add(query.toString()); } return ret; } public String getVersion() { return ""; } public String getDescription() { return "Updates the Invenio recid with the missing/new docs (if any)"; } public String getSourceId() { return ""; } public String getSource() { return ""; } }