Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package com.ikanow.infinit.e.processing.custom.launcher; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringWriter; import java.io.Writer; import java.lang.reflect.Method; import java.net.URL; import java.net.URLClassLoader; import java.util.Date; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.apache.commons.lang.StringEscapeUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.log4j.AppenderSkeleton; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.log4j.spi.LoggingEvent; import org.bson.types.ObjectId; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import com.ikanow.infinit.e.data_model.custom.ICustomInfiniteInternalEngine; import com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat; import com.ikanow.infinit.e.data_model.custom.InfiniteMongoSplitter; import com.ikanow.infinit.e.data_model.store.DbManager; import com.ikanow.infinit.e.data_model.store.MongoDbUtil; import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo; import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo; import com.ikanow.infinit.e.data_model.store.document.DocumentPojo; import com.ikanow.infinit.e.processing.custom.output.CustomOutputManager; import com.ikanow.infinit.e.processing.custom.utils.AuthUtils; import com.ikanow.infinit.e.processing.custom.utils.HadoopUtils; import com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchHadoopUtils; import com.ikanow.infinit.e.processing.custom.utils.InfiniteHadoopUtils; import com.ikanow.infinit.e.processing.custom.utils.PropertiesManager; import com.mongodb.BasicDBObject; public class CustomHadoopTaskLauncher extends AppenderSkeleton { private static Logger _logger = Logger.getLogger(CustomHadoopTaskLauncher.class); private com.ikanow.infinit.e.data_model.utils.PropertiesManager prop_general = new com.ikanow.infinit.e.data_model.utils.PropertiesManager(); private PropertiesManager props_custom = null; private boolean bLocalMode; private Integer nDebugLimit; private boolean bTestMode = false; public CustomHadoopTaskLauncher(boolean bLocalMode_, Integer nDebugLimit_, PropertiesManager prop_custom_) { bLocalMode = bLocalMode_; nDebugLimit = nDebugLimit_; props_custom = prop_custom_; if (null != nDebugLimit) { bTestMode = true; } } @SuppressWarnings({ "unchecked", "rawtypes" }) public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation) throws IOException, SAXException, ParserConfigurationException { StringWriter xml = new StringWriter(); String outputCollection = job.outputCollectionTemp;// (non-append mode) if ((null != job.appendResults) && job.appendResults) outputCollection = job.outputCollection; // (append mode, write directly in....) else if (null != job.incrementalMode) job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode) createConfigXML(xml, job.jobtitle, job.inputCollection, InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS), job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper, job.reducer, job.combiner, InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY), job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode, job.submitterID, job.selfMerge, job.outputCollection, job.appendResults); ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader(); URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, savedClassLoader); Thread.currentThread().setContextClassLoader(child); // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable boolean dataModelLoaded = true; try { URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, null); try { Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest); } catch (ClassNotFoundException e2) { //(this is fine, will use the cached version) dataModelLoaded = false; } if (dataModelLoaded) Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest); } catch (ClassNotFoundException e1) { throw new RuntimeException( "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards"); } // Now load the XML into a configuration object: Configuration config = new Configuration(); // Add the client configuration overrides: if (!bLocalMode) { String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/"; config.addResource(new Path(hadoopConfigPath + "core-site.xml")); config.addResource(new Path(hadoopConfigPath + "mapred-site.xml")); config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml")); } //TESTED try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes())); NodeList nList = doc.getElementsByTagName("property"); for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { Element eElement = (Element) nNode; String name = getTagValue("name", eElement); String value = getTagValue("value", eElement); if ((null != name) && (null != value)) { config.set(name, value); } } } } catch (Exception e) { throw new IOException(e.getMessage()); } // Some other config defaults: // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config) config.set("mapred.map.tasks.speculative.execution", "false"); config.set("mapred.reduce.tasks.speculative.execution", "false"); // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera) // Now run the JAR file try { BasicDBObject advancedConfigurationDbo = null; try { advancedConfigurationDbo = (null != job.query) ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query)) : (new BasicDBObject()); } catch (Exception e) { advancedConfigurationDbo = new BasicDBObject(); } boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable; if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) { throw new RuntimeException( "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead."); } config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) if (bLocalMode) { // local job tracker and FS mode config.set("mapred.job.tracker", "local"); config.set("fs.default.name", "local"); } else { if (bTestMode) { // run job tracker locally but FS mode remotely config.set("mapred.job.tracker", "local"); } else { // normal job tracker String trackerUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker"); config.set("mapred.job.tracker", trackerUrl); } String fsUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name"); config.set("fs.default.name", fsUrl); } if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.data_model.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.processing.custom.library.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } //TESTED // Debug scripts (only if they exist), and only in non local/test mode if (!bLocalMode && !bTestMode) { try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_map_error_handler.sh", config); config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle); config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_reduce_error_handler.sh", config); config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on } //TODO (???): TOTEST // (need to do these 2 things here before the job is created, at which point the config class has been copied across) //1) Class<?> mapperClazz = Class.forName(job.mapper, true, child); if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz .newInstance(); preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode)); } //TESTED //2) if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { // Need to download the GridFSZip file try { Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/", "GridFSZipFile.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } catch (Throwable t) { } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!) } if (job.inputCollection.equals("records")) { InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo); //(won't run under 0.19 so running with "records" should cause all sorts of exceptions) } //TESTED (by hand) if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); } // Manually specified caches List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"), job, config, props_custom); Job hj = new Job(config); // (NOTE: from here, changes to config are ignored) try { if (null != localJarCaches) { if (bLocalMode || bTestMode) { Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class }); method.setAccessible(true); method.invoke(child, localJarCaches.toArray()); } //TOTEST (tested logically) } Class<?> classToLoad = Class.forName(job.mapper, true, child); hj.setJarByClass(classToLoad); if (job.inputCollection.equalsIgnoreCase("filesystem")) { String inputPath = null; try { inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); if (!inputPath.endsWith("/")) { inputPath = inputPath + "/"; } } catch (Exception e) { } if (null == inputPath) { throw new RuntimeException("Must specify 'file.url' if reading from filesystem."); } inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath); InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive) InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB) InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child)); } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { String[] oidStrs = null; try { String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)"); Matcher m = oidExtractor.matcher(inputPath); if (m.find()) { oidStrs = m.group(1).split("\\s*,\\s*"); } else { throw new RuntimeException( "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath); } InfiniteHadoopUtils.authenticateShareList(job, oidStrs); } catch (Exception e) { throw new RuntimeException( "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e); } hj.getConfiguration().setStrings("mapred.input.dir", oidStrs); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child)); } else if (job.inputCollection.equals("records")) { hj.setInputFormatClass((Class<? extends InputFormat>) Class .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child)); } else { if (esMode) { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat", true, child)); } else { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child)); } } if ((null != job.exportToHdfs) && job.exportToHdfs) { //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?) Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom); if ((null != job.outputKey) && (null != job.outputValue) && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text") && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) { // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text) hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child)); TextOutputFormat.setOutputPath(hj, outPath); } //TESTED else { hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child)); SequenceFileOutputFormat.setOutputPath(hj, outPath); } //TESTED } else { // normal case, stays in MongoDB hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child)); } hj.setMapperClass((Class<? extends Mapper>) mapperClazz); String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null); if (null != mapperOutputKeyOverride) { hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride)); } //TESTED String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null); if (null != mapperOutputValueOverride) { hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride)); } //TESTED if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null") && !job.reducer.equalsIgnoreCase("none")) { hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child)); // Variable reducers: if (null != job.query) { try { hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1)); } catch (Exception e) { try { // (just check it's not a string that is a valid int) hj.setNumReduceTasks( Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1"))); } catch (Exception e2) { } } } //TESTED } else { hj.setNumReduceTasks(0); } if ((null != job.combiner) && !job.combiner.startsWith("#") && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) { hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child)); } hj.setOutputKeyClass(Class.forName(job.outputKey, true, child)); hj.setOutputValueClass(Class.forName(job.outputValue, true, child)); hj.setJobName(job.jobtitle); currJobName = job.jobtitle; } catch (Error e) { // (messing about with class loaders = lots of chances for errors!) throw new RuntimeException(e.getMessage(), e); } if (bTestMode || bLocalMode) { hj.submit(); currThreadId = null; Logger.getRootLogger().addAppender(this); currLocalJobId = hj.getJobID().toString(); currLocalJobErrs.setLength(0); while (!hj.isComplete()) { Thread.sleep(1000); } Logger.getRootLogger().removeAppender(this); if (hj.isSuccessful()) { if (this.currLocalJobErrs.length() > 0) { return "local_done: " + this.currLocalJobErrs.toString(); } else { return "local_done"; } } else { return "Error: " + this.currLocalJobErrs.toString(); } } else { hj.submit(); String jobId = hj.getJobID().toString(); return jobId; } } catch (Exception e) { e.printStackTrace(); Thread.currentThread().setContextClassLoader(savedClassLoader); return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e); } finally { Thread.currentThread().setContextClassLoader(savedClassLoader); } } public String runHadoopJob_commandLine(CustomMapReduceJobPojo job, String jar) { String jobid = null; try { job.tempConfigXMLLocation = createConfigXML_commandLine(job.jobtitle, job.inputCollection, job._id.toString(), job.tempConfigXMLLocation, job.mapper, job.reducer, job.combiner, InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY), job.communityIds, job.isCustomTable, job.getOutputDatabase(), job.outputKey, job.outputValue, job.outputCollectionTemp, job.arguments, job.incrementalMode, job.submitterID, job.selfMerge, job.outputCollection, job.appendResults); Runtime rt = Runtime.getRuntime(); String[] commands = new String[] { "hadoop", "--config", props_custom.getHadoopConfigPath() + "/hadoop", "jar", jar, "-conf", job.tempConfigXMLLocation }; String command = ""; for (String s : commands) command += s + " "; Process pr = rt.exec(command); //Once we start running the command attach to stderr to //receive the output to parse out the jobid InputStream in = pr.getErrorStream(); InputStreamReader is = new InputStreamReader(in); BufferedReader br = new BufferedReader(is); StringBuilder output = new StringBuilder(); String line = null; long startTime = new Date().getTime(); boolean bGotJobId = false; //while we haven't found the id, there are still lines to read, and it hasn't been more than 60 seconds while (!bGotJobId && (line = br.readLine()) != null && (new Date().getTime() - startTime) < InfiniteHadoopUtils.SECONDS_60) { output.append(line); int getJobIdIndex = -1; String searchstring = "INFO mapred.JobClient: Running job: "; if ((getJobIdIndex = line.indexOf(searchstring)) >= 0) { // Get JobId and trim() it (obviously trivial) jobid = line.substring(getJobIdIndex + searchstring.length()).trim(); bGotJobId = true; } } //60 seconds passed and we never found the id if (!bGotJobId) { _logger.info("job_start_timeout_error_title=" + job.jobtitle + " job_start_timeout_error_id=" + job._id.toString() + " job_start_timeout_error_message=" + output.toString()); //if we never found the id mark it as errored out return "Error:\n" + output.toString(); } } catch (Exception ex) { //had an error running command //probably log error to the job so we stop trying to run it _logger.info("job_start_timeout_error_title=" + job.jobtitle + " job_start_timeout_error_id=" + job._id.toString() + " job_start_timeout_error_message=" + InfiniteHadoopUtils.createExceptionMessage(ex)); jobid = "Error:\n" + ex.getMessage(); // (means this gets displayed) } return jobid; } //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// // UTILS /** * Create the xml file that will configure the mongo commands and * write that to the server * * @param input * @param output * @throws IOException */ private String createConfigXML_commandLine(String title, String input, String output, String configLocation, String mapper, String reducer, String combiner, String query, List<ObjectId> communityIds, boolean isCustomTable, String outputDatabase, String outputKey, String outputValue, String tempOutputCollection, String arguments, Boolean incrementalMode, ObjectId userId, Boolean selfMerge, String originalOutputCollection, Boolean appendResults) throws IOException { if (configLocation == null) configLocation = InfiniteHadoopUtils.assignNewConfigLocation(props_custom); File configFile = new File(configLocation); FileWriter fstream = new FileWriter(configFile); BufferedWriter out = new BufferedWriter(fstream); createConfigXML(out, title, input, InfiniteHadoopUtils.getQueryOrProcessing(query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS), isCustomTable, outputDatabase, output, tempOutputCollection, mapper, reducer, combiner, query, communityIds, outputKey, outputValue, arguments, incrementalMode, userId, selfMerge, originalOutputCollection, appendResults); fstream.close(); return configLocation; } private void createConfigXML(Writer out, String title, String input, String fields, boolean isCustomTable, String outputDatabase, String output, String tempOutputCollection, String mapper, String reducer, String combiner, String query, List<ObjectId> communityIds, String outputKey, String outputValue, String arguments, Boolean incrementalMode, ObjectId userId, Boolean selfMerge, String originalOutputCollection, Boolean appendResults) throws IOException { String dbserver = prop_general.getDatabaseServer(); output = outputDatabase + "." + tempOutputCollection; boolean isAdmin = AuthUtils.isAdmin(userId); int nSplits = 8; int nDocsPerSplit = 12500; //add communities to query if this is not a custom table BasicDBObject oldQueryObj = null; BasicDBObject srcTags = null; // Start with the old query: if (query.startsWith("{")) { oldQueryObj = (BasicDBObject) com.mongodb.util.JSON.parse(query); } else { oldQueryObj = new BasicDBObject(); } boolean elasticsearchQuery = oldQueryObj.containsField("qt") && !isCustomTable; int nLimit = 0; if (oldQueryObj.containsField("$limit")) { nLimit = oldQueryObj.getInt("$limit"); oldQueryObj.remove("$limit"); } if (oldQueryObj.containsField("$splits")) { nSplits = oldQueryObj.getInt("$splits"); oldQueryObj.remove("$splits"); } if (oldQueryObj.containsField("$srctags")) { srcTags = new BasicDBObject(SourcePojo.tags_, oldQueryObj.get("$srctags")); oldQueryObj.remove("$srctags"); } if (bLocalMode) { // If in local mode, then set this to a large number so we always run inside our limit/split version // (since for some reason MongoInputFormat seems to fail on large collections) nSplits = InfiniteMongoSplitter.MAX_SPLITS; } if (oldQueryObj.containsField("$docsPerSplit")) { nDocsPerSplit = oldQueryObj.getInt("$docsPerSplit"); oldQueryObj.remove("$docsPerSplit"); } oldQueryObj.remove("$fields"); oldQueryObj.remove("$output"); oldQueryObj.remove("$reducers"); String mapperKeyClass = oldQueryObj.getString("$mapper_key_class", ""); String mapperValueClass = oldQueryObj.getString("$mapper_value_class", ""); oldQueryObj.remove("$mapper_key_class"); oldQueryObj.remove("$mapper_value_class"); String cacheList = null; Object cacheObj = oldQueryObj.get("$caches"); if (null != cacheObj) { cacheList = cacheObj.toString(); // (either array of strings, or single string) if (!cacheList.startsWith("[")) { cacheList = "[" + cacheList + "]"; // ("must" now be valid array) } oldQueryObj.remove("$caches"); } //TESTED if (null != nDebugLimit) { // (debug mode override) nLimit = nDebugLimit; } boolean tmpIncMode = (null != incrementalMode) && incrementalMode; Date fromOverride = null; Date toOverride = null; Object fromOverrideObj = oldQueryObj.remove("$tmin"); Object toOverrideObj = oldQueryObj.remove("$tmax"); if (null != fromOverrideObj) { fromOverride = InfiniteHadoopUtils.dateStringFromObject(fromOverrideObj, true); } if (null != toOverrideObj) { toOverride = InfiniteHadoopUtils.dateStringFromObject(toOverrideObj, false); } if (!isCustomTable) { if (elasticsearchQuery) { oldQueryObj.put("communityIds", communityIds); //tmin/tmax not supported - already have that capability as part of the query } else { if (input.equals("feature.temporal")) { if ((null != fromOverride) || (null != toOverride)) { oldQueryObj.put("value.maxTime", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, true)); } //TESTED oldQueryObj.put("_id.c", new BasicDBObject(DbManager.in_, communityIds)); } else { oldQueryObj.put(DocumentPojo.communityId_, new BasicDBObject(DbManager.in_, communityIds)); if ((null != fromOverride) || (null != toOverride)) { oldQueryObj.put("_id", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, false)); } //TESTED if (input.equals("doc_metadata.metadata")) { oldQueryObj.put(DocumentPojo.index_, new BasicDBObject(DbManager.ne_, "?DEL?")); // (ensures not soft-deleted) } } } } else { if ((null != fromOverride) || (null != toOverride)) { oldQueryObj.put("_id", InfiniteHadoopUtils.createDateRange(fromOverride, toOverride, false)); } //TESTED //get the custom table (and database) input = CustomOutputManager.getCustomDbAndCollection(input); } query = oldQueryObj.toString(); if (arguments == null) arguments = ""; // Generic configuration out.write("<?xml version=\"1.0\"?>\n<configuration>"); // Mongo specific configuration out.write("\n\t<property><!-- name of job shown in jobtracker --><name>mongo.job.name</name><value>" + title + "</value></property>" + "\n\t<property><!-- run the job verbosely ? --><name>mongo.job.verbose</name><value>true</value></property>" + "\n\t<property><!-- Run the job in the foreground and wait for response, or background it? --><name>mongo.job.background</name><value>false</value></property>" + "\n\t<property><!-- If you are reading from mongo, the URI --><name>mongo.input.uri</name><value>mongodb://" + dbserver + "/" + input + "</value></property>" + "\n\t<property><!-- If you are writing to mongo, the URI --><name>mongo.output.uri</name><value>mongodb://" + dbserver + "/" + output + "</value> </property>" + "\n\t<property><!-- The query, in JSON, to execute [OPTIONAL] --><name>mongo.input.query</name><value>" + StringEscapeUtils.escapeXml(query) + "</value></property>" + "\n\t<property><!-- The fields, in JSON, to read [OPTIONAL] --><name>mongo.input.fields</name><value>" + ((fields == null) ? ("") : fields) + "</value></property>" + "\n\t<property><!-- A JSON sort specification for read [OPTIONAL] --><name>mongo.input.sort</name><value></value></property>" + "\n\t<property><!-- The number of documents to limit to for read [OPTIONAL] --><name>mongo.input.limit</name><value>" + nLimit + "</value><!-- 0 == no limit --></property>" + "\n\t<property><!-- The number of documents to skip in read [OPTIONAL] --><!-- TODO - Are we running limit() or skip() first? --><name>mongo.input.skip</name><value>0</value> <!-- 0 == no skip --></property>" + "\n\t<property><!-- Class for the mapper --><name>mongo.job.mapper</name><value>" + mapper + "</value></property>" + "\n\t<property><!-- Reducer class --><name>mongo.job.reducer</name><value>" + reducer + "</value></property>" + "\n\t<property><!-- InputFormat Class --><name>mongo.job.input.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat</value></property>" + "\n\t<property><!-- OutputFormat Class --><name>mongo.job.output.format</name><value>com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat</value></property>" + "\n\t<property><!-- Output key class for the output format --><name>mongo.job.output.key</name><value>" + outputKey + "</value></property>" + "\n\t<property><!-- Output value class for the output format --><name>mongo.job.output.value</name><value>" + outputValue + "</value></property>" + "\n\t<property><!-- Output key class for the mapper [optional] --><name>mongo.job.mapper.output.key</name><value>" + mapperKeyClass + "</value></property>" + "\n\t<property><!-- Output value class for the mapper [optional] --><name>mongo.job.mapper.output.value</name><value>" + mapperValueClass + "</value></property>" + "\n\t<property><!-- Class for the combiner [optional] --><name>mongo.job.combiner</name><value>" + combiner + "</value></property>" + "\n\t<property><!-- Partitioner class [optional] --><name>mongo.job.partitioner</name><value></value></property>" + "\n\t<property><!-- Sort Comparator class [optional] --><name>mongo.job.sort_comparator</name><value></value></property>" + "\n\t<property><!-- Split Size [optional] --><name>mongo.input.split_size</name><value>32</value></property>"); // Infinit.e specific configuration out.write("\n\t<property><!-- User Arguments [optional] --><name>infinit.e.userid</name><value>" + StringEscapeUtils.escapeXml(userId.toString()) + "</value></property>" + "\n\t<property><!-- User Arguments [optional] --><name>arguments</name><value>" + StringEscapeUtils.escapeXml(arguments) + "</value></property>" + "\n\t<property><!-- Maximum number of splits [optional] --><name>max.splits</name><value>" + nSplits + "</value></property>" + "\n\t<property><!-- Maximum number of docs per split [optional] --><name>max.docs.per.split</name><value>" + nDocsPerSplit + "</value></property>" + "\n\t<property><!-- Infinit.e incremental mode [optional] --><name>update.incremental</name><value>" + tmpIncMode + "</value></property>" + "\n\t<property><!-- Infinit.e quick admin check [optional] --><name>infinit.e.is.admin</name><value>" + isAdmin + "</value></property>" + "\n\t<property><!-- Infinit.e userid [optional] --><name>infinit.e.userid</name><value>" + userId + "</value></property>"); if (null != cacheList) { out.write( "\n\t<property><!-- Infinit.e cache list [optional] --><name>infinit.e.cache.list</name><value>" + cacheList + "</value></property>"); } //TESTED if (null != srcTags) { out.write( "\n\t<property><!-- Infinit.e src tags filter [optional] --><name>infinit.e.source.tags.filter</name><value>" + srcTags.toString() + "</value></property>"); } if (null != selfMerge && selfMerge && originalOutputCollection != null) { originalOutputCollection = "mongodb://" + dbserver + "/" + outputDatabase + "." + originalOutputCollection; out.write( "\n\t<property><!-- This jobs output collection for passing into the mapper along with input collection [optional] --><name>infinit.e.selfMerge</name><value>" + originalOutputCollection + "</value></property>"); } // Closing thoughts: out.write("\n</configuration>"); out.flush(); out.close(); } //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// // REALLY LOW LEVEL UTILS private static String getTagValue(String sTag, Element eElement) { NodeList nlList = eElement.getElementsByTagName(sTag).item(0).getChildNodes(); Node nValue = (Node) nlList.item(0); if (null != nValue) { return nValue.getNodeValue(); } else { return null; } } //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// // LOGGING INTERFACE private String currLocalJobId = null; private String currSanityCheck = null; private String currThreadId = null; private String currJobName = null; private StringBuffer currLocalJobErrs = new StringBuffer(); @Override public void close() { } @Override public boolean requiresLayout() { return false; } @Override protected void append(LoggingEvent arg0) { // Get current thread id (need to check these even if we have them so we don't log them multiple times) if (arg0.getLoggerName().equals("com.ikanow.infinit.e.data_model.custom.InfiniteFileInputReader")) { if ((null != currJobName) && arg0.getRenderedMessage().startsWith(currJobName + ":")) { if (null == currThreadId) { // this is one of the first message that is printed out so get the thread... currThreadId = arg0.getThreadName(); currSanityCheck = "Task:attempt_" + currLocalJobId.substring(4) + "_"; } return; // (don't log this) } } //TESTED else if (arg0.getLoggerName() .equals("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat$InfiniteEsRecordReader")) { if ((null != currJobName) && arg0.getRenderedMessage().startsWith(currJobName + ":")) { if (null == currThreadId) { // this is one of the first message that is printed out so get the thread... currThreadId = arg0.getThreadName(); currSanityCheck = "Task:attempt_" + currLocalJobId.substring(4) + "_"; } return; // (don't log this) } } //TESTED else if (arg0.getLoggerName().equals("com.ikanow.infinit.e.data_model.custom.InfiniteMongoRecordReader")) { if ((null != currJobName) && arg0.getRenderedMessage().startsWith(currJobName + ":")) { if (null == currThreadId) { // this is one of the first message that is printed out so get the thread... currThreadId = arg0.getThreadName(); currSanityCheck = "Task:attempt_" + currLocalJobId.substring(4) + "_"; } return; // (don't log this) } } //TESTED if ((null != currThreadId) && arg0.getLoggerName().equals("org.apache.hadoop.mapred.Task")) { if (arg0.getRenderedMessage().startsWith(currSanityCheck)) { // This is to check we didn't accidentally get someone else's messages if (!currThreadId.equals(arg0.getThreadName())) { _logger.error("Drop all logging: thread mismatch for " + currLocalJobId); currLocalJobErrs.setLength(0); currThreadId = "ZXCVB"; } } } //TESTED else if (arg0.getLoggerName().equals("org.apache.hadoop.mapred.LocalJobRunner")) { if (arg0.getMessage().toString().equals(currLocalJobId)) { String[] exceptionInfo = arg0.getThrowableStrRep(); if (null != exceptionInfo) { currLocalJobErrs.append("Uncaught Exception in local job.\n"); for (String errLine : exceptionInfo) { if (errLine.startsWith(" at org.apache.hadoop")) { break; } currLocalJobErrs.append(errLine).append("\n"); } } } } //TESTED (uncaught exception) else if (!arg0.getLoggerName().startsWith("org.apache.hadoop") && !arg0.getLoggerName().startsWith("com.mongodb.hadoop.")) { if (arg0.getThreadName().equals(currThreadId)) { if ((arg0.getLevel() == Level.ERROR) || bTestMode) { currLocalJobErrs.append('[').append(arg0.getLevel()).append("] ").append(arg0.getLoggerName()) .append(":").append(arg0.getLocationInformation().getLineNumber()).append(" ") .append(arg0.getMessage()).append("\n"); String[] exceptionInfo = arg0.getThrowableStrRep(); if (null != exceptionInfo) { for (String errLine : exceptionInfo) { if (errLine.startsWith(" at org.apache.hadoop")) { break; } currLocalJobErrs.append(errLine).append("\n"); } } //(end if exception information present) } //(end if error or in test mode) } //(end if this is my thread) } //TESTED } }