List of usage examples for org.apache.hadoop.conf Configuration setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:com.google.cloud.hadoop.fs.gcs.ListHelperGoogleHadoopFileSystem.java
License:Open Source License
/** * Factory method for constructing and initializing an instance of * ListHelperGoogleHadoopFileSystem which is ready to list/get FileStatus entries corresponding * to {@code fileInfos}.// ww w . ja va2 s. com */ public static GoogleHadoopFileSystem createInstance(GoogleCloudStorageFileSystem gcsfs, Collection<FileInfo> fileInfos) throws IOException { Preconditions.checkState(!fileInfos.isEmpty(), "Cannot construct ListHelperGoogleHadoopFileSystem with empty fileInfos list!"); List<GoogleCloudStorageItemInfo> infos = new ArrayList<>(); URI rootUri = null; Set<URI> providedPaths = new HashSet<>(); for (FileInfo info : fileInfos) { infos.add(info.getItemInfo()); providedPaths.add(info.getPath()); if (rootUri == null) { // Set the root URI to the first path in the collection. rootUri = info.getPath(); } } // The flow for populating this doesn't bother to populate metadata entries for parent // directories but we know the parent directories are expected to exist, so we'll just // populate the missing entries explicitly here. Necessary for getFileStatus(parentOfInfo) // to work when using an instance of this class. for (FileInfo info : fileInfos) { URI parentPath = gcsfs.getParentPath(info.getPath()); while (parentPath != null && !parentPath.equals(GoogleCloudStorageFileSystem.GCS_ROOT)) { if (!providedPaths.contains(parentPath)) { LOG.debug("Adding fake entry for missing parent path '{}'", parentPath); GoogleCloudStorageItemInfo fakeInfo = new GoogleCloudStorageItemInfo( gcsfs.getPathCodec().validatePathAndGetId(parentPath, true), 0, 0, null, null); infos.add(fakeInfo); providedPaths.add(parentPath); } parentPath = gcsfs.getParentPath(parentPath); } } // Add in placeholder bucket info, since the bucket info won't be relevant for our listObject // operations. String tempBucket = rootUri.getAuthority(); infos.add(new GoogleCloudStorageItemInfo(new StorageResourceId(tempBucket), 0, 0, "", "")); MetadataReadOnlyGoogleCloudStorage tempGcs = new MetadataReadOnlyGoogleCloudStorage(infos); GoogleCloudStorageFileSystem tempGcsFs = new GoogleCloudStorageFileSystem(tempGcs, gcsfs.getOptions()); GoogleHadoopFileSystem tempGhfs = new ListHelperGoogleHadoopFileSystem(tempGcsFs); Configuration tempConfig = new Configuration(); tempConfig.set(GoogleHadoopFileSystemBase.GCS_SYSTEM_BUCKET_KEY, tempBucket); tempConfig.setBoolean(GoogleHadoopFileSystemBase.GCS_CREATE_SYSTEM_BUCKET_KEY, false); tempConfig.set(GoogleHadoopFileSystemBase.GCS_WORKING_DIRECTORY_KEY, "/"); // Set initSuperclass == false to avoid screwing up FileSystem statistics. tempGhfs.initialize(rootUri, tempConfig, false); return tempGhfs; }
From source file:com.google.mr4c.hadoop.yarn.YarnTestBinding.java
License:Open Source License
private void startMrCluster() throws IOException { Configuration conf = new JobConf(); FileSystem.setDefaultUri(conf, HadoopTestUtils.getTestDFS().getUri()); conf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_FIXED_PORTS, true); conf.setBoolean(JHAdminConfig.MR_HISTORY_MINICLUSTER_FIXED_PORTS, true); String addr = MiniYARNCluster.getHostname() + ":0"; conf.set(YarnConfiguration.RM_ADDRESS, addr); conf.set(JHAdminConfig.MR_HISTORY_ADDRESS, addr); m_mrCluster = MiniMRClientClusterFactory.create(HadoopTestUtils.class, "MR4CTests", 1, // num node managers conf);// w w w .j a va2 s. co m // make sure startup is finished for (int i = 0; i < 60; i++) { String newAddr = m_mrCluster.getConfig().get(YarnConfiguration.RM_ADDRESS); if (newAddr.equals(addr)) { s_log.warn("MiniYARNCluster startup not complete"); try { Thread.sleep(1000); } catch (InterruptedException ie) { throw new IOException(ie); } } else { s_log.info("MiniYARNCluster now available at {}", newAddr); return; } } throw new IOException("MiniYARNCluster taking too long to startup"); }
From source file:com.gsinnovations.howdah.AbstractJob.java
License:Apache License
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat) throws IOException { Job job = new Job(new Configuration(getConf())); Configuration jobConf = job.getConfiguration(); if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); }/*from ww w .ja v a 2s . com*/ job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); job.setMapOutputKeyClass(mapperKey); job.setMapOutputValueClass(mapperValue); jobConf.setBoolean("mapred.compress.map.output", true); job.setReducerClass(reducer); job.setOutputKeyClass(reducerKey); job.setOutputValueClass(reducerValue); job.setJobName(getCustomJobName(job, mapper, reducer)); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; }
From source file:com.hadoop.mapreduce.TestLzoLazyLoading.java
License:Open Source License
private void runWordCount(Configuration cf, boolean compressIn, boolean compressOut) throws IOException, InterruptedException, ClassNotFoundException { Configuration thisConf = new Configuration(cf); if (compressIn) { thisConf.setBoolean("mapred.compression.lzo.test.codec-checked-after-map", true); }// w w w . j a v a2s . c om if (compressOut) { thisConf.setBoolean("mapred.compression.lzo.test.codec-checked-after-reduce", true); } Path pathIn = new Path(TEST_ROOT_DIR + "/in"); Path pathOut = new Path(TEST_ROOT_DIR + "/out"); localFs.delete(pathIn, true); localFs.delete(pathOut, true); writeFile(makeFileName("in/part1", compressIn), "this is a test\nof word count test\ntest\n"); writeFile(makeFileName("in/part2", compressIn), "more test"); Job job = new Job(thisConf, "word count"); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); if (compressOut) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, LzoCodec.class); } FileInputFormat.addInputPath(job, pathIn); FileOutputFormat.setOutputPath(job, pathOut); job.submit(); assertEquals("IsLzoChecked (client)?", compressIn, LzoCodec.isNativeLzoChecked()); assertTrue(job.waitForCompletion(false)); String result = readFile(makeFileName("out/part-r-00000", compressOut)); System.out.println(result); assertEquals("a\t1\ncount\t1\nis\t1\nmore\t1\nof\t1\ntest\t4\nthis\t1\nword\t1\n", result); }
From source file:com.hortonworks.minicluster.MiniHadoopCluster.java
License:Apache License
/** */* ww w . j a va2 s. c om*/ */ @Override public void serviceInit(Configuration conf) throws Exception { conf.setBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, true); conf.setStrings(YarnConfiguration.NM_AUX_SERVICES, new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID }); conf.setClass( String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID), ShuffleHandler.class, Service.class); conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0); this.addService(new ResourceManagerWrapper()); for (int index = 0; index < this.nodeManagers.length; index++) { this.nodeManagers[index] = new ShortCircuitedNodeManager(); this.addService(new NodeManagerWrapper(index)); } super.serviceInit(conf instanceof YarnConfiguration ? conf : new YarnConfiguration(conf)); }
From source file:com.huayu.metis.flume.sink.hdfs.BucketWriter.java
License:Apache License
/** * open() is called by append()/*from w ww . ja v a 2 s. c om*/ * * @throws java.io.IOException * @throws InterruptedException */ private void open() throws IOException, InterruptedException { if ((filePath == null) || (writer == null)) { throw new IOException("Invalid file settings"); } final Configuration config = new Configuration(); // disable FileSystem JVM shutdown hook config.setBoolean("fs.automatic.close", false); // Hadoop is not thread safe when doing certain RPC operations, // including getFileSystem(), when running under Kerberos. // open() must be called by one thread at a time in the JVM. // NOTE: tried synchronizing on the underlying Kerberos principal previously // which caused deadlocks. See FLUME-1231. synchronized (staticLock) { checkAndThrowInterruptedException(); try { long counter = fileExtensionCounter.incrementAndGet(); //?? String fullFileName = fileName + "." + counter + "." + fileSuffix; //? bucketPath = filePath + "/" + fullFileName + inUseSuffix; //?? targetPath = filePath + "/" + fullFileName; LOG.debug("[hy]Creating BuckPath:" + bucketPath + ", TargetPath:" + targetPath); callWithTimeout(new CallRunner<Void>() { @Override public Void call() throws Exception { // Need to get reference to FS using above config before underlying // writer does in order to avoid shutdown hook & IllegalStateExceptions fileSystem = new Path(bucketPath).getFileSystem(config); writer.open(bucketPath); return null; } }); } catch (Exception ex) { sinkCounter.incrementConnectionFailedCount(); if (ex instanceof IOException) { throw (IOException) ex; } else { throw Throwables.propagate(ex); } } } sinkCounter.incrementConnectionCreatedCount(); resetCounters(); // if time-based rolling is enabled, schedule the roll if (rollInterval > 0) { Callable<Void> action = new Callable<Void>() { public Void call() throws Exception { LOG.debug("Rolling file ({}): Roll scheduled after {} sec elapsed.", bucketPath, rollInterval); try { // Roll the file and remove reference from sfWriters map. close(true); } catch (Throwable t) { LOG.error("Unexpected error", t); } return null; } }; timedRollFuture = timedRollerPool.schedule(action, rollInterval, TimeUnit.SECONDS); } isOpen = true; }
From source file:com.idvp.platform.hdfs.BucketWriter.java
License:Apache License
/** * open() is called by append()/*from w w w .j a v a 2 s. c om*/ * * @throws IOException * @throws InterruptedException */ private void open() throws IOException, InterruptedException { if ((filePath == null) || (writer == null)) { throw new IOException("Invalid file settings"); } final Configuration config = new Configuration(); // disable FileSystem JVM shutdown hook config.setBoolean("fs.automatic.close", false); // Hadoop is not thread safe when doing certain RPC operations, // including getFileSystem(), when running under Kerberos. // open() must be called by one thread at a time in the JVM. // NOTE: tried synchronizing on the underlying Kerberos principal previously // which caused deadlocks. See FLUME-1231. synchronized (staticLock) { checkAndThrowInterruptedException(); try { long counter = fileExtensionCounter.incrementAndGet(); String fullFileName = fileName + "." + counter; if (fileSuffix != null && fileSuffix.length() > 0) { fullFileName += fileSuffix; } bucketPath = filePath + "/" + inUsePrefix + fullFileName + inUseSuffix; targetPath = filePath + "/" + fullFileName; LOG.info("Creating " + bucketPath); callWithTimeout((CallRunner<Void>) () -> { // Need to get reference to FS using above config before underlying // writer does in order to avoid shutdown hook & // IllegalStateExceptions if (!mockFsInjected) { fileSystem = new Path(bucketPath).getFileSystem(config); } writer.open(bucketPath); return null; }); } catch (Exception ex) { if (ex instanceof IOException) { throw (IOException) ex; } else { throw Throwables.propagate(ex); } } } isClosedMethod = getRefIsClosed(); resetCounters(); // if time-based rolling is enabled, schedule the roll if (rollInterval > 0) { Callable<Void> action = new Callable<Void>() { public Void call() throws Exception { LOG.debug("Rolling file ({}): Roll scheduled after {} sec elapsed.", bucketPath, rollInterval); try { // Roll the file and remove reference from sfWriters map. close(true); } catch (Throwable t) { LOG.error("Unexpected error", t); } return null; } }; timedRollFuture = timedRollerPool.schedule(action, rollInterval, TimeUnit.SECONDS); } isOpen = true; }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.VerySimpleLocalExample.java
License:Apache License
@SuppressWarnings({ "deprecation", "unchecked", "rawtypes" }) @Test//w ww . j a v a 2 s.co m public void test_localHadoopLaunch() throws IOException, IllegalStateException, ClassNotFoundException, InterruptedException { // 0) Setup the temp dir final String temp_dir = System.getProperty("java.io.tmpdir") + File.separator; //final Path tmp_path = FileContext.getLocalFSFileContext().makeQualified(new Path(temp_dir)); final Path tmp_path2 = FileContext.getLocalFSFileContext() .makeQualified(new Path(temp_dir + "/tmp_output")); try { FileContext.getLocalFSFileContext().delete(tmp_path2, true); } catch (Exception e) { } // (just doesn't exist yet) // 1) Setup config with local mode final Configuration config = new Configuration(); config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) config.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); config.set("mapred.job.tracker", "local"); config.set("fs.defaultFS", "local"); config.unset("mapreduce.framework.name"); // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); // 2) Build job and do more setup using the Job API //TODO: not sure why this is deprecated, it doesn't seem to be in v1? We do need to move to JobConf at some point, but I ran into some // issues when trying to do everything I needed to for V1, so seems expedient to start here and migrate away final Job hj = new Job(config); // (NOTE: from here, changes to config are ignored) // Input format: //TOOD: fails because of guava issue, looks like we'll need to move to 2.7 and check it works with 2.5.x server? //TextInputFormat.addInputPath(hj, tmp_path); //hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName ("org.apache.hadoop.mapreduce.lib.input.TextInputFormat")); hj.setInputFormatClass(TestInputFormat.class); // Output format: hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat")); TextOutputFormat.setOutputPath(hj, tmp_path2); // Mapper etc (combiner/reducer are similar) hj.setMapperClass(TestMapper.class); hj.setOutputKeyClass(Text.class); hj.setOutputValueClass(Text.class); hj.setNumReduceTasks(0); // (disable reducer for now) hj.setJar("test"); try { hj.submit(); } catch (UnsatisfiedLinkError e) { throw new RuntimeException( "This is a windows/hadoop compatibility problem - adding the hadoop-commons in the misc_test_assets subdirectory to the top of the classpath should resolve it (and does in V1), though I haven't yet made that work with Aleph2", e); } //hj.getJobID().toString(); while (!hj.isComplete()) { Thread.sleep(1000); } assertTrue("Finished successfully", hj.isSuccessful()); }
From source file:com.ikanow.infinit.e.core.mapreduce.HadoopJobRunner.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) private String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation) throws IOException, SAXException, ParserConfigurationException { StringWriter xml = new StringWriter(); createConfigXML(xml, job.jobtitle, job.inputCollection, getQueryOrProcessing(job.query, QuerySpec.INPUTFIELDS), job.isCustomTable, job.getOutputDatabase(), job._id.toString(), job.outputCollectionTemp, job.mapper, job.reducer, job.combiner, getQueryOrProcessing(job.query, QuerySpec.QUERY), job.communityIds, job.outputKey, job.outputValue, job.arguments);/* w ww. ja va 2s . c o m*/ ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader(); URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, savedClassLoader); Thread.currentThread().setContextClassLoader(child); // Now load the XML into a configuration object: Configuration config = new Configuration(); try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes())); NodeList nList = doc.getElementsByTagName("property"); for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { Element eElement = (Element) nNode; String name = getTagValue("name", eElement); String value = getTagValue("value", eElement); if ((null != name) && (null != value)) { config.set(name, value); } } } } catch (Exception e) { throw new IOException(e.getMessage()); } // Now run the JAR file try { config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) if (bLocalMode) { config.set("mapred.job.tracker", "local"); config.set("fs.default.name", "local"); } else { String trackerUrl = HadoopUtils.getXMLProperty( prop_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker"); String fsUrl = HadoopUtils.getXMLProperty( prop_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name"); config.set("mapred.job.tracker", trackerUrl); config.set("fs.default.name", fsUrl); } Job hj = new Job(config); Class<?> classToLoad = Class.forName(job.mapper, true, child); hj.setJarByClass(classToLoad); hj.setInputFormatClass((Class<? extends InputFormat>) Class .forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child)); if ((null != job.exportToHdfs) && job.exportToHdfs) { hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child)); Path outPath = this.ensureOutputDirectory(job); SequenceFileOutputFormat.setOutputPath(hj, outPath); } else { // normal case, stays in MongoDB hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("com.mongodb.hadoop.MongoOutputFormat", true, child)); } hj.setMapperClass((Class<? extends Mapper>) Class.forName(job.mapper, true, child)); if ((null != job.reducer) && !job.reducer.equalsIgnoreCase("null") && !job.reducer.equalsIgnoreCase("none")) { hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child)); } else { hj.setNumReduceTasks(0); } if ((null != job.combiner) && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) { hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child)); } hj.setOutputKeyClass(Class.forName(job.outputKey, true, child)); hj.setOutputValueClass(Class.forName(job.outputValue, true, child)); hj.setJobName(job.jobtitle); if (bLocalMode) { hj.waitForCompletion(false); return "local_done"; } else { hj.submit(); String jobId = hj.getJobID().toString(); return jobId; } } catch (Exception e) { e.printStackTrace(); Thread.currentThread().setContextClassLoader(savedClassLoader); return "Error: " + HarvestExceptionUtils.createExceptionMessage(e); } finally { Thread.currentThread().setContextClassLoader(savedClassLoader); } }
From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation) throws IOException, SAXException, ParserConfigurationException { StringWriter xml = new StringWriter(); String outputCollection = job.outputCollectionTemp;// (non-append mode) if ((null != job.appendResults) && job.appendResults) outputCollection = job.outputCollection; // (append mode, write directly in....) else if (null != job.incrementalMode) job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode) createConfigXML(xml, job.jobtitle, job.inputCollection, InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS), job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper, job.reducer, job.combiner,/*www . j a v a 2 s . c om*/ InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY), job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode, job.submitterID, job.selfMerge, job.outputCollection, job.appendResults); ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader(); URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, savedClassLoader); Thread.currentThread().setContextClassLoader(child); // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable boolean dataModelLoaded = true; try { URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, null); try { Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest); } catch (ClassNotFoundException e2) { //(this is fine, will use the cached version) dataModelLoaded = false; } if (dataModelLoaded) Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest); } catch (ClassNotFoundException e1) { throw new RuntimeException( "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards"); } // Now load the XML into a configuration object: Configuration config = new Configuration(); // Add the client configuration overrides: if (!bLocalMode) { String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/"; config.addResource(new Path(hadoopConfigPath + "core-site.xml")); config.addResource(new Path(hadoopConfigPath + "mapred-site.xml")); config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml")); } //TESTED try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes())); NodeList nList = doc.getElementsByTagName("property"); for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { Element eElement = (Element) nNode; String name = getTagValue("name", eElement); String value = getTagValue("value", eElement); if ((null != name) && (null != value)) { config.set(name, value); } } } } catch (Exception e) { throw new IOException(e.getMessage()); } // Some other config defaults: // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config) config.set("mapred.map.tasks.speculative.execution", "false"); config.set("mapred.reduce.tasks.speculative.execution", "false"); // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera) // Now run the JAR file try { BasicDBObject advancedConfigurationDbo = null; try { advancedConfigurationDbo = (null != job.query) ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query)) : (new BasicDBObject()); } catch (Exception e) { advancedConfigurationDbo = new BasicDBObject(); } boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable; if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) { throw new RuntimeException( "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead."); } config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) if (bLocalMode) { // local job tracker and FS mode config.set("mapred.job.tracker", "local"); config.set("fs.default.name", "local"); } else { if (bTestMode) { // run job tracker locally but FS mode remotely config.set("mapred.job.tracker", "local"); } else { // normal job tracker String trackerUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker"); config.set("mapred.job.tracker", trackerUrl); } String fsUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name"); config.set("fs.default.name", fsUrl); } if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.data_model.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.processing.custom.library.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } //TESTED // Debug scripts (only if they exist), and only in non local/test mode if (!bLocalMode && !bTestMode) { try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_map_error_handler.sh", config); config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle); config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_reduce_error_handler.sh", config); config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on } //TODO (???): TOTEST // (need to do these 2 things here before the job is created, at which point the config class has been copied across) //1) Class<?> mapperClazz = Class.forName(job.mapper, true, child); if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz .newInstance(); preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode)); } //TESTED //2) if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { // Need to download the GridFSZip file try { Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/", "GridFSZipFile.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } catch (Throwable t) { } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!) } if (job.inputCollection.equals("records")) { InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo); //(won't run under 0.19 so running with "records" should cause all sorts of exceptions) } //TESTED (by hand) if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); } // Manually specified caches List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"), job, config, props_custom); Job hj = new Job(config); // (NOTE: from here, changes to config are ignored) try { if (null != localJarCaches) { if (bLocalMode || bTestMode) { Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class }); method.setAccessible(true); method.invoke(child, localJarCaches.toArray()); } //TOTEST (tested logically) } Class<?> classToLoad = Class.forName(job.mapper, true, child); hj.setJarByClass(classToLoad); if (job.inputCollection.equalsIgnoreCase("filesystem")) { String inputPath = null; try { inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); if (!inputPath.endsWith("/")) { inputPath = inputPath + "/"; } } catch (Exception e) { } if (null == inputPath) { throw new RuntimeException("Must specify 'file.url' if reading from filesystem."); } inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath); InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive) InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB) InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child)); } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { String[] oidStrs = null; try { String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)"); Matcher m = oidExtractor.matcher(inputPath); if (m.find()) { oidStrs = m.group(1).split("\\s*,\\s*"); } else { throw new RuntimeException( "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath); } InfiniteHadoopUtils.authenticateShareList(job, oidStrs); } catch (Exception e) { throw new RuntimeException( "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e); } hj.getConfiguration().setStrings("mapred.input.dir", oidStrs); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child)); } else if (job.inputCollection.equals("records")) { hj.setInputFormatClass((Class<? extends InputFormat>) Class .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child)); } else { if (esMode) { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat", true, child)); } else { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child)); } } if ((null != job.exportToHdfs) && job.exportToHdfs) { //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?) Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom); if ((null != job.outputKey) && (null != job.outputValue) && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text") && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) { // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text) hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child)); TextOutputFormat.setOutputPath(hj, outPath); } //TESTED else { hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child)); SequenceFileOutputFormat.setOutputPath(hj, outPath); } //TESTED } else { // normal case, stays in MongoDB hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child)); } hj.setMapperClass((Class<? extends Mapper>) mapperClazz); String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null); if (null != mapperOutputKeyOverride) { hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride)); } //TESTED String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null); if (null != mapperOutputValueOverride) { hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride)); } //TESTED if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null") && !job.reducer.equalsIgnoreCase("none")) { hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child)); // Variable reducers: if (null != job.query) { try { hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1)); } catch (Exception e) { try { // (just check it's not a string that is a valid int) hj.setNumReduceTasks( Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1"))); } catch (Exception e2) { } } } //TESTED } else { hj.setNumReduceTasks(0); } if ((null != job.combiner) && !job.combiner.startsWith("#") && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) { hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child)); } hj.setOutputKeyClass(Class.forName(job.outputKey, true, child)); hj.setOutputValueClass(Class.forName(job.outputValue, true, child)); hj.setJobName(job.jobtitle); currJobName = job.jobtitle; } catch (Error e) { // (messing about with class loaders = lots of chances for errors!) throw new RuntimeException(e.getMessage(), e); } if (bTestMode || bLocalMode) { hj.submit(); currThreadId = null; Logger.getRootLogger().addAppender(this); currLocalJobId = hj.getJobID().toString(); currLocalJobErrs.setLength(0); while (!hj.isComplete()) { Thread.sleep(1000); } Logger.getRootLogger().removeAppender(this); if (hj.isSuccessful()) { if (this.currLocalJobErrs.length() > 0) { return "local_done: " + this.currLocalJobErrs.toString(); } else { return "local_done"; } } else { return "Error: " + this.currLocalJobErrs.toString(); } } else { hj.submit(); String jobId = hj.getJobID().toString(); return jobId; } } catch (Exception e) { e.printStackTrace(); Thread.currentThread().setContextClassLoader(savedClassLoader); return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e); } finally { Thread.currentThread().setContextClassLoader(savedClassLoader); } }