List of usage examples for org.apache.hadoop.conf Configuration setLong
public void setLong(String name, long value)
name
property to a long
. From source file:org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil.java
License:Apache License
public static void setScrutinyOutputMax(Configuration configuration, long outputMaxRows) { Preconditions.checkNotNull(configuration); configuration.setLong(SCRUTINY_OUTPUT_MAX, outputMaxRows); }
From source file:org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil.java
License:Apache License
public static void setScrutinyExecuteTimestamp(Configuration configuration, long ts) { Preconditions.checkNotNull(configuration); configuration.setLong(SCRUTINY_EXECUTE_TIMESTAMP, ts); }
From source file:org.apache.phoenix.query.BaseTest.java
License:Apache License
private static void setDefaultTestConfig(Configuration conf, ReadOnlyProps overrideProps) { ConfigUtil.setReplicationConfigIfAbsent(conf); QueryServices services = new PhoenixTestDriver().getQueryServices(); for (Entry<String, String> entry : services.getProps()) { conf.set(entry.getKey(), entry.getValue()); }// w w w . j a va2s .co m //no point doing sanity checks when running tests. conf.setBoolean("hbase.table.sanity.checks", false); // set the server rpc controller and rpc scheduler factory, used to configure the cluster conf.set(RSRpcServices.REGION_SERVER_RPC_SCHEDULER_FACTORY_CLASS, DEFAULT_RPC_SCHEDULER_FACTORY); conf.setLong(HConstants.ZK_SESSION_TIMEOUT, 10 * HConstants.DEFAULT_ZK_SESSION_TIMEOUT); conf.setLong(HConstants.ZOOKEEPER_TICK_TIME, 6 * 1000); // override any defaults based on overrideProps for (Entry<String, String> entry : overrideProps) { conf.set(entry.getKey(), entry.getValue()); } }
From source file:org.apache.phoenix.transaction.TephraTransactionContext.java
License:Apache License
@Override public void setTxnConfigs(Configuration config, String tmpFolder, int defaultTxnTimeoutSeconds) throws IOException { config.setBoolean(TxConstants.Manager.CFG_DO_PERSIST, false); config.set(TxConstants.Service.CFG_DATA_TX_CLIENT_RETRY_STRATEGY, "n-times"); config.setInt(TxConstants.Service.CFG_DATA_TX_CLIENT_ATTEMPTS, 1); config.setInt(TxConstants.Service.CFG_DATA_TX_BIND_PORT, Networks.getRandomPort()); config.set(TxConstants.Manager.CFG_TX_SNAPSHOT_DIR, tmpFolder); config.setInt(TxConstants.Manager.CFG_TX_TIMEOUT, defaultTxnTimeoutSeconds); config.unset(TxConstants.Manager.CFG_TX_HDFS_USER); config.setLong(TxConstants.Manager.CFG_TX_SNAPSHOT_INTERVAL, 5L); }
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler.java
License:Apache License
/** * The method that creates the Job corresponding to a MapReduceOper. * The assumption is that// ww w .j a v a 2 s. c o m * every MapReduceOper will have a load and a store. The JobConf removes * the load operator and serializes the input filespec so that PigInputFormat can * take over the creation of splits. It also removes the store operator * and serializes the output filespec so that PigOutputFormat can take over * record writing. The remaining portion of the map plan and reduce plans are * serialized and stored for the PigMapReduce or PigMapOnly objects to take over * the actual running of the plans. * The Mapper & Reducer classes and the required key value formats are set. * Checks if this is a map only job and uses PigMapOnly class as the mapper * and uses PigMapReduce otherwise. * If it is a Map Reduce job, it is bound to have a package operator. Remove it from * the reduce plan and serializes it so that the PigMapReduce class can use it to package * the indexed tuples received by the reducer. * @param mro - The MapReduceOper for which the JobConf is required * @param config - the Configuration object from which JobConf is built * @param pigContext - The PigContext passed on from execution engine * @return Job corresponding to mro * @throws JobCreationException */ @SuppressWarnings({ "unchecked" }) private Job getJob(MROperPlan plan, MapReduceOper mro, Configuration config, PigContext pigContext) throws JobCreationException { org.apache.hadoop.mapreduce.Job nwJob = null; try { nwJob = new org.apache.hadoop.mapreduce.Job(config); } catch (Exception e) { throw new JobCreationException(e); } Configuration conf = nwJob.getConfiguration(); ArrayList<FileSpec> inp = new ArrayList<FileSpec>(); ArrayList<List<OperatorKey>> inpTargets = new ArrayList<List<OperatorKey>>(); ArrayList<String> inpSignatureLists = new ArrayList<String>(); ArrayList<Long> inpLimits = new ArrayList<Long>(); ArrayList<POStore> storeLocations = new ArrayList<POStore>(); Path tmpLocation = null; // add settings for pig statistics String setScriptProp = conf.get(PigConfiguration.INSERT_ENABLED, "true"); if (setScriptProp.equalsIgnoreCase("true")) { MRScriptState ss = MRScriptState.get(); ss.addSettingsToConf(mro, conf); } conf.set(MRConfiguration.MAPPER_NEW_API, "true"); conf.set(MRConfiguration.REDUCER_NEW_API, "true"); String buffPercent = conf.get(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT); if (buffPercent == null || Double.parseDouble(buffPercent) <= 0) { log.info(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT + " is not set, set to default 0.3"); conf.set(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT, "0.3"); } else { log.info(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT + " is set to " + conf.get(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT)); } configureCompression(conf); try { //Process the POLoads List<POLoad> lds = PlanHelper.getPhysicalOperators(mro.mapPlan, POLoad.class); if (lds != null && lds.size() > 0) { for (POLoad ld : lds) { LoadFunc lf = ld.getLoadFunc(); lf.setLocation(ld.getLFile().getFileName(), nwJob); //Store the inp filespecs inp.add(ld.getLFile()); } } if (!mro.reducePlan.isEmpty()) { log.info("Reduce phase detected, estimating # of required reducers."); adjustNumReducers(plan, mro, nwJob); } else { nwJob.setNumReduceTasks(0); } for (String udf : mro.UDFs) { if (udf.contains("GFCross")) { Object func = pigContext.instantiateFuncFromSpec(new FuncSpec(udf)); if (func instanceof GFCross) { String crossKey = ((GFCross) func).getCrossKey(); // If non GFCross has been processed yet if (pigContext.getProperties() .get(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey) == null) { pigContext.getProperties().setProperty( PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey, Integer.toString(nwJob.getNumReduceTasks())); } conf.set(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey, (String) pigContext .getProperties().get(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey)); } } } if (lds != null && lds.size() > 0) { for (POLoad ld : lds) { //Store the target operators for tuples read //from this input List<PhysicalOperator> ldSucs = mro.mapPlan.getSuccessors(ld); List<OperatorKey> ldSucKeys = new ArrayList<OperatorKey>(); if (ldSucs != null) { for (PhysicalOperator operator2 : ldSucs) { ldSucKeys.add(operator2.getOperatorKey()); } } inpTargets.add(ldSucKeys); inpSignatureLists.add(ld.getSignature()); inpLimits.add(ld.getLimit()); //Remove the POLoad from the plan if (!pigContext.inIllustrator) mro.mapPlan.remove(ld); } } if (!pigContext.inIllustrator && !pigContext.getExecType().isLocal()) { if (okToRunLocal(nwJob, mro, lds)) { log.info(SMALL_JOB_LOG_MSG); // override with the default conf to run in local mode for (Entry<String, String> entry : defaultConf) { String key = entry.getKey(); if (key.equals(MRConfiguration.REDUCE_TASKS) || key.equals(MRConfiguration.JOB_REDUCES)) { // this must not be set back to the default in case it has been set to 0 for example. continue; } if (key.startsWith("fs.")) { // we don't want to change fs settings back continue; } if (key.startsWith("io.")) { // we don't want to change io settings back continue; } String value = entry.getValue(); if (conf.get(key) == null || !conf.get(key).equals(value)) { conf.set(key, value); } } conf.setBoolean(PigImplConstants.CONVERTED_TO_LOCAL, true); } else { log.info(BIG_JOB_LOG_MSG); // Setup the DistributedCache for this job List<URL> allJars = new ArrayList<URL>(); for (URL extraJar : pigContext.extraJars) { if (!allJars.contains(extraJar)) { allJars.add(extraJar); } } for (String scriptJar : pigContext.scriptJars) { URL jar = new File(scriptJar).toURI().toURL(); if (!allJars.contains(jar)) { allJars.add(jar); } } for (String defaultJar : JarManager.getDefaultJars()) { URL jar = new File(defaultJar).toURI().toURL(); if (!allJars.contains(jar)) { allJars.add(jar); } } for (URL jar : allJars) { boolean predeployed = false; for (String predeployedJar : pigContext.predeployedJars) { if (predeployedJar.contains(new File(jar.toURI()).getName())) { predeployed = true; } } if (!predeployed) { log.info("Adding jar to DistributedCache: " + jar); putJarOnClassPathThroughDistributedCache(pigContext, conf, jar); } } File scriptUDFJarFile = JarManager.createPigScriptUDFJar(pigContext); if (scriptUDFJarFile != null) { putJarOnClassPathThroughDistributedCache(pigContext, conf, scriptUDFJarFile.toURI().toURL()); } } } if (Utils.isLocal(pigContext, conf)) { ConfigurationUtil.replaceConfigForLocalMode(conf); } conf.set("pig.inputs", ObjectSerializer.serialize(inp)); conf.set("pig.inpTargets", ObjectSerializer.serialize(inpTargets)); conf.set("pig.inpSignatures", ObjectSerializer.serialize(inpSignatureLists)); conf.set("pig.inpLimits", ObjectSerializer.serialize(inpLimits)); conf.set("pig.pigContext", ObjectSerializer.serialize(pigContext)); conf.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList())); // this is for unit tests since some don't create PigServer // if user specified the job name using -D switch, Pig won't reset the name then. if (System.getProperty(MRConfiguration.JOB_NAME) == null && pigContext.getProperties().getProperty(PigContext.JOB_NAME) != null) { nwJob.setJobName(pigContext.getProperties().getProperty(PigContext.JOB_NAME)); } if (pigContext.getProperties().getProperty(PigContext.JOB_PRIORITY) != null) { // If the job priority was set, attempt to get the corresponding enum value // and set the hadoop job priority. String jobPriority = pigContext.getProperties().getProperty(PigContext.JOB_PRIORITY).toUpperCase(); try { // Allow arbitrary case; the Hadoop job priorities are all upper case. conf.set(MRConfiguration.JOB_PRIORITY, JobPriority.valueOf(jobPriority).toString()); } catch (IllegalArgumentException e) { StringBuffer sb = new StringBuffer("The job priority must be one of ["); JobPriority[] priorities = JobPriority.values(); for (int i = 0; i < priorities.length; ++i) { if (i > 0) sb.append(", "); sb.append(priorities[i]); } sb.append("]. You specified [" + jobPriority + "]"); throw new JobCreationException(sb.toString()); } } setupDistributedCache(pigContext, conf, pigContext.getProperties(), "pig.streaming.ship.files", true); setupDistributedCache(pigContext, conf, pigContext.getProperties(), "pig.streaming.cache.files", false); nwJob.setInputFormatClass(PigInputFormat.class); // tmp file compression setups // PIG-3741 This must be done before setStoreLocation on POStores Utils.setTmpFileCompressionOnConf(pigContext, conf); //Process POStore and remove it from the plan LinkedList<POStore> mapStores = PlanHelper.getPhysicalOperators(mro.mapPlan, POStore.class); LinkedList<POStore> reduceStores = PlanHelper.getPhysicalOperators(mro.reducePlan, POStore.class); for (POStore st : mapStores) { storeLocations.add(st); StoreFuncInterface sFunc = st.getStoreFunc(); sFunc.setStoreLocation(st.getSFile().getFileName(), nwJob); if (sFunc instanceof OverwritableStoreFunc) { OverwritableStoreFunc osf = (OverwritableStoreFunc) sFunc; if (osf.shouldOverwrite()) { osf.cleanupOutput(st, nwJob); } } } for (POStore st : reduceStores) { storeLocations.add(st); StoreFuncInterface sFunc = st.getStoreFunc(); sFunc.setStoreLocation(st.getSFile().getFileName(), nwJob); if (sFunc instanceof OverwritableStoreFunc) { OverwritableStoreFunc osf = (OverwritableStoreFunc) sFunc; if (osf.shouldOverwrite()) { osf.cleanupOutput(st, nwJob); } } } setOutputFormat(nwJob); if (mapStores.size() + reduceStores.size() == 1) { // single store case log.info("Setting up single store job"); POStore st; if (reduceStores.isEmpty()) { st = mapStores.get(0); if (!pigContext.inIllustrator) mro.mapPlan.remove(st); } else { st = reduceStores.get(0); if (!pigContext.inIllustrator) mro.reducePlan.remove(st); } MapRedUtil.setupStreamingDirsConfSingle(st, pigContext, conf); } else if (mapStores.size() + reduceStores.size() > 0) { // multi store case log.info("Setting up multi store job"); MapRedUtil.setupStreamingDirsConfMulti(pigContext, conf); boolean disableCounter = conf.getBoolean("pig.disable.counter", false); if (disableCounter) { log.info("Disable Pig custom output counters"); } int idx = 0; for (POStore sto : storeLocations) { sto.setDisableCounter(disableCounter); sto.setMultiStore(true); sto.setIndex(idx++); } } // store map key type // this is needed when the key is null to create // an appropriate NullableXXXWritable object conf.set("pig.map.keytype", ObjectSerializer.serialize(new byte[] { mro.mapKeyType })); // set parent plan in all operators in map and reduce plans // currently the parent plan is really used only when POStream is present in the plan new PhyPlanSetter(mro.mapPlan).visit(); new PhyPlanSetter(mro.reducePlan).visit(); // this call modifies the ReplFiles names of POFRJoin operators // within the MR plans, must be called before the plans are // serialized setupDistributedCacheForJoin(mro, pigContext, conf); // Search to see if we have any UDFs that need to pack things into the // distributed cache. setupDistributedCacheForUdfs(mro, pigContext, conf); SchemaTupleFrontend.copyAllGeneratedToDistributedCache(pigContext, conf); POPackage pack = null; if (mro.reducePlan.isEmpty()) { //MapOnly Job nwJob.setMapperClass(PigMapOnly.Map.class); if (!pigContext.inIllustrator) conf.set("pig.mapPlan", ObjectSerializer.serialize(mro.mapPlan)); if (mro.isEndOfAllInputSetInMap()) { // this is used in Map.close() to decide whether the // pipeline needs to be rerun one more time in the close() // The pipeline is rerun if there either was a stream or POMergeJoin conf.set(END_OF_INP_IN_MAP, "true"); } } else { //Map Reduce Job //Process the POPackage operator and remove it from the reduce plan if (!mro.combinePlan.isEmpty()) { POPackage combPack = (POPackage) mro.combinePlan.getRoots().get(0); mro.combinePlan.remove(combPack); nwJob.setCombinerClass(PigCombiner.Combine.class); conf.set("pig.combinePlan", ObjectSerializer.serialize(mro.combinePlan)); conf.set("pig.combine.package", ObjectSerializer.serialize(combPack)); } else if (mro.needsDistinctCombiner()) { nwJob.setCombinerClass(DistinctCombiner.Combine.class); log.info("Setting identity combiner class."); } pack = (POPackage) mro.reducePlan.getRoots().get(0); if (!pigContext.inIllustrator) mro.reducePlan.remove(pack); nwJob.setMapperClass(PigMapReduce.Map.class); nwJob.setReducerClass(PigMapReduce.Reduce.class); if (mro.customPartitioner != null) nwJob.setPartitionerClass(PigContext.resolveClassName(mro.customPartitioner)); if (!pigContext.inIllustrator) conf.set("pig.mapPlan", ObjectSerializer.serialize(mro.mapPlan)); if (mro.isEndOfAllInputSetInMap()) { // this is used in Map.close() to decide whether the // pipeline needs to be rerun one more time in the close() // The pipeline is rerun only if there was a stream or merge-join. conf.set(END_OF_INP_IN_MAP, "true"); } if (!pigContext.inIllustrator) conf.set("pig.reducePlan", ObjectSerializer.serialize(mro.reducePlan)); if (mro.isEndOfAllInputSetInReduce()) { // this is used in Map.close() to decide whether the // pipeline needs to be rerun one more time in the close() // The pipeline is rerun only if there was a stream conf.set("pig.stream.in.reduce", "true"); } if (!pigContext.inIllustrator) conf.set("pig.reduce.package", ObjectSerializer.serialize(pack)); conf.set("pig.reduce.key.type", Byte.toString(pack.getPkgr().getKeyType())); if (mro.getUseSecondaryKey()) { nwJob.setGroupingComparatorClass(PigSecondaryKeyGroupComparator.class); nwJob.setPartitionerClass(SecondaryKeyPartitioner.class); nwJob.setSortComparatorClass(PigSecondaryKeyComparator.class); nwJob.setOutputKeyClass(NullableTuple.class); conf.set("pig.secondarySortOrder", ObjectSerializer.serialize(mro.getSecondarySortOrder())); } else { Class<? extends WritableComparable> keyClass = HDataType .getWritableComparableTypes(pack.getPkgr().getKeyType()).getClass(); nwJob.setOutputKeyClass(keyClass); selectComparator(mro, pack.getPkgr().getKeyType(), nwJob); } nwJob.setOutputValueClass(NullableTuple.class); } if (mro.isGlobalSort() || mro.isLimitAfterSort()) { if (mro.isGlobalSort()) { String symlink = addSingleFileToDistributedCache(pigContext, conf, mro.getQuantFile(), "pigsample"); conf.set("pig.quantilesFile", symlink); nwJob.setPartitionerClass(WeightedRangePartitioner.class); } if (mro.isUDFComparatorUsed) { boolean usercomparator = false; for (String compFuncSpec : mro.UDFs) { Class comparator = PigContext.resolveClassName(compFuncSpec); if (ComparisonFunc.class.isAssignableFrom(comparator)) { nwJob.setMapperClass(PigMapReduce.MapWithComparator.class); nwJob.setReducerClass(PigMapReduce.ReduceWithComparator.class); conf.set("pig.reduce.package", ObjectSerializer.serialize(pack)); conf.set("pig.usercomparator", "true"); nwJob.setOutputKeyClass(NullableTuple.class); nwJob.setSortComparatorClass(comparator); usercomparator = true; break; } } if (!usercomparator) { String msg = "Internal error. Can't find the UDF comparator"; throw new IOException(msg); } } else { conf.set("pig.sortOrder", ObjectSerializer.serialize(mro.getSortOrder())); } } if (mro.isSkewedJoin()) { String symlink = addSingleFileToDistributedCache(pigContext, conf, mro.getSkewedJoinPartitionFile(), "pigdistkey"); conf.set("pig.keyDistFile", symlink); nwJob.setPartitionerClass(SkewedPartitioner.class); nwJob.setMapperClass(PigMapReduce.MapWithPartitionIndex.class); nwJob.setMapOutputKeyClass(NullablePartitionWritable.class); nwJob.setGroupingComparatorClass(PigGroupingPartitionWritableComparator.class); } if (mro.isCounterOperation()) { if (mro.isRowNumber()) { nwJob.setMapperClass(PigMapReduceCounter.PigMapCounter.class); } else { nwJob.setReducerClass(PigMapReduceCounter.PigReduceCounter.class); } } if (mro.isRankOperation()) { Iterator<String> operationIDs = mro.getRankOperationId().iterator(); while (operationIDs.hasNext()) { String operationID = operationIDs.next(); Iterator<Pair<String, Long>> itPairs = globalCounters.get(operationID).iterator(); Pair<String, Long> pair = null; while (itPairs.hasNext()) { pair = itPairs.next(); conf.setLong(pair.first, pair.second); } } } if (!pigContext.inIllustrator) { // unset inputs for POStore, otherwise, map/reduce plan will be unnecessarily deserialized for (POStore st : mapStores) { st.setInputs(null); st.setParentPlan(null); } for (POStore st : reduceStores) { st.setInputs(null); st.setParentPlan(null); } conf.set(PIG_MAP_STORES, ObjectSerializer.serialize(mapStores)); conf.set(PIG_REDUCE_STORES, ObjectSerializer.serialize(reduceStores)); } String tmp; long maxCombinedSplitSize = 0; if (!mro.combineSmallSplits() || pigContext.getProperties().getProperty("pig.splitCombination", "true").equals("false")) conf.setBoolean("pig.noSplitCombination", true); else if ((tmp = pigContext.getProperties().getProperty("pig.maxCombinedSplitSize", null)) != null) { try { maxCombinedSplitSize = Long.parseLong(tmp); } catch (NumberFormatException e) { log.warn( "Invalid numeric format for pig.maxCombinedSplitSize; use the default maximum combined split size"); } } if (maxCombinedSplitSize > 0) conf.setLong("pig.maxCombinedSplitSize", maxCombinedSplitSize); // It's a hack to set distributed cache file for hadoop 23. Once MiniMRCluster do not require local // jar on fixed location, this can be removed if (pigContext.getExecType() == ExecType.MAPREDUCE) { String newfiles = conf.get("alternative.mapreduce.job.cache.files"); if (newfiles != null) { String files = conf.get(MRConfiguration.JOB_CACHE_FILES); conf.set(MRConfiguration.JOB_CACHE_FILES, files == null ? newfiles.toString() : files + "," + newfiles); } } // Serialize the UDF specific context info. UDFContext.getUDFContext().serialize(conf); Job cjob = new Job(new JobConf(conf), new ArrayList<Job>()); jobStoreMap.put(cjob, new Pair<List<POStore>, Path>(storeLocations, tmpLocation)); return cjob; } catch (JobCreationException jce) { throw jce; } catch (Exception e) { int errCode = 2017; String msg = "Internal error creating job configuration."; throw new JobCreationException(msg, errCode, PigException.BUG, e); } }
From source file:org.apache.pig.backend.hadoop.executionengine.tez.plan.optimizer.LoaderProcessor.java
License:Apache License
/** * Do the final configuration of LoadFuncs and store what goes where. This * will need to be changed as the inputs get un-bundled * * @param tezOp/*from www.java 2 s .c o m*/ * @param conf * @param job * @return true if any POLoads were found, else false. * @throws VisitorException * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private List<POLoad> processLoads(TezOperator tezOp) throws VisitorException, IOException, ClassNotFoundException, InterruptedException { ArrayList<FileSpec> inp = new ArrayList<FileSpec>(); ArrayList<List<OperatorKey>> inpTargets = new ArrayList<List<OperatorKey>>(); ArrayList<String> inpSignatureLists = new ArrayList<String>(); ArrayList<Long> inpLimits = new ArrayList<Long>(); List<POLoad> lds = PlanHelper.getPhysicalOperators(tezOp.plan, POLoad.class); Job job = Job.getInstance(jobConf); Configuration conf = job.getConfiguration(); if (lds != null && lds.size() > 0) { if (lds.size() == 1) { for (POLoad ld : lds) { LoadFunc lf = ld.getLoadFunc(); lf.setLocation(ld.getLFile().getFileName(), job); // Store the inp filespecs inp.add(ld.getLFile()); } } else { throw new VisitorException("There is more than one load for TezOperator " + tezOp); } } if (lds != null && lds.size() > 0) { for (POLoad ld : lds) { // Store the target operators for tuples read // from this input List<PhysicalOperator> ldSucs = new ArrayList<PhysicalOperator>(tezOp.plan.getSuccessors(ld)); List<OperatorKey> ldSucKeys = new ArrayList<OperatorKey>(); if (ldSucs != null) { for (PhysicalOperator operator2 : ldSucs) { ldSucKeys.add(operator2.getOperatorKey()); } } inpTargets.add(ldSucKeys); inpSignatureLists.add(ld.getSignature()); inpLimits.add(ld.getLimit()); // Remove the POLoad from the plan tezOp.plan.remove(ld); // Now add the input handling operator for the Tez backend // TODO: Move this upstream to the PhysicalPlan generation POSimpleTezLoad tezLoad = new POSimpleTezLoad(ld.getOperatorKey(), ld.getLoadFunc()); tezLoad.setLFile(ld.getLFile()); tezLoad.setSignature(ld.getSignature()); tezLoad.setInputKey(ld.getOperatorKey().toString()); tezLoad.copyAliasFrom(ld); tezLoad.setCacheFiles(ld.getCacheFiles()); tezLoad.setShipFiles(ld.getShipFiles()); tezOp.plan.add(tezLoad); for (PhysicalOperator sucs : ldSucs) { tezOp.plan.connect(tezLoad, sucs); } } UDFContext.getUDFContext().serialize(conf); conf.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList())); conf.set(PigInputFormat.PIG_INPUTS, ObjectSerializer.serialize(inp)); conf.set(PigInputFormat.PIG_INPUT_TARGETS, ObjectSerializer.serialize(inpTargets)); conf.set(PigInputFormat.PIG_INPUT_SIGNATURES, ObjectSerializer.serialize(inpSignatureLists)); conf.set(PigInputFormat.PIG_INPUT_LIMITS, ObjectSerializer.serialize(inpLimits)); String tmp; long maxCombinedSplitSize = 0; if (!tezOp.combineSmallSplits() || pc.getProperties() .getProperty(PigConfiguration.PIG_SPLIT_COMBINATION, "true").equals("false")) conf.setBoolean(PigConfiguration.PIG_NO_SPLIT_COMBINATION, true); else if ((tmp = pc.getProperties().getProperty(PigConfiguration.PIG_MAX_COMBINED_SPLIT_SIZE, null)) != null) { try { maxCombinedSplitSize = Long.parseLong(tmp); } catch (NumberFormatException e) { LOG.warn( "Invalid numeric format for pig.maxCombinedSplitSize; use the default maximum combined split size"); } } if (maxCombinedSplitSize > 0) conf.setLong("pig.maxCombinedSplitSize", maxCombinedSplitSize); tezOp.getLoaderInfo().setInpSignatureLists(inpSignatureLists); tezOp.getLoaderInfo().setInp(inp); tezOp.getLoaderInfo().setInpLimits(inpLimits); // Not using MRInputAMSplitGenerator because delegation tokens are // fetched in FileInputFormat tezOp.getLoaderInfo().setInputSplitInfo(MRInputHelpers.generateInputSplitsToMem(conf, false, 0)); // TODO: Can be set to -1 if TEZ-601 gets fixed and getting input // splits can be moved to if(loads) block below int parallelism = tezOp.getLoaderInfo().getInputSplitInfo().getNumTasks(); tezOp.setRequestedParallelism(parallelism); tezOp.setTotalInputFilesSize(InputSizeReducerEstimator.getTotalInputFileSize(conf, lds, job)); } return lds; }
From source file:org.apache.pig.backend.hadoop.executionengine.tez.TezDagBuilder.java
License:Apache License
private Vertex newVertex(TezOperator tezOp, boolean isMap) throws IOException, ClassNotFoundException, InterruptedException { ProcessorDescriptor procDesc = ProcessorDescriptor.create(tezOp.getProcessorName()); // Pass physical plans to vertex as user payload. JobConf payloadConf = new JobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), false)); // We do this so that dag.getCredentials(), job.getCredentials(), // job.getConfiguration().getCredentials() all reference the same Credentials object // Unfortunately there is no setCredentials() on Job payloadConf.setCredentials(dag.getCredentials()); // We won't actually use this job, but we need it to talk with the Load Store funcs @SuppressWarnings("deprecation") Job job = new Job(payloadConf); payloadConf = (JobConf) job.getConfiguration(); if (tezOp.sampleOperator != null) { payloadConf.set(PigProcessor.SAMPLE_VERTEX, tezOp.sampleOperator.getOperatorKey().toString()); }/*from w w w. j av a 2 s . c o m*/ if (tezOp.sortOperator != null) { payloadConf.set(PigProcessor.SORT_VERTEX, tezOp.sortOperator.getOperatorKey().toString()); } String tmp; long maxCombinedSplitSize = 0; if (!tezOp.combineSmallSplits() || pc.getProperties().getProperty(PigConfiguration.PIG_SPLIT_COMBINATION, "true").equals("false")) payloadConf.setBoolean(PigConfiguration.PIG_NO_SPLIT_COMBINATION, true); else if ((tmp = pc.getProperties().getProperty(PigConfiguration.PIG_MAX_COMBINED_SPLIT_SIZE, null)) != null) { try { maxCombinedSplitSize = Long.parseLong(tmp); } catch (NumberFormatException e) { log.warn( "Invalid numeric format for pig.maxCombinedSplitSize; use the default maximum combined split size"); } } if (maxCombinedSplitSize > 0) payloadConf.setLong("pig.maxCombinedSplitSize", maxCombinedSplitSize); payloadConf.set("pig.inputs", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInp())); payloadConf.set("pig.inpSignatures", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInpSignatureLists())); payloadConf.set("pig.inpLimits", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInpLimits())); // Process stores LinkedList<POStore> stores = processStores(tezOp, payloadConf, job); payloadConf.set("pig.pigContext", ObjectSerializer.serialize(pc)); payloadConf.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList())); payloadConf.set("exectype", "TEZ"); payloadConf.setBoolean(MRConfiguration.MAPPER_NEW_API, true); payloadConf.setClass(MRConfiguration.INPUTFORMAT_CLASS, PigInputFormat.class, InputFormat.class); // Set parent plan for all operators in the Tez plan. new PhyPlanSetter(tezOp.plan).visit(); // Set the endOfAllInput flag on the physical plan if certain operators that // use this property (such as STREAM) are present in the plan. EndOfAllInputSetter.EndOfAllInputChecker checker = new EndOfAllInputSetter.EndOfAllInputChecker(tezOp.plan); checker.visit(); if (checker.isEndOfAllInputPresent()) { payloadConf.set(JobControlCompiler.END_OF_INP_IN_MAP, "true"); } // Configure the classes for incoming shuffles to this TezOp // TODO: Refactor out resetting input keys, PIG-3957 List<PhysicalOperator> roots = tezOp.plan.getRoots(); if (roots.size() == 1 && roots.get(0) instanceof POPackage) { POPackage pack = (POPackage) roots.get(0); List<PhysicalOperator> succsList = tezOp.plan.getSuccessors(pack); if (succsList != null) { succsList = new ArrayList<PhysicalOperator>(succsList); } byte keyType = pack.getPkgr().getKeyType(); tezOp.plan.remove(pack); payloadConf.set("pig.reduce.package", ObjectSerializer.serialize(pack)); setIntermediateOutputKeyValue(keyType, payloadConf, tezOp); POShuffleTezLoad newPack; newPack = new POShuffleTezLoad(pack); if (tezOp.isSkewedJoin()) { newPack.setSkewedJoins(true); } tezOp.plan.add(newPack); // Set input keys for POShuffleTezLoad. This is used to identify // the inputs that are attached to the POShuffleTezLoad in the // backend. Map<Integer, String> localRearrangeMap = new TreeMap<Integer, String>(); for (TezOperator pred : mPlan.getPredecessors(tezOp)) { if (tezOp.sampleOperator != null && tezOp.sampleOperator == pred) { // skip sample vertex input } else { String inputKey = pred.getOperatorKey().toString(); if (pred.isVertexGroup()) { pred = mPlan.getOperator(pred.getVertexGroupMembers().get(0)); } LinkedList<POLocalRearrangeTez> lrs = PlanHelper.getPhysicalOperators(pred.plan, POLocalRearrangeTez.class); for (POLocalRearrangeTez lr : lrs) { if (lr.isConnectedToPackage() && lr.getOutputKey().equals(tezOp.getOperatorKey().toString())) { localRearrangeMap.put((int) lr.getIndex(), inputKey); } } } } for (Map.Entry<Integer, String> entry : localRearrangeMap.entrySet()) { newPack.addInputKey(entry.getValue()); } if (succsList != null) { for (PhysicalOperator succs : succsList) { tezOp.plan.connect(newPack, succs); } } setIntermediateOutputKeyValue(pack.getPkgr().getKeyType(), payloadConf, tezOp); } else if (roots.size() == 1 && roots.get(0) instanceof POIdentityInOutTez) { POIdentityInOutTez identityInOut = (POIdentityInOutTez) roots.get(0); // TODO Need to fix multiple input key mapping TezOperator identityInOutPred = null; for (TezOperator pred : mPlan.getPredecessors(tezOp)) { if (!pred.isSampleAggregation()) { identityInOutPred = pred; break; } } identityInOut.setInputKey(identityInOutPred.getOperatorKey().toString()); } else if (roots.size() == 1 && roots.get(0) instanceof POValueInputTez) { POValueInputTez valueInput = (POValueInputTez) roots.get(0); LinkedList<String> scalarInputs = new LinkedList<String>(); for (POUserFunc userFunc : PlanHelper.getPhysicalOperators(tezOp.plan, POUserFunc.class)) { if (userFunc.getFunc() instanceof ReadScalarsTez) { scalarInputs.add(((ReadScalarsTez) userFunc.getFunc()).getTezInputs()[0]); } } // Make sure we don't find the scalar for (TezOperator pred : mPlan.getPredecessors(tezOp)) { if (!scalarInputs.contains(pred.getOperatorKey().toString())) { valueInput.setInputKey(pred.getOperatorKey().toString()); break; } } } JobControlCompiler.setOutputFormat(job); // set parent plan in all operators. currently the parent plan is really // used only when POStream, POSplit are present in the plan new PhyPlanSetter(tezOp.plan).visit(); // Serialize the execution plan payloadConf.set(PigProcessor.PLAN, ObjectSerializer.serialize(tezOp.plan)); UDFContext.getUDFContext().serialize(payloadConf); MRToTezHelper.processMRSettings(payloadConf, globalConf); if (!pc.inIllustrator) { for (POStore store : stores) { // unset inputs for POStore, otherwise, map/reduce plan will be unnecessarily deserialized store.setInputs(null); store.setParentPlan(null); } // We put them in the reduce because PigOutputCommitter checks the // ID of the task to see if it's a map, and if not, calls the reduce // committers. payloadConf.set(JobControlCompiler.PIG_MAP_STORES, ObjectSerializer.serialize(new ArrayList<POStore>())); payloadConf.set(JobControlCompiler.PIG_REDUCE_STORES, ObjectSerializer.serialize(stores)); } if (tezOp.isNeedEstimateParallelism()) { payloadConf.setBoolean(PigProcessor.ESTIMATE_PARALLELISM, true); log.info("Estimate quantile for sample aggregation vertex " + tezOp.getOperatorKey().toString()); } // Take our assembled configuration and create a vertex UserPayload userPayload = TezUtils.createUserPayloadFromConf(payloadConf); procDesc.setUserPayload(userPayload); Vertex vertex = Vertex.create(tezOp.getOperatorKey().toString(), procDesc, tezOp.getVertexParallelism(), isMap ? MRHelpers.getResourceForMRMapper(globalConf) : MRHelpers.getResourceForMRReducer(globalConf)); Map<String, String> taskEnv = new HashMap<String, String>(); MRHelpers.updateEnvBasedOnMRTaskEnv(globalConf, taskEnv, isMap); vertex.setTaskEnvironment(taskEnv); // All these classes are @InterfaceAudience.Private in Hadoop. Switch to Tez methods in TEZ-1012 // set the timestamps, public/private visibility of the archives and files ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(globalConf); // get DelegationToken for each cached file ClientDistributedCacheManager.getDelegationTokens(globalConf, job.getCredentials()); MRApps.setupDistributedCache(globalConf, localResources); vertex.addTaskLocalFiles(localResources); vertex.setTaskLaunchCmdOpts(isMap ? MRHelpers.getJavaOptsForMRMapper(globalConf) : MRHelpers.getJavaOptsForMRReducer(globalConf)); log.info("For vertex - " + tezOp.getOperatorKey().toString() + ": parallelism=" + tezOp.getVertexParallelism() + ", memory=" + vertex.getTaskResource().getMemory() + ", java opts=" + vertex.getTaskLaunchCmdOpts()); // Right now there can only be one of each of these. Will need to be // more generic when there can be more. for (POLoad ld : tezOp.getLoaderInfo().getLoads()) { // TODO: These should get the globalConf, or a merged version that // keeps settings like pig.maxCombinedSplitSize vertex.setLocationHint( VertexLocationHint.create(tezOp.getLoaderInfo().getInputSplitInfo().getTaskLocationHints())); vertex.addDataSource(ld.getOperatorKey().toString(), DataSourceDescriptor.create( InputDescriptor.create(MRInput.class.getName()) .setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder() .setConfigurationBytes(TezUtils.createByteStringFromConf(payloadConf)) .setSplits(tezOp.getLoaderInfo().getInputSplitInfo().getSplitsProto()).build() .toByteString().asReadOnlyByteBuffer())), InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName()), dag.getCredentials())); } for (POStore store : stores) { ArrayList<POStore> emptyList = new ArrayList<POStore>(); ArrayList<POStore> singleStore = new ArrayList<POStore>(); singleStore.add(store); Configuration outputPayLoad = new Configuration(payloadConf); outputPayLoad.set(JobControlCompiler.PIG_MAP_STORES, ObjectSerializer.serialize(emptyList)); outputPayLoad.set(JobControlCompiler.PIG_REDUCE_STORES, ObjectSerializer.serialize(singleStore)); OutputDescriptor storeOutDescriptor = OutputDescriptor.create(MROutput.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(outputPayLoad)); if (tezOp.getVertexGroupStores() != null) { OperatorKey vertexGroupKey = tezOp.getVertexGroupStores().get(store.getOperatorKey()); if (vertexGroupKey != null) { getPlan().getOperator(vertexGroupKey).getVertexGroupInfo() .setStoreOutputDescriptor(storeOutDescriptor); continue; } } vertex.addDataSink(store.getOperatorKey().toString(), new DataSinkDescriptor(storeOutDescriptor, OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), dag.getCredentials())); } // LoadFunc and StoreFunc add delegation tokens to Job Credentials in // setLocation and setStoreLocation respectively. For eg: HBaseStorage // InputFormat add delegation token in getSplits and OutputFormat in // checkOutputSpecs. For eg: FileInputFormat and FileOutputFormat if (stores.size() > 0) { new PigOutputFormat().checkOutputSpecs(job); } // Set the right VertexManagerPlugin if (tezOp.getEstimatedParallelism() != -1) { if (tezOp.isGlobalSort() || tezOp.isSkewedJoin()) { // Set VertexManagerPlugin to PartitionerDefinedVertexManager, which is able // to decrease/increase parallelism of sorting vertex dynamically // based on the numQuantiles calculated by sample aggregation vertex vertex.setVertexManagerPlugin( VertexManagerPluginDescriptor.create(PartitionerDefinedVertexManager.class.getName())); log.info("Set VertexManagerPlugin to PartitionerDefinedParallelismVertexManager for vertex " + tezOp.getOperatorKey().toString()); } else { boolean containScatterGather = false; boolean containCustomPartitioner = false; for (TezEdgeDescriptor edge : tezOp.inEdges.values()) { if (edge.dataMovementType == DataMovementType.SCATTER_GATHER) { containScatterGather = true; } if (edge.partitionerClass != null) { containCustomPartitioner = true; } } if (containScatterGather && !containCustomPartitioner) { // Use auto-parallelism feature of ShuffleVertexManager to dynamically // reduce the parallelism of the vertex VertexManagerPluginDescriptor vmPluginDescriptor = VertexManagerPluginDescriptor .create(ShuffleVertexManager.class.getName()); Configuration vmPluginConf = ConfigurationUtil.toConfiguration(pc.getProperties(), false); vmPluginConf.setBoolean(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL, true); if (vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM, InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) != InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) { vmPluginConf.setLong( ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE, vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM, InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER)); } vmPluginDescriptor.setUserPayload(TezUtils.createUserPayloadFromConf(vmPluginConf)); vertex.setVertexManagerPlugin(vmPluginDescriptor); log.info("Set auto parallelism for vertex " + tezOp.getOperatorKey().toString()); } } } // Reset udfcontext jobconf. It is not supposed to be set in the front end UDFContext.getUDFContext().addJobConf(null); return vertex; }
From source file:org.apache.pig.piggybank.test.storage.TestXMLLoader.java
License:Apache License
/** * This test case test the special case when a non-matching tag spans two file * splits in a .bz2 compressed file. At the same time, the part that falls in * the first split is a prefix of the matching tag. * In other words, till the end of the first split, it looks like the tag is * matching but it is not actually matching. * * @throws Exception/*from w w w . j ava2 s . c om*/ */ public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws Exception { Configuration conf = new Configuration(); long blockSize = 100 * 1024; conf.setLong("fs.local.block.size", blockSize); String tagName = "event"; PigServer pig = new PigServer(LOCAL, conf); FileSystem localFs = FileSystem.getLocal(conf); FileStatus[] testFiles = localFs .globStatus(new Path("src/test/java/org/apache/pig/piggybank/test/evaluation/xml/data/*xml.bz2")); assertTrue("No test files", testFiles.length > 0); for (FileStatus testFile : testFiles) { String testFileName = testFile.getPath().toUri().getPath().replace("\\", "\\\\"); String query = "A = LOAD '" + testFileName + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);"; pig.registerQuery(query); Iterator<?> it = pig.openIterator("A"); while (it.hasNext()) { Tuple tuple = (Tuple) it.next(); if (tuple == null) break; else { if (tuple.size() > 0) { assertTrue(((String) tuple.get(0)).startsWith("<" + tagName + ">")); } } } } }
From source file:org.apache.pig.piggybank.test.storage.TestXMLLoader.java
License:Apache License
/** * This test checks that a multi-line tag spanning two splits should be * matched.//from w w w . j a v a 2 s.c o m * @throws Exception */ public void testXMLLoaderShouldMatchTagSpanningSplits() throws Exception { Configuration conf = new Configuration(); long blockSize = 512; conf.setLong("fs.local.block.size", blockSize); conf.setLong(MRConfiguration.MAX_SPLIT_SIZE, blockSize); String tagName = "event"; File tempFile = File.createTempFile("long-file", ".xml"); FileSystem localFs = FileSystem.getLocal(conf); FSDataOutputStream directOut = localFs.create(new Path(tempFile.getAbsolutePath()), true); String matchingElement = "<event>\ndata\n</event>\n"; long pos = 0; int matchingCount = 0; PrintStream ps = new PrintStream(directOut); // 1- Write some elements that fit completely in the first block while (pos + 2 * matchingElement.length() < blockSize) { ps.print(matchingElement); pos += matchingElement.length(); matchingCount++; } // 2- Write a long element that spans multiple lines and multiple blocks String longElement = matchingElement.replace("data", "data\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\n"); ps.print(longElement); pos += longElement.length(); matchingCount++; // 3- Write some more elements to fill in the second block completely while (pos < 2 * blockSize) { ps.print(matchingElement); pos += matchingElement.length(); matchingCount++; } ps.close(); PigServer pig = new PigServer(LOCAL, conf); String tempFileName = tempFile.getAbsolutePath().replace("\\", "\\\\"); String query = "A = LOAD '" + tempFileName + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);"; pig.registerQuery(query); Iterator<?> it = pig.openIterator("A"); int count = 0; while (it.hasNext()) { Tuple tuple = (Tuple) it.next(); if (tuple == null) break; else { if (tuple.size() > 0) { count++; // Make sure the returned text is a proper XML element DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); Document doc = docBuilder.parse(new ByteArrayInputStream(((String) tuple.get(0)).getBytes())); assertTrue(doc.getDocumentElement().getNodeName().equals(tagName)); } } } assertEquals(matchingCount, count); }
From source file:org.apache.rocketmq.sink.ReplicatorTest.java
License:Apache License
/** * This method starts the HBase cluster and the RocketMQ server. * * @throws Exception// w w w. ja v a2 s . c om */ @Before public void setUp() throws Exception { final Configuration hbaseConf = HBaseConfiguration.create(); hbaseConf.setInt("replication.stats.thread.period.seconds", 5); hbaseConf.setLong("replication.sleep.before.failover", 2000); hbaseConf.setInt("replication.source.maxretriesmultiplier", 10); hbaseConf.setBoolean(HConstants.REPLICATION_ENABLE_KEY, true); // Add RocketMQ properties - we prefix each property with 'rocketmq' addRocketMQProperties(hbaseConf); utility = new HBaseTestingUtility(hbaseConf); utility.startMiniCluster(); utility.getHBaseCluster().getRegionServerThreads().size(); // setup and start RocketMQ startMQ(); }