Example usage for org.apache.hadoop.conf Configuration setLong

List of usage examples for org.apache.hadoop.conf Configuration setLong

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setLong.

Prototype

public void setLong(String name, long value) 

Source Link

Document

Set the value of the name property to a long.

Usage

From source file:org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil.java

License:Apache License

public static void setScrutinyOutputMax(Configuration configuration, long outputMaxRows) {
    Preconditions.checkNotNull(configuration);
    configuration.setLong(SCRUTINY_OUTPUT_MAX, outputMaxRows);
}

From source file:org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil.java

License:Apache License

public static void setScrutinyExecuteTimestamp(Configuration configuration, long ts) {
    Preconditions.checkNotNull(configuration);
    configuration.setLong(SCRUTINY_EXECUTE_TIMESTAMP, ts);
}

From source file:org.apache.phoenix.query.BaseTest.java

License:Apache License

private static void setDefaultTestConfig(Configuration conf, ReadOnlyProps overrideProps) {
    ConfigUtil.setReplicationConfigIfAbsent(conf);
    QueryServices services = new PhoenixTestDriver().getQueryServices();
    for (Entry<String, String> entry : services.getProps()) {
        conf.set(entry.getKey(), entry.getValue());
    }//  w w  w . j  a  va2s .co m
    //no point doing sanity checks when running tests.
    conf.setBoolean("hbase.table.sanity.checks", false);
    // set the server rpc controller and rpc scheduler factory, used to configure the cluster
    conf.set(RSRpcServices.REGION_SERVER_RPC_SCHEDULER_FACTORY_CLASS, DEFAULT_RPC_SCHEDULER_FACTORY);
    conf.setLong(HConstants.ZK_SESSION_TIMEOUT, 10 * HConstants.DEFAULT_ZK_SESSION_TIMEOUT);
    conf.setLong(HConstants.ZOOKEEPER_TICK_TIME, 6 * 1000);
    // override any defaults based on overrideProps
    for (Entry<String, String> entry : overrideProps) {
        conf.set(entry.getKey(), entry.getValue());
    }
}

From source file:org.apache.phoenix.transaction.TephraTransactionContext.java

License:Apache License

@Override
public void setTxnConfigs(Configuration config, String tmpFolder, int defaultTxnTimeoutSeconds)
        throws IOException {
    config.setBoolean(TxConstants.Manager.CFG_DO_PERSIST, false);
    config.set(TxConstants.Service.CFG_DATA_TX_CLIENT_RETRY_STRATEGY, "n-times");
    config.setInt(TxConstants.Service.CFG_DATA_TX_CLIENT_ATTEMPTS, 1);
    config.setInt(TxConstants.Service.CFG_DATA_TX_BIND_PORT, Networks.getRandomPort());
    config.set(TxConstants.Manager.CFG_TX_SNAPSHOT_DIR, tmpFolder);
    config.setInt(TxConstants.Manager.CFG_TX_TIMEOUT, defaultTxnTimeoutSeconds);
    config.unset(TxConstants.Manager.CFG_TX_HDFS_USER);
    config.setLong(TxConstants.Manager.CFG_TX_SNAPSHOT_INTERVAL, 5L);
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler.java

License:Apache License

/**
 * The method that creates the Job corresponding to a MapReduceOper.
 * The assumption is that//  ww  w .j  a v a 2 s.  c  o m
 * every MapReduceOper will have a load and a store. The JobConf removes
 * the load operator and serializes the input filespec so that PigInputFormat can
 * take over the creation of splits. It also removes the store operator
 * and serializes the output filespec so that PigOutputFormat can take over
 * record writing. The remaining portion of the map plan and reduce plans are
 * serialized and stored for the PigMapReduce or PigMapOnly objects to take over
 * the actual running of the plans.
 * The Mapper &amp; Reducer classes and the required key value formats are set.
 * Checks if this is a map only job and uses PigMapOnly class as the mapper
 * and uses PigMapReduce otherwise.
 * If it is a Map Reduce job, it is bound to have a package operator. Remove it from
 * the reduce plan and serializes it so that the PigMapReduce class can use it to package
 * the indexed tuples received by the reducer.
 * @param mro - The MapReduceOper for which the JobConf is required
 * @param config - the Configuration object from which JobConf is built
 * @param pigContext - The PigContext passed on from execution engine
 * @return Job corresponding to mro
 * @throws JobCreationException
 */
@SuppressWarnings({ "unchecked" })
private Job getJob(MROperPlan plan, MapReduceOper mro, Configuration config, PigContext pigContext)
        throws JobCreationException {
    org.apache.hadoop.mapreduce.Job nwJob = null;

    try {
        nwJob = new org.apache.hadoop.mapreduce.Job(config);
    } catch (Exception e) {
        throw new JobCreationException(e);
    }

    Configuration conf = nwJob.getConfiguration();

    ArrayList<FileSpec> inp = new ArrayList<FileSpec>();
    ArrayList<List<OperatorKey>> inpTargets = new ArrayList<List<OperatorKey>>();
    ArrayList<String> inpSignatureLists = new ArrayList<String>();
    ArrayList<Long> inpLimits = new ArrayList<Long>();
    ArrayList<POStore> storeLocations = new ArrayList<POStore>();
    Path tmpLocation = null;

    // add settings for pig statistics
    String setScriptProp = conf.get(PigConfiguration.INSERT_ENABLED, "true");
    if (setScriptProp.equalsIgnoreCase("true")) {
        MRScriptState ss = MRScriptState.get();
        ss.addSettingsToConf(mro, conf);
    }

    conf.set(MRConfiguration.MAPPER_NEW_API, "true");
    conf.set(MRConfiguration.REDUCER_NEW_API, "true");

    String buffPercent = conf.get(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT);
    if (buffPercent == null || Double.parseDouble(buffPercent) <= 0) {
        log.info(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT + " is not set, set to default 0.3");
        conf.set(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT, "0.3");
    } else {
        log.info(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT + " is set to "
                + conf.get(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT));
    }

    configureCompression(conf);

    try {
        //Process the POLoads
        List<POLoad> lds = PlanHelper.getPhysicalOperators(mro.mapPlan, POLoad.class);

        if (lds != null && lds.size() > 0) {
            for (POLoad ld : lds) {
                LoadFunc lf = ld.getLoadFunc();
                lf.setLocation(ld.getLFile().getFileName(), nwJob);

                //Store the inp filespecs
                inp.add(ld.getLFile());
            }
        }

        if (!mro.reducePlan.isEmpty()) {
            log.info("Reduce phase detected, estimating # of required reducers.");
            adjustNumReducers(plan, mro, nwJob);
        } else {
            nwJob.setNumReduceTasks(0);
        }

        for (String udf : mro.UDFs) {
            if (udf.contains("GFCross")) {
                Object func = pigContext.instantiateFuncFromSpec(new FuncSpec(udf));
                if (func instanceof GFCross) {
                    String crossKey = ((GFCross) func).getCrossKey();
                    // If non GFCross has been processed yet
                    if (pigContext.getProperties()
                            .get(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey) == null) {
                        pigContext.getProperties().setProperty(
                                PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey,
                                Integer.toString(nwJob.getNumReduceTasks()));
                    }
                    conf.set(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey, (String) pigContext
                            .getProperties().get(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey));
                }
            }
        }

        if (lds != null && lds.size() > 0) {
            for (POLoad ld : lds) {
                //Store the target operators for tuples read
                //from this input
                List<PhysicalOperator> ldSucs = mro.mapPlan.getSuccessors(ld);
                List<OperatorKey> ldSucKeys = new ArrayList<OperatorKey>();
                if (ldSucs != null) {
                    for (PhysicalOperator operator2 : ldSucs) {
                        ldSucKeys.add(operator2.getOperatorKey());
                    }
                }
                inpTargets.add(ldSucKeys);
                inpSignatureLists.add(ld.getSignature());
                inpLimits.add(ld.getLimit());
                //Remove the POLoad from the plan
                if (!pigContext.inIllustrator)
                    mro.mapPlan.remove(ld);
            }
        }

        if (!pigContext.inIllustrator && !pigContext.getExecType().isLocal()) {
            if (okToRunLocal(nwJob, mro, lds)) {
                log.info(SMALL_JOB_LOG_MSG);
                // override with the default conf to run in local mode
                for (Entry<String, String> entry : defaultConf) {
                    String key = entry.getKey();
                    if (key.equals(MRConfiguration.REDUCE_TASKS) || key.equals(MRConfiguration.JOB_REDUCES)) {
                        // this must not be set back to the default in case it has been set to 0 for example.
                        continue;
                    }
                    if (key.startsWith("fs.")) {
                        // we don't want to change fs settings back
                        continue;
                    }
                    if (key.startsWith("io.")) {
                        // we don't want to change io settings back
                        continue;
                    }
                    String value = entry.getValue();
                    if (conf.get(key) == null || !conf.get(key).equals(value)) {
                        conf.set(key, value);
                    }
                }

                conf.setBoolean(PigImplConstants.CONVERTED_TO_LOCAL, true);
            } else {
                log.info(BIG_JOB_LOG_MSG);
                // Setup the DistributedCache for this job
                List<URL> allJars = new ArrayList<URL>();

                for (URL extraJar : pigContext.extraJars) {
                    if (!allJars.contains(extraJar)) {
                        allJars.add(extraJar);
                    }
                }

                for (String scriptJar : pigContext.scriptJars) {
                    URL jar = new File(scriptJar).toURI().toURL();
                    if (!allJars.contains(jar)) {
                        allJars.add(jar);
                    }
                }

                for (String defaultJar : JarManager.getDefaultJars()) {
                    URL jar = new File(defaultJar).toURI().toURL();
                    if (!allJars.contains(jar)) {
                        allJars.add(jar);
                    }
                }

                for (URL jar : allJars) {
                    boolean predeployed = false;
                    for (String predeployedJar : pigContext.predeployedJars) {
                        if (predeployedJar.contains(new File(jar.toURI()).getName())) {
                            predeployed = true;
                        }
                    }
                    if (!predeployed) {
                        log.info("Adding jar to DistributedCache: " + jar);
                        putJarOnClassPathThroughDistributedCache(pigContext, conf, jar);
                    }
                }

                File scriptUDFJarFile = JarManager.createPigScriptUDFJar(pigContext);
                if (scriptUDFJarFile != null) {
                    putJarOnClassPathThroughDistributedCache(pigContext, conf,
                            scriptUDFJarFile.toURI().toURL());
                }
            }
        }

        if (Utils.isLocal(pigContext, conf)) {
            ConfigurationUtil.replaceConfigForLocalMode(conf);
        }
        conf.set("pig.inputs", ObjectSerializer.serialize(inp));
        conf.set("pig.inpTargets", ObjectSerializer.serialize(inpTargets));
        conf.set("pig.inpSignatures", ObjectSerializer.serialize(inpSignatureLists));
        conf.set("pig.inpLimits", ObjectSerializer.serialize(inpLimits));
        conf.set("pig.pigContext", ObjectSerializer.serialize(pigContext));
        conf.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList()));
        // this is for unit tests since some don't create PigServer

        // if user specified the job name using -D switch, Pig won't reset the name then.
        if (System.getProperty(MRConfiguration.JOB_NAME) == null
                && pigContext.getProperties().getProperty(PigContext.JOB_NAME) != null) {
            nwJob.setJobName(pigContext.getProperties().getProperty(PigContext.JOB_NAME));
        }

        if (pigContext.getProperties().getProperty(PigContext.JOB_PRIORITY) != null) {
            // If the job priority was set, attempt to get the corresponding enum value
            // and set the hadoop job priority.
            String jobPriority = pigContext.getProperties().getProperty(PigContext.JOB_PRIORITY).toUpperCase();
            try {
                // Allow arbitrary case; the Hadoop job priorities are all upper case.
                conf.set(MRConfiguration.JOB_PRIORITY, JobPriority.valueOf(jobPriority).toString());

            } catch (IllegalArgumentException e) {
                StringBuffer sb = new StringBuffer("The job priority must be one of [");
                JobPriority[] priorities = JobPriority.values();
                for (int i = 0; i < priorities.length; ++i) {
                    if (i > 0)
                        sb.append(", ");
                    sb.append(priorities[i]);
                }
                sb.append("].  You specified [" + jobPriority + "]");
                throw new JobCreationException(sb.toString());
            }
        }

        setupDistributedCache(pigContext, conf, pigContext.getProperties(), "pig.streaming.ship.files", true);
        setupDistributedCache(pigContext, conf, pigContext.getProperties(), "pig.streaming.cache.files", false);

        nwJob.setInputFormatClass(PigInputFormat.class);

        // tmp file compression setups
        // PIG-3741 This must be done before setStoreLocation on POStores
        Utils.setTmpFileCompressionOnConf(pigContext, conf);

        //Process POStore and remove it from the plan
        LinkedList<POStore> mapStores = PlanHelper.getPhysicalOperators(mro.mapPlan, POStore.class);
        LinkedList<POStore> reduceStores = PlanHelper.getPhysicalOperators(mro.reducePlan, POStore.class);

        for (POStore st : mapStores) {
            storeLocations.add(st);
            StoreFuncInterface sFunc = st.getStoreFunc();
            sFunc.setStoreLocation(st.getSFile().getFileName(), nwJob);
            if (sFunc instanceof OverwritableStoreFunc) {
                OverwritableStoreFunc osf = (OverwritableStoreFunc) sFunc;
                if (osf.shouldOverwrite()) {
                    osf.cleanupOutput(st, nwJob);
                }
            }
        }

        for (POStore st : reduceStores) {
            storeLocations.add(st);
            StoreFuncInterface sFunc = st.getStoreFunc();
            sFunc.setStoreLocation(st.getSFile().getFileName(), nwJob);
            if (sFunc instanceof OverwritableStoreFunc) {
                OverwritableStoreFunc osf = (OverwritableStoreFunc) sFunc;
                if (osf.shouldOverwrite()) {
                    osf.cleanupOutput(st, nwJob);
                }
            }
        }

        setOutputFormat(nwJob);

        if (mapStores.size() + reduceStores.size() == 1) { // single store case
            log.info("Setting up single store job");

            POStore st;
            if (reduceStores.isEmpty()) {
                st = mapStores.get(0);
                if (!pigContext.inIllustrator)
                    mro.mapPlan.remove(st);
            } else {
                st = reduceStores.get(0);
                if (!pigContext.inIllustrator)
                    mro.reducePlan.remove(st);
            }

            MapRedUtil.setupStreamingDirsConfSingle(st, pigContext, conf);
        } else if (mapStores.size() + reduceStores.size() > 0) { // multi store case
            log.info("Setting up multi store job");
            MapRedUtil.setupStreamingDirsConfMulti(pigContext, conf);

            boolean disableCounter = conf.getBoolean("pig.disable.counter", false);
            if (disableCounter) {
                log.info("Disable Pig custom output counters");
            }
            int idx = 0;
            for (POStore sto : storeLocations) {
                sto.setDisableCounter(disableCounter);
                sto.setMultiStore(true);
                sto.setIndex(idx++);
            }
        }

        // store map key type
        // this is needed when the key is null to create
        // an appropriate NullableXXXWritable object
        conf.set("pig.map.keytype", ObjectSerializer.serialize(new byte[] { mro.mapKeyType }));

        // set parent plan in all operators in map and reduce plans
        // currently the parent plan is really used only when POStream is present in the plan
        new PhyPlanSetter(mro.mapPlan).visit();
        new PhyPlanSetter(mro.reducePlan).visit();

        // this call modifies the ReplFiles names of POFRJoin operators
        // within the MR plans, must be called before the plans are
        // serialized
        setupDistributedCacheForJoin(mro, pigContext, conf);

        // Search to see if we have any UDFs that need to pack things into the
        // distributed cache.
        setupDistributedCacheForUdfs(mro, pigContext, conf);

        SchemaTupleFrontend.copyAllGeneratedToDistributedCache(pigContext, conf);

        POPackage pack = null;
        if (mro.reducePlan.isEmpty()) {
            //MapOnly Job
            nwJob.setMapperClass(PigMapOnly.Map.class);
            if (!pigContext.inIllustrator)
                conf.set("pig.mapPlan", ObjectSerializer.serialize(mro.mapPlan));
            if (mro.isEndOfAllInputSetInMap()) {
                // this is used in Map.close() to decide whether the
                // pipeline needs to be rerun one more time in the close()
                // The pipeline is rerun if there either was a stream or POMergeJoin
                conf.set(END_OF_INP_IN_MAP, "true");
            }
        } else {
            //Map Reduce Job
            //Process the POPackage operator and remove it from the reduce plan
            if (!mro.combinePlan.isEmpty()) {
                POPackage combPack = (POPackage) mro.combinePlan.getRoots().get(0);
                mro.combinePlan.remove(combPack);
                nwJob.setCombinerClass(PigCombiner.Combine.class);
                conf.set("pig.combinePlan", ObjectSerializer.serialize(mro.combinePlan));
                conf.set("pig.combine.package", ObjectSerializer.serialize(combPack));
            } else if (mro.needsDistinctCombiner()) {
                nwJob.setCombinerClass(DistinctCombiner.Combine.class);
                log.info("Setting identity combiner class.");
            }
            pack = (POPackage) mro.reducePlan.getRoots().get(0);
            if (!pigContext.inIllustrator)
                mro.reducePlan.remove(pack);
            nwJob.setMapperClass(PigMapReduce.Map.class);
            nwJob.setReducerClass(PigMapReduce.Reduce.class);

            if (mro.customPartitioner != null)
                nwJob.setPartitionerClass(PigContext.resolveClassName(mro.customPartitioner));

            if (!pigContext.inIllustrator)
                conf.set("pig.mapPlan", ObjectSerializer.serialize(mro.mapPlan));
            if (mro.isEndOfAllInputSetInMap()) {
                // this is used in Map.close() to decide whether the
                // pipeline needs to be rerun one more time in the close()
                // The pipeline is rerun only if there was a stream or merge-join.
                conf.set(END_OF_INP_IN_MAP, "true");
            }
            if (!pigContext.inIllustrator)
                conf.set("pig.reducePlan", ObjectSerializer.serialize(mro.reducePlan));
            if (mro.isEndOfAllInputSetInReduce()) {
                // this is used in Map.close() to decide whether the
                // pipeline needs to be rerun one more time in the close()
                // The pipeline is rerun only if there was a stream
                conf.set("pig.stream.in.reduce", "true");
            }
            if (!pigContext.inIllustrator)
                conf.set("pig.reduce.package", ObjectSerializer.serialize(pack));
            conf.set("pig.reduce.key.type", Byte.toString(pack.getPkgr().getKeyType()));

            if (mro.getUseSecondaryKey()) {
                nwJob.setGroupingComparatorClass(PigSecondaryKeyGroupComparator.class);
                nwJob.setPartitionerClass(SecondaryKeyPartitioner.class);
                nwJob.setSortComparatorClass(PigSecondaryKeyComparator.class);
                nwJob.setOutputKeyClass(NullableTuple.class);
                conf.set("pig.secondarySortOrder", ObjectSerializer.serialize(mro.getSecondarySortOrder()));

            } else {
                Class<? extends WritableComparable> keyClass = HDataType
                        .getWritableComparableTypes(pack.getPkgr().getKeyType()).getClass();
                nwJob.setOutputKeyClass(keyClass);
                selectComparator(mro, pack.getPkgr().getKeyType(), nwJob);
            }
            nwJob.setOutputValueClass(NullableTuple.class);
        }

        if (mro.isGlobalSort() || mro.isLimitAfterSort()) {
            if (mro.isGlobalSort()) {
                String symlink = addSingleFileToDistributedCache(pigContext, conf, mro.getQuantFile(),
                        "pigsample");
                conf.set("pig.quantilesFile", symlink);
                nwJob.setPartitionerClass(WeightedRangePartitioner.class);
            }

            if (mro.isUDFComparatorUsed) {
                boolean usercomparator = false;
                for (String compFuncSpec : mro.UDFs) {
                    Class comparator = PigContext.resolveClassName(compFuncSpec);
                    if (ComparisonFunc.class.isAssignableFrom(comparator)) {
                        nwJob.setMapperClass(PigMapReduce.MapWithComparator.class);
                        nwJob.setReducerClass(PigMapReduce.ReduceWithComparator.class);
                        conf.set("pig.reduce.package", ObjectSerializer.serialize(pack));
                        conf.set("pig.usercomparator", "true");
                        nwJob.setOutputKeyClass(NullableTuple.class);
                        nwJob.setSortComparatorClass(comparator);
                        usercomparator = true;
                        break;
                    }
                }
                if (!usercomparator) {
                    String msg = "Internal error. Can't find the UDF comparator";
                    throw new IOException(msg);
                }

            } else {
                conf.set("pig.sortOrder", ObjectSerializer.serialize(mro.getSortOrder()));
            }
        }

        if (mro.isSkewedJoin()) {
            String symlink = addSingleFileToDistributedCache(pigContext, conf, mro.getSkewedJoinPartitionFile(),
                    "pigdistkey");
            conf.set("pig.keyDistFile", symlink);
            nwJob.setPartitionerClass(SkewedPartitioner.class);
            nwJob.setMapperClass(PigMapReduce.MapWithPartitionIndex.class);
            nwJob.setMapOutputKeyClass(NullablePartitionWritable.class);
            nwJob.setGroupingComparatorClass(PigGroupingPartitionWritableComparator.class);
        }

        if (mro.isCounterOperation()) {
            if (mro.isRowNumber()) {
                nwJob.setMapperClass(PigMapReduceCounter.PigMapCounter.class);
            } else {
                nwJob.setReducerClass(PigMapReduceCounter.PigReduceCounter.class);
            }
        }

        if (mro.isRankOperation()) {
            Iterator<String> operationIDs = mro.getRankOperationId().iterator();

            while (operationIDs.hasNext()) {
                String operationID = operationIDs.next();
                Iterator<Pair<String, Long>> itPairs = globalCounters.get(operationID).iterator();
                Pair<String, Long> pair = null;
                while (itPairs.hasNext()) {
                    pair = itPairs.next();
                    conf.setLong(pair.first, pair.second);
                }
            }
        }

        if (!pigContext.inIllustrator) {
            // unset inputs for POStore, otherwise, map/reduce plan will be unnecessarily deserialized
            for (POStore st : mapStores) {
                st.setInputs(null);
                st.setParentPlan(null);
            }
            for (POStore st : reduceStores) {
                st.setInputs(null);
                st.setParentPlan(null);
            }
            conf.set(PIG_MAP_STORES, ObjectSerializer.serialize(mapStores));
            conf.set(PIG_REDUCE_STORES, ObjectSerializer.serialize(reduceStores));
        }

        String tmp;
        long maxCombinedSplitSize = 0;
        if (!mro.combineSmallSplits()
                || pigContext.getProperties().getProperty("pig.splitCombination", "true").equals("false"))
            conf.setBoolean("pig.noSplitCombination", true);
        else if ((tmp = pigContext.getProperties().getProperty("pig.maxCombinedSplitSize", null)) != null) {
            try {
                maxCombinedSplitSize = Long.parseLong(tmp);
            } catch (NumberFormatException e) {
                log.warn(
                        "Invalid numeric format for pig.maxCombinedSplitSize; use the default maximum combined split size");
            }
        }
        if (maxCombinedSplitSize > 0)
            conf.setLong("pig.maxCombinedSplitSize", maxCombinedSplitSize);

        // It's a hack to set distributed cache file for hadoop 23. Once MiniMRCluster do not require local
        // jar on fixed location, this can be removed
        if (pigContext.getExecType() == ExecType.MAPREDUCE) {
            String newfiles = conf.get("alternative.mapreduce.job.cache.files");
            if (newfiles != null) {
                String files = conf.get(MRConfiguration.JOB_CACHE_FILES);
                conf.set(MRConfiguration.JOB_CACHE_FILES,
                        files == null ? newfiles.toString() : files + "," + newfiles);
            }
        }
        // Serialize the UDF specific context info.
        UDFContext.getUDFContext().serialize(conf);
        Job cjob = new Job(new JobConf(conf), new ArrayList<Job>());
        jobStoreMap.put(cjob, new Pair<List<POStore>, Path>(storeLocations, tmpLocation));
        return cjob;

    } catch (JobCreationException jce) {
        throw jce;
    } catch (Exception e) {
        int errCode = 2017;
        String msg = "Internal error creating job configuration.";
        throw new JobCreationException(msg, errCode, PigException.BUG, e);
    }
}

From source file:org.apache.pig.backend.hadoop.executionengine.tez.plan.optimizer.LoaderProcessor.java

License:Apache License

/**
 * Do the final configuration of LoadFuncs and store what goes where. This
 * will need to be changed as the inputs get un-bundled
 *
 * @param tezOp/*from   www.java 2 s  .c  o m*/
 * @param conf
 * @param job
 * @return true if any POLoads were found, else false.
 * @throws VisitorException
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
private List<POLoad> processLoads(TezOperator tezOp)
        throws VisitorException, IOException, ClassNotFoundException, InterruptedException {
    ArrayList<FileSpec> inp = new ArrayList<FileSpec>();
    ArrayList<List<OperatorKey>> inpTargets = new ArrayList<List<OperatorKey>>();
    ArrayList<String> inpSignatureLists = new ArrayList<String>();
    ArrayList<Long> inpLimits = new ArrayList<Long>();

    List<POLoad> lds = PlanHelper.getPhysicalOperators(tezOp.plan, POLoad.class);

    Job job = Job.getInstance(jobConf);
    Configuration conf = job.getConfiguration();

    if (lds != null && lds.size() > 0) {
        if (lds.size() == 1) {
            for (POLoad ld : lds) {
                LoadFunc lf = ld.getLoadFunc();
                lf.setLocation(ld.getLFile().getFileName(), job);

                // Store the inp filespecs
                inp.add(ld.getLFile());
            }
        } else {
            throw new VisitorException("There is more than one load for TezOperator " + tezOp);
        }
    }

    if (lds != null && lds.size() > 0) {
        for (POLoad ld : lds) {
            // Store the target operators for tuples read
            // from this input
            List<PhysicalOperator> ldSucs = new ArrayList<PhysicalOperator>(tezOp.plan.getSuccessors(ld));
            List<OperatorKey> ldSucKeys = new ArrayList<OperatorKey>();
            if (ldSucs != null) {
                for (PhysicalOperator operator2 : ldSucs) {
                    ldSucKeys.add(operator2.getOperatorKey());
                }
            }
            inpTargets.add(ldSucKeys);
            inpSignatureLists.add(ld.getSignature());
            inpLimits.add(ld.getLimit());
            // Remove the POLoad from the plan
            tezOp.plan.remove(ld);
            // Now add the input handling operator for the Tez backend
            // TODO: Move this upstream to the PhysicalPlan generation
            POSimpleTezLoad tezLoad = new POSimpleTezLoad(ld.getOperatorKey(), ld.getLoadFunc());
            tezLoad.setLFile(ld.getLFile());
            tezLoad.setSignature(ld.getSignature());
            tezLoad.setInputKey(ld.getOperatorKey().toString());
            tezLoad.copyAliasFrom(ld);
            tezLoad.setCacheFiles(ld.getCacheFiles());
            tezLoad.setShipFiles(ld.getShipFiles());
            tezOp.plan.add(tezLoad);
            for (PhysicalOperator sucs : ldSucs) {
                tezOp.plan.connect(tezLoad, sucs);
            }
        }
        UDFContext.getUDFContext().serialize(conf);
        conf.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList()));
        conf.set(PigInputFormat.PIG_INPUTS, ObjectSerializer.serialize(inp));
        conf.set(PigInputFormat.PIG_INPUT_TARGETS, ObjectSerializer.serialize(inpTargets));
        conf.set(PigInputFormat.PIG_INPUT_SIGNATURES, ObjectSerializer.serialize(inpSignatureLists));
        conf.set(PigInputFormat.PIG_INPUT_LIMITS, ObjectSerializer.serialize(inpLimits));
        String tmp;
        long maxCombinedSplitSize = 0;
        if (!tezOp.combineSmallSplits() || pc.getProperties()
                .getProperty(PigConfiguration.PIG_SPLIT_COMBINATION, "true").equals("false"))
            conf.setBoolean(PigConfiguration.PIG_NO_SPLIT_COMBINATION, true);
        else if ((tmp = pc.getProperties().getProperty(PigConfiguration.PIG_MAX_COMBINED_SPLIT_SIZE,
                null)) != null) {
            try {
                maxCombinedSplitSize = Long.parseLong(tmp);
            } catch (NumberFormatException e) {
                LOG.warn(
                        "Invalid numeric format for pig.maxCombinedSplitSize; use the default maximum combined split size");
            }
        }
        if (maxCombinedSplitSize > 0)
            conf.setLong("pig.maxCombinedSplitSize", maxCombinedSplitSize);
        tezOp.getLoaderInfo().setInpSignatureLists(inpSignatureLists);
        tezOp.getLoaderInfo().setInp(inp);
        tezOp.getLoaderInfo().setInpLimits(inpLimits);
        // Not using MRInputAMSplitGenerator because delegation tokens are
        // fetched in FileInputFormat
        tezOp.getLoaderInfo().setInputSplitInfo(MRInputHelpers.generateInputSplitsToMem(conf, false, 0));
        // TODO: Can be set to -1 if TEZ-601 gets fixed and getting input
        // splits can be moved to if(loads) block below
        int parallelism = tezOp.getLoaderInfo().getInputSplitInfo().getNumTasks();
        tezOp.setRequestedParallelism(parallelism);
        tezOp.setTotalInputFilesSize(InputSizeReducerEstimator.getTotalInputFileSize(conf, lds, job));
    }
    return lds;
}

From source file:org.apache.pig.backend.hadoop.executionengine.tez.TezDagBuilder.java

License:Apache License

private Vertex newVertex(TezOperator tezOp, boolean isMap)
        throws IOException, ClassNotFoundException, InterruptedException {
    ProcessorDescriptor procDesc = ProcessorDescriptor.create(tezOp.getProcessorName());

    // Pass physical plans to vertex as user payload.
    JobConf payloadConf = new JobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), false));

    // We do this so that dag.getCredentials(), job.getCredentials(),
    // job.getConfiguration().getCredentials() all reference the same Credentials object
    // Unfortunately there is no setCredentials() on Job
    payloadConf.setCredentials(dag.getCredentials());
    // We won't actually use this job, but we need it to talk with the Load Store funcs
    @SuppressWarnings("deprecation")
    Job job = new Job(payloadConf);
    payloadConf = (JobConf) job.getConfiguration();

    if (tezOp.sampleOperator != null) {
        payloadConf.set(PigProcessor.SAMPLE_VERTEX, tezOp.sampleOperator.getOperatorKey().toString());
    }/*from   w w w. j av  a 2  s  . c  o  m*/

    if (tezOp.sortOperator != null) {
        payloadConf.set(PigProcessor.SORT_VERTEX, tezOp.sortOperator.getOperatorKey().toString());
    }

    String tmp;
    long maxCombinedSplitSize = 0;
    if (!tezOp.combineSmallSplits()
            || pc.getProperties().getProperty(PigConfiguration.PIG_SPLIT_COMBINATION, "true").equals("false"))
        payloadConf.setBoolean(PigConfiguration.PIG_NO_SPLIT_COMBINATION, true);
    else if ((tmp = pc.getProperties().getProperty(PigConfiguration.PIG_MAX_COMBINED_SPLIT_SIZE,
            null)) != null) {
        try {
            maxCombinedSplitSize = Long.parseLong(tmp);
        } catch (NumberFormatException e) {
            log.warn(
                    "Invalid numeric format for pig.maxCombinedSplitSize; use the default maximum combined split size");
        }
    }
    if (maxCombinedSplitSize > 0)
        payloadConf.setLong("pig.maxCombinedSplitSize", maxCombinedSplitSize);

    payloadConf.set("pig.inputs", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInp()));
    payloadConf.set("pig.inpSignatures",
            ObjectSerializer.serialize(tezOp.getLoaderInfo().getInpSignatureLists()));
    payloadConf.set("pig.inpLimits", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInpLimits()));
    // Process stores
    LinkedList<POStore> stores = processStores(tezOp, payloadConf, job);

    payloadConf.set("pig.pigContext", ObjectSerializer.serialize(pc));
    payloadConf.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList()));
    payloadConf.set("exectype", "TEZ");
    payloadConf.setBoolean(MRConfiguration.MAPPER_NEW_API, true);
    payloadConf.setClass(MRConfiguration.INPUTFORMAT_CLASS, PigInputFormat.class, InputFormat.class);

    // Set parent plan for all operators in the Tez plan.
    new PhyPlanSetter(tezOp.plan).visit();

    // Set the endOfAllInput flag on the physical plan if certain operators that
    // use this property (such as STREAM) are present in the plan.
    EndOfAllInputSetter.EndOfAllInputChecker checker = new EndOfAllInputSetter.EndOfAllInputChecker(tezOp.plan);
    checker.visit();
    if (checker.isEndOfAllInputPresent()) {
        payloadConf.set(JobControlCompiler.END_OF_INP_IN_MAP, "true");
    }

    // Configure the classes for incoming shuffles to this TezOp
    // TODO: Refactor out resetting input keys, PIG-3957
    List<PhysicalOperator> roots = tezOp.plan.getRoots();
    if (roots.size() == 1 && roots.get(0) instanceof POPackage) {
        POPackage pack = (POPackage) roots.get(0);

        List<PhysicalOperator> succsList = tezOp.plan.getSuccessors(pack);
        if (succsList != null) {
            succsList = new ArrayList<PhysicalOperator>(succsList);
        }
        byte keyType = pack.getPkgr().getKeyType();
        tezOp.plan.remove(pack);
        payloadConf.set("pig.reduce.package", ObjectSerializer.serialize(pack));
        setIntermediateOutputKeyValue(keyType, payloadConf, tezOp);
        POShuffleTezLoad newPack;
        newPack = new POShuffleTezLoad(pack);
        if (tezOp.isSkewedJoin()) {
            newPack.setSkewedJoins(true);
        }
        tezOp.plan.add(newPack);

        // Set input keys for POShuffleTezLoad. This is used to identify
        // the inputs that are attached to the POShuffleTezLoad in the
        // backend.
        Map<Integer, String> localRearrangeMap = new TreeMap<Integer, String>();
        for (TezOperator pred : mPlan.getPredecessors(tezOp)) {
            if (tezOp.sampleOperator != null && tezOp.sampleOperator == pred) {
                // skip sample vertex input
            } else {
                String inputKey = pred.getOperatorKey().toString();
                if (pred.isVertexGroup()) {
                    pred = mPlan.getOperator(pred.getVertexGroupMembers().get(0));
                }
                LinkedList<POLocalRearrangeTez> lrs = PlanHelper.getPhysicalOperators(pred.plan,
                        POLocalRearrangeTez.class);
                for (POLocalRearrangeTez lr : lrs) {
                    if (lr.isConnectedToPackage()
                            && lr.getOutputKey().equals(tezOp.getOperatorKey().toString())) {
                        localRearrangeMap.put((int) lr.getIndex(), inputKey);
                    }
                }
            }
        }
        for (Map.Entry<Integer, String> entry : localRearrangeMap.entrySet()) {
            newPack.addInputKey(entry.getValue());
        }

        if (succsList != null) {
            for (PhysicalOperator succs : succsList) {
                tezOp.plan.connect(newPack, succs);
            }
        }

        setIntermediateOutputKeyValue(pack.getPkgr().getKeyType(), payloadConf, tezOp);
    } else if (roots.size() == 1 && roots.get(0) instanceof POIdentityInOutTez) {
        POIdentityInOutTez identityInOut = (POIdentityInOutTez) roots.get(0);
        // TODO Need to fix multiple input key mapping
        TezOperator identityInOutPred = null;
        for (TezOperator pred : mPlan.getPredecessors(tezOp)) {
            if (!pred.isSampleAggregation()) {
                identityInOutPred = pred;
                break;
            }
        }
        identityInOut.setInputKey(identityInOutPred.getOperatorKey().toString());
    } else if (roots.size() == 1 && roots.get(0) instanceof POValueInputTez) {
        POValueInputTez valueInput = (POValueInputTez) roots.get(0);

        LinkedList<String> scalarInputs = new LinkedList<String>();
        for (POUserFunc userFunc : PlanHelper.getPhysicalOperators(tezOp.plan, POUserFunc.class)) {
            if (userFunc.getFunc() instanceof ReadScalarsTez) {
                scalarInputs.add(((ReadScalarsTez) userFunc.getFunc()).getTezInputs()[0]);
            }
        }
        // Make sure we don't find the scalar
        for (TezOperator pred : mPlan.getPredecessors(tezOp)) {
            if (!scalarInputs.contains(pred.getOperatorKey().toString())) {
                valueInput.setInputKey(pred.getOperatorKey().toString());
                break;
            }
        }
    }
    JobControlCompiler.setOutputFormat(job);

    // set parent plan in all operators. currently the parent plan is really
    // used only when POStream, POSplit are present in the plan
    new PhyPlanSetter(tezOp.plan).visit();

    // Serialize the execution plan
    payloadConf.set(PigProcessor.PLAN, ObjectSerializer.serialize(tezOp.plan));

    UDFContext.getUDFContext().serialize(payloadConf);

    MRToTezHelper.processMRSettings(payloadConf, globalConf);

    if (!pc.inIllustrator) {
        for (POStore store : stores) {
            // unset inputs for POStore, otherwise, map/reduce plan will be unnecessarily deserialized
            store.setInputs(null);
            store.setParentPlan(null);
        }
        // We put them in the reduce because PigOutputCommitter checks the
        // ID of the task to see if it's a map, and if not, calls the reduce
        // committers.
        payloadConf.set(JobControlCompiler.PIG_MAP_STORES,
                ObjectSerializer.serialize(new ArrayList<POStore>()));
        payloadConf.set(JobControlCompiler.PIG_REDUCE_STORES, ObjectSerializer.serialize(stores));
    }

    if (tezOp.isNeedEstimateParallelism()) {
        payloadConf.setBoolean(PigProcessor.ESTIMATE_PARALLELISM, true);
        log.info("Estimate quantile for sample aggregation vertex " + tezOp.getOperatorKey().toString());
    }

    // Take our assembled configuration and create a vertex
    UserPayload userPayload = TezUtils.createUserPayloadFromConf(payloadConf);
    procDesc.setUserPayload(userPayload);

    Vertex vertex = Vertex.create(tezOp.getOperatorKey().toString(), procDesc, tezOp.getVertexParallelism(),
            isMap ? MRHelpers.getResourceForMRMapper(globalConf)
                    : MRHelpers.getResourceForMRReducer(globalConf));

    Map<String, String> taskEnv = new HashMap<String, String>();
    MRHelpers.updateEnvBasedOnMRTaskEnv(globalConf, taskEnv, isMap);
    vertex.setTaskEnvironment(taskEnv);

    // All these classes are @InterfaceAudience.Private in Hadoop. Switch to Tez methods in TEZ-1012
    // set the timestamps, public/private visibility of the archives and files
    ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(globalConf);
    // get DelegationToken for each cached file
    ClientDistributedCacheManager.getDelegationTokens(globalConf, job.getCredentials());
    MRApps.setupDistributedCache(globalConf, localResources);
    vertex.addTaskLocalFiles(localResources);

    vertex.setTaskLaunchCmdOpts(isMap ? MRHelpers.getJavaOptsForMRMapper(globalConf)
            : MRHelpers.getJavaOptsForMRReducer(globalConf));

    log.info("For vertex - " + tezOp.getOperatorKey().toString() + ": parallelism="
            + tezOp.getVertexParallelism() + ", memory=" + vertex.getTaskResource().getMemory() + ", java opts="
            + vertex.getTaskLaunchCmdOpts());

    // Right now there can only be one of each of these. Will need to be
    // more generic when there can be more.
    for (POLoad ld : tezOp.getLoaderInfo().getLoads()) {

        // TODO: These should get the globalConf, or a merged version that
        // keeps settings like pig.maxCombinedSplitSize
        vertex.setLocationHint(
                VertexLocationHint.create(tezOp.getLoaderInfo().getInputSplitInfo().getTaskLocationHints()));
        vertex.addDataSource(ld.getOperatorKey().toString(), DataSourceDescriptor.create(
                InputDescriptor.create(MRInput.class.getName())
                        .setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder()
                                .setConfigurationBytes(TezUtils.createByteStringFromConf(payloadConf))
                                .setSplits(tezOp.getLoaderInfo().getInputSplitInfo().getSplitsProto()).build()
                                .toByteString().asReadOnlyByteBuffer())),
                InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName()),
                dag.getCredentials()));
    }

    for (POStore store : stores) {

        ArrayList<POStore> emptyList = new ArrayList<POStore>();
        ArrayList<POStore> singleStore = new ArrayList<POStore>();
        singleStore.add(store);

        Configuration outputPayLoad = new Configuration(payloadConf);
        outputPayLoad.set(JobControlCompiler.PIG_MAP_STORES, ObjectSerializer.serialize(emptyList));
        outputPayLoad.set(JobControlCompiler.PIG_REDUCE_STORES, ObjectSerializer.serialize(singleStore));

        OutputDescriptor storeOutDescriptor = OutputDescriptor.create(MROutput.class.getName())
                .setUserPayload(TezUtils.createUserPayloadFromConf(outputPayLoad));
        if (tezOp.getVertexGroupStores() != null) {
            OperatorKey vertexGroupKey = tezOp.getVertexGroupStores().get(store.getOperatorKey());
            if (vertexGroupKey != null) {
                getPlan().getOperator(vertexGroupKey).getVertexGroupInfo()
                        .setStoreOutputDescriptor(storeOutDescriptor);
                continue;
            }
        }
        vertex.addDataSink(store.getOperatorKey().toString(), new DataSinkDescriptor(storeOutDescriptor,
                OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), dag.getCredentials()));
    }

    // LoadFunc and StoreFunc add delegation tokens to Job Credentials in
    // setLocation and setStoreLocation respectively. For eg: HBaseStorage
    // InputFormat add delegation token in getSplits and OutputFormat in
    // checkOutputSpecs. For eg: FileInputFormat and FileOutputFormat
    if (stores.size() > 0) {
        new PigOutputFormat().checkOutputSpecs(job);
    }

    // Set the right VertexManagerPlugin
    if (tezOp.getEstimatedParallelism() != -1) {
        if (tezOp.isGlobalSort() || tezOp.isSkewedJoin()) {
            // Set VertexManagerPlugin to PartitionerDefinedVertexManager, which is able
            // to decrease/increase parallelism of sorting vertex dynamically
            // based on the numQuantiles calculated by sample aggregation vertex
            vertex.setVertexManagerPlugin(
                    VertexManagerPluginDescriptor.create(PartitionerDefinedVertexManager.class.getName()));
            log.info("Set VertexManagerPlugin to PartitionerDefinedParallelismVertexManager for vertex "
                    + tezOp.getOperatorKey().toString());
        } else {
            boolean containScatterGather = false;
            boolean containCustomPartitioner = false;
            for (TezEdgeDescriptor edge : tezOp.inEdges.values()) {
                if (edge.dataMovementType == DataMovementType.SCATTER_GATHER) {
                    containScatterGather = true;
                }
                if (edge.partitionerClass != null) {
                    containCustomPartitioner = true;
                }
            }
            if (containScatterGather && !containCustomPartitioner) {
                // Use auto-parallelism feature of ShuffleVertexManager to dynamically
                // reduce the parallelism of the vertex
                VertexManagerPluginDescriptor vmPluginDescriptor = VertexManagerPluginDescriptor
                        .create(ShuffleVertexManager.class.getName());
                Configuration vmPluginConf = ConfigurationUtil.toConfiguration(pc.getProperties(), false);
                vmPluginConf.setBoolean(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL,
                        true);
                if (vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
                        InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) != InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) {
                    vmPluginConf.setLong(
                            ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE,
                            vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
                                    InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER));
                }
                vmPluginDescriptor.setUserPayload(TezUtils.createUserPayloadFromConf(vmPluginConf));
                vertex.setVertexManagerPlugin(vmPluginDescriptor);
                log.info("Set auto parallelism for vertex " + tezOp.getOperatorKey().toString());
            }
        }
    }

    // Reset udfcontext jobconf. It is not supposed to be set in the front end
    UDFContext.getUDFContext().addJobConf(null);
    return vertex;
}

From source file:org.apache.pig.piggybank.test.storage.TestXMLLoader.java

License:Apache License

/**
 * This test case test the special case when a non-matching tag spans two file
 * splits in a .bz2 compressed file. At the same time, the part that falls in
 * the first split is a prefix of the matching tag.
 * In other words, till the end of the first split, it looks like the tag is
 * matching but it is not actually matching.
 *
 * @throws Exception/*from   w w w  . j  ava2  s .  c om*/
 */
public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws Exception {
    Configuration conf = new Configuration();
    long blockSize = 100 * 1024;
    conf.setLong("fs.local.block.size", blockSize);

    String tagName = "event";

    PigServer pig = new PigServer(LOCAL, conf);
    FileSystem localFs = FileSystem.getLocal(conf);
    FileStatus[] testFiles = localFs
            .globStatus(new Path("src/test/java/org/apache/pig/piggybank/test/evaluation/xml/data/*xml.bz2"));
    assertTrue("No test files", testFiles.length > 0);
    for (FileStatus testFile : testFiles) {
        String testFileName = testFile.getPath().toUri().getPath().replace("\\", "\\\\");
        String query = "A = LOAD '" + testFileName
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    assertTrue(((String) tuple.get(0)).startsWith("<" + tagName + ">"));
                }
            }
        }
    }
}

From source file:org.apache.pig.piggybank.test.storage.TestXMLLoader.java

License:Apache License

/**
 * This test checks that a multi-line tag spanning two splits should be
 * matched.//from   w w w .  j  a v  a  2 s.c o  m
 * @throws Exception
 */
public void testXMLLoaderShouldMatchTagSpanningSplits() throws Exception {
    Configuration conf = new Configuration();
    long blockSize = 512;
    conf.setLong("fs.local.block.size", blockSize);
    conf.setLong(MRConfiguration.MAX_SPLIT_SIZE, blockSize);

    String tagName = "event";
    File tempFile = File.createTempFile("long-file", ".xml");
    FileSystem localFs = FileSystem.getLocal(conf);
    FSDataOutputStream directOut = localFs.create(new Path(tempFile.getAbsolutePath()), true);

    String matchingElement = "<event>\ndata\n</event>\n";
    long pos = 0;
    int matchingCount = 0;
    PrintStream ps = new PrintStream(directOut);
    // 1- Write some elements that fit completely in the first block
    while (pos + 2 * matchingElement.length() < blockSize) {
        ps.print(matchingElement);
        pos += matchingElement.length();
        matchingCount++;
    }
    // 2- Write a long element that spans multiple lines and multiple blocks
    String longElement = matchingElement.replace("data",
            "data\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\n");
    ps.print(longElement);
    pos += longElement.length();
    matchingCount++;
    // 3- Write some more elements to fill in the second block completely
    while (pos < 2 * blockSize) {
        ps.print(matchingElement);
        pos += matchingElement.length();
        matchingCount++;
    }
    ps.close();

    PigServer pig = new PigServer(LOCAL, conf);
    String tempFileName = tempFile.getAbsolutePath().replace("\\", "\\\\");
    String query = "A = LOAD '" + tempFileName
            + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
    pig.registerQuery(query);
    Iterator<?> it = pig.openIterator("A");

    int count = 0;
    while (it.hasNext()) {
        Tuple tuple = (Tuple) it.next();
        if (tuple == null)
            break;
        else {
            if (tuple.size() > 0) {
                count++;
                // Make sure the returned text is a proper XML element
                DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
                Document doc = docBuilder.parse(new ByteArrayInputStream(((String) tuple.get(0)).getBytes()));
                assertTrue(doc.getDocumentElement().getNodeName().equals(tagName));
            }
        }
    }
    assertEquals(matchingCount, count);
}

From source file:org.apache.rocketmq.sink.ReplicatorTest.java

License:Apache License

/**
 * This method starts the HBase cluster and the RocketMQ server.
 *
 * @throws Exception//  w w w. ja v a2  s  . c om
 */
@Before
public void setUp() throws Exception {
    final Configuration hbaseConf = HBaseConfiguration.create();
    hbaseConf.setInt("replication.stats.thread.period.seconds", 5);
    hbaseConf.setLong("replication.sleep.before.failover", 2000);
    hbaseConf.setInt("replication.source.maxretriesmultiplier", 10);
    hbaseConf.setBoolean(HConstants.REPLICATION_ENABLE_KEY, true);

    // Add RocketMQ properties - we prefix each property with 'rocketmq'
    addRocketMQProperties(hbaseConf);

    utility = new HBaseTestingUtility(hbaseConf);
    utility.startMiniCluster();
    utility.getHBaseCluster().getRegionServerThreads().size();

    // setup and start RocketMQ
    startMQ();
}