List of usage examples for org.apache.hadoop.conf Configuration setLong
public void setLong(String name, long value)
name
property to a long
. From source file:com.splicemachine.test.SpliceTestPlatformConfig.java
License:Apache License
public static Configuration create(String hbaseRootDirUri, Integer masterPort, Integer masterInfoPort, Integer regionServerPort, Integer regionServerInfoPort, Integer derbyPort, boolean failTasksRandomly) { Configuration config = HConfiguration.unwrapDelegate(); config.set(SQLConfiguration.STORAGE_FACTORY_HOME, hbaseRootDirUri); ////w w w . ja v a 2s . c o m // Coprocessors // config.set("hbase.coprocessor.regionserver.classes", getRegionServerCoprocessorsAsString()); config.set("hbase.coprocessor.region.classes", getRegionCoprocessorsAsString()); config.set("hbase.coprocessor.master.classes", getMasterCoprocessorsAsString()); // // Networking // config.set("hbase.zookeeper.quorum", "127.0.0.1:2181"); config.setInt("hbase.master.port", masterPort); config.setInt("hbase.master.info.port", masterInfoPort); config.setInt("hbase.regionserver.port", regionServerPort); config.setInt("hbase.regionserver.info.port", regionServerInfoPort); config.setInt("hbase.master.jmx.port", HConfiguration.DEFAULT_JMX_BIND_PORT); // this is set because the HBase master and regionserver are running on the same machine and in the same JVM config.setInt(SQLConfiguration.NETWORK_BIND_PORT, derbyPort); config.setClass(DefaultStoreEngine.DEFAULT_COMPACTOR_CLASS_KEY, SpliceDefaultCompactor.class, Compactor.class); // config.setClass(ConsistencyControlUtils.MVCC_IMPL, SIMultiVersionConsistencyControl.class, ConsistencyControl.class); config.setClass(DefaultStoreEngine.DEFAULT_COMPACTION_POLICY_CLASS_KEY, SpliceDefaultCompactionPolicy.class, CompactionPolicy.class); // // Networking -- interfaces // // force use of loop back interface on MacOSX, else don't set it // if (System.getProperty("os.name").contains("Mac") ) { // String interfaceName = "lo0"; // config.set("hbase.zookeeper.dns.interface", interfaceName); // config.set("hbase.master.dns.interface", interfaceName); // config.set("hbase.regionserver.dns.interface", interfaceName); // } // // File System // config.set("fs.defaultFS", "file:///"); // MapR Hack, tells it local filesystem // fs.default.name is deprecated config.set(FileSystem.FS_DEFAULT_NAME_KEY, "file:///"); config.setDouble("yarn.nodemanager.resource.io-spindles", 2.0); config.set("fs.default.name", "file:///"); config.set("yarn.nodemanager.container-executor.class", "org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor"); // Must allow Cygwin instance to config its own rootURI if (!"CYGWIN".equals(hbaseRootDirUri)) { config.set("hbase.rootdir", hbaseRootDirUri); } // // Threads, timeouts // config.setLong("hbase.rpc.timeout", MINUTES.toMillis(2)); config.setLong("hbase.client.scanner.timeout.period", MINUTES.toMillis(2)); // hbase.regionserver.lease.period is deprecated config.setLong("hbase.client.operation.timeout", MINUTES.toMillis(2)); config.setLong("hbase.regionserver.handler.count", 200); config.setLong("hbase.regionserver.msginterval", 1000); config.setLong("hbase.master.event.waiting.time", 20); config.setLong("hbase.master.lease.thread.wakefrequency", SECONDS.toMillis(3)); // config.setBoolean("hbase.master.loadbalance.bytable",true); config.setInt("hbase.balancer.period", 5000); config.setLong("hbase.server.thread.wakefrequency", SECONDS.toMillis(1)); config.setLong("hbase.client.pause", 100); // // Compaction Controls // config.setLong("hbase.hstore.compaction.min", 5); // min number of eligible files before we compact config.setLong("hbase.hstore.compaction.max", 10); // max files to be selected for a single minor compaction config.setLong("hbase.hstore.compaction.min.size", 16 * MiB); // store files smaller than this will always be eligible for minor compaction. HFiles this size or larger are evaluated by hbase.hstore.compaction.ratio to determine if they are eligible config.setLong("hbase.hstore.compaction.max.size", 248 * MiB); // store files larger than this will be excluded from compaction config.setFloat("hbase.hstore.compaction.ratio", 1.25f); // default is 1.2f, at one point we had this set to 0.25f and 25f (which was likely a typo) // // Memstore, store files, splits // config.setLong(HConstants.HREGION_MAX_FILESIZE, 32 * MiB); // hbase.hregion.max.filesize config.setLong("hbase.hregion.memstore.flush.size", 128 * MiB); // was 512 MiB config.setLong("hbase.hregion.memstore.block.multiplier", 4); config.setFloat("hbase.regionserver.global.memstore.size", 0.25f); // set mem store to 25% of heap config.setLong("hbase.hstore.blockingStoreFiles", 20); // config.set("hbase.regionserver.region.split.policy", "org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy"); // change default split policy. this makes more sense for a standalone/single regionserver // Support SI //config.setClass(HConstants.MVCC_IMPL, SIMultiVersionConsistencyControl.class, ConsistencyControl.class); // // HFile // config.setInt("hfile.index.block.max.size", 16 * 1024); // 16KiB config.setFloat("hfile.block.cache.size", 0.25f); // set block cache to 25% of heap config.setFloat("io.hfile.bloom.error.rate", (float) 0.005); config.setBoolean(CacheConfig.CACHE_BLOOM_BLOCKS_ON_WRITE_KEY, true); // hfile.block.bloom.cacheonwrite //config.set("hbase.master.hfilecleaner.plugins", getHFileCleanerAsString()); config.set("hbase.master.hfilecleaner.plugins", getHFileCleanerAsString()); // // Misc // config.set("hbase.cluster.distributed", "true"); // don't start zookeeper for us config.set("hbase.master.distributed.log.splitting", "false"); // TODO: explain why we are setting this // AWS Credentials for test... // config.set(ACCESS_KEY, "AKIAJ6HBMCK5ALHVBFPQ"); config.set(SECRET_KEY, "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"); // // Splice // config.setLong("splice.ddl.drainingWait.maximum", SECONDS.toMillis(15)); // wait 15 seconds before bailing on bad ddl statements config.setLong("splice.ddl.maxWaitSeconds", 120000); // // Snapshots // config.setBoolean("hbase.snapshot.enabled", true); HConfiguration.reloadConfiguration(config); return HConfiguration.unwrapDelegate(); }
From source file:com.taobao.adfs.distributed.DistributedDataTest.java
License:Apache License
@BeforeClass static public void setupAfterClass() throws Throwable { Utilities.configureLog4j(null, "distributed.logger.conf.", Level.DEBUG); Configuration conf = new Configuration(false); conf.set("distributed.data.path", "target/test" + DistributedDataTest.class.getSimpleName()); conf.setLong("distributed.data.delete.check.interval.time", 1); conf.set("distributed.data.format", "true"); exampleData = new ExampleData(conf); exampleData.format();// w w w . j a v a 2s .c o m }
From source file:com.teradata.benchto.generator.HiveTypesGenerator.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(/*from w w w . j a va2 s . co m*/ Option.builder("format").required().hasArg().desc("file format (orc, parquet or text)").build()); options.addOption(Option.builder("type").required().hasArg().desc( "hive type to be generated (bigint, int, boolean, double, binary, date, timestamp, string, decimal or varchar)") .build()); options.addOption(Option.builder("rows").required().hasArg().desc("total row count").build()); options.addOption(Option.builder("mappers").required().hasArg().desc("total mappers count").build()); options.addOption(Option.builder("path").hasArg() .desc("base path for generating files, default is: /benchmarks/benchto/types").build()); options.addOption(Option.builder("regex").numberOfArgs(3) .desc("generate varchars from regex pattern, arguments are: pattern, min length, max length") .build()); CommandLine line; String format; String hiveType; long numberOfRows; long numberOfFiles; String basePath; Optional<String> regexPattern = Optional.absent(); Optional<Integer> regexMinLength = Optional.absent(); Optional<Integer> regexMaxLength = Optional.absent(); try { line = new DefaultParser().parse(options, args); format = line.getOptionValue("format"); hiveType = line.getOptionValue("type"); numberOfRows = parseLong(line.getOptionValue("rows")); numberOfFiles = parseLong(line.getOptionValue("mappers")); basePath = line.getOptionValue("path", "/benchmarks/benchto/types"); if (line.hasOption("regex")) { String[] values = line.getOptionValues("regex"); regexPattern = Optional.of(values[0]); regexMinLength = Optional.of(parseInt(values[1])); regexMaxLength = Optional.of(parseInt(values[2])); } } catch (Exception e) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("benchto-generator", options); throw e; } String jobName = format("GenerateData-%s-%s-%d", format, hiveType, numberOfRows); Path outputDir = new Path(format("%s/%s-%s/%d", basePath, format, hiveType, numberOfRows)); Class<? extends OutputFormat> outputFormatClass = getOutputFormatClass(format); LOG.info("Generating " + numberOfRows + " " + hiveType + "s, directory: " + outputDir + ", number of files: " + numberOfFiles); Configuration configuration = new Configuration(); configuration.set(FORMAT_PROPERTY_NAME, format); configuration.set(HIVE_TYPE_PROPERTY_NAME, hiveType); configuration.setLong(NUM_ROWS_PROPERTY_NAME, numberOfRows); configuration.setLong(NUM_MAPS, numberOfFiles); if (regexPattern.isPresent()) { configuration.set(REGEX_PATTERN, regexPattern.get()); configuration.setInt(REGEX_MIN_LENGTH, regexMinLength.get()); configuration.setInt(REGEX_MAX_LENGTH, regexMaxLength.get()); } Job generatorJob = Job.getInstance(configuration, jobName); FileOutputFormat.setOutputPath(generatorJob, outputDir); ParquetOutputFormat.setWriteSupportClass(generatorJob, DataWritableWriteSupport.class); generatorJob.setJarByClass(HiveTypesGenerator.class); generatorJob.setMapperClass(HiveTypesMapper.class); generatorJob.setNumReduceTasks(0); generatorJob.setOutputKeyClass(NullWritable.class); generatorJob.setOutputValueClass(Writable.class); generatorJob.setInputFormatClass(CounterInputFormat.class); generatorJob.setOutputFormatClass(outputFormatClass); return generatorJob.waitForCompletion(true) ? 0 : 1; }
From source file:com.twitter.algebra.nmf.NMFCommon.java
License:Apache License
public static void setNumberOfMapSlots(Configuration conf, FileSystem fs, Path[] paths, String joblabel) { if (conf.get(MAPSPLOTS) == null) return;//www.j a v a 2 s .c o m int mapSlots = conf.getInt(MAPSPLOTS, 1); mapSlots = conf.getInt(MAPSPLOTS + "." + joblabel, mapSlots); long du = 0; try { for (Path path : paths) du += MapDir.du(path, fs); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } long splitSize = du / mapSlots; log.info("du: " + du + " mapSlots: " + mapSlots + " splitSize: " + splitSize); long minSplitSize = (long) (splitSize * 0.9); long maxSplitSize = Math.max((long) (splitSize * 1.1), 1024 * 1024); conf.setLong("mapred.min.split.size", minSplitSize); conf.setLong("mapreduce.min.split.size", minSplitSize); conf.setLong("mapred.max.split.size", maxSplitSize); conf.setLong("mapreduce.max.split.size", maxSplitSize); }
From source file:com.twitter.hraven.etl.JobFileProcessor.java
License:Apache License
/** * @param conf/*from w w w . ja v a2 s . c om*/ * used to connect to HBAse * @param cluster * for which we are processing * @param reprocess * Reprocess those records that may have been processed already. * Otherwise successfully processed job files are skipped. * @param reprocessOnly * process only those raw records that were marked to be reprocessed. * When true then reprocess argument is ignored and is assumed to be * true. * @param batchSize * the total number of jobs to process in a batch (a MR job scanning * these many records in the raw table). * @param minJobId * used to start the scan. If null then there is no min limit on * JobId. * @param maxJobId * used to end the scan (inclusive). If null then there is no max * limit on jobId. * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException * @throws ExecutionException * @throws RowKeyParseException */ private List<JobRunner> getJobRunners(Configuration conf, String cluster, boolean reprocess, int batchSize, String minJobId, String maxJobId) throws IOException, InterruptedException, ClassNotFoundException, RowKeyParseException { List<JobRunner> jobRunners = new LinkedList<JobRunner>(); JobHistoryRawService jobHistoryRawService = new JobHistoryRawService(conf); try { // Bind all MR jobs together with one runID. long now = System.currentTimeMillis(); conf.setLong(Constants.MR_RUN_CONF_KEY, now); List<Scan> scanList = jobHistoryRawService.getHistoryRawTableScans(cluster, minJobId, maxJobId, reprocess, batchSize); for (Scan scan : scanList) { Job job = getProcessingJob(conf, scan, scanList.size()); JobRunner jobRunner = new JobRunner(job, null); jobRunners.add(jobRunner); } } finally { IOException caught = null; try { jobHistoryRawService.close(); } catch (IOException ioe) { caught = ioe; } if (caught != null) { throw caught; } } return jobRunners; }
From source file:com.twitter.hraven.etl.JobFileRawLoader.java
License:Apache License
/** * @param myHBaseConf/*w w w . j a v a 2 s. co m*/ * used to contact HBase and to run jobs against. Should be an HBase * configuration. * @param cluster * for which to process records. * @param processFileSubstring * return rows where the process file path contains this string. If * <code>null</code> or empty string, then no filtering is applied. * @param forceReprocess * whether all jobs for which a file is loaded needs to be * reprocessed. * @return whether all job files for all processRecords were properly * processed. * @throws IOException * @throws ClassNotFoundException * when problems occur setting up the job. * @throws InterruptedException */ private boolean processRecordsFromHBase(Configuration myHBaseConf, String cluster, String processFileSubstring, boolean forceReprocess) throws IOException, InterruptedException, ClassNotFoundException { int failures = 0; ProcessRecordService processRecordService = new ProcessRecordService(myHBaseConf); // Grab all records. List<ProcessRecord> processRecords = processRecordService.getProcessRecords(cluster, PREPROCESSED, Integer.MAX_VALUE, processFileSubstring); try { LOG.info("ProcessRecords for " + cluster + ": " + processRecords.size()); // Bind all MR jobs together with one runID. long now = System.currentTimeMillis(); myHBaseConf.setLong(Constants.MR_RUN_CONF_KEY, now); myHBaseConf.setBoolean(Constants.FORCE_REPROCESS_CONF_KEY, forceReprocess); // Iterate over 0 based list in reverse order for (int j = processRecords.size() - 1; j >= 0; j--) { ProcessRecord processRecord = processRecords.get(j); LOG.info("Processing " + processRecord); boolean success = runRawLoaderJob(myHBaseConf, processRecord.getProcessFile(), processRecords.size()); // Bail out on first failure. if (success) { processRecordService.setProcessState(processRecord, ProcessState.LOADED); } else { failures++; } } } finally { processRecordService.close(); } return (failures == 0); }
From source file:com.wandisco.s3hdfs.rewrite.filter.TestBase.java
License:Apache License
/** * @throws java.lang.Exception/* w ww. j a v a 2 s . c om*/ */ @Before public void setUp() throws Exception { Configuration conf = new HdfsConfiguration(new S3HdfsConfiguration()); conf.setInt(S3_PROXY_PORT_KEY, PROXY_PORT); conf.setBoolean(DFS_WEBHDFS_ENABLED_KEY, true); conf.setInt(DFS_DATANODE_SCAN_PERIOD_HOURS_KEY, 100); conf.setLong(DFS_BLOCK_SIZE_KEY, 1024); conf.setLong(DFS_NAMENODE_MIN_BLOCK_SIZE_KEY, 512); // ^ has to be a multiple of 512 FsPermission.setUMask(conf, FsPermission.createImmutable((short) 0)); // ^ eliminate the UMask in HDFS to remove perm denied exceptions in s3Dir hostName = conf.get(S3_SERVICE_HOSTNAME_KEY); System.out.println("S3HDFS ServiceHostName: " + hostName); s3Directory = conf.get(S3_DIRECTORY_KEY); cluster = new MiniDFSCluster.Builder(conf).nameNodeHttpPort(HTTP_PORT).numDataNodes(3).build(); cluster.waitActive(); hdfs = cluster.getFileSystem(); //initialize s3 directory Path s3Path = new Path(s3Directory); assertTrue(hdfs.mkdirs(s3Path)); testUtil = new S3HdfsTestUtil(hdfs, s3Directory); s3Service = testUtil.configureS3Service(hostName, PROXY_PORT); }
From source file:com.wipro.ats.bdre.datagen.mr.Driver.java
License:Apache License
/** * @param args the cli arguments/*from w w w. j a va 2s. c o m*/ */ @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = getConf(); GetGeneralConfig generalConfig = new GetGeneralConfig(); GeneralConfig gc = generalConfig.byConigGroupAndKey("imconfig", "common.default-fs-name"); conf.set("fs.defaultFS", gc.getDefaultVal()); String processId = args[0]; Path outputDir = new Path(ResolvePath.replaceVars(args[1])); Properties dataProps = Config.getDataProperties(processId); Properties tableProps = Config.getTableProperties(processId); TableUtil tableUtil = new TableUtil(); Table table = tableUtil.formTableFromConfig(processId); FileSystem fs = FileSystem.get(conf); LOGGER.info("Default FS =" + conf.get("fs.defaultFS")); //set in the conf for mappers to use conf.set(Config.SEPARATOR_KEY, tableProps.getProperty("separator")); conf.set(Config.PID_KEY, processId); conf.setLong(Config.NUM_ROWS_KEY, Long.parseLong(dataProps.getProperty("numRows"))); conf.setInt(Config.NUM_SPLITS_KEY, Integer.parseInt(dataProps.getProperty("numSplits"))); Job job = Job.getInstance(conf); Path mrOutputPath = new Path(outputDir.toString() + "/MROUT/" + table.getTableName()); FileOutputFormat.setOutputPath(job, mrOutputPath); job.setJobName("Datagen-" + table.getTableName()); job.setJarByClass(Driver.class); job.setMapperClass(RecordGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RangeInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); //merge and create a single file Path srcDir = mrOutputPath; Path destFile = new Path(outputDir.toString() + "/" + table.getTableName()); FileUtil.copyMerge(fs, srcDir, fs, destFile, true, conf, ""); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash("0"); registerFileInfo.setFileSize(0L); registerFileInfo.setPath(destFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; }
From source file:com.yahoo.glimmer.indexing.generator.TripleIndexGenerator.java
License:Open Source License
public int run(String[] args) throws Exception { SimpleJSAP jsap = new SimpleJSAP(TripleIndexGenerator.class.getName(), "Generates a keyword index from RDF data.", new Parameter[] { new Switch(NO_CONTEXTS_ARG, 'C', "withoutContexts", "Don't process the contexts for each tuple."), new FlaggedOption(METHOD_ARG, JSAP.STRING_PARSER, "horizontal", JSAP.REQUIRED, 'm', METHOD_ARG, "horizontal or vertical."), new FlaggedOption(PREDICATES_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', PREDICATES_ARG, "Subset of the properties to be indexed."), new FlaggedOption(RESOURCE_PREFIX_ARG, JSAP.STRING_PARSER, "@", JSAP.NOT_REQUIRED, 'r', RESOURCE_PREFIX_ARG, "Prefix to add to object resource hash values when indexing. Stops queries for numbers matching resource hash values. Default is '@'"), new UnflaggedOption("input", JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the input data."), new UnflaggedOption(NUMBER_OF_DOCS_ARG, JSAP.LONG_PARSER, JSAP.REQUIRED, "Number of documents to index"), new UnflaggedOption("output", JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the output."), new UnflaggedOption(RESOURCES_HASH_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location of the resources hash file."), }); JSAPResult jsapResult = jsap.parse(args); // check whether the command line was valid, and if it wasn't, // display usage information and exit. if (!jsapResult.success()) { System.err.println();//w w w . j a v a 2 s.c o m System.err.println("Usage: java " + TripleIndexGenerator.class.getName()); System.err.println(" " + jsap.getUsage()); System.err.println(); System.exit(1); } Job job = Job.getInstance(getConf()); job.setJarByClass(TripleIndexGenerator.class); job.setJobName("TripleIndexGenerator" + System.currentTimeMillis()); FileInputFormat.setInputPaths(job, new Path(jsapResult.getString("input"))); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(DocumentMapper.class); job.setMapOutputKeyClass(TermKey.class); job.setMapOutputValueClass(TermValue.class); job.setPartitionerClass(TermKey.FirstPartitioner.class); job.setGroupingComparatorClass(TermKey.FirstGroupingComparator.class); job.setReducerClass(TermReduce.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IndexRecordWriterValue.class); job.setOutputFormatClass(IndexRecordWriter.OutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(jsapResult.getString("output"))); Configuration conf = job.getConfiguration(); conf.setClass("mapred.output.key.comparator.class", TermKey.Comparator.class, WritableComparator.class); conf.set("mapreduce.user.classpath.first", "true"); long numDocs = jsapResult.getLong(NUMBER_OF_DOCS_ARG); conf.setLong(NUMBER_OF_DOCUMENTS, numDocs); // Set this in a attempt to get around the 2GB of ram task limit on our cluster. // Setting this in the hope of fixing Direct buffer memory errors conf.setInt(INDEX_WRITER_CACHE_SIZE, 1024 * 1024); conf.set(OUTPUT_DIR, jsapResult.getString("output")); boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false); if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_HORIZONTAL)) { HorizontalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG), jsapResult.getString(RESOURCE_PREFIX_ARG)); } else if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_VERTICAL)) { if (!jsapResult.contains(PREDICATES_ARG)) { throw new IllegalArgumentException("When '" + METHOD_ARG + "' is '" + METHOD_ARG_VALUE_VERTICAL + "' you have to give a predicates file too."); } VerticalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG), jsapResult.getString(RESOURCE_PREFIX_ARG), jsapResult.getString(PREDICATES_ARG)); } else { throw new IllegalArgumentException(METHOD_ARG + " should be '" + METHOD_ARG_VALUE_HORIZONTAL + "' or '" + METHOD_ARG_VALUE_VERTICAL + "'"); } conf.setInt("mapreduce.input.linerecordreader.line.maxlength", 1024 * 1024); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:de.l3s.content.timex.extracting.ClueWeb09Timex.java
License:Apache License
/** * Runs this tool.//ww w . java 2s. c o m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("input").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("output").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("column").hasArg() .withDescription("column to store row data into (must exist)").create(COLUMN)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); cmdline = parser.parse(options, args); if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } if (!cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); // String column = cmdline.getOptionValue(COLUMN); LOG.info("Tool name: " + ClueWeb09Timex.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); // LOG.info(" - column: " + column); Configuration conf = HBaseConfiguration.create(); conf.set("hbase.zookeeper.quorum", "node05.ib,node03.ib,node04.ib"); conf.set("hbase.zookeeper.property.clientPort", "2181"); conf.set("hbase.master", "master.ib"); // conf.set("conf.column", column); long milliSeconds = 10000 * 60 * 60; //x10 default conf.setLong("mapred.task.timeout", milliSeconds); Job job = Job.getInstance(conf, ClueWeb09Timex.class.getSimpleName() + " time-confident extraction + annotation + HBase import: " + input); //Configuration conf = new Configuration(); //Job job = Job.getInstance(conf, "web pages count"); job.setJarByClass(ClueWeb09Timex.class); job.setNumReduceTasks(0); job.setInputFormatClass(ClueWeb09InputFormat.class); job.setOutputFormatClass(TableOutputFormat.class); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, output); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Writable.class); job.setMapperClass(TMapper.class); //job.setReducerClass(IntSumReducer.class); //job.setOutputKeyClass(Text.class); //job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(input)); //FileOutputFormat.setOutputPath(job, new Path(output)); job.waitForCompletion(true); return 0; }