List of usage examples for org.apache.hadoop.conf Configuration setLong
public void setLong(String name, long value)
name
property to a long
. From source file:de.l3s.content.timex.extracting.ClueWeb09TimexWriteToHDFS.java
License:Apache License
/** * Runs this tool.//w ww.ja v a2 s . c o m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("input").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("output").hasArg().withDescription("output path").create(OUTPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); cmdline = parser.parse(options, args); if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } if (!cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); LOG.info("Tool name: " + ClueWeb09TimexWriteToHDFS.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); Configuration conf = new Configuration(); long milliSeconds = 10000 * 60 * 60; //x10 default conf.setLong("mapred.task.timeout", milliSeconds); Job job = Job.getInstance(conf, "extract CW tempex and output to HDFS"); job.setJarByClass(ClueWeb09TimexWriteToHDFS.class); job.setNumReduceTasks(0); job.setInputFormatClass(ClueWeb09InputFormat.class); job.setMapperClass(TMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.waitForCompletion(true); return 0; }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java
License:Apache License
/** * pass2: perform the LLR calculation/*from w w w . j a va 2 s . com*/ */ private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal, boolean emitUnigrams, float minValue, int reduceTasks) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); conf.setLong(AssocReducer.NGRAM_TOTAL, nGramTotal); conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams); conf.setFloat(AssocReducer.MIN_VALUE, minValue); conf.setInt("mapred.job.map.memory.mb", 1280); conf.setInt("mapred.job.reduce.memory.mb", 2560); conf.set("mapred.reduce.child.java.opts", "-Xmx2G"); conf.setInt("mapred.task.timeout", 6000000); conf.set(AssocReducer.ASSOC_METRIC, "llr"); Job job = new Job(conf); job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output + " pruning: " + minValue); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(Gram.class); job.setMapOutputValueClass(Gram.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY)); Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY + "_llr"); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.setReducerClass(AssocReducer.class); job.setNumReduceTasks(reduceTasks); // Defines additional single text based output 'text' for the job MultipleOutputs.addNamedOutput(job, "contingency", TextOutputFormat.class, Text.class, Text.class); // Defines additional multi sequencefile based output 'sequence' for the // job MultipleOutputs.addNamedOutput(job, "llr", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "pmi", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "chi", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "dice", TextOutputFormat.class, Text.class, DoubleWritable.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.standalone.WarcBoilerplateRemoval.java
License:Apache License
public static void processWarcGzFile(File input, File outFile, boolean keepMinimalHtml) throws IOException { System.out.printf("Reading from %s, writing to %s%n", input, outFile); Configuration conf = new Configuration(); // set limit to 100 GB (= almost unlimited) conf.setLong("warc.output.segment.size", WARCFileWriter.DEFAULT_MAX_SEGMENT_SIZE * 100); //Opens a file for reading. CompressionCodec codec = WARCFileWriter.getGzipCodec(conf); InputStream byteStream = new BufferedInputStream(new FileInputStream(input)); DataInputStream dataStream = new DataInputStream( codec == null ? byteStream : codec.createInputStream(byteStream)); BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval(); long startTime = System.currentTimeMillis(); int counter = 0; int recordsRead = 0; Path outputPath = new Path(outFile.getAbsolutePath()); WARCFileWriter warcFileWriter = new WARCFileWriter(conf, codec, outputPath); // detecting the correct charset final CharsetDetector charsetDetector = new ICUCharsetDetectorWrapper(); while (true) { try {//from ww w . j a v a2s. c o m //Reads the next record from the file. WARCRecord wc = new WARCRecord(dataStream); // detect charset byte[] bytes = wc.getContent(); Charset charset = charsetDetector.detectCharset(bytes); String html = new String(bytes, charset); // strip HTTP header html = html.substring(html.indexOf("\r\n\r\n") + 4); String plainText; if (keepMinimalHtml) { plainText = boilerPlateRemoval.getMinimalHtml(html, null); } else { plainText = boilerPlateRemoval.getPlainText(html, null); } counter++; if (counter % 100 == 0) { System.out.printf(Locale.ENGLISH, "~%.1f entries per second%n", counter * 1000f / (double) (System.currentTimeMillis() - startTime)); System.out.printf(Locale.ENGLISH, "%d records processed%n", recordsRead); } recordsRead++; // create copy of WarcRecord WARCRecord newWarcRecord = new WARCRecord(wc); newWarcRecord.setContent(plainText); warcFileWriter.write(newWarcRecord); } catch (EOFException e) { break; } } warcFileWriter.close(); // rename from out.warc.gz.seg-00000.warc.gz to out.warc.gz File actualOutputFile = new File(outFile.getAbsolutePath() + ".seg-00000.warc.gz"); if (!actualOutputFile.exists()) { throw new IOException("File " + actualOutputFile + " does not exist"); } if (!actualOutputFile.renameTo(outFile)) { throw new IOException("Renaming file " + actualOutputFile + " to " + outFile + " failed"); } // delete .crc file File crcFile = new File(actualOutputFile.getParentFile(), "." + actualOutputFile.getName() + ".crc"); if (!crcFile.delete()) { throw new IOException(crcFile + " was not deleted"); } System.out.printf(Locale.ENGLISH, "%d records written to %s, total time %f%n", recordsRead, outFile.getName(), counter * 1000f / (double) (System.currentTimeMillis() - startTime)); }
From source file:edu.rosehulman.CollocDriver.java
License:Apache License
/** * pass2: perform the LLR calculation//from w ww. j av a 2s . c om */ private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal, boolean emitUnigrams, float minLLRValue, int reduceTasks) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal); conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams); conf.setFloat(LLRReducer.MIN_LLR, minLLRValue); Job job = new Job(conf); job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(Gram.class); job.setMapOutputValueClass(Gram.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY)); Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(LLRReducer.class); job.setNumReduceTasks(reduceTasks); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:hadoop.SleepJob.java
License:Apache License
public Job createJob(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) throws IOException { Configuration conf = getConf(); conf.setLong(MAP_SLEEP_TIME, mapSleepTime); conf.setLong(REDUCE_SLEEP_TIME, reduceSleepTime); conf.setInt(MAP_SLEEP_COUNT, mapSleepCount); conf.setInt(REDUCE_SLEEP_COUNT, reduceSleepCount); conf.setInt(MRJobConfig.NUM_MAPS, numMapper); Job job = new Job(conf, "sleep"); job.setNumReduceTasks(numReducer);//from w ww .j av a 2s .c o m job.setJarByClass(SleepJob.class); job.setNumReduceTasks(numReducer); job.setMapperClass(SleepMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(SleepReducer.class); job.setOutputFormatClass(NullOutputFormat.class); job.setInputFormatClass(SleepInputFormat.class); job.setPartitionerClass(SleepJobPartitioner.class); job.setSpeculativeExecution(false); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); return job; }
From source file:hudson.gridmaven.gridlayer.DataNodeStartTask.java
License:Open Source License
public Void call() throws IOException { System.out.println("Starting data node"); //System.setProperty("java.net.preferIPv4Stack" , "true"); Configuration conf = new Configuration(); conf.set("fs.default.name", hdfsUrl); conf.set("dfs.data.dir", new File(new File(rootPath), "hadoop/datanode").getAbsolutePath()); conf.set("dfs.datanode.address", "0.0.0.0:0"); conf.set("dfs.datanode.http.address", "0.0.0.0:0"); conf.set("dfs.datanode.ipc.address", "0.0.0.0:0"); conf.set("slave.host.name", slaveHostName); conf.set("dfs.safemode.extension", "1"); conf.set("dfs.namenode.logging.level", "ALL"); conf.set("dfs.block.size", "1048576"); // TODO: make this configurable // make room for builds conf.setLong("dfs.datanode.du.reserved", 1L * 1024 * 1024 * 1024); DataNode dn = DataNode.instantiateDataNode(new String[0], conf); DataNode.runDatanodeDaemon(dn);/*from w ww. ja va2 s. c om*/ return null; }
From source file:hudson.plugins.hadoop.DataNodeStartTask.java
License:Open Source License
public Void call() throws IOException { System.out.println("Starting data node"); Configuration conf = new Configuration(); conf.set("fs.default.name", hdfsUrl); conf.set("dfs.data.dir", new File(new File(rootPath), "hadoop/datanode").getAbsolutePath()); conf.set("dfs.datanode.address", "0.0.0.0:0"); conf.set("dfs.datanode.http.address", "0.0.0.0:0"); conf.set("dfs.datanode.ipc.address", "0.0.0.0:0"); conf.set("slave.host.name", slaveHostName); // TODO: make this configurable // make room for builds conf.setLong("dfs.datanode.du.reserved", 10L * 1024 * 1024 * 1024); DataNode dn = DataNode.instantiateDataNode(new String[0], conf); DataNode.runDatanodeDaemon(dn);//from w ww . j av a 2 s . c om return null; }
From source file:io.covert.dns.storage.accumulo.AccumuloStorageModuleFactory.java
License:Apache License
public static void configure(Job job, String inst, String zooKeepers, String user, String password, long maxMemory, long maxLatency, int maxWriteThreads, Collection<Class<? extends MutationGeneratorFactory>> generatorFactoryClasses) { Configuration conf = job.getConfiguration(); StringBuilder factories = new StringBuilder(); boolean first = true; for (Class<? extends MutationGeneratorFactory> clz : generatorFactoryClasses) { if (first) { first = false;//from w ww . ja v a2 s . com factories.append(clz.getName()); } else { factories.append(",").append(clz.getName()); } } conf.set("storage.module.factory", AccumuloStorageModuleFactory.class.getName()); conf.set("accumulo.storage.module.mutation.generator.factories", factories.toString()); conf.set("accumulo.storage.module.instance.name", inst); conf.set("accumulo.storage.module.zookeepers", zooKeepers); conf.set("accumulo.storage.module.user", user); conf.set("accumulo.storage.module.password", password); conf.setLong("accumulo.storage.module.max.memory", maxMemory); conf.setLong("accumulo.storage.module.max.latency", maxLatency); conf.setInt("accumulo.storage.module.max.write.threads", maxWriteThreads); }
From source file:io.dataapps.chlorine.hadoop.HDFSScanMR.java
License:Apache License
public static Job makeJob(Configuration conf, Path in, Path out, String matchPath, long scanSince, String chlorineConfigFilePath, String queue, String maskPath) throws IOException { conf.setBoolean("mapred.output.compress", false); conf.setLong("scanSince", scanSince); conf.set("matchPath", matchPath); conf.set("maskPath", maskPath); conf.set("inputPath", in.toString()); if (queue != null) { conf.set("mapred.job.queue.name", queue); }//from www.j av a 2s .c om conf.set("fs.permissions.umask-mode", "007"); conf.setInt("input_path_depth", in.depth()); Job job = Job.getInstance(conf, "Chlorine_HDFS_Scan"); job.setJarByClass(HDFSScanMR.class); if (chlorineConfigFilePath != null) { try { job.addCacheFile(new URI(chlorineConfigFilePath)); conf.set("finder_file", (new File(chlorineConfigFilePath)).getName()); } catch (URISyntaxException e) { LOG.error(e); } } job.setMapperClass(DeepScanMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, in); TextInputFormat.setInputDirRecursive(job, true); TextInputFormat.setInputPathFilter(job, NewFilesFilter.class); FileOutputFormat.setOutputPath(job, out); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); return job; }
From source file:io.hops.security.HopsUtil.java
License:Apache License
private static Configuration generateSSLServerConf(Configuration conf, String cryptoMaterialPassword) { Configuration sslConf = new Configuration(false); sslConf.set(/*from ww w . j a v a 2s.c o m*/ FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.SERVER, FileBasedKeyStoresFactory.SSL_KEYSTORE_LOCATION_TPL_KEY), HopsSSLSocketFactory.LOCALIZED_KEYSTORE_FILE_NAME); sslConf.set(FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.SERVER, FileBasedKeyStoresFactory.SSL_KEYSTORE_PASSWORD_TPL_KEY), cryptoMaterialPassword); sslConf.set(FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.SERVER, FileBasedKeyStoresFactory.SSL_KEYSTORE_KEYPASSWORD_TPL_KEY), cryptoMaterialPassword); sslConf.set( FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.SERVER, FileBasedKeyStoresFactory.SSL_TRUSTSTORE_LOCATION_TPL_KEY), HopsSSLSocketFactory.LOCALIZED_TRUSTSTORE_FILE_NAME); sslConf.set(FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.SERVER, FileBasedKeyStoresFactory.SSL_TRUSTSTORE_PASSWORD_TPL_KEY), cryptoMaterialPassword); sslConf.set( FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.SERVER, FileBasedKeyStoresFactory.SSL_PASSWORDFILE_LOCATION_TPL_KEY), HopsSSLSocketFactory.LOCALIZED_PASSWD_FILE_NAME); Configuration sslClientConf = new Configuration(false); String sslClientResource = conf.get(SSLFactory.SSL_CLIENT_CONF_KEY, "ssl-client.xml"); sslClientConf.addResource(sslClientResource); long keyStoreReloadInterval = sslClientConf.getLong( FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.CLIENT, FileBasedKeyStoresFactory.SSL_KEYSTORE_RELOAD_INTERVAL_TPL_KEY), FileBasedKeyStoresFactory.DEFAULT_SSL_KEYSTORE_RELOAD_INTERVAL); String timeUnitStr = sslClientConf.get( FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.CLIENT, FileBasedKeyStoresFactory.SSL_KEYSTORE_RELOAD_TIMEUNIT_TPL_KEY), FileBasedKeyStoresFactory.DEFAULT_SSL_KEYSTORE_RELOAD_TIMEUNIT); long trustStoreReloadInterval = sslClientConf.getLong( FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.CLIENT, FileBasedKeyStoresFactory.SSL_TRUSTSTORE_RELOAD_INTERVAL_TPL_KEY), FileBasedKeyStoresFactory.DEFAULT_SSL_TRUSTSTORE_RELOAD_INTERVAL); sslConf.setLong(FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.SERVER, FileBasedKeyStoresFactory.SSL_KEYSTORE_RELOAD_INTERVAL_TPL_KEY), keyStoreReloadInterval); sslConf.set(FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.SERVER, FileBasedKeyStoresFactory.SSL_KEYSTORE_RELOAD_TIMEUNIT_TPL_KEY), timeUnitStr); sslConf.setLong( FileBasedKeyStoresFactory.resolvePropertyName(SSLFactory.Mode.SERVER, FileBasedKeyStoresFactory.SSL_TRUSTSTORE_RELOAD_INTERVAL_TPL_KEY), trustStoreReloadInterval); return sslConf; }