List of usage examples for org.apache.hadoop.conf Configuration getBoolean
public boolean getBoolean(String name, boolean defaultValue)
name
property as a boolean
. From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); this.minSupport = conf.getInt(MIN_SUPPORT, DEFAULT_MIN_SUPPORT); boolean emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS); emitUnigrams = true;//from ww w.j a va 2 s.c o m log.info("Min support is {}", minSupport); log.info("Emit Unitgrams is {}", emitUnigrams); }
From source file:edu.indiana.d2i.htrc.corpus.clean.CleanCorpusDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub /**//from w ww .j ava2s . c o m * Specify the # of reducers through -D * mapred.reduce.tasks=<numOfReducers> in hadoop command line. Specify * whether compression is used through -D * user.args.compression=<true/false> */ if (args.length != 2) { System.err.printf( "Usage: %s [generic options] </path/to/input/directory> </path/to/output/directory>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration conf = getConf(); Job job = new Job(conf, "HTRC Cleaning Raw Corpus"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(CleanCorpusDriver.class); job.setMapperClass(CleanCorpusMapper.class); job.setReducerClass(CleanCorpusReducer.class); if (conf.getBoolean("user.args.compression", false)) { /* use compression */ SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(TextArrayWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.indiana.d2i.htrc.corpus.retrieve.RetrieveRawCorpusDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub /**/*w w w . jav a 2 s .c o m*/ * Specify the # of reducers through -D * mapred.reduce.tasks=<numOfReducers> in hadoop command line. Specify * using compression through -D user.args.compression=true */ if (args.length != 3) { System.err.printf( "Usage: %s [generic options] </path/to/input/directory> </path/to/output/directory> </path/to/property/file>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration conf = getConf(); Properties prop = new Properties(); prop.load(new FileInputStream(args[2])); // set configuration parameters // data api related parameters conf.set(Constants.DATA_API_EPR, prop.getProperty(Constants.DATA_API_EPR)); conf.set(Constants.DATA_API_CONCAT, prop.getProperty(Constants.DATA_API_CONCAT)); conf.set(Constants.DATA_API_SELFSIGN, prop.getProperty(Constants.DATA_API_SELFSIGN)); conf.set(Constants.DATA_API_DELIMITER, prop.getProperty(Constants.DATA_API_DELIMITER)); conf.set(Constants.DATA_API_VOL_PREFIX, prop.getProperty(Constants.DATA_API_VOL_PREFIX)); conf.set(Constants.DATA_API_PAGE_PREFIX, prop.getProperty(Constants.DATA_API_PAGE_PREFIX)); conf.set(Constants.DATA_API_REQ_SIZE, prop.getProperty(Constants.DATA_API_REQ_SIZE)); // oauth2 related parameters conf.set(Constants.OAUTH2_EPR, prop.getProperty(Constants.OAUTH2_EPR)); conf.set(Constants.OAUTH2_USER_NAME, prop.getProperty(Constants.OAUTH2_USER_NAME)); conf.set(Constants.OAUTH2_USER_PASSWORD, prop.getProperty(Constants.OAUTH2_USER_PASSWORD)); // set # of lines (volumes in our case) to be processed by one map task conf.set("mapreduce.input.lineinputformat.linespermap", prop.getProperty(Constants.NUM_VOLUMES_PER_MAPPER)); Job job = new Job(conf, "HTRC Retrieving Raw Corpus"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(RetrieveRawCorpusDriver.class); job.setMapperClass(RetrieveRawCorpusMapper.class); job.setReducerClass(RetrieveRawCorpusReducer.class); if (conf.getBoolean("user.args.compression", false)) { /* use compression */ SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(TextArrayWritable.class); job.setInputFormatClass(NLineInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.indiana.d2i.htrc.corpus.transform.CorpusTransformDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub /**/* w w w . j a va2s.c o m*/ * Specify the # of reducers through -D * mapred.reduce.tasks=<numOfReducers> in hadoop command line. Specify * whether compression is used through -D user.args.compression=true, * use -D user.args.wordset.filename=<wordset_filename> to set wordset * filename, use -files </local/path/to/wordset_file> to distribute * wordset_file to each compute node */ if (args.length != 2) { System.err.printf( "Usage: %s [generic options] </path/to/input/directory> </path/to/output/directory>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration conf = getConf(); Job job = new Job(conf, "HTRC Transforming Corpus"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(CorpusTransformDriver.class); job.setMapperClass(CorpusTransformMapper.class); job.setReducerClass(CorpusTransformReducer.class); if (conf.getBoolean("user.args.compression", false)) { /* use compression */ SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(TextArrayWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.indiana.d2i.htrc.util.DataAPITestDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { String dataAPIConfClassName = args[0]; int maxIdsPerReq = Integer.valueOf(args[1]); String queryStr = args[2];/*ww w. j a v a2 s . c o m*/ Configuration conf = getConf(); Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq); int maxIdRetrieved = conf.getInt(HTRCConstants.MAX_ID_RETRIEVED, 100); String dataEPR = conf.get(HTRCConstants.HOSTS_SEPARATEDBY_COMMA).split(",")[0]; String delimitor = conf.get(HTRCConstants.DATA_API_URL_DELIMITOR, "|"); String clientID = conf.get(HTRCConstants.DATA_API_CLIENTID, "yim"); String clientSecrete = conf.get(HTRCConstants.DATA_API_CLIENTSECRETE, "yim"); String tokenLoc = conf.get(HTRCConstants.DATA_API_TOKENLOC, "https://129-79-49-119.dhcp-bl.indiana.edu:25443/oauth2/token?grant_type=client_credentials"); boolean selfsigned = conf.getBoolean(HTRCConstants.DATA_API_SELFSIGNED, true); if (dataEPR.equals(HTRCConstants.DATA_API_DEFAULT_URL)) { dataEPR = HTRCConstants.DATA_API_DEFAULT_URL_PREFIX + dataEPR; } HTRCDataAPIClient dataClient = new HTRCDataAPIClient.Builder(dataEPR, delimitor).authentication(true) .selfsigned(selfsigned).clientID(clientID).clientSecrete(clientSecrete).tokenLocation(tokenLoc) .build(); // String queryStr = "yale.39002052249902|uc2.ark:/13960/t88g8h13f|uc2.ark:/13960/t6sx67388|uc2.ark:/13960/t5j96547r|uc2.ark:/13960/t6ww79z3v|yale.39002085406669|miua.4918260.0305.001|uc2.ark:/13960/t3416xb23|uc2.ark:/13960/t86h4mv25|loc.ark:/13960/t2k64mv58|"; Iterable<Entry<String, String>> entries = dataClient.getID2Content(queryStr); for (Entry<String, String> entry : entries) { System.out.println(entry.getKey()); } return 0; }
From source file:edu.indiana.d2i.htrc.util.DataCopyValidation.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String outputPath = args[0]; // result String dataAPIConfClassName = args[1]; int maxIdsPerReq = Integer.valueOf(args[2]); logger.info("DataValidation "); logger.info(" - output: " + outputPath); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); logger.info(" - maxIdsPerReq: " + maxIdsPerReq); Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq); // HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf); String dataEPR = conf.get(HTRCConstants.HOSTS_SEPARATEDBY_COMMA, "https://129-79-49-119.dhcp-bl.indiana.edu:25443/data-api"); String delimitor = conf.get(HTRCConstants.DATA_API_URL_DELIMITOR, "|"); String clientID = conf.get(HTRCConstants.DATA_API_CLIENTID, "yim"); String clientSecrete = conf.get(HTRCConstants.DATA_API_CLIENTSECRETE, "yim"); String tokenLoc = conf.get(HTRCConstants.DATA_API_TOKENLOC, "https://129-79-49-119.dhcp-bl.indiana.edu:25443/oauth2/token?grant_type=client_credentials"); boolean selfsigned = conf.getBoolean(HTRCConstants.DATA_API_SELFSIGNED, true); HTRCDataAPIClient client = new HTRCDataAPIClient.Builder(dataEPR, delimitor).authentication(true) .selfsigned(selfsigned).clientID(clientID).clientSecrete(clientSecrete).tokenLocation(tokenLoc) .build();// w w w . j a v a2s .c o m FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(outputPath), Utilities.HIDDEN_FILE_FILTER); Text key = new Text(); Text value = new Text(); for (int i = 0; i < status.length; i++) { SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[i].getPath(), conf); while (seqReader.next(key, value)) { // logger.info(key.toString()); Iterable<Entry<String, String>> content = client.getID2Content(key.toString()); Iterator<Entry<String, String>> iterator = content.iterator(); Entry<String, String> entry = iterator.next(); if (!entry.getValue().equals(value.toString())) { logger.error("Book : " + key.toString() + " corrupts!"); } } } logger.info("Finish validation."); return 0; }
From source file:edu.indiana.d2i.htrc.util.IDValidation.java
License:Apache License
@Override public int run(String[] args) throws Exception { System.out.println("args.length " + args.length); String dataAPIConfClassName = args[0]; int maxIdsPerReq = Integer.valueOf(args[1]); String idfile = args[2];// w w w. j a v a 2s . c om String accurateIdfile = args[3]; Configuration conf = getConf(); Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq); int maxIdRetrieved = conf.getInt(HTRCConstants.MAX_ID_RETRIEVED, 100); String dataEPR = conf.get(HTRCConstants.HOSTS_SEPARATEDBY_COMMA).split(",")[0]; String delimitor = conf.get(HTRCConstants.DATA_API_URL_DELIMITOR, "|"); String clientID = conf.get(HTRCConstants.DATA_API_CLIENTID, "yim"); String clientSecrete = conf.get(HTRCConstants.DATA_API_CLIENTSECRETE, "yim"); String tokenLoc = conf.get(HTRCConstants.DATA_API_TOKENLOC, "https://129-79-49-119.dhcp-bl.indiana.edu:25443/oauth2/token?grant_type=client_credentials"); boolean selfsigned = conf.getBoolean(HTRCConstants.DATA_API_SELFSIGNED, true); if (dataEPR.equals(HTRCConstants.DATA_API_DEFAULT_URL)) { dataEPR = HTRCConstants.DATA_API_DEFAULT_URL_PREFIX + dataEPR; } HTRCDataAPIClient dataClient = new HTRCDataAPIClient.Builder(dataEPR, delimitor).authentication(true) .selfsigned(selfsigned).clientID(clientID).clientSecrete(clientSecrete).tokenLocation(tokenLoc) .build(); BufferedReader reader = new BufferedReader(new FileReader(idfile)); BufferedWriter writer = new BufferedWriter(new FileWriter(accurateIdfile)); String line = null; int count = 0; while ((line = reader.readLine()) != null) { Iterable<Entry<String, String>> content = dataClient.getID2Content(line); if (content != null) writer.write(line + "\n"); if ((++count) % 1000 == 0) System.out.println("Finish " + count + " volumes."); } reader.close(); writer.close(); // String queryStr = "yale.39002052249902|uc2.ark:/13960/t88g8h13f|uc2.ark:/13960/t6sx67388|uc2.ark:/13960/t5j96547r|uc2.ark:/13960/t6ww79z3v|yale.39002085406669|miua.4918260.0305.001|uc2.ark:/13960/t3416xb23|uc2.ark:/13960/t86h4mv25|loc.ark:/13960/t2k64mv58|"; // Iterable<Entry<String, String>> entries = dataClient.getID2Content(queryStr); // for (Entry<String, String> entry : entries) { // System.out.println(entry.getKey()); // } // return 0; }
From source file:edu.indiana.d2i.htrc.util.Utilities.java
License:Apache License
public static HTRCDataAPIClient creatDataAPIClient(Configuration conf) { String dataEPR = conf.get(HTRCConstants.DATA_API_EPR, "129-79-49-119.dhcp-bl.indiana.edu:25443"); String delimitor = conf.get(HTRCConstants.DATA_API_URL_DELIMITOR, "|"); String clientID = conf.get(HTRCConstants.DATA_API_CLIENTID, "yim"); String clientSecrete = conf.get(HTRCConstants.DATA_API_CLIENTSECRETE, "yim"); String tokenLoc = conf.get(HTRCConstants.DATA_API_TOKENLOC, "https://129-79-49-119.dhcp-bl.indiana.edu:25443/oauth2/token?grant_type=client_credentials"); boolean selfsigned = conf.getBoolean(HTRCConstants.DATA_API_SELFSIGNED, true); return new HTRCDataAPIClient.Builder(dataEPR, delimitor).authentication(true).selfsigned(selfsigned) .clientID(clientID).clientSecrete(clientSecrete).tokenLocation(tokenLoc).build(); }
From source file:edu.indiana.d2i.htrc.util.Utilities.java
License:Apache License
public static void setDataAPIConf(Configuration conf, String dataAPIConfClassName, int maxIdsPerReq) throws ClassNotFoundException { Class<?> dataAPIConfClass = Class.forName(dataAPIConfClassName); DataAPIDefaultConf confInstance = (DataAPIDefaultConf) ReflectionUtils.newInstance(dataAPIConfClass, conf); confInstance.configurate(conf, maxIdsPerReq); logger.info("Data API configuration"); logger.info(" - host: " + conf.get(HTRCConstants.HOSTS_SEPARATEDBY_COMMA, "129-79-49-119.dhcp-bl.indiana.edu:25443/data-api")); logger.info(" - delimitor: " + conf.get(HTRCConstants.DATA_API_URL_DELIMITOR, "|")); logger.info(" - clientID: " + conf.get(HTRCConstants.DATA_API_CLIENTID, "yim")); logger.info(" - clientSecret: " + conf.get(HTRCConstants.DATA_API_CLIENTSECRETE, "yim")); logger.info(" - tokenLoc: " + conf.get(HTRCConstants.DATA_API_TOKENLOC, "https://129-79-49-119.dhcp-bl.indiana.edu:25443/oauth2/token?grant_type=client_credentials")); logger.info(" - selfsigned: " + conf.getBoolean(HTRCConstants.DATA_API_SELFSIGNED, true)); logger.info(" - maxIDRetrieved: " + conf.getInt(HTRCConstants.MAX_ID_RETRIEVED, 100)); }
From source file:edu.indiana.d2i.htrc.util.Utilities.java
License:Apache License
public static void filterUnexistID(String input, String output) throws Exception { BufferedReader reader = new BufferedReader(new FileReader(input)); BufferedWriter writer = new BufferedWriter(new FileWriter(output)); Configuration conf = new Configuration(); Utilities.setDataAPIConf(conf, "edu.indiana.d2i.htrc.io.DataAPISilvermapleConf", 1); int maxIdRetrieved = conf.getInt(HTRCConstants.MAX_ID_RETRIEVED, 100); String dataEPR = conf.get(HTRCConstants.HOSTS_SEPARATEDBY_COMMA).split(",")[0]; String delimitor = conf.get(HTRCConstants.DATA_API_URL_DELIMITOR, "|"); String clientID = conf.get(HTRCConstants.DATA_API_CLIENTID, "yim"); String clientSecrete = conf.get(HTRCConstants.DATA_API_CLIENTSECRETE, "yim"); String tokenLoc = conf.get(HTRCConstants.DATA_API_TOKENLOC, "https://129-79-49-119.dhcp-bl.indiana.edu:25443/oauth2/token?grant_type=client_credentials"); boolean selfsigned = conf.getBoolean(HTRCConstants.DATA_API_SELFSIGNED, true); if (dataEPR.equals(HTRCConstants.DATA_API_DEFAULT_URL)) { dataEPR = HTRCConstants.DATA_API_DEFAULT_URL_PREFIX + dataEPR; }//from w w w . j a v a 2 s . com HTRCDataAPIClient dataClient = new HTRCDataAPIClient.Builder(dataEPR, delimitor).authentication(true) .selfsigned(selfsigned).clientID(clientID).clientSecrete(clientSecrete).tokenLocation(tokenLoc) .build(); String line = null; while ((line = reader.readLine()) != null) { Iterable<Entry<String, String>> content = dataClient.getID2Content(line); if (content != null) writer.write(line + "\n"); } writer.close(); reader.close(); }