List of usage examples for org.apache.hadoop.conf Configuration getInt
public int getInt(String name, int defaultValue)
name
property as an int
. From source file:eastcircle.terasort.TotalOrderPartitioner.java
License:Apache License
/** * Read the cut points from the given sequence file. * @param fs the file system/*from w w w . j a v a2s.c om*/ * @param p the path to read * @param job the job config * @return the strings to split the partitions on * @throws IOException */ private static Text[] readPartitions(FileSystem fs, Path p, Configuration conf) throws IOException { int reduces = conf.getInt(MRJobConfig.NUM_REDUCES, 1); Text[] result = new Text[reduces - 1]; DataInputStream reader = fs.open(p); for (int i = 0; i < reduces - 1; ++i) { result[i] = new Text(); result[i].readFields(reader); } reader.close(); return result; }
From source file:edu.indiana.d2i.htrc.corpus.analysis.LDAAnalysisMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); String mappingTableFileName = conf.get("user.args.mapping.table.filename"); String topicsFileName = conf.get("user.args.topics.filename"); BufferedReader reader = null; String line = null;/*from ww w . jav a 2s . c om*/ // load mapping table try { reader = new BufferedReader(new InputStreamReader(new FileInputStream(mappingTableFileName))); /* * each line is a mapping: <word> <index of the word in full word * set> */ while ((line = reader.readLine()) != null) { String trimmedLine = line.trim(); int idx = trimmedLine.lastIndexOf(' '); mappingTable.add(new MappingTableEntry(trimmedLine.substring(0, idx), Integer.parseInt(trimmedLine.substring(idx + 1)))); } } finally { if (reader != null) reader.close(); } mappingIndices = CorpusProcessingUtils.extractIdxFromMappingTable(mappingTable); reader = null; line = null; // load topics try { reader = new BufferedReader(new InputStreamReader(new FileInputStream(topicsFileName))); /* Each line is a topic */ while ((line = reader.readLine()) != null) { topics.add(line.trim()); } } finally { if (reader != null) reader.close(); } // load LDA state, stateFilePath is the path in HDFS String stateFilePath = conf.get("user.args.lda.state.filepath"); int stepSize = conf.getInt("user.args.topdoctable.capacity.stepsize", Integer.parseInt(Constants.LDA_ANALYSIS_DEFAULT_STEP_SIZE)); if (stateFilePath == null) { // No previous state for initialization (first iteration) ldaAnalyzer = new LDAAnalyzer(mappingTable, topics, stepSize); } else { // second and following iterations FileSystem fs = FileSystem.get(conf); SequenceFile.Reader seqFileReader = null; try { seqFileReader = new SequenceFile.Reader(fs, new Path(stateFilePath), conf); Text key = (Text) ReflectionUtils.newInstance(seqFileReader.getKeyClass(), conf); LDAState ldaState = (LDAState) ReflectionUtils.newInstance(seqFileReader.getValueClass(), conf); // the sequence file should only have one record seqFileReader.next(key, ldaState); ldaAnalyzer = new LDAAnalyzer(ldaState.getWordsTopicsTable(), ldaState.getTopicsDocumentsTable(), mappingTable, topics); } finally { IOUtils.closeStream(seqFileReader); } } }
From source file:edu.indiana.d2i.htrc.io.index.lucene.LuceneClient.java
License:Apache License
private LuceneClient(Configuration conf) throws IOException { String directory = conf.get(HTRCConstants.LUCENE_INDEX_PATH); // String directory = conf.get("htrc.lucene.index.path"); FileSystem fs = FileSystem.get(conf); Path indexPath = new Path(directory); Directory dir = new FileSystemDirectory(fs, indexPath, false, conf); indexSearcher = new IndexSearcher(dir); indexReader = IndexReader.open(dir); dictionary = new Dictionary(conf); // dynamic load the filter ?? // filter = new StopWordFilter(); filter = new StopWordFilter("stopwords.txt"); // found in the classpath filter.addNextFilter(new DictionaryFilter(dictionary)); filter.addNextFilter(new FrequencyFilter(conf.getInt(HTRCConstants.FILTER_WORD_MIN_FREQUENCE, 2))); filter.addNextFilter(new WordLengthFilter(conf.getInt(HTRCConstants.FILTER_WORD_MIN_LENGTH, 2))); }
From source file:edu.indiana.d2i.htrc.io.index.solr.SolrClient.java
License:Apache License
private void initFilters(Configuration conf) throws IOException { dictionary = new Dictionary(conf); filter = new StopWordFilter("stopwords.txt"); // found in the classpath filter.addNextFilter(new DictionaryFilter(dictionary)); filter.addNextFilter(new FrequencyFilter(conf.getInt(HTRCConstants.FILTER_WORD_MIN_FREQUENCE, 2))); filter.addNextFilter(new WordLengthFilter(conf.getInt(HTRCConstants.FILTER_WORD_MIN_LENGTH, 2))); }
From source file:edu.indiana.d2i.htrc.io.mem.MemCachedRecordWriter.java
License:Apache License
public MemCachedRecordWriter(Configuration conf) { // read configuration MAX_EXPIRE = conf.getInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, -1); int numClients = conf.getInt(HTRCConstants.MEMCACHED_CLIENT_NUM, -1); String[] hostArray = conf.getStrings(HTRCConstants.MEMCACHED_HOSTS); List<String> hosts = Arrays.asList(hostArray); Class<?> writableClass = conf.getClass("mapred.output.value.class", Writable.class); String namespace = conf.get(MemKMeansConfig.KEY_NS); if (namespace != null) NameSpace = namespace;/* w w w . ja va2 s. co m*/ client = ThreadedMemcachedClient.getThreadedMemcachedClient(numClients, hosts); transcoder = new HadoopWritableTranscoder<V>(conf, writableClass); }
From source file:edu.indiana.d2i.htrc.io.mem.ThreadedMemcachedClient.java
License:Apache License
public static ThreadedMemcachedClient getThreadedMemcachedClient(Configuration conf) { int numClients = conf.getInt(HTRCConstants.MEMCACHED_CLIENT_NUM, 1); String[] hostArray = conf.getStrings(HTRCConstants.MEMCACHED_HOSTS); List<String> hosts = Arrays.asList(hostArray); return getThreadedMemcachedClient(numClients, hosts); }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java
License:Apache License
private void sequentialTransform() throws Exception { Configuration conf = getConf(); setupConfiguration(conf);//from www. j av a2s . c om HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf); // set up analyzer, filter Analyzer analyzer = ClassUtils.instantiateAs( conf.get(DocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()), Analyzer.class); HTRCFilter filter = new StopWordFilter("stopwords.txt"); // found in the // classpath Dictionary dictionary = new Dictionary(conf); filter.addNextFilter(new DictionaryFilter(dictionary)); filter.addNextFilter(new WordLengthFilter(conf.getInt(HTRCConstants.FILTER_WORD_MIN_LENGTH, 2))); // memcached client ThreadedMemcachedClient memcachedClient = ThreadedMemcachedClient.getThreadedMemcachedClient(conf); MemcachedClient cache = memcachedClient.getCache(); int maxExpir = conf.getInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, -1); Transcoder<VectorWritable> transcoder = new HadoopWritableTranscoder<VectorWritable>(conf, VectorWritable.class); // Path input = new Path(idListDir); FileSystem fs = input.getFileSystem(conf); DataInputStream fsinput = new DataInputStream(fs.open(input)); BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput)); String line = null; int idNumThreshold = maxIdsPerReq; int idNum = 0; StringBuilder idList = new StringBuilder(); VectorWritable vectorWritable = new VectorWritable(); while ((line = reader.readLine()) != null) { idList.append(line + "|"); if ((++idNum) >= idNumThreshold) { // <id, content> Iterable<Entry<String, String>> content = client.getID2Content(idList.toString()); for (Entry<String, String> entry : content) { Vector result = transform2Vector(entry.getValue(), entry.getKey(), analyzer, filter, dictionary); vectorWritable.set(result); cache.set(entry.getKey(), maxExpir, vectorWritable, transcoder); // validate VectorWritable vecWritable = cache.get(entry.getKey(), transcoder); if (vecWritable == null) { throw new RuntimeException(entry.getKey() + " is not written to Memcached."); } else { System.out.println(entry.getKey()); } } idList = new StringBuilder(); idNum = 0; } } if (idList.length() > 0) { Iterable<Entry<String, String>> content = client.getID2Content(idList.toString()); for (Entry<String, String> entry : content) { Vector result = transform2Vector(entry.getValue(), entry.getKey(), analyzer, filter, dictionary); vectorWritable.set(result); cache.set(entry.getKey(), maxExpir, vectorWritable, transcoder); // validate VectorWritable vecWritable = cache.get(entry.getKey(), transcoder); if (vecWritable == null) { throw new RuntimeException(entry.getKey() + " is not written to Memcached."); } else { System.out.println(entry.getKey()); } } } }
From source file:edu.indiana.d2i.htrc.kmeans.MemKMeansUtil.java
License:Apache License
public static void loadClusterInfo(Configuration conf, Collection<Cluster> clusters) { int k = conf.getInt(MemKMeansConfig.CLUSTER_NUM, -1); if (k == -1)//from w ww.j a v a2s . co m throw new IllegalArgumentException("Number of cluster is -1!"); ThreadedMemcachedClient client = ThreadedMemcachedClient.getThreadedMemcachedClient(conf); MemcachedClient cache = client.getCache(); Transcoder<Cluster> clusterTranscoder = new HadoopWritableTranscoder<Cluster>(conf, Cluster.class); for (int i = 0; i < k; i++) { Cluster cluster = cache.get(toClusterName(i), clusterTranscoder); if (cluster != null) { clusters.add(cluster); } else { // logger.error("cannot find VectorWritable for " + id); client.close(); throw new RuntimeException("can't find cluster " + toClusterName(i)); } } client.close(); }
From source file:edu.indiana.d2i.htrc.kmeans.MemKMeansUtil.java
License:Apache License
public static boolean isConverged(Configuration conf) { int k = conf.getInt(MemKMeansConfig.CLUSTER_NUM, -1); if (k == -1)/* w ww. j a v a2 s . c o m*/ throw new IllegalArgumentException("Number of cluster is -1!"); ThreadedMemcachedClient client = ThreadedMemcachedClient.getThreadedMemcachedClient(conf); MemcachedClient cache = client.getCache(); Transcoder<Cluster> clusterTranscoder = new HadoopWritableTranscoder<Cluster>(conf, Cluster.class); for (int i = 0; i < k; i++) { Cluster cluster = cache.get(toClusterName(i), clusterTranscoder); if (cluster != null) { if (!cluster.isConverged()) return false; } else { throw new RuntimeException("can't find cluster " + toClusterName(i)); } } client.close(); return true; }
From source file:edu.indiana.d2i.htrc.kmeans.MemKMeansUtil.java
License:Apache License
public static void writeClusters2HDFS(Configuration conf, Path des) throws IOException { int k = conf.getInt(MemKMeansConfig.CLUSTER_NUM, -1); if (k == -1)/*from ww w . j ava2 s . co m*/ throw new IllegalArgumentException("Number of cluster is -1!"); ThreadedMemcachedClient client = ThreadedMemcachedClient.getThreadedMemcachedClient(conf); MemcachedClient cache = client.getCache(); Transcoder<Cluster> clusterTranscoder = new HadoopWritableTranscoder<Cluster>(conf, Cluster.class); SequenceFile.Writer writer = new SequenceFile.Writer(FileSystem.get(conf), conf, des, Text.class, Cluster.class); Text key = new Text(); for (int i = 0; i < k; i++) { Cluster cluster = cache.get(toClusterName(i), clusterTranscoder); key.set(cluster.getIdentifier()); writer.append(key, cluster); } writer.close(); client.close(); }