List of usage examples for org.apache.hadoop.io Text set
public void set(Text other)
From source file:microbench.WordCountOnHDFSDataLocal.java
License:Apache License
public static void main(String[] args) throws IOException, InterruptedException { try {/*from ww w . j av a 2s. c om*/ parseArgs(args); HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); MPI_D.Init(args, MPI_D.Mode.Common, conf); JobConf jobConf = new JobConf(confPath); if (MPI_D.COMM_BIPARTITE_O != null) { // O communicator int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); if (rank == 0) { DataMPIUtil.printArgs(args); } System.out.println("The O task " + rank + " of " + size + " is working..."); HadoopReader<LongWritable, Text> reader = HadoopIOUtil.getReader(jobConf, inDir, TextInputFormat.class, rank, MPI_D.COMM_BIPARTITE_O); Text word = new Text(); IntWritable one = new IntWritable(1); LongWritable khead = reader.createKey(); Text vhead = reader.createValue(); while (reader.next(khead, vhead)) { StringTokenizer itr = new StringTokenizer(vhead.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); // send key-value MPI_D.Send(word, one); } } reader.close(); } else if (MPI_D.COMM_BIPARTITE_A != null) { // A communicator int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_A); System.out.println("The A task " + rank + " of " + size + " is working..."); HadoopWriter<Text, IntWritable> outrw = HadoopIOUtil.getNewWriter(jobConf, outDir, Text.class, IntWritable.class, TextOutputFormat.class, null, rank, MPI_D.COMM_BIPARTITE_A); Text oldKey = null; IntWritable valueData = new IntWritable(); int sum = 0; Object[] keyValue = MPI_D.Recv(); while (keyValue != null) { Text key = (Text) keyValue[0]; IntWritable value = (IntWritable) keyValue[1]; if (oldKey == null) { oldKey = key; sum = value.get(); } else { if (key.equals(oldKey)) { sum += value.get(); } else { valueData.set(sum); outrw.write(oldKey, valueData); oldKey = key; sum = value.get(); } } keyValue = MPI_D.Recv(); } if (oldKey != null) { valueData.set(sum); outrw.write(oldKey, valueData); } outrw.close(); } MPI_D.Finalize(); } catch (MPI_D_Exception e) { e.printStackTrace(); } }
From source file:mlbench.pagerank.PagerankNaive.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) public static void main(String[] args) throws IOException, InterruptedException { try {/*from w ww .jav a2 s .co m*/ parseArgs(args); HashMap<String, String> conf = new HashMap<String, String>(); initConf(conf); MPI_D.Init(args, MPI_D.Mode.Common, conf); JobConf jobConf = new JobConf(confPath); if (MPI_D.COMM_BIPARTITE_O != null) { // O communicator int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); if (rank == 0) { LOG.info(PagerankNaive.class.getSimpleName() + " O start."); } FileSplit[] inputs1 = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, jobConf, edgeDir, rank); FileSplit[] inputs2 = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, jobConf, vecDir, rank); FileSplit[] inputs = (FileSplit[]) ArrayUtils.addAll(inputs2, inputs1); for (int i = 0; i < inputs.length; i++) { FileSplit fsplit = inputs[i]; LineRecordReader kvrr = new LineRecordReader(jobConf, fsplit); LongWritable key = kvrr.createKey(); Text value = kvrr.createValue(); { IntWritable k = new IntWritable(); Text v = new Text(); while (kvrr.next(key, value)) { String line_text = value.toString(); // ignore comments in edge file if (line_text.startsWith("#")) continue; final String[] line = line_text.split("\t"); if (line.length < 2) continue; // vector : ROWID VALUE('vNNNN') if (line[1].charAt(0) == 'v') { k.set(Integer.parseInt(line[0])); v.set(line[1]); MPI_D.Send(k, v); } else { /* * In other matrix-vector multiplication, we * output (dst, src) here However, In PageRank, * the matrix-vector computation formula is M^T * * v. Therefore, we output (src,dst) here. */ int src_id = Integer.parseInt(line[0]); int dst_id = Integer.parseInt(line[1]); k.set(src_id); v.set(line[1]); MPI_D.Send(k, v); if (make_symmetric == 1) { k.set(dst_id); v.set(line[0]); MPI_D.Send(k, v); } } } } } } else if (MPI_D.COMM_BIPARTITE_A != null) { // A communicator int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A); if (rank == 0) { LOG.info(PagerankNaive.class.getSimpleName() + " A start."); } HadoopWriter<IntWritable, Text> outrw = HadoopIOUtil.getNewWriter(jobConf, outDir, IntWritable.class, Text.class, TextOutputFormat.class, null, rank, MPI_D.COMM_BIPARTITE_A); IntWritable oldKey = null; int i; double cur_rank = 0; ArrayList<Integer> dst_nodes_list = new ArrayList<Integer>(); Object[] keyValue = MPI_D.Recv(); while (keyValue != null) { IntWritable key = (IntWritable) keyValue[0]; Text value = (Text) keyValue[1]; if (oldKey == null) { oldKey = key; } // A new key arrives if (!key.equals(oldKey)) { outrw.write(oldKey, new Text("s" + cur_rank)); int outdeg = dst_nodes_list.size(); if (outdeg > 0) { cur_rank = cur_rank / (double) outdeg; } for (i = 0; i < outdeg; i++) { outrw.write(new IntWritable(dst_nodes_list.get(i)), new Text("v" + cur_rank)); } oldKey = key; cur_rank = 0; dst_nodes_list = new ArrayList<Integer>(); } // common record String line_text = value.toString(); final String[] line = line_text.split("\t"); if (line.length == 1) { if (line_text.charAt(0) == 'v') { // vector : VALUE cur_rank = Double.parseDouble(line_text.substring(1)); } else { // edge : ROWID dst_nodes_list.add(Integer.parseInt(line[0])); } } keyValue = MPI_D.Recv(); } // write the left part if (cur_rank != 0) { outrw.write(oldKey, new Text("s" + cur_rank)); int outdeg = dst_nodes_list.size(); if (outdeg > 0) { cur_rank = cur_rank / (double) outdeg; } for (i = 0; i < outdeg; i++) { outrw.write(new IntWritable(dst_nodes_list.get(i)), new Text("v" + cur_rank)); } } outrw.close(); } MPI_D.Finalize(); } catch (MPI_D_Exception e) { e.printStackTrace(); } }
From source file:mx.itam.metodos.lshclustering.MinhashEmitMapper.java
License:Apache License
@Override public void map(Text id, IntArrayWritable values, Context context) throws IOException, InterruptedException { for (int i = 0; i < functionsCount; i++) { hashValues[i] = Integer.MAX_VALUE; }/*from w ww .jav a 2 s. c o m*/ for (int i = 0; i < functionsCount; i++) { HashFunction hf = functions[i]; for (Writable wr : values.get()) { IntWritable value = (IntWritable) wr; int hash = hf.hashInt(value.get()).asInt(); if (hash < hashValues[i]) { hashValues[i] = hash; } } } Text sketch = new Text(); Hasher hasher = lsh.newHasher(); int band = 0; for (int i = 0; i < functionsCount; i++) { hasher.putInt(hashValues[i]); if (i > 0 && (i % rows) == 0) { sketch.set(band + "-" + hasher.hash().toString()); context.write(new SecondarySortKey(sketch, id), id); hasher = lsh.newHasher(); band++; } } sketch.set(band + "-" + hasher.hash().toString()); context.write(new SecondarySortKey(sketch, id), id); }
From source file:mx.itam.metodos.minhashing.MinhashMapper.java
License:Apache License
@Override public void map(Text id, IntArrayWritable values, Context ctx) throws IOException, InterruptedException { for (int i = 0; i < functionsCount; i++) { hashValues[i] = Integer.MAX_VALUE; }//from w ww . j a va 2 s . co m for (int i = 0; i < functionsCount; i++) { HashFunction hf = functions[i]; for (Writable wr : values.get()) { IntWritable value = (IntWritable) wr; int hash = hf.hashInt(value.get()).asInt(); if (hash < hashValues[i]) { hashValues[i] = hash; } } } Text sketch = new Text(); Hasher hasher = lsh.newHasher(); int band = 0; for (int i = 0; i < functionsCount; i++) { hasher.putInt(hashValues[i]); if (i > 0 && (i % rows) == 0) { sketch.set(band + "-" + hasher.hash().toString()); write(id, sketch, ctx); hasher = lsh.newHasher(); band++; } } sketch.set(band + "-" + hasher.hash().toString()); write(id, sketch, ctx); }
From source file:mx.iteso.msc.asn.mrwordcount.MyMapper.java
License:Apache License
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { Text word = new Text(); StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, new IntWritable(1)); }//from www . j a v a 2 s .c o m }
From source file:net.peacesoft.nutch.crawl.ReIndexerMapReduce.java
License:Apache License
public void map(Text key, Writable value, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { String urlString = filterUrl(normalizeUrl(key.toString())); if (urlString == null) { return;/*from w ww . ja v a 2 s .c o m*/ } else { key.set(urlString); } output.collect(key, new NutchWritable(value)); }
From source file:oracle.kv.hadoop.hive.table.TableHiveRecordReader.java
License:Open Source License
@Override public boolean next(Text key, Text value) { LOG.trace("next [key = " + key + ", value = " + value + "]"); if (key == null || value == null) { return false; }/*w ww. ja va2 s . co m*/ boolean ret = false; try { key.clear(); value.clear(); ret = v2RecordReader.nextKeyValue(); if (ret) { final Row curRow = v2RecordReader.getCurrentValue(); assert curRow != null; key.set(curRow.createPrimaryKey().toString()); value.set(curRow.toString()); } } catch (Exception e) { LOG.error("TableHiveRecordReader " + this + " caught: " + e, e); } return ret; }
From source file:org.ahanna.DoubleConversionMapper.java
License:Apache License
public void map(Text json, Text nothing, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { try {// ww w.j ava 2s. co m Text outJson = new Text(); String jsonStr = json.toString(); JSONTokener tokener = new JSONTokener(jsonStr); JSONObject jsonObj = new JSONObject(tokener); fixATweet(jsonObj); if (jsonObj.has("retweeted_status")) { Object retweetObj = jsonObj.get("retweeted_status"); if (retweetObj.toString() != "null") { fixATweet((JSONObject) retweetObj); } } outJson.set(jsonObj.toString()); output.collect(outJson, nothing); } catch (EOFException e) { // do nothing } catch (JSONException e) { // do nothing } }
From source file:org.ankus.mapreduce.algorithms.correlation.booleanset.BooleanSetMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String row = value.toString(); String[] columns = row.split(delimiter); StringBuffer uniqueKeyStringBuffer = new StringBuffer(); for (int i = 0; i < columns.length; i++) { String column = columns[i]; if (i == Integer.parseInt(keyIndex)) { uniqueKeyStringBuffer.append(column); } else {/* w ww . j a v a2 s . c om*/ continue; } } for (int i = 1; i < columns.length; i++) { // If the data value is not equal 0 or 1, the value is 1. if (columns[i].equals("0") || columns[i].equals("1")) { value.set(columns[i]); } else { value.set("1"); } TextIntegerPairWritableComparable textIntegerPairWritableComparable = new TextIntegerPairWritableComparable( uniqueKeyStringBuffer.toString(), Integer.parseInt(value.toString())); context.write(new Text("item-" + i), textIntegerPairWritableComparable); } }
From source file:org.ankus.mapreduce.algorithms.correlation.numericset.NumericSetMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String row = value.toString(); String[] columns = row.split(delimiter); StringBuffer uniqueKeyStringBuffer = new StringBuffer(); for (int i = 0; i < columns.length; i++) { String column = columns[i]; if (i == Integer.parseInt(keyIndex)) { uniqueKeyStringBuffer.append(column); } else {//from w ww .j av a2 s. c om continue; } } for (int k = 1; k < columns.length; k++) { value.set(columns[k]); TextDoublePairWritableComparable textDoublePairWritableComparable = new TextDoublePairWritableComparable( uniqueKeyStringBuffer.toString(), Double.parseDouble(value.toString())); context.write(new Text("item-" + k), textDoublePairWritableComparable); } }