List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:clustering.similarity.ISimMapper.java
License:Apache License
/** * @param key groupId1,groupId2// w ww . j a va 2s . com * @param value sim * {@inheritDoc} */ @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { String[] pair = key.toString().split(","); this.outputKey.set(Integer.valueOf(pair[0]), Integer.valueOf(pair[1])); this.outputValue.set(Double.valueOf(value.toString())); // wrap input and write to reducer context.write(this.outputKey, this.outputValue); }
From source file:clustering.similarity.PreMapper.java
License:Apache License
/** * @param key termId:term/*from w w w .java2 s . c o m*/ * @param value group_id=tf-idf,... * {@inheritDoc} */ @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { String[] docs = value.toString().split(","); if (docs.length > this.lengthThreshold) { /* set the length of each container */ int docsInSeg = docs.length / this.splitNum; if (docs.length % this.splitNum != 0) { docsInSeg++; } for (int i = 0; i < this.splitNum - 1; i++) { // seg_i, self join String partI = getSegment(docs, i, docsInSeg); this.outputKey.set(this.bigIndex++, 0); this.outputValue.set(partI); context.write(this.outputKey, this.outputValue); for (int j = i + 1; j < this.splitNum; j++) { // seg_i#seg_j, cross join /* continue from seg_i */ String partJ = getSegment(docs, j, docsInSeg); this.outputKey.set(this.bigIndex++, 1); this.outputValue.set(partI + "#" + partJ); context.write(this.outputKey, this.outputValue); } } // last seg, self join String lastSeg = getSegment(docs, this.splitNum - 1, docsInSeg); this.outputKey.set(this.bigIndex++, 0); this.outputValue.set(lastSeg); context.write(this.outputKey, this.outputValue); } else if (docs.length > 1) { this.outputKey.set(this.smallIndex++, 0); // container_id,flag \t term_id:group_id=tf-idf,... context.write(this.outputKey, value); } }
From source file:clustering.similarity.PreReducer.java
License:Apache License
/** * @param values docId=TF-IDF,docId=TF-IDF... *//* w w w .j a v a 2s . co m*/ private void selfJoin(Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { String[] contents = value.toString().split(","); if (contents.length > 1) { // should always be true, just for security for (int i = 0; i < contents.length; i++) { for (int j = i + 1; j < contents.length; j++) { String[] idAndWeighti = contents[i].split("="); String[] idAndWeightj = contents[j].split("="); output(idAndWeighti[0], idAndWeightj[0], idAndWeighti[1], idAndWeightj[1], context); } } } } }
From source file:clustering.similarity.PreReducer.java
License:Apache License
/** * @param values docId=TF-IDF,docId=TF-IDF...#docId=TF-IDF,docId=TF-IDF... *///w w w. jav a 2 s. com private void crossJoin(Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { // termId, docId=TF-IDF,docId=TF-IDF... String[] sets = value.toString().split("#"); String[] set1 = sets[0].split(","); String[] set2 = sets[1].split(","); for (String aPart1 : set1) { for (String aPart2 : set2) { String[] idAndWeight1 = aPart1.split("="); String[] idAndWeight2 = aPart2.split("="); output(idAndWeight1[0], idAndWeight2[0], idAndWeight1[1], idAndWeight2[1], context); } } } }
From source file:clustering.tf_idf.TermCountMapper.java
License:Apache License
/** * @param key group_id//from w w w. j av a 2 s .co m * @param value entry_id@@g_no::g_name##[g_model] * {@inheritDoc} */ @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { String[] line = value.toString().split("::"); String[] nameAndModel = line[1].split("##"); String group_id = key.toString(); if (nameAndModel.length < 1) { // should not be, just for secure. return; } String[] nameTerms = nameAndModel[0].split(" "); for (String term : nameTerms) { this.outputKey.set(term + "@@@" + group_id + "::title"); // term@@@group_id::title \t 1 context.write(this.outputKey, this.outputValue); } if (nameAndModel.length == 2) { String[] modelTerms = nameAndModel[1].split(" "); for (String term : modelTerms) { this.outputKey.set(term + "@@@" + group_id + "::content"); // term@@@group_id::content \t 1 context.write(this.outputKey, this.outputValue); } } }
From source file:clustering.tf_idf.TermFreqMapper.java
License:Apache License
/** * @param key term@@@group_id::position * @param value count//from w w w .j av a 2s. c o m * {@inheritDoc} */ @Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { // termAndDoc[0] = term // termAndDoc[1] = group_id::position String[] termAndDoc = key.toString().split("@@@"); // idAndPosition[0] = group_id // idAndPosition[1] = position String[] idAndPosition = termAndDoc[1].split("::"); // id this.outputKey.set(Integer.valueOf(idAndPosition[0])); // position::term=count this.outputValue.set(idAndPosition[1] + "::" + termAndDoc[0] + "=" + value.toString()); // group_id \t position::term=count context.write(this.outputKey, this.outputValue); }
From source file:clustering.tf_idf.TermFreqReducer.java
License:Apache License
/** * @param key group_id/*from w ww .j a v a 2 s . c o m*/ * @param values position::term=count * {@inheritDoc} */ @Override protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { int termsCntInDoc = 0; this.termWeightMap.clear(); for (Text val : values) { // positionTermCnt[0] = position // positionTermCnt[1] = term=count String[] positionTermCnt = val.toString().split("::"); String position = positionTermCnt[0]; String[] termCnt = positionTermCnt[1].split("="); int count = Integer.valueOf(termCnt[1]); termsCntInDoc += count; // TODO: 17-4-24 is it necessary to make it enum or a class? double weightedCount = position.equals("title") ? this.weight * count : count; // term : weight CollectionUtils.updateCountMap(this.termWeightMap, termCnt[0], weightedCount); } for (Map.Entry<String, Double> entry : this.termWeightMap.entrySet()) { // term this.outputKey.set(entry.getKey()); // group_id=weighted_tf double wtf = entry.getValue() / termsCntInDoc; this.outputValue.set(key.toString() + "=" + wtf); context.write(this.outputKey, this.outputValue); } }
From source file:clustering.tf_idf.TF_IDF_Reducer.java
License:Apache License
/** * @param key term//from ww w. ja v a 2 s. com * @param values group_id=weighted_tf * {@inheritDoc} */ @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { this.docAndWTF.clear(); // total appear times of this term int appearInAll = 0; /* count the total appear times of each term * and store them with their weighted tf in a map */ for (Text value : values) { appearInAll++; // docAndFreq[0] = group_id // docAndFreq[1] = weighted_tf String[] docAndFreq = value.toString().split("="); this.docAndWTF.put(docAndFreq[0], docAndFreq[1]); } for (Map.Entry<String, String> entry : this.docAndWTF.entrySet()) { double wtf = Double.valueOf(entry.getValue()); double idf = Math.log((double) this.documentNumber / (double) (appearInAll + 1)); this.outputKey.set(entry.getKey()); this.outputValue.set(key.toString() + "=" + wtf * idf); // group_id \t term=tf_idf context.write(this.outputKey, this.outputValue); } }
From source file:cn.ac.ncic.mastiff.io.coding.ORCStringEcnodingUtil.java
License:Apache License
public String readEachValue(Text previous) throws IOException { Text result = null; int entry = (int) reader.next(); if (previous == null) { result = new Text(); } else {//from w w w . j av a 2 s . com result = (Text) previous; } int offset = dictionaryOffsets[entry]; int length; // if it isn't the last entry, subtract the offsets otherwise use // the buffer length. if (entry < dictionaryOffsets.length - 1) { length = dictionaryOffsets[entry + 1] - offset; } else { length = dictionaryBuffer.size() - offset; } // If the column is just empty strings, the size will be zero, // so the buffer will be null, in that case just return result // as it will default to empty if (dictionaryBuffer != null) { dictionaryBuffer.setText(result, offset, length); } else { result.clear(); } return result.toString(); }
From source file:cn.ac.ncic.mastiff.io.coding.RedBlackTreeStringReader.java
License:Apache License
public String readEachValue(Text previous) throws IOException { Text result = null; int entry = (int) reader.next(); if (previous == null) { result = new Text(); } else {//from w w w . ja v a 2 s . co m result = (Text) previous; } int offset = dictionaryOffsets[entry]; int length; if (entry < dictionaryOffsets.length - 1) { length = dictionaryOffsets[entry + 1] - offset; } else { length = dictionaryBuffer.size() - offset; } // If the column is just empty strings, the size will be zero, // so the buffer will be null, in that case just return result // as it will default to empty if (dictionaryBuffer != null) { dictionaryBuffer.setText(result, offset, length); } else { result.clear(); } // } return result.toString(); }