Example usage for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString()

Source Link

Document

Convert text back to string

Usage

From source file:clustering.similarity.ISimMapper.java

License:Apache License

/**
 * @param key   groupId1,groupId2// w  ww . j a  va 2s .  com
 * @param value sim
 *              {@inheritDoc}
 */
@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {

    String[] pair = key.toString().split(",");
    this.outputKey.set(Integer.valueOf(pair[0]), Integer.valueOf(pair[1]));
    this.outputValue.set(Double.valueOf(value.toString()));
    // wrap input and write to reducer
    context.write(this.outputKey, this.outputValue);
}

From source file:clustering.similarity.PreMapper.java

License:Apache License

/**
 * @param key   termId:term/*from w w w  .java2 s .  c o  m*/
 * @param value group_id=tf-idf,...
 *              {@inheritDoc}
 */
@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {

    String[] docs = value.toString().split(",");

    if (docs.length > this.lengthThreshold) {
        /* set the length of each container */
        int docsInSeg = docs.length / this.splitNum;
        if (docs.length % this.splitNum != 0) {
            docsInSeg++;
        }
        for (int i = 0; i < this.splitNum - 1; i++) {
            // seg_i, self join
            String partI = getSegment(docs, i, docsInSeg);
            this.outputKey.set(this.bigIndex++, 0);
            this.outputValue.set(partI);
            context.write(this.outputKey, this.outputValue);

            for (int j = i + 1; j < this.splitNum; j++) {
                // seg_i#seg_j, cross join
                /* continue from seg_i */
                String partJ = getSegment(docs, j, docsInSeg);

                this.outputKey.set(this.bigIndex++, 1);
                this.outputValue.set(partI + "#" + partJ);
                context.write(this.outputKey, this.outputValue);
            }
        }
        // last seg, self join
        String lastSeg = getSegment(docs, this.splitNum - 1, docsInSeg);
        this.outputKey.set(this.bigIndex++, 0);
        this.outputValue.set(lastSeg);
        context.write(this.outputKey, this.outputValue);
    } else if (docs.length > 1) {
        this.outputKey.set(this.smallIndex++, 0);
        // container_id,flag \t term_id:group_id=tf-idf,...
        context.write(this.outputKey, value);
    }
}

From source file:clustering.similarity.PreReducer.java

License:Apache License

/**
 * @param values docId=TF-IDF,docId=TF-IDF...
 *//*  w w  w .j a  v  a  2s . co  m*/
private void selfJoin(Iterable<Text> values, Context context) throws IOException, InterruptedException {

    for (Text value : values) {
        String[] contents = value.toString().split(",");

        if (contents.length > 1) { // should always be true, just for security
            for (int i = 0; i < contents.length; i++) {
                for (int j = i + 1; j < contents.length; j++) {
                    String[] idAndWeighti = contents[i].split("=");
                    String[] idAndWeightj = contents[j].split("=");

                    output(idAndWeighti[0], idAndWeightj[0], idAndWeighti[1], idAndWeightj[1], context);
                }
            }
        }
    }
}

From source file:clustering.similarity.PreReducer.java

License:Apache License

/**
 * @param values docId=TF-IDF,docId=TF-IDF...#docId=TF-IDF,docId=TF-IDF...
 *///w w w.  jav  a  2  s.  com
private void crossJoin(Iterable<Text> values, Context context) throws IOException, InterruptedException {

    for (Text value : values) {
        // termId, docId=TF-IDF,docId=TF-IDF...
        String[] sets = value.toString().split("#");
        String[] set1 = sets[0].split(",");
        String[] set2 = sets[1].split(",");

        for (String aPart1 : set1) {
            for (String aPart2 : set2) {
                String[] idAndWeight1 = aPart1.split("=");
                String[] idAndWeight2 = aPart2.split("=");

                output(idAndWeight1[0], idAndWeight2[0], idAndWeight1[1], idAndWeight2[1], context);
            }
        }
    }
}

From source file:clustering.tf_idf.TermCountMapper.java

License:Apache License

/**
 * @param key   group_id//from   w  w  w.  j  av a  2 s .co  m
 * @param value entry_id@@g_no::g_name##[g_model]
 *              {@inheritDoc}
 */
@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {

    String[] line = value.toString().split("::");

    String[] nameAndModel = line[1].split("##");
    String group_id = key.toString();

    if (nameAndModel.length < 1) {
        // should not be, just for secure.
        return;
    }

    String[] nameTerms = nameAndModel[0].split(" ");
    for (String term : nameTerms) {
        this.outputKey.set(term + "@@@" + group_id + "::title");
        // term@@@group_id::title \t 1
        context.write(this.outputKey, this.outputValue);
    }
    if (nameAndModel.length == 2) {
        String[] modelTerms = nameAndModel[1].split(" ");
        for (String term : modelTerms) {
            this.outputKey.set(term + "@@@" + group_id + "::content");
            // term@@@group_id::content \t 1
            context.write(this.outputKey, this.outputValue);
        }
    }
}

From source file:clustering.tf_idf.TermFreqMapper.java

License:Apache License

/**
 * @param key   term@@@group_id::position
 * @param value count//from w  w  w .j  av  a  2s.  c  o  m
 *              {@inheritDoc}
 */
@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {

    // termAndDoc[0] = term
    // termAndDoc[1] = group_id::position
    String[] termAndDoc = key.toString().split("@@@");

    // idAndPosition[0] = group_id
    // idAndPosition[1] = position
    String[] idAndPosition = termAndDoc[1].split("::");

    // id
    this.outputKey.set(Integer.valueOf(idAndPosition[0]));
    // position::term=count
    this.outputValue.set(idAndPosition[1] + "::" + termAndDoc[0] + "=" + value.toString());
    // group_id \t position::term=count
    context.write(this.outputKey, this.outputValue);
}

From source file:clustering.tf_idf.TermFreqReducer.java

License:Apache License

/**
 * @param key    group_id/*from  w  ww .j  a  v  a  2  s . c  o m*/
 * @param values position::term=count
 *               {@inheritDoc}
 */
@Override
protected void reduce(IntWritable key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    int termsCntInDoc = 0;
    this.termWeightMap.clear();

    for (Text val : values) {
        // positionTermCnt[0] = position
        // positionTermCnt[1] = term=count
        String[] positionTermCnt = val.toString().split("::");
        String position = positionTermCnt[0];

        String[] termCnt = positionTermCnt[1].split("=");

        int count = Integer.valueOf(termCnt[1]);
        termsCntInDoc += count;
        // TODO: 17-4-24 is it necessary to make it enum or a class?
        double weightedCount = position.equals("title") ? this.weight * count : count;

        // term : weight
        CollectionUtils.updateCountMap(this.termWeightMap, termCnt[0], weightedCount);
    }

    for (Map.Entry<String, Double> entry : this.termWeightMap.entrySet()) {
        // term
        this.outputKey.set(entry.getKey());
        // group_id=weighted_tf
        double wtf = entry.getValue() / termsCntInDoc;
        this.outputValue.set(key.toString() + "=" + wtf);
        context.write(this.outputKey, this.outputValue);
    }
}

From source file:clustering.tf_idf.TF_IDF_Reducer.java

License:Apache License

/**
 * @param key    term//from ww w.  ja  v a 2  s.  com
 * @param values group_id=weighted_tf
 *               {@inheritDoc}
 */
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {

    this.docAndWTF.clear();
    // total appear times of this term
    int appearInAll = 0;
    /* count the total appear times of each term
     * and store them with their weighted tf in a map  */
    for (Text value : values) {
        appearInAll++;
        // docAndFreq[0] = group_id
        // docAndFreq[1] = weighted_tf
        String[] docAndFreq = value.toString().split("=");
        this.docAndWTF.put(docAndFreq[0], docAndFreq[1]);
    }

    for (Map.Entry<String, String> entry : this.docAndWTF.entrySet()) {
        double wtf = Double.valueOf(entry.getValue());

        double idf = Math.log((double) this.documentNumber / (double) (appearInAll + 1));

        this.outputKey.set(entry.getKey());
        this.outputValue.set(key.toString() + "=" + wtf * idf);
        // group_id \t term=tf_idf
        context.write(this.outputKey, this.outputValue);
    }
}

From source file:cn.ac.ncic.mastiff.io.coding.ORCStringEcnodingUtil.java

License:Apache License

public String readEachValue(Text previous) throws IOException {
    Text result = null;
    int entry = (int) reader.next();
    if (previous == null) {
        result = new Text();
    } else {//from   w w  w  . j av a  2  s  .  com
        result = (Text) previous;
    }
    int offset = dictionaryOffsets[entry];
    int length;
    // if it isn't the last entry, subtract the offsets otherwise use
    // the buffer length.
    if (entry < dictionaryOffsets.length - 1) {
        length = dictionaryOffsets[entry + 1] - offset;
    } else {
        length = dictionaryBuffer.size() - offset;
    }
    // If the column is just empty strings, the size will be zero,
    // so the buffer will be null, in that case just return result
    // as it will default to empty
    if (dictionaryBuffer != null) {
        dictionaryBuffer.setText(result, offset, length);
    } else {
        result.clear();
    }
    return result.toString();
}

From source file:cn.ac.ncic.mastiff.io.coding.RedBlackTreeStringReader.java

License:Apache License

public String readEachValue(Text previous) throws IOException {
    Text result = null;
    int entry = (int) reader.next();
    if (previous == null) {
        result = new Text();
    } else {//from w w w .  ja  v  a 2  s .  co  m
        result = (Text) previous;
    }
    int offset = dictionaryOffsets[entry];
    int length;
    if (entry < dictionaryOffsets.length - 1) {
        length = dictionaryOffsets[entry + 1] - offset;
    } else {
        length = dictionaryBuffer.size() - offset;
    }
    // If the column is just empty strings, the size will be zero,
    // so the buffer will be null, in that case just return result
    // as it will default to empty
    if (dictionaryBuffer != null) {
        dictionaryBuffer.setText(result, offset, length);
    } else {
        result.clear();
    }
    // }
    return result.toString();

}