Example usage for org.apache.hadoop.io Text getBytes

List of usage examples for org.apache.hadoop.io Text getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Returns the raw bytes; however, only data up to #getLength() is valid.

Usage

From source file:it.crs4.seal.common.CutText.java

License:Open Source License

public void loadRecord(Text record) throws FormatException {
    int pos = 0; // the byte position within the record
    int fieldno = 0; // the field index within the record
    int colno = 0; // the index within the list of requested fields (columns)
    try {/*from  w  ww . j  a  v a2s  .c  o m*/
        while (pos < record.getLength() && colno < columns.size()) // iterate over each field
        {
            int endpos = record.find(delim, pos); // the field's end position
            if (endpos < 0)
                endpos = record.getLength();

            if (columns.get(colno) == fieldno) // if we're at a requested field
            {
                extractedFields[colno] = Text.decode(record.getBytes(), pos, endpos - pos);
                extractedFieldPositions[colno] = pos;
                colno += 1; // advance column
            }

            pos = endpos + 1; // the next starting position is the current end + 1
            fieldno += 1;
        }
    } catch (java.nio.charset.CharacterCodingException e) {
        throw new FormatException("character coding exception.  Message: " + e.getMessage(), record);
    }

    if (colno < columns.size())
        throw new FormatException("Missing field(s) in record. Field " + colno + " (zero-based) not found.",
                record);
}

From source file:it.crs4.seal.common.TextSamMapping.java

License:Open Source License

public TextSamMapping(Text sam) throws FormatException {
    unparsedData = new Text();
    cutter = new CutText(Delim, 0, 1, 2, 3, 4, 5, 6, 7, 8); // all fields up to and including insert size

    try {/*  w w  w  . ja  v a2  s. c  o  m*/
        cutter.loadRecord(sam);
        flag = Integer.parseInt(cutter.getField(1)); // set flag first so we can use the flag methods
        mapQ = Byte.parseByte(cutter.getField(4));

        if (isMapped())
            pos5 = Integer.parseInt(cutter.getField(3));
        if (isMateMapped())
            matePos5 = Integer.parseInt(cutter.getField(7));
        if (isMapped() && isMateMapped())
            insertSize = Integer.parseInt(cutter.getField(8));
    } catch (CutText.FormatException e) {
        throw new FormatException("sam formatting problem: " + e + ". Record: " + sam);
    } catch (NumberFormatException e) {
        throw new FormatException("sam formatting problem.  Found text in place of a number.  Record: " + sam);
    }

    int seqStart = cutter.getFieldPos(8) + cutter.getField(8).length() + 1;
    if (seqStart > sam.getLength())
        throw new FormatException("Incomplete SAM record -- missing fields. Record: " + sam);
    // copy the sequence and tag data to our internal buffer
    unparsedData.set(sam.getBytes(), seqStart, sam.getLength() - seqStart);

    // Find the end of the sequence field.  Search for a Delim after the insert size field.
    int end = unparsedData.find(Delim);
    if (end < 0)
        throw new FormatException("Bad SAM format.  Missing terminator for sequence field.  SAM: " + sam);
    seqLen = end;

    // now repeat for the quality field
    qualityStart = end + 1;
    if (qualityStart > unparsedData.getLength())
        throw new FormatException("Incomplete SAM record -- missing quality field. Record: " + sam);
    end = unparsedData.find(Delim, qualityStart);
    if (end < 0)
        end = unparsedData.getLength();
    if (seqLen != end - qualityStart) {
        throw new FormatException(
                "Length of sequence (" + seqLen + ") is different from length of quality string ("
                        + (end - qualityStart) + "). Record: " + sam);
    }

    tagsStart = end + 1;
}

From source file:it.crs4.seal.prq.PairReadsQSeqMapper.java

License:Open Source License

public void map(Text readId, SequencedFragment read, IMRContext<SequenceId, Text> context)
        throws IOException, InterruptedException {
    // build the key
    builder.delete(0, builder.length());

    // field up and including the index number goes in the location.  The read is on its own.
    if (read.getRead() == null)
        throw new RuntimeException("Cannot get read number from read: " + readId);

    if (read.getLane() != null && read.getTile() != null && read.getXpos() != null && read.getYpos() != null) {
        appendIdToBuilder(builder, read); // appends the read id to the builder provided
        // finally the index field
        builder.append("#").append(read.getIndexSequence() == null ? '0' : read.getIndexSequence());
        sequenceKey.set(builder.toString(), read.getRead());
    } else {//from www  .j  av a 2  s.c o m
        // maybe it's a fastq id with a trailing read number (/1 or /2)
        if (readId.getLength() > 2) {
            int last = readId.getLength() - 1;
            if (readId.charAt(last - 1) == '/') {
                // truncate the /[12] from the read id
                // last == length - 1.  We want length - 2 bytes, which is equal to last - 1
                sequenceKey.set(Text.decode(readId.getBytes(), 0, last - 1), read.getRead());
            } else
                throw new RuntimeException(
                        "Didn't find /read_number at end of the read id.  Please use qseq files or fastq with illumina-formatted name tags.");
        } else
            throw new RuntimeException("Read id " + readId
                    + " is too short.   Please use qseq files or fastq with illumina-formatted name tags.");
    }

    // then the tab-delimited value
    sequenceValue.clear();
    sequenceValue.append(read.getSequence().getBytes(), 0, read.getSequence().getLength());
    sequenceValue.append(Delim, 0, Delim.length);
    sequenceValue.append(read.getQuality().getBytes(), 0, read.getQuality().getLength());
    sequenceValue.append(Delim, 0, Delim.length);
    // the filter flag is optional.  If it's absent we assume the read passes filtering.
    sequenceValue.append(ZeroOne, (read.getFilterPassed() == null || read.getFilterPassed() ? 1 : 0), 1);

    context.write(sequenceKey, sequenceValue);
    context.progress();
}

From source file:it.crs4.seal.prq.PairReadsQSeqReducer.java

License:Open Source License

public void reduce(SequenceId key, Iterable<Text> values, IMRContext<Text, ReadPair> context)
        throws IOException, InterruptedException {
    outputKey.set(key.getLocation());//w  ww  .j  a  v  a2  s  . c o m
    outputValue.clear();

    int nReads = 0;
    int nBadReads = 0;
    for (Text read : values) {
        ++nReads;
        if (nReads > nReadsPerTemplate)
            throw new RuntimeException("got more than " + nReadsPerTemplate + " reads for sequence key " + key
                    + ". Record: " + read);

        int[] fieldsPos = findFields(read);
        // filtered read?
        // If dropFailedFilter is false it shortcuts the test and sets filterPassed directly to true.
        // If it's true then we check whether the field is equal to '1'
        boolean filterPassed = !dropFailedFilter || read.getBytes()[fieldsPos[2]] == (byte) '1';

        if (!filterPassed) {
            context.increment(ReadCounters.FailedFilter, 1);
            ++nBadReads;
        } else if (!checkReadQuality(read, fieldsPos)) {
            context.increment(ReadCounters.NotEnoughBases, 1);
            ++nBadReads;
        }

        // In here we do all the work to prepare the read for output.  It will be written to the
        // appropriate WritableMapping, which will in turn be inserted into the ReadPair outputValue.
        prepMapping(read.getBytes(), fieldsPos, nReads - 1);
    }

    if (nReads < nReadsPerTemplate) {
        context.increment(ReadCounters.Unpaired, nReads);
        String msg = String.format("Too few reads for template! (found %s). Key: %s", nReads, key);
        if (warnOnlyIfUnpaired)
            LOG.warn(msg);
        else
            throw new RuntimeException(msg + "\nread: " + outputValue.toString());
    }
    // nReads can't be > nReadsPerTemplate since that should be caught in the loop above.

    // If they're a complete template and they're not all bad write. Unpaired are dropped
    if (nReads == nReadsPerTemplate && nBadReads < nReads)
        context.write(outputKey, outputValue);
    else
        context.increment(ReadCounters.Dropped, nReads);

    context.progress();
}

From source file:it.crs4.seal.prq.PairReadsQSeqReducer.java

License:Open Source License

/**
 * Verify whether a read satisfies quality standards.
 * For now this method verifies whether the read has at least
 * minBasesThreshold known bases (ignoring unknown bases N).
 *//*from   w  w  w.j  a  v  a  2  s . co m*/
protected boolean checkReadQuality(Text read, int[] fieldsPos) {
    /* The read's delimiter is at the bytes before the second field starts */
    int readEnd = fieldsPos[1] - 1;

    // The condition is "min number of valid bases".  However, we consider
    // the inverse condition "max number of unknowns".
    // readEnd is also the length of the read fragment
    // readEnd - minBasesThreshold gives us the maximum number of unknowns acceptable.
    int nAcceptableUnknowns = readEnd - minBasesThreshold;

    if (nAcceptableUnknowns < 0) // the fragment is shorter than minBasesThreshold
        return false;

    int nUnknownBases = 0;
    byte[] data = read.getBytes(); // we can work directly in bytes as long as we only have ASCII characters
    for (int pos = 0; pos < readEnd; ++pos) {
        if (data[pos] == UnknownBase) {
            ++nUnknownBases;
            if (nUnknownBases > nAcceptableUnknowns)
                return false;
        }
    }
    return true;
}

From source file:it.uniroma1.hadoop.pagerank.job1.PageRankJob1Mapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    /* Job#1 mapper will simply parse a line of the input graph creating a map with key-value(s) pairs.
     * Input format is the following (separator is TAB):
     * //from  w  w w.  java 2 s .  c om
     *     <nodeA>    <nodeB>
     * 
     * which denotes an edge going from <nodeA> to <nodeB>.
     * We would need to skip comment lines (denoted by the # characters at the beginning of the line).
     * We will also collect all the distinct nodes in our graph: this is needed to compute the initial 
     * pagerank value in Job #1 reducer and also in later jobs.
     */

    if (value.charAt(0) != '#') {

        int tabIndex = value.find("\t");
        String nodeA = Text.decode(value.getBytes(), 0, tabIndex);
        String nodeB = Text.decode(value.getBytes(), tabIndex + 1, value.getLength() - (tabIndex + 1));
        context.write(new Text(nodeA), new Text(nodeB));

        // add the current source node to the node list so we can 
        // compute the total amount of nodes of our graph in Job#2
        PageRank.NODES.add(nodeA);
        // also add the target node to the same list: we may have a target node 
        // with no outlinks (so it will never be parsed as source)
        PageRank.NODES.add(nodeB);

    }

}

From source file:it.uniroma1.hadoop.pagerank.job2.PageRankJob2Mapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    /* PageRank calculation algorithm (mapper)
     * Input file format (separator is TAB):
     * //  w w w .j a  va 2 s  .com
     *     <title>    <page-rank>    <link1>,<link2>,<link3>,<link4>,... ,<linkN>
     * 
     * Output has 2 kind of records:
     * One record composed by the collection of links of each page:
     *     
     *     <title>   |<link1>,<link2>,<link3>,<link4>, ... , <linkN>
     *     
     * Another record composed by the linked page, the page rank of the source page 
     * and the total amount of out links of the source page:
     *  
     *     <link>    <page-rank>    <total-links>
     */

    int tIdx1 = value.find("\t");
    int tIdx2 = value.find("\t", tIdx1 + 1);

    // extract tokens from the current line
    String page = Text.decode(value.getBytes(), 0, tIdx1);
    String pageRank = Text.decode(value.getBytes(), tIdx1 + 1, tIdx2 - (tIdx1 + 1));
    String links = Text.decode(value.getBytes(), tIdx2 + 1, value.getLength() - (tIdx2 + 1));

    String[] allOtherPages = links.split(",");
    for (String otherPage : allOtherPages) {
        Text pageRankWithTotalLinks = new Text(pageRank + "\t" + allOtherPages.length);
        context.write(new Text(otherPage), pageRankWithTotalLinks);
    }

    // put the original links so the reducer is able to produce the correct output
    context.write(new Text(page), new Text(PageRank.LINKS_SEPARATOR + links));

}

From source file:it.uniroma1.hadoop.pagerank.job3.PageRankJob3Mapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    /* Rank Ordering (mapper only)
     * Input file format (separator is TAB):
     * /*from w  w w .j  a va 2  s . com*/
     *     <title>    <page-rank>    <link1>,<link2>,<link3>,<link4>,... ,<linkN>
     * 
     * This is a simple job which does the ordering of our documents according to the computed pagerank.
     * We will map the pagerank (key) to its value (page) and Hadoop will do the sorting on keys for us.
     * There is no need to implement a reducer: the mapping and sorting is enough for our purpose.
     */

    int tIdx1 = value.find("\t");
    int tIdx2 = value.find("\t", tIdx1 + 1);

    // extract tokens from the current line
    String page = Text.decode(value.getBytes(), 0, tIdx1);
    float pageRank = Float.parseFloat(Text.decode(value.getBytes(), tIdx1 + 1, tIdx2 - (tIdx1 + 1)));

    context.write(new DoubleWritable(pageRank), new Text(page));

}

From source file:mapred.io.CustomRecordReader.java

License:Apache License

private int skipUtfByteOrderMark() throws IOException {
    Text value = new Text();
    // Strip BOM(Byte Order Mark)
    // Text only support UTF-8, we only need to check UTF-8 BOM
    // (0xEF,0xBB,0xBF) at the start of the text stream.
    int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE);
    int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
    // Even we read 3 extra bytes for the first line,
    // we won't alter existing behavior (no backwards incompat issue).
    // Because the newSize is less than maxLineLength and
    // the number of bytes copied to Text is always no more than newSize.
    // If the return size from readLine is not less than maxLineLength,
    // we will discard the current line and read the next line.
    pos += newSize;/*from  w w  w. java 2  s.  c  o  m*/
    int textLength = value.getLength();
    byte[] textBytes = value.getBytes();
    if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
            && (textBytes[2] == (byte) 0xBF)) {
        // find UTF-8 BOM, strip it.
        LOG.info("Found UTF-8 BOM and skipped it");
        textLength -= 3;
        newSize -= 3;
        if (textLength > 0) {
            // It may work to use the same buffer and not do the copyBytes
            textBytes = value.copyBytes();
            value.set(textBytes, 3, textLength);
        } else {
            value.clear();
        }
    }
    return newSize;
}

From source file:mr.MyFileRecordReader2.java

License:Apache License

private int skipUtfByteOrderMark(Text value) throws IOException {
    // Strip BOM(Byte Order Mark)
    // Text only support UTF-8, we only need to check UTF-8 BOM
    // (0xEF,0xBB,0xBF) at the start of the text stream.
    int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE);
    int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
    // Even we read 3 extra bytes for the first line,
    // we won't alter existing behavior (no backwards incompat issue).
    // Because the newSize is less than maxLineLength and
    // the number of bytes copied to Text is always no more than newSize.
    // If the return size from readLine is not less than maxLineLength,
    // we will discard the current line and read the next line.
    pos += newSize;// ww  w .j av  a  2  s. co m
    int textLength = value.getLength();
    byte[] textBytes = value.getBytes();
    if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
            && (textBytes[2] == (byte) 0xBF)) {
        // find UTF-8 BOM, strip it.
        LOG.info("Found UTF-8 BOM and skipped it");
        textLength -= 3;
        newSize -= 3;
        if (textLength > 0) {
            // It may work to use the same buffer and not do the copyBytes
            textBytes = value.copyBytes();
            value.set(textBytes, 3, textLength);
        } else {
            value.clear();
        }
    }
    return newSize;
}