Example usage for org.apache.hadoop.io Text charAt

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text charAt.

Prototype

public int charAt(int position)

Source Link

Document

Returns the Unicode Scalar Value (32-bit integer value) for the character at position.

Usage

From source file:com.lovelysystems.hive.udf.ESHashUDF.java

License:Apache License

private static long DJB_HASH(Text value) {
    long hash = 5381;

    for (int i = 0; i < value.getLength(); i++) {
        hash = ((hash << 5) + hash) + value.charAt(i);
    }//www  . j a va2 s  .  co  m
    return hash;
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void text() {

        Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");
        assertThat(t.getLength(), is(10));

        assertThat(t.find("\u0041"), is(0));
        assertThat(t.find("\u00DF"), is(1));
        assertThat(t.find("\u6771"), is(3));
        assertThat(t.find("\uD801\uDC00"), is(6));

        assertThat(t.charAt(0), is(0x0041));
        assertThat(t.charAt(1), is(0x00DF));
        assertThat(t.charAt(3), is(0x6771));
        assertThat(t.charAt(6), is(0x10400));
    }/*from  www .  j a va2s  . c  o m*/

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void test() throws IOException {
        // vv TextTest
        Text t = new Text("hadoop");
        assertThat(t.getLength(), is(6));
        assertThat(t.getBytes().length, is(6));

        assertThat(t.charAt(2), is((int) 'd'));
        assertThat("Out of bounds", t.charAt(100), is(-1));
        // ^^ TextTest
    }// www  .j a  v a  2 s. co  m

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void withSupplementaryCharacters() throws IOException {

        String s = "\u0041\u00DF\u6771\uD801\uDC00";
        assertThat(s.length(), is(5));/*from  w  w w. j  a v  a2s.  c  om*/
        assertThat(s.getBytes("UTF-8").length, is(10));

        assertThat(s.indexOf('\u0041'), is(0));
        assertThat(s.indexOf('\u00DF'), is(1));
        assertThat(s.indexOf('\u6771'), is(2));
        assertThat(s.indexOf('\uD801'), is(3));
        assertThat(s.indexOf('\uDC00'), is(4));

        assertThat(s.charAt(0), is('\u0041'));
        assertThat(s.charAt(1), is('\u00DF'));
        assertThat(s.charAt(2), is('\u6771'));
        assertThat(s.charAt(3), is('\uD801'));
        assertThat(s.charAt(4), is('\uDC00'));

        Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");

        assertThat(serializeToString(t), is("0a41c39fe69db1f0909080"));

        assertThat(t.charAt(t.find("\u0041")), is(0x0041));
        assertThat(t.charAt(t.find("\u00DF")), is(0x00DF));
        assertThat(t.charAt(t.find("\u6771")), is(0x6771));
        assertThat(t.charAt(t.find("\uD801\uDC00")), is(0x10400));

    }

From source file:gov.jgi.meta.hadoop.input.FastaBlockLineReader.java

License:Open Source License

public int readLine(Text key, Map<String, String> set, int maxLineLength, long maxBytesToConsume)
        throws IOException {

    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;//  ww w.j a va2 s.  co m
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '>');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fasta record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a ">"

    /*
    find the next record start
     */
    eof = false;
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }

    } while (buffer[bufferPosn++] != '>' || (totalBytesRead + bufferPosn - startPosn) <= maxBytesToConsume);

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    Text k = new Text();
    Text s = new Text();
    int i = 1; // skip initial record seperator ">"
    int j = 1;
    do {
        k.clear();
        s.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        k.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            s.append(recordBlock.getBytes(), i, j - i - 1);
            set.put(k.toString(), s.toString().toLowerCase());

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>') {
            j++;
        }

        j++; // skip the ">"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:gov.jgi.meta.hadoop.input.FastqBlockLineReader.java

License:Open Source License

public int readLine(Text key, Map<String, String> set, int maxLineLength, int maxBytesToConsume)
        throws IOException {

    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;/* ww w .j  av a 2 s. c om*/
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '@');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fasta record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@"

    /*
    find the next record start
     */
    eof = false;
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }

    } while (buffer[bufferPosn++] != '@' || (totalBytesRead + bufferPosn - startPosn) < maxBytesToConsume);

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    Text k = new Text();
    Text s = new Text();
    int i = 1; // skip initial record seperator ">"
    int j = 1;
    do {
        k.clear();
        s.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        k.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            s.append(recordBlock.getBytes(), i, j - i - 1);
            set.put(k.toString(), s.toString().toLowerCase());

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength() && recordBlock.charAt(j) != '@') {
            j++;
        }

        j++; // skip the "@"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:gov.jgi.meta.hadoop.input.FastqLineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line.//from ww  w.  j  a  v a2s .  c  o m
 *
 * @param str               the object to store the given line (without newline)
 * @param maxLineLength     the maximum number of bytes to store into str;
 *                          the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *                          in this call.  This is only a hint, because if the line cross
 *                          this threshold, we allow it to happen.  It can overshoot
 *                          potentially by as much as one buffer length.
 * @return the number of bytes read including the (longest) newline
 *         found.
 * @throws java.io.IOException if the underlying stream throws
 */
public int readLine(Text key, Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '@');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fastq record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@"

    /*
    find the next record start
     */
    eof = false;
    int numOfNewlines = 0;//Added by lanhin
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
        //Modefied by lanhin
        if (buffer[bufferPosn] == CR || buffer[bufferPosn] == LF) {
            numOfNewlines++;
        }
        if ((numOfNewlines >= 4) && buffer[bufferPosn] == '@') {
            bufferPosn++;
            break;
        }
        bufferPosn++;
    } while (true);//buffer[bufferPosn++] != '@');  // only read one record at a time
    //Modefied by lanhin end

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    int i = 1; // skip initial record seperator "@"
    int j = 1;
    do {
        key.clear();
        str.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        key.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            str.append(recordBlock.getBytes(), i, j - i - 1);

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength()) {
            // && recordBlock.charAt(j) != '@') {  // Modified by lanhin
            /* Should go straight to the end of recordBlock,
               ignore all the left info.  --lanhin*/

            j++;
        }

        j++; // skip the "@"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:io.aos.hdfs.StringTextComparisonTest.java

License:Apache License

@Test
public void text() {

    Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");
    assertThat(t.getLength(), is(10));/* www .j  av  a2 s . c o m*/

    assertThat(t.find("\u0041"), is(0));
    assertThat(t.find("\u00DF"), is(1));
    assertThat(t.find("\u6771"), is(3));
    assertThat(t.find("\uD801\uDC00"), is(6));

    assertThat(t.charAt(0), is(0x0041));
    assertThat(t.charAt(1), is(0x00DF));
    assertThat(t.charAt(3), is(0x6771));
    assertThat(t.charAt(6), is(0x10400));
}

From source file:io.aos.hdfs.TextTest.java

License:Apache License

@Test
public void test() throws IOException {
    // vv TextTest
    Text t = new Text("hadoop");
    assertThat(t.getLength(), is(6));/*from   w w  w .  jav  a 2s . c o  m*/
    assertThat(t.getBytes().length, is(6));

    assertThat(t.charAt(2), is((int) 'd'));
    assertThat("Out of bounds", t.charAt(100), is(-1));
    // ^^ TextTest
}

From source file:io.aos.hdfs.TextTest.java

License:Apache License

@Test
public void withSupplementaryCharacters() throws IOException {

    String s = "\u0041\u00DF\u6771\uD801\uDC00";
    assertThat(s.length(), is(5));/*from  w  ww.  j ava  2s . c  om*/
    assertThat(s.getBytes("UTF-8").length, is(10));

    assertThat(s.indexOf('\u0041'), is(0));
    assertThat(s.indexOf('\u00DF'), is(1));
    assertThat(s.indexOf('\u6771'), is(2));
    assertThat(s.indexOf('\uD801'), is(3));
    assertThat(s.indexOf('\uDC00'), is(4));

    assertThat(s.charAt(0), is('\u0041'));
    assertThat(s.charAt(1), is('\u00DF'));
    assertThat(s.charAt(2), is('\u6771'));
    assertThat(s.charAt(3), is('\uD801'));
    assertThat(s.charAt(4), is('\uDC00'));

    Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");

    assertThat(serializeToString(t), is("0a41c39fe69db1f0909080"));

    assertThat(t.charAt(t.find("\u0041")), is(0x0041));
    assertThat(t.charAt(t.find("\u00DF")), is(0x00DF));
    assertThat(t.charAt(t.find("\u6771")), is(0x6771));
    assertThat(t.charAt(t.find("\uD801\uDC00")), is(0x10400));

}