List of usage examples for org.apache.hadoop.io Text charAt
public int charAt(int position)
position
. From source file:com.lovelysystems.hive.udf.ESHashUDF.java
License:Apache License
private static long DJB_HASH(Text value) { long hash = 5381; for (int i = 0; i < value.getLength(); i++) { hash = ((hash << 5) + hash) + value.charAt(i); }//www . j a va2 s . co m return hash; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void text() { Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00"); assertThat(t.getLength(), is(10)); assertThat(t.find("\u0041"), is(0)); assertThat(t.find("\u00DF"), is(1)); assertThat(t.find("\u6771"), is(3)); assertThat(t.find("\uD801\uDC00"), is(6)); assertThat(t.charAt(0), is(0x0041)); assertThat(t.charAt(1), is(0x00DF)); assertThat(t.charAt(3), is(0x6771)); assertThat(t.charAt(6), is(0x10400)); }/*from www . j a va2s . c o m*/
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void test() throws IOException { // vv TextTest Text t = new Text("hadoop"); assertThat(t.getLength(), is(6)); assertThat(t.getBytes().length, is(6)); assertThat(t.charAt(2), is((int) 'd')); assertThat("Out of bounds", t.charAt(100), is(-1)); // ^^ TextTest }// www .j a v a 2 s. co m
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void withSupplementaryCharacters() throws IOException { String s = "\u0041\u00DF\u6771\uD801\uDC00"; assertThat(s.length(), is(5));/*from w w w. j a v a2s. c om*/ assertThat(s.getBytes("UTF-8").length, is(10)); assertThat(s.indexOf('\u0041'), is(0)); assertThat(s.indexOf('\u00DF'), is(1)); assertThat(s.indexOf('\u6771'), is(2)); assertThat(s.indexOf('\uD801'), is(3)); assertThat(s.indexOf('\uDC00'), is(4)); assertThat(s.charAt(0), is('\u0041')); assertThat(s.charAt(1), is('\u00DF')); assertThat(s.charAt(2), is('\u6771')); assertThat(s.charAt(3), is('\uD801')); assertThat(s.charAt(4), is('\uDC00')); Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00"); assertThat(serializeToString(t), is("0a41c39fe69db1f0909080")); assertThat(t.charAt(t.find("\u0041")), is(0x0041)); assertThat(t.charAt(t.find("\u00DF")), is(0x00DF)); assertThat(t.charAt(t.find("\u6771")), is(0x6771)); assertThat(t.charAt(t.find("\uD801\uDC00")), is(0x10400)); }
From source file:gov.jgi.meta.hadoop.input.FastaBlockLineReader.java
License:Open Source License
public int readLine(Text key, Map<String, String> set, int maxLineLength, long maxBytesToConsume) throws IOException { int totalBytesRead = 0; int numRecordsRead = 0; Boolean eof = false;// ww w.j a va2 s. co m int startPosn; Text recordBlock = new Text(); /* first thing to do is to move forward till you see a start character */ startPosn = bufferPosn; do { if (bufferPosn >= bufferLength) { totalBytesRead += bufferPosn - startPosn; bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '>'); /* if we hit the end of file already, then just return 0 bytes processed */ if (eof) return totalBytesRead; /* now bufferPosn should be at the start of a fasta record */ totalBytesRead += (bufferPosn - 1) - startPosn; startPosn = bufferPosn - 1; // startPosn guaranteed to be at a ">" /* find the next record start */ eof = false; do { if (bufferPosn >= bufferLength) { /* copy the current buffer before refreshing the buffer */ int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '>' || (totalBytesRead + bufferPosn - startPosn) <= maxBytesToConsume); if (!eof) { bufferPosn--; // make sure we leave bufferPosn pointing to the next record int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; } /* record block now has the byte array we want to process for reads */ Text k = new Text(); Text s = new Text(); int i = 1; // skip initial record seperator ">" int j = 1; do { k.clear(); s.clear(); /* first parse the key */ i = j; Boolean junkOnLine = false; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } else if (c == ' ' || c == '\t') { junkOnLine = true; break; } } k.append(recordBlock.getBytes(), i, j - i - 1); /* in case there is additional metadata on the header line, ignore everything after the first word. */ if (junkOnLine) { while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF) j++; } //LOG.info ("key = " + k.toString()); /* now skip the newlines */ while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; /* now read the sequence */ do { i = j; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } } s.append(recordBlock.getBytes(), i, j - i - 1); set.put(k.toString(), s.toString().toLowerCase()); while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>'); numRecordsRead++; /* now skip characters (newline or carige return most likely) till record start */ while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>') { j++; } j++; // skip the ">" } while (j < recordBlock.getLength()); return totalBytesRead; }
From source file:gov.jgi.meta.hadoop.input.FastqBlockLineReader.java
License:Open Source License
public int readLine(Text key, Map<String, String> set, int maxLineLength, int maxBytesToConsume) throws IOException { int totalBytesRead = 0; int numRecordsRead = 0; Boolean eof = false;/* ww w .j av a 2 s. c om*/ int startPosn; Text recordBlock = new Text(); /* first thing to do is to move forward till you see a start character */ startPosn = bufferPosn; do { if (bufferPosn >= bufferLength) { totalBytesRead += bufferPosn - startPosn; bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '@'); /* if we hit the end of file already, then just return 0 bytes processed */ if (eof) return totalBytesRead; /* now bufferPosn should be at the start of a fasta record */ totalBytesRead += (bufferPosn - 1) - startPosn; startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@" /* find the next record start */ eof = false; do { if (bufferPosn >= bufferLength) { /* copy the current buffer before refreshing the buffer */ int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '@' || (totalBytesRead + bufferPosn - startPosn) < maxBytesToConsume); if (!eof) { bufferPosn--; // make sure we leave bufferPosn pointing to the next record int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; } /* record block now has the byte array we want to process for reads */ Text k = new Text(); Text s = new Text(); int i = 1; // skip initial record seperator ">" int j = 1; do { k.clear(); s.clear(); /* first parse the key */ i = j; Boolean junkOnLine = false; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } else if (c == ' ' || c == '\t') { junkOnLine = true; break; } } k.append(recordBlock.getBytes(), i, j - i - 1); /* in case there is additional metadata on the header line, ignore everything after the first word. */ if (junkOnLine) { while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF) j++; } //LOG.info ("key = " + k.toString()); /* now skip the newlines */ while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; /* now read the sequence */ do { i = j; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } } s.append(recordBlock.getBytes(), i, j - i - 1); set.put(k.toString(), s.toString().toLowerCase()); while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+'); numRecordsRead++; /* now skip characters (newline or carige return most likely) till record start */ while (j < recordBlock.getLength() && recordBlock.charAt(j) != '@') { j++; } j++; // skip the "@" } while (j < recordBlock.getLength()); return totalBytesRead; }
From source file:gov.jgi.meta.hadoop.input.FastqLineReader.java
License:Open Source License
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF) , '\r' (CR), * or '\r\n' (CR+LF). EOF also terminates an otherwise unterminated * line.//from ww w. j a v a2s . c o m * * @param str the object to store the given line (without newline) * @param maxLineLength the maximum number of bytes to store into str; * the rest of the line is silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume * in this call. This is only a hint, because if the line cross * this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * @return the number of bytes read including the (longest) newline * found. * @throws java.io.IOException if the underlying stream throws */ public int readLine(Text key, Text str, int maxLineLength, int maxBytesToConsume) throws IOException { int totalBytesRead = 0; int numRecordsRead = 0; Boolean eof = false; int startPosn; Text recordBlock = new Text(); /* first thing to do is to move forward till you see a start character */ startPosn = bufferPosn; do { if (bufferPosn >= bufferLength) { totalBytesRead += bufferPosn - startPosn; bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '@'); /* if we hit the end of file already, then just return 0 bytes processed */ if (eof) return totalBytesRead; /* now bufferPosn should be at the start of a fastq record */ totalBytesRead += (bufferPosn - 1) - startPosn; startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@" /* find the next record start */ eof = false; int numOfNewlines = 0;//Added by lanhin do { if (bufferPosn >= bufferLength) { /* copy the current buffer before refreshing the buffer */ int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } //Modefied by lanhin if (buffer[bufferPosn] == CR || buffer[bufferPosn] == LF) { numOfNewlines++; } if ((numOfNewlines >= 4) && buffer[bufferPosn] == '@') { bufferPosn++; break; } bufferPosn++; } while (true);//buffer[bufferPosn++] != '@'); // only read one record at a time //Modefied by lanhin end if (!eof) { bufferPosn--; // make sure we leave bufferPosn pointing to the next record int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; } /* record block now has the byte array we want to process for reads */ int i = 1; // skip initial record seperator "@" int j = 1; do { key.clear(); str.clear(); /* first parse the key */ i = j; Boolean junkOnLine = false; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } else if (c == ' ' || c == '\t') { junkOnLine = true; break; } } key.append(recordBlock.getBytes(), i, j - i - 1); /* in case there is additional metadata on the header line, ignore everything after the first word. */ if (junkOnLine) { while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF) j++; } //LOG.info ("key = " + k.toString()); /* now skip the newlines */ while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; /* now read the sequence */ do { i = j; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } } str.append(recordBlock.getBytes(), i, j - i - 1); while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+'); numRecordsRead++; /* now skip characters (newline or carige return most likely) till record start */ while (j < recordBlock.getLength()) { // && recordBlock.charAt(j) != '@') { // Modified by lanhin /* Should go straight to the end of recordBlock, ignore all the left info. --lanhin*/ j++; } j++; // skip the "@" } while (j < recordBlock.getLength()); return totalBytesRead; }
From source file:io.aos.hdfs.StringTextComparisonTest.java
License:Apache License
@Test public void text() { Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00"); assertThat(t.getLength(), is(10));/* www .j av a2 s . c o m*/ assertThat(t.find("\u0041"), is(0)); assertThat(t.find("\u00DF"), is(1)); assertThat(t.find("\u6771"), is(3)); assertThat(t.find("\uD801\uDC00"), is(6)); assertThat(t.charAt(0), is(0x0041)); assertThat(t.charAt(1), is(0x00DF)); assertThat(t.charAt(3), is(0x6771)); assertThat(t.charAt(6), is(0x10400)); }
From source file:io.aos.hdfs.TextTest.java
License:Apache License
@Test public void test() throws IOException { // vv TextTest Text t = new Text("hadoop"); assertThat(t.getLength(), is(6));/*from w w w . jav a 2s . c o m*/ assertThat(t.getBytes().length, is(6)); assertThat(t.charAt(2), is((int) 'd')); assertThat("Out of bounds", t.charAt(100), is(-1)); // ^^ TextTest }
From source file:io.aos.hdfs.TextTest.java
License:Apache License
@Test public void withSupplementaryCharacters() throws IOException { String s = "\u0041\u00DF\u6771\uD801\uDC00"; assertThat(s.length(), is(5));/*from w ww. j ava 2s . c om*/ assertThat(s.getBytes("UTF-8").length, is(10)); assertThat(s.indexOf('\u0041'), is(0)); assertThat(s.indexOf('\u00DF'), is(1)); assertThat(s.indexOf('\u6771'), is(2)); assertThat(s.indexOf('\uD801'), is(3)); assertThat(s.indexOf('\uDC00'), is(4)); assertThat(s.charAt(0), is('\u0041')); assertThat(s.charAt(1), is('\u00DF')); assertThat(s.charAt(2), is('\u6771')); assertThat(s.charAt(3), is('\uD801')); assertThat(s.charAt(4), is('\uDC00')); Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00"); assertThat(serializeToString(t), is("0a41c39fe69db1f0909080")); assertThat(t.charAt(t.find("\u0041")), is(0x0041)); assertThat(t.charAt(t.find("\u00DF")), is(0x00DF)); assertThat(t.charAt(t.find("\u6771")), is(0x6771)); assertThat(t.charAt(t.find("\uD801\uDC00")), is(0x10400)); }