List of usage examples for org.apache.hadoop.io Text append
public void append(byte[] utf8, int start, int len)
From source file:Importer.java
License:Open Source License
public static void copyFile(File file) throws Exception { // String TEST_PREFIX = ""; File destFile = new File(outDir, file.getName() + ".seq"); Path dest = new Path(destFile.getAbsolutePath()); Configuration conf = new Configuration(); FileSystem fileSys = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(conf.get("fs.default.name")), conf);/* w w w . j a v a 2s. c om*/ CompressionCodec codec = new DefaultCodec(); fileSys.mkdirs(dest.getParent()); FSDataOutputStream outputStr = fileSys.create(dest); seqFileWriter = SequenceFile.createWriter(conf, outputStr, Text.class, Text.class, SequenceFile.CompressionType.BLOCK, codec); String filename = file.getName(); InputStream in = new BufferedInputStream(new FileInputStream(file)); if (filename.endsWith(".bz2")) { in.read(); in.read(); //snarf header in = new CBZip2InputStream(in); } BufferedReader br = new BufferedReader(new InputStreamReader(in, "US-ASCII")); System.out.println("working on file " + file); int records = 0; long bytes = 0, bytes_since_status = 0; long startTime = System.currentTimeMillis(); String s = null; Text content = new Text(); while ((s = br.readLine()) != null) { if (s.startsWith("---END.OF.DOCUMENT---")) { Text name = new Text(hash(content)); seqFileWriter.append(name, content); records++; content = new Text(); } else { byte[] line_as_bytes = (s + " ").getBytes(); for (byte b : line_as_bytes) { assert b < 128 : "found an unexpected high-bit set"; } content.append(line_as_bytes, 0, line_as_bytes.length); bytes += line_as_bytes.length; /* bytes_since_status += line_as_bytes.length; if(bytes_since_status > 10 * 1024 * 1024) { //every 10 MB System.err.print('.'); bytes_since_status = 0; }*/ } } //end while if (content.getLength() > 5) { Text name = new Text(hash(content)); seqFileWriter.append(name, content); records++; } totalBytes += bytes; totalRecords += records; long time = (System.currentTimeMillis() - startTime) / 1000 + 1; long kbSec = bytes / 1024 / time; System.out.println(new java.util.Date()); System.out.println("File " + file.getName() + " " + records + " records, " + bytes + " bytes in " + time + " seconds (" + kbSec + " KB/sec)."); in.close(); seqFileWriter.close(); outputStr.close(); }
From source file:TweetTweetTweet.java
License:Open Source License
@Override public Text toText(Text text) { tweet1.toText(text);// w w w. j a va 2 s.c om text.append(Tab, 0, Tab.length); tweet2.toText(text); text.append(Tab, 0, Tab.length); tweet3.toText(text); return text; }
From source file:accumulo.ingest.AbstractAccumuloCsvIngest.java
License:Apache License
protected void setRowId(Text buffer, Text fileName, long recordCount) { final byte[] rowSuffix = lex.encode(recordCount); buffer.clear();/*from w ww . j av a2 s . co m*/ buffer.append(fileName.getBytes(), 0, fileName.getLength()); buffer.append(rowSuffix, 0, rowSuffix.length); }
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Parses a read from an interleaved FASTQ file. * * Only reads a single record./* www . jav a 2 s. c o m*/ * * @param readName Text record containing read name. Output parameter. * @param value Text record containing full record. Output parameter. * @return Returns true if read was successful (did not hit EOF). * * @throws RuntimeException Throws exception if FASTQ record doesn't * have proper formatting (e.g., record doesn't start with @). */ protected boolean lowLevelFastqRead(Text readName, Text value) throws IOException { // ID line readName.clear(); long skipped = appendLineInto(readName, true); pos += skipped; if (skipped == 0) { return false; // EOF } if (readName.getBytes()[0] != '@') { throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage() + ". Line: " + readName + ". \n"); } value.append(readName.getBytes(), 0, readName.getLength()); // sequence appendLineInto(value, false); // separator line appendLineInto(value, false); // quality appendLineInto(value, false); return true; }
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Reads a newline into a text record from the underlying line reader. * * @param dest Text record to read line into. * @param eofOk Whether an EOF is acceptable in this line. * @return Returns the number of bytes read. * * @throws EOFException Throws if eofOk was false and we hit an EOF in * the current line./*from w w w .j av a 2 s . c om*/ */ private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException { Text buf = new Text(); int bytesRead = lineReader.readLine(buf, MAX_LINE_LENGTH); if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) throw new EOFException(); dest.append(buf.getBytes(), 0, buf.getLength()); dest.append(newline, 0, 1); pos += bytesRead; return bytesRead; }
From source file:com.alexholmes.hadooputils.sort.DelimitedLineReader.java
License:Apache License
/** * Read a line terminated by one of CR, LF, or CRLF. */// w w w. jav a2 s . c o m private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) { ++bytesConsumed; //account for CR from previous read } bufferLength = fillBuffer(in, buffer, prevCharCR); if (bufferLength <= 0) { break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) { --readLength; //CR at the end of the buffer } bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:com.alexholmes.hadooputils.sort.DelimitedLineReader.java
License:Apache License
/** * Read a line terminated by a custom delimiter. *//*from w w w.ja va 2 s . c om*/ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from inputStream, but the head of the stream may be * already captured in the previous buffer, so we have several cases: * * 1. The buffer tail does not contain any character sequence which * matches with the head of delimiter. We count it as a * ambiguous byte count = 0 * * 2. The buffer tail contains a X number of characters, * that forms a sequence, which matches with the * head of delimiter. We count ambiguous byte count = X * * // *** eg: A segment of input file is as follows * * " record 1792: I found this bug very interesting and * I have completely read about it. record 1793: This bug * can be solved easily record 1794: This ." * * delimiter = "record"; * * supposing:- String at the end of buffer = * "I found this bug very interesting and I have completely re" * There for next buffer = "ad about it. record 179 ...." * * The matching characters in the input * buffer tail and delimiter head = "re" * Therefore, ambiguous byte count = 2 **** // * * 2.1 If the following bytes are the remaining characters of * the delimiter, then we have to capture only up to the starting * position of delimiter. That means, we need not include the * ambiguous characters in str. * * 2.2 If the following bytes are not the remaining characters of * the delimiter ( as mentioned in the example ), * then we have to include the ambiguous characters in str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization long bytesConsumed = 0; int delPosn = 0; int ambiguousByteCount = 0; // To capture the ambiguous characters count do { int startPosn = bufferPosn; // Start from previous end position if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0); if (bufferLength <= 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) { delPosn++; if (delPosn >= recordDelimiterBytes.length) { bufferPosn++; break; } } else if (delPosn != 0) { bufferPosn--; delPosn = 0; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - delPosn; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { if (ambiguousByteCount > 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); //appending the ambiguous characters (refer case 2.2) bytesConsumed += ambiguousByteCount; ambiguousByteCount = 0; } str.append(buffer, startPosn, appendLength); txtLength += appendLength; } if (bufferPosn >= bufferLength) { if (delPosn > 0 && delPosn < recordDelimiterBytes.length) { ambiguousByteCount = delPosn; bytesConsumed -= ambiguousByteCount; //to be consumed in next } } } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before delimiter: " + bytesConsumed); } return (int) bytesConsumed; }
From source file:com.asakusafw.runtime.io.line.BasicLineInput.java
License:Apache License
private void append(Text entity, int len) throws IOException { ByteBuffer bs = byteBuffer;/*from w ww . ja va2 s . c o m*/ CharBuffer cs = charBuffer; int limit = cs.limit(); // slice the buffer cs.limit(cs.position() + len); while (true) { bs.clear(); CoderResult result = encoder.encode(cs, bs, true); if (result.isError() == false) { bs.flip(); entity.append(bs.array(), bs.position(), bs.limit()); if (result.isUnderflow()) { break; } } else { assert result.isError(); try { result.throwException(); } catch (CharacterCodingException e) { throw new IOException(MessageFormat.format("exception occurred while encoding text: {0}", path), e); } } } cs.limit(limit); }
From source file:com.asakusafw.runtime.io.line.Utf8LineInput.java
License:Apache License
private State appendBufferTo(Text entity) { assert bufferOffset < bufferLimit; byte[] b = buffer; // skip LF after CR if (sawCr && b[bufferOffset] == '\n') { bufferOffset++;/*from w w w .j a va 2 s . c o m*/ } boolean eol = false; int lim = bufferLimit; int off = bufferOffset; int len = 0; // scan buffer until CR/LF/buffer limit for (int i = bufferOffset; i < lim; i++) { byte c = b[i]; if (c == '\r' || c == '\n') { eol = true; sawCr = c == '\r'; break; } else { len++; } } // advance buffer cursor bufferOffset += len + (eol ? 1 : 0); assert bufferOffset <= bufferLimit; if (len == 0) { return eol ? State.LINE_BREAK : State.NOTHING; } else { entity.append(b, off, len); return eol ? State.LINE_BREAK : State.CONTINUE; } }
From source file:com.asakusafw.runtime.io.text.value.StringOptionFieldAdapter.java
License:Apache License
@Override protected void doParse(CharSequence contents, StringOption property) { property.reset();// w w w.j ava 2 s .com Text text = property.get(); CharBuffer cbuf = CharBuffer.wrap(contents); ByteBuffer bbuf = encodeBuffer; CharsetEncoder enc = encoder; enc.reset(); while (cbuf.hasRemaining()) { bbuf.clear(); CoderResult result = enc.encode(cbuf, bbuf, true); if (result.isError()) { throw new IllegalArgumentException( MessageFormat.format("cannot map input string to UTF-8: {0}", TextUtil.quote(contents))); } bbuf.flip(); if (bbuf.hasRemaining()) { text.append(bbuf.array(), bbuf.arrayOffset() + bbuf.position(), bbuf.remaining()); } bbuf.clear(); enc.flush(bbuf); bbuf.flip(); if (bbuf.hasRemaining()) { text.append(bbuf.array(), bbuf.arrayOffset() + bbuf.position(), bbuf.remaining()); } } }