Example usage for org.apache.hadoop.io Text append

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text append.

Prototype

public void append(byte[] utf8, int start, int len)

Source Link

Document

Append a range of bytes to the end of the given text

Usage

From source file:Importer.java

License:Open Source License

public static void copyFile(File file) throws Exception {
    //    String TEST_PREFIX = "";
    File destFile = new File(outDir, file.getName() + ".seq");
    Path dest = new Path(destFile.getAbsolutePath());

    Configuration conf = new Configuration();
    FileSystem fileSys = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(conf.get("fs.default.name")),
            conf);/* w w  w .  j  a v a  2s.  c om*/
    CompressionCodec codec = new DefaultCodec();
    fileSys.mkdirs(dest.getParent());
    FSDataOutputStream outputStr = fileSys.create(dest);
    seqFileWriter = SequenceFile.createWriter(conf, outputStr, Text.class, Text.class,
            SequenceFile.CompressionType.BLOCK, codec);
    String filename = file.getName();
    InputStream in = new BufferedInputStream(new FileInputStream(file));
    if (filename.endsWith(".bz2")) {
        in.read();
        in.read(); //snarf header
        in = new CBZip2InputStream(in);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(in, "US-ASCII"));

    System.out.println("working on file " + file);
    int records = 0;
    long bytes = 0, bytes_since_status = 0;
    long startTime = System.currentTimeMillis();
    String s = null;
    Text content = new Text();
    while ((s = br.readLine()) != null) {
        if (s.startsWith("---END.OF.DOCUMENT---")) {
            Text name = new Text(hash(content));
            seqFileWriter.append(name, content);
            records++;
            content = new Text();
        } else {
            byte[] line_as_bytes = (s + " ").getBytes();
            for (byte b : line_as_bytes) {
                assert b < 128 : "found an unexpected high-bit set";
            }

            content.append(line_as_bytes, 0, line_as_bytes.length);
            bytes += line_as_bytes.length;
            /*
            bytes_since_status += line_as_bytes.length;
            if(bytes_since_status > 10 * 1024 * 1024) { //every 10 MB
              System.err.print('.');
              bytes_since_status = 0;
            }*/
        }
    } //end while
    if (content.getLength() > 5) {
        Text name = new Text(hash(content));
        seqFileWriter.append(name, content);
        records++;
    }
    totalBytes += bytes;
    totalRecords += records;
    long time = (System.currentTimeMillis() - startTime) / 1000 + 1;
    long kbSec = bytes / 1024 / time;
    System.out.println(new java.util.Date());
    System.out.println("File " + file.getName() + " " + records + " records, " + bytes + " bytes in " + time
            + " seconds (" + kbSec + " KB/sec).");
    in.close();
    seqFileWriter.close();
    outputStr.close();
}

From source file:TweetTweetTweet.java

License:Open Source License

@Override
public Text toText(Text text) {
    tweet1.toText(text);//  w w  w.  j a va 2 s.c om
    text.append(Tab, 0, Tab.length);
    tweet2.toText(text);
    text.append(Tab, 0, Tab.length);
    tweet3.toText(text);
    return text;
}

From source file:accumulo.ingest.AbstractAccumuloCsvIngest.java

License:Apache License

protected void setRowId(Text buffer, Text fileName, long recordCount) {
    final byte[] rowSuffix = lex.encode(recordCount);
    buffer.clear();/*from w ww  .  j  av  a2  s .  co m*/
    buffer.append(fileName.getBytes(), 0, fileName.getLength());
    buffer.append(rowSuffix, 0, rowSuffix.length);
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Parses a read from an interleaved FASTQ file.
 *
 * Only reads a single record./* www .  jav a 2 s.  c o m*/
 *
 * @param readName Text record containing read name. Output parameter.
 * @param value Text record containing full record. Output parameter.
 * @return Returns true if read was successful (did not hit EOF).
 *
 * @throws RuntimeException Throws exception if FASTQ record doesn't
 *   have proper formatting (e.g., record doesn't start with @).
 */
protected boolean lowLevelFastqRead(Text readName, Text value) throws IOException {
    // ID line
    readName.clear();
    long skipped = appendLineInto(readName, true);
    pos += skipped;
    if (skipped == 0) {
        return false; // EOF
    }

    if (readName.getBytes()[0] != '@') {
        throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage()
                + ". Line: " + readName + ". \n");
    }

    value.append(readName.getBytes(), 0, readName.getLength());

    // sequence
    appendLineInto(value, false);

    // separator line
    appendLineInto(value, false);

    // quality
    appendLineInto(value, false);

    return true;
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Reads a newline into a text record from the underlying line reader.
 *
 * @param dest Text record to read line into.
 * @param eofOk Whether an EOF is acceptable in this line.
 * @return Returns the number of bytes read.
 *
 * @throws EOFException Throws if eofOk was false and we hit an EOF in
 *    the current line./*from w w  w .j  av a 2 s  .  c om*/
 */
private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException {
    Text buf = new Text();
    int bytesRead = lineReader.readLine(buf, MAX_LINE_LENGTH);

    if (bytesRead < 0 || (bytesRead == 0 && !eofOk))
        throw new EOFException();

    dest.append(buf.getBytes(), 0, buf.getLength());
    dest.append(newline, 0, 1);
    pos += bytesRead;

    return bytesRead;
}

From source file:com.alexholmes.hadooputils.sort.DelimitedLineReader.java

License:Apache License

/**
 * Read a line terminated by one of CR, LF, or CRLF.
 *///  w w w. jav  a2 s  .  c  o  m
private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
     * 1. No newline characters are in the buffer, so we need to copy
     *    everything and read another buffer from the stream.
     * 2. An unambiguously terminated line is in buffer, so we just
     *    copy to str.
     * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
     *    in CR.  In this case we copy everything up to CR to str, but
     *    we also need to see what follows CR: if it's LF, then we
     *    need consume LF as well, so next call to readLine will read
     *    from after that.
     * We use a flag prevCharCR to signal if previous character was CR
     * and, if it happens to be at the end of the buffer, delay
     * consuming it until we have a chance to look at the char that
     * follows.
     */
    str.clear();
    int txtLength = 0; //tracks str.getLength(), as an optimization
    int newlineLength = 0; //length of terminating newline
    boolean prevCharCR = false; //true of prev char was CR
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; //starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR) {
                ++bytesConsumed; //account for CR from previous read
            }
            bufferLength = fillBuffer(in, buffer, prevCharCR);
            if (bufferLength <= 0) {
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
            if (prevCharCR) { //CR + notLF, we are at notLF
                newlineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0) {
            --readLength; //CR at the end of the buffer
        }
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:com.alexholmes.hadooputils.sort.DelimitedLineReader.java

License:Apache License

/**
 * Read a line terminated by a custom delimiter.
 *//*from w  w  w.ja  va  2 s . c om*/
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from inputStream, but the head of the stream may be
     *  already captured in the previous buffer, so we have several cases:
     * 
     * 1. The buffer tail does not contain any character sequence which
     *    matches with the head of delimiter. We count it as a 
     *    ambiguous byte count = 0
     *    
     * 2. The buffer tail contains a X number of characters,
     *    that forms a sequence, which matches with the
     *    head of delimiter. We count ambiguous byte count = X
     *    
     *    // ***  eg: A segment of input file is as follows
     *    
     *    " record 1792: I found this bug very interesting and
     *     I have completely read about it. record 1793: This bug
     *     can be solved easily record 1794: This ." 
     *    
     *    delimiter = "record";
     *        
     *    supposing:- String at the end of buffer =
     *    "I found this bug very interesting and I have completely re"
     *    There for next buffer = "ad about it. record 179       ...."           
     *     
     *     The matching characters in the input
     *     buffer tail and delimiter head = "re" 
     *     Therefore, ambiguous byte count = 2 ****   //
     *     
     *     2.1 If the following bytes are the remaining characters of
     *         the delimiter, then we have to capture only up to the starting 
     *         position of delimiter. That means, we need not include the 
     *         ambiguous characters in str.
     *     
     *     2.2 If the following bytes are not the remaining characters of
     *         the delimiter ( as mentioned in the example ), 
     *         then we have to include the ambiguous characters in str. 
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    long bytesConsumed = 0;
    int delPosn = 0;
    int ambiguousByteCount = 0; // To capture the ambiguous characters count
    do {
        int startPosn = bufferPosn; // Start from previous end position
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0);
            if (bufferLength <= 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
                delPosn++;
                if (delPosn >= recordDelimiterBytes.length) {
                    bufferPosn++;
                    break;
                }
            } else if (delPosn != 0) {
                bufferPosn--;
                delPosn = 0;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength = readLength - delPosn;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            if (ambiguousByteCount > 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                //appending the ambiguous characters (refer case 2.2)
                bytesConsumed += ambiguousByteCount;
                ambiguousByteCount = 0;
            }
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
        if (bufferPosn >= bufferLength) {
            if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
                ambiguousByteCount = delPosn;
                bytesConsumed -= ambiguousByteCount; //to be consumed in next
            }
        }
    } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume);
    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:com.asakusafw.runtime.io.line.BasicLineInput.java

License:Apache License

private void append(Text entity, int len) throws IOException {
    ByteBuffer bs = byteBuffer;/*from w ww  . ja va2 s  . c  o m*/
    CharBuffer cs = charBuffer;
    int limit = cs.limit();
    // slice the buffer
    cs.limit(cs.position() + len);
    while (true) {
        bs.clear();
        CoderResult result = encoder.encode(cs, bs, true);
        if (result.isError() == false) {
            bs.flip();
            entity.append(bs.array(), bs.position(), bs.limit());
            if (result.isUnderflow()) {
                break;
            }
        } else {
            assert result.isError();
            try {
                result.throwException();
            } catch (CharacterCodingException e) {
                throw new IOException(MessageFormat.format("exception occurred while encoding text: {0}", path),
                        e);
            }
        }
    }
    cs.limit(limit);
}

From source file:com.asakusafw.runtime.io.line.Utf8LineInput.java

License:Apache License

private State appendBufferTo(Text entity) {
    assert bufferOffset < bufferLimit;
    byte[] b = buffer;
    // skip LF after CR
    if (sawCr && b[bufferOffset] == '\n') {
        bufferOffset++;/*from   w w  w  .j a va  2 s .  c o  m*/
    }
    boolean eol = false;
    int lim = bufferLimit;
    int off = bufferOffset;
    int len = 0;
    // scan buffer until CR/LF/buffer limit
    for (int i = bufferOffset; i < lim; i++) {
        byte c = b[i];
        if (c == '\r' || c == '\n') {
            eol = true;
            sawCr = c == '\r';
            break;
        } else {
            len++;
        }
    }

    // advance buffer cursor
    bufferOffset += len + (eol ? 1 : 0);
    assert bufferOffset <= bufferLimit;

    if (len == 0) {
        return eol ? State.LINE_BREAK : State.NOTHING;
    } else {
        entity.append(b, off, len);
        return eol ? State.LINE_BREAK : State.CONTINUE;
    }
}

From source file:com.asakusafw.runtime.io.text.value.StringOptionFieldAdapter.java

License:Apache License

@Override
protected void doParse(CharSequence contents, StringOption property) {
    property.reset();//  w w w.j  ava  2  s  .com
    Text text = property.get();
    CharBuffer cbuf = CharBuffer.wrap(contents);
    ByteBuffer bbuf = encodeBuffer;
    CharsetEncoder enc = encoder;
    enc.reset();
    while (cbuf.hasRemaining()) {
        bbuf.clear();
        CoderResult result = enc.encode(cbuf, bbuf, true);
        if (result.isError()) {
            throw new IllegalArgumentException(
                    MessageFormat.format("cannot map input string to UTF-8: {0}", TextUtil.quote(contents)));
        }
        bbuf.flip();
        if (bbuf.hasRemaining()) {
            text.append(bbuf.array(), bbuf.arrayOffset() + bbuf.position(), bbuf.remaining());
        }
        bbuf.clear();
        enc.flush(bbuf);
        bbuf.flip();
        if (bbuf.hasRemaining()) {
            text.append(bbuf.array(), bbuf.arrayOffset() + bbuf.position(), bbuf.remaining());
        }
    }
}