Example usage for org.apache.hadoop.io Text append

List of usage examples for org.apache.hadoop.io Text append

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text append.

Prototype

public void append(byte[] utf8, int start, int len) 

Source Link

Document

Append a range of bytes to the end of the given text

Usage

From source file:Importer.java

License:Open Source License

public static void copyFile(File file) throws Exception {
    //    String TEST_PREFIX = "";
    File destFile = new File(outDir, file.getName() + ".seq");
    Path dest = new Path(destFile.getAbsolutePath());

    Configuration conf = new Configuration();
    FileSystem fileSys = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(conf.get("fs.default.name")),
            conf);/* w w  w .  j  a v a  2s.  c om*/
    CompressionCodec codec = new DefaultCodec();
    fileSys.mkdirs(dest.getParent());
    FSDataOutputStream outputStr = fileSys.create(dest);
    seqFileWriter = SequenceFile.createWriter(conf, outputStr, Text.class, Text.class,
            SequenceFile.CompressionType.BLOCK, codec);
    String filename = file.getName();
    InputStream in = new BufferedInputStream(new FileInputStream(file));
    if (filename.endsWith(".bz2")) {
        in.read();
        in.read(); //snarf header
        in = new CBZip2InputStream(in);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(in, "US-ASCII"));

    System.out.println("working on file " + file);
    int records = 0;
    long bytes = 0, bytes_since_status = 0;
    long startTime = System.currentTimeMillis();
    String s = null;
    Text content = new Text();
    while ((s = br.readLine()) != null) {
        if (s.startsWith("---END.OF.DOCUMENT---")) {
            Text name = new Text(hash(content));
            seqFileWriter.append(name, content);
            records++;
            content = new Text();
        } else {
            byte[] line_as_bytes = (s + " ").getBytes();
            for (byte b : line_as_bytes) {
                assert b < 128 : "found an unexpected high-bit set";
            }

            content.append(line_as_bytes, 0, line_as_bytes.length);
            bytes += line_as_bytes.length;
            /*
            bytes_since_status += line_as_bytes.length;
            if(bytes_since_status > 10 * 1024 * 1024) { //every 10 MB
              System.err.print('.');
              bytes_since_status = 0;
            }*/
        }
    } //end while
    if (content.getLength() > 5) {
        Text name = new Text(hash(content));
        seqFileWriter.append(name, content);
        records++;
    }
    totalBytes += bytes;
    totalRecords += records;
    long time = (System.currentTimeMillis() - startTime) / 1000 + 1;
    long kbSec = bytes / 1024 / time;
    System.out.println(new java.util.Date());
    System.out.println("File " + file.getName() + " " + records + " records, " + bytes + " bytes in " + time
            + " seconds (" + kbSec + " KB/sec).");
    in.close();
    seqFileWriter.close();
    outputStr.close();
}

From source file:TweetTweetTweet.java

License:Open Source License

@Override
public Text toText(Text text) {
    tweet1.toText(text);//  w w  w.  j a va 2 s.c om
    text.append(Tab, 0, Tab.length);
    tweet2.toText(text);
    text.append(Tab, 0, Tab.length);
    tweet3.toText(text);
    return text;
}

From source file:accumulo.ingest.AbstractAccumuloCsvIngest.java

License:Apache License

protected void setRowId(Text buffer, Text fileName, long recordCount) {
    final byte[] rowSuffix = lex.encode(recordCount);
    buffer.clear();/*from w ww  .  j  av  a2  s .  co m*/
    buffer.append(fileName.getBytes(), 0, fileName.getLength());
    buffer.append(rowSuffix, 0, rowSuffix.length);
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Parses a read from an interleaved FASTQ file.
 *
 * Only reads a single record./* www .  jav a 2 s.  c o m*/
 *
 * @param readName Text record containing read name. Output parameter.
 * @param value Text record containing full record. Output parameter.
 * @return Returns true if read was successful (did not hit EOF).
 *
 * @throws RuntimeException Throws exception if FASTQ record doesn't
 *   have proper formatting (e.g., record doesn't start with @).
 */
protected boolean lowLevelFastqRead(Text readName, Text value) throws IOException {
    // ID line
    readName.clear();
    long skipped = appendLineInto(readName, true);
    pos += skipped;
    if (skipped == 0) {
        return false; // EOF
    }

    if (readName.getBytes()[0] != '@') {
        throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage()
                + ". Line: " + readName + ". \n");
    }

    value.append(readName.getBytes(), 0, readName.getLength());

    // sequence
    appendLineInto(value, false);

    // separator line
    appendLineInto(value, false);

    // quality
    appendLineInto(value, false);

    return true;
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Reads a newline into a text record from the underlying line reader.
 *
 * @param dest Text record to read line into.
 * @param eofOk Whether an EOF is acceptable in this line.
 * @return Returns the number of bytes read.
 *
 * @throws EOFException Throws if eofOk was false and we hit an EOF in
 *    the current line./*from w w  w .j  av a 2 s  .  c om*/
 */
private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException {
    Text buf = new Text();
    int bytesRead = lineReader.readLine(buf, MAX_LINE_LENGTH);

    if (bytesRead < 0 || (bytesRead == 0 && !eofOk))
        throw new EOFException();

    dest.append(buf.getBytes(), 0, buf.getLength());
    dest.append(newline, 0, 1);
    pos += bytesRead;

    return bytesRead;
}

From source file:com.alexholmes.hadooputils.sort.DelimitedLineReader.java

License:Apache License

/**
 * Read a line terminated by one of CR, LF, or CRLF.
 *///  w w w. jav  a2 s  .  c  o  m
private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
     * 1. No newline characters are in the buffer, so we need to copy
     *    everything and read another buffer from the stream.
     * 2. An unambiguously terminated line is in buffer, so we just
     *    copy to str.
     * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
     *    in CR.  In this case we copy everything up to CR to str, but
     *    we also need to see what follows CR: if it's LF, then we
     *    need consume LF as well, so next call to readLine will read
     *    from after that.
     * We use a flag prevCharCR to signal if previous character was CR
     * and, if it happens to be at the end of the buffer, delay
     * consuming it until we have a chance to look at the char that
     * follows.
     */
    str.clear();
    int txtLength = 0; //tracks str.getLength(), as an optimization
    int newlineLength = 0; //length of terminating newline
    boolean prevCharCR = false; //true of prev char was CR
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; //starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR) {
                ++bytesConsumed; //account for CR from previous read
            }
            bufferLength = fillBuffer(in, buffer, prevCharCR);
            if (bufferLength <= 0) {
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
            if (prevCharCR) { //CR + notLF, we are at notLF
                newlineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0) {
            --readLength; //CR at the end of the buffer
        }
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:com.alexholmes.hadooputils.sort.DelimitedLineReader.java

License:Apache License

/**
 * Read a line terminated by a custom delimiter.
 *//*from w  w  w.ja  va  2 s . c om*/
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /* We're reading data from inputStream, but the head of the stream may be
     *  already captured in the previous buffer, so we have several cases:
     * 
     * 1. The buffer tail does not contain any character sequence which
     *    matches with the head of delimiter. We count it as a 
     *    ambiguous byte count = 0
     *    
     * 2. The buffer tail contains a X number of characters,
     *    that forms a sequence, which matches with the
     *    head of delimiter. We count ambiguous byte count = X
     *    
     *    // ***  eg: A segment of input file is as follows
     *    
     *    " record 1792: I found this bug very interesting and
     *     I have completely read about it. record 1793: This bug
     *     can be solved easily record 1794: This ." 
     *    
     *    delimiter = "record";
     *        
     *    supposing:- String at the end of buffer =
     *    "I found this bug very interesting and I have completely re"
     *    There for next buffer = "ad about it. record 179       ...."           
     *     
     *     The matching characters in the input
     *     buffer tail and delimiter head = "re" 
     *     Therefore, ambiguous byte count = 2 ****   //
     *     
     *     2.1 If the following bytes are the remaining characters of
     *         the delimiter, then we have to capture only up to the starting 
     *         position of delimiter. That means, we need not include the 
     *         ambiguous characters in str.
     *     
     *     2.2 If the following bytes are not the remaining characters of
     *         the delimiter ( as mentioned in the example ), 
     *         then we have to include the ambiguous characters in str. 
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    long bytesConsumed = 0;
    int delPosn = 0;
    int ambiguousByteCount = 0; // To capture the ambiguous characters count
    do {
        int startPosn = bufferPosn; // Start from previous end position
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0);
            if (bufferLength <= 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                break; // EOF
            }
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) {
            if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
                delPosn++;
                if (delPosn >= recordDelimiterBytes.length) {
                    bufferPosn++;
                    break;
                }
            } else if (delPosn != 0) {
                bufferPosn--;
                delPosn = 0;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength = readLength - delPosn;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            if (ambiguousByteCount > 0) {
                str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                //appending the ambiguous characters (refer case 2.2)
                bytesConsumed += ambiguousByteCount;
                ambiguousByteCount = 0;
            }
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
        if (bufferPosn >= bufferLength) {
            if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
                ambiguousByteCount = delPosn;
                bytesConsumed -= ambiguousByteCount; //to be consumed in next
            }
        }
    } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume);
    if (bytesConsumed > (long) Integer.MAX_VALUE) {
        throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
    }
    return (int) bytesConsumed;
}

From source file:com.asakusafw.runtime.io.line.BasicLineInput.java

License:Apache License

private void append(Text entity, int len) throws IOException {
    ByteBuffer bs = byteBuffer;/*from w ww  . ja va2 s  . c  o m*/
    CharBuffer cs = charBuffer;
    int limit = cs.limit();
    // slice the buffer
    cs.limit(cs.position() + len);
    while (true) {
        bs.clear();
        CoderResult result = encoder.encode(cs, bs, true);
        if (result.isError() == false) {
            bs.flip();
            entity.append(bs.array(), bs.position(), bs.limit());
            if (result.isUnderflow()) {
                break;
            }
        } else {
            assert result.isError();
            try {
                result.throwException();
            } catch (CharacterCodingException e) {
                throw new IOException(MessageFormat.format("exception occurred while encoding text: {0}", path),
                        e);
            }
        }
    }
    cs.limit(limit);
}

From source file:com.asakusafw.runtime.io.line.Utf8LineInput.java

License:Apache License

private State appendBufferTo(Text entity) {
    assert bufferOffset < bufferLimit;
    byte[] b = buffer;
    // skip LF after CR
    if (sawCr && b[bufferOffset] == '\n') {
        bufferOffset++;/*from   w w  w  .j a va  2 s .  c o  m*/
    }
    boolean eol = false;
    int lim = bufferLimit;
    int off = bufferOffset;
    int len = 0;
    // scan buffer until CR/LF/buffer limit
    for (int i = bufferOffset; i < lim; i++) {
        byte c = b[i];
        if (c == '\r' || c == '\n') {
            eol = true;
            sawCr = c == '\r';
            break;
        } else {
            len++;
        }
    }

    // advance buffer cursor
    bufferOffset += len + (eol ? 1 : 0);
    assert bufferOffset <= bufferLimit;

    if (len == 0) {
        return eol ? State.LINE_BREAK : State.NOTHING;
    } else {
        entity.append(b, off, len);
        return eol ? State.LINE_BREAK : State.CONTINUE;
    }
}

From source file:com.asakusafw.runtime.io.text.value.StringOptionFieldAdapter.java

License:Apache License

@Override
protected void doParse(CharSequence contents, StringOption property) {
    property.reset();//  w w w.j  ava  2  s  .com
    Text text = property.get();
    CharBuffer cbuf = CharBuffer.wrap(contents);
    ByteBuffer bbuf = encodeBuffer;
    CharsetEncoder enc = encoder;
    enc.reset();
    while (cbuf.hasRemaining()) {
        bbuf.clear();
        CoderResult result = enc.encode(cbuf, bbuf, true);
        if (result.isError()) {
            throw new IllegalArgumentException(
                    MessageFormat.format("cannot map input string to UTF-8: {0}", TextUtil.quote(contents)));
        }
        bbuf.flip();
        if (bbuf.hasRemaining()) {
            text.append(bbuf.array(), bbuf.arrayOffset() + bbuf.position(), bbuf.remaining());
        }
        bbuf.clear();
        enc.flush(bbuf);
        bbuf.flip();
        if (bbuf.hasRemaining()) {
            text.append(bbuf.array(), bbuf.arrayOffset() + bbuf.position(), bbuf.remaining());
        }
    }
}