Example usage for org.apache.hadoop.io Text getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getBytes.

Prototype

@Override
public byte[] getBytes()

Source Link

Document

Returns the raw bytes; however, only data up to #getLength() is valid.

Usage

From source file:com.ricemap.spateDB.io.TextSerializerHelper.java

License:Apache License

public static void consumeMap(Text text, Map<String, String> tags) {
    tags.clear();//  www  .j  a  v  a 2  s. c  o  m
    if (text.getLength() > 0) {
        byte[] tagsBytes = text.getBytes();
        if (tagsBytes[0] != Separators[MapStart])
            return;
        int i1 = 1;
        while (i1 < text.getLength() && tagsBytes[i1] != Separators[MapEnd]) {
            int i2 = i1 + 1;
            while (i2 < text.getLength() && tagsBytes[i2] != Separators[KeyValueSeparator])
                i2++;
            String key = new String(tagsBytes, i1, i2 - i1);
            i1 = i2 + 1;

            i2 = i1 + 1;
            while (i2 < text.getLength() && tagsBytes[i2] != Separators[FieldSeparator]
                    && tagsBytes[i2] != Separators[MapEnd])
                i2++;
            String value = new String(tagsBytes, i1, i2 - i1);
            tags.put(key, value);
            i1 = i2;
            if (i1 < text.getLength() && tagsBytes[i1] == Separators[FieldSeparator])
                i1++;
        }
        text.set(tagsBytes, i1, text.getLength() - i1);
    }
}

From source file:com.ricemap.spateDB.mapred.SpatialRecordReader.java

License:Apache License

/**
 * Reads the next line from input and return true if a line was read.
 * If no more lines are available in this split, a false is returned.
 * @param value/*from  w  ww.ja v a 2 s  .co m*/
 * @return
 * @throws IOException
 */
protected boolean nextLine(Text value) throws IOException {
    if (blockType == BlockType.RTREE && pos == 8) {
        // File is positioned at the RTree header
        // Skip the header and go to first data object in file
        pos += RTree.skipHeader(in);
        LOG.info("Skipped R-tree to position: " + pos);
        // Reinitialize record reader at the new position
        lineReader = new LineReader(in);
    }
    while (getFilePosition() <= end) {
        value.clear();
        int b = 0;
        if (buffer != null) {
            // Read the first line encountered in buffer
            int eol = RTree.skipToEOL(buffer, 0);
            b += eol;
            value.append(buffer, 0, eol);
            if (eol < buffer.length) {
                // There are still some bytes remaining in buffer
                byte[] tmp = new byte[buffer.length - eol];
                System.arraycopy(buffer, eol, tmp, 0, tmp.length);
            } else {
                buffer = null;
            }
            // Check if a complete line has been read from the buffer
            byte last_byte = value.getBytes()[value.getLength() - 1];
            if (last_byte == '\n' || last_byte == '\r')
                return true;
        }

        // Read the first line from stream
        Text temp = new Text();
        b += lineReader.readLine(temp);
        if (b == 0) {
            // Indicates an end of stream
            return false;
        }
        pos += b;

        // Append the part read from stream to the part extracted from buffer
        value.append(temp.getBytes(), 0, temp.getLength());

        if (value.getLength() > 1) {
            // Read a non-empty line. Note that end-of-line character is included
            return true;
        }
    }
    // Reached end of file
    return false;
}

From source file:com.ricemap.spateDB.operations.Tail.java

License:Apache License

/**
 * Reads a maximum of n lines from the stream starting from its current
 * position and going backward.//www . j a v  a2 s  . c  o m
 * 
 * @param in - An input stream. It'll be scanned from its current position
 *   backward till position 0
 * @param n - Maximum number of lines to return
 * @param stockObject - An object used to deserialize lines read. It can
 *   be set to <code>null</code> if output is also <code>null</code>. In this
 *   case, nothing is reported to the output.
 * @param output - An output collector used to report lines read.
 * @return - The position of the beginning of the earliest line read from
 *   buffer.
 * @throws IOException
 */
public static <T extends TextSerializable> long tail(FSDataInputStream in, int n, T stockObject,
        ResultCollector<T> output) throws IOException {
    int lines_read = 0;
    long end = in.getPos();
    long offset_of_last_eol = end;
    long last_read_byte = end;

    LongWritable line_offset = new LongWritable();
    Text read_line = new Text();
    Text remainder_from_last_buffer = new Text();
    byte[] buffer = new byte[4096];

    while (last_read_byte > 0 && lines_read < n) {
        // Read next chunk from the back
        long first_byte_to_read = (last_read_byte - 1) - (last_read_byte - 1) % buffer.length;
        in.seek(first_byte_to_read);
        int bytes_to_read = (int) (last_read_byte - first_byte_to_read);
        in.read(buffer, 0, bytes_to_read);
        last_read_byte = first_byte_to_read;

        // Iterate over bytes in this buffer
        int i_last_byte_consumed_in_buffer = bytes_to_read;
        int i_last_byte_examined_in_buffer = bytes_to_read;
        while (i_last_byte_examined_in_buffer > 0 && lines_read < n) {
            byte byte_examined = buffer[--i_last_byte_examined_in_buffer];
            if (byte_examined == '\n' || byte_examined == '\r') {
                // Found an end of line character
                // Report this to output unless it's empty
                long offset_of_this_eol = first_byte_to_read + i_last_byte_examined_in_buffer;
                if (offset_of_last_eol - offset_of_this_eol > 1) {
                    if (output != null) {
                        read_line.clear();
                        // +1 is to skip the EOL at the beginning
                        read_line.append(buffer, i_last_byte_examined_in_buffer + 1,
                                i_last_byte_consumed_in_buffer - (i_last_byte_examined_in_buffer + 1));
                        // Also append bytes remaining from last buffer
                        if (remainder_from_last_buffer.getLength() > 0) {
                            read_line.append(remainder_from_last_buffer.getBytes(), 0,
                                    remainder_from_last_buffer.getLength());
                        }
                        line_offset.set(offset_of_this_eol + 1);
                        stockObject.fromText(read_line);
                        output.collect(stockObject);
                    }
                    lines_read++;
                    remainder_from_last_buffer.clear();
                }
                i_last_byte_consumed_in_buffer = i_last_byte_examined_in_buffer;
                offset_of_last_eol = offset_of_this_eol;
            }
        }
        if (i_last_byte_consumed_in_buffer > 0) {
            // There are still some bytes not consumed in buffer
            if (remainder_from_last_buffer.getLength() == 0) {
                // Store whatever is remaining in remainder_from_last_buffer
                remainder_from_last_buffer.append(buffer, 0, i_last_byte_consumed_in_buffer);
            } else {
                // Prepend remaining bytes to Text
                Text t = new Text();
                t.append(buffer, 0, i_last_byte_consumed_in_buffer);
                t.append(remainder_from_last_buffer.getBytes(), 0, remainder_from_last_buffer.getLength());
                remainder_from_last_buffer = t;
            }
        }
    }

    if (lines_read < n && remainder_from_last_buffer.getLength() > 0) {
        // There is still one last line needs to be reported
        lines_read++;
        if (output != null) {
            read_line = remainder_from_last_buffer;
            line_offset.set(0);
            stockObject.fromText(read_line);
            output.collect(stockObject);
        }
        offset_of_last_eol = -1;
    }

    return offset_of_last_eol + 1;
}

From source file:com.rramos.bigdata.utils.GenericUDFSha2.java

License:Apache License

@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
    if (digest == null) {
        return null;
    }//from w  w w. j a  va2s . c om

    digest.reset();
    if (isStr) {
        Text n = GenericUDFParamUtils.getTextValue(arguments, 0, converters);
        if (n == null) {
            return null;
        }
        digest.update(n.getBytes(), 0, n.getLength());
    } else {
        BytesWritable bWr = GenericUDFParamUtils.getBinaryValue(arguments, 0, converters);
        if (bWr == null) {
            return null;
        }
        digest.update(bWr.getBytes(), 0, bWr.getLength());
    }
    byte[] resBin = digest.digest();
    String resStr = Hex.encodeHexString(resBin);

    output.set(resStr);
    return output;
}

From source file:com.spotify.hdfs2cass.CassandraPartitioner.java

License:Apache License

@Override
public int getPartition(Text key, Text value, int numReducers) {
    final int partition;

    final BigIntegerToken token = partitioner.getToken(ByteBuffer.wrap(key.getBytes()));

    final int index = Collections.binarySearch(tokenNodes, new TokenNode(token), SEARCH_COMPARATOR);
    if (index >= 0) {
        final int multiple = numReducers / tokenNodes.size();
        partition = index + (multiple * RANDOM.nextInt(multiple));
    } else {//from w  w w. jav  a2s  .  c o m
        throw new RuntimeException("Failed to find a node for token " + token);
    }

    return partition;
}

From source file:com.transwarp.hbase.bulkload.withindex.TextWithIndexSortReducer.java

License:Apache License

@Override
protected void reduce(ImmutableBytesWritable rowKey, java.lang.Iterable<Text> lines,
        Reducer<ImmutableBytesWritable, Text, ImmutableBytesWritable, KeyValue>.Context context)
        throws java.io.IOException, InterruptedException {
    // although reduce() is called per-row, handle pathological case
    long threshold = context.getConfiguration().getLong("reducer.row.threshold", 1L * (1 << 30));
    Iterator<Text> iter = lines.iterator();
    boolean qualifier = context.getConfiguration().getBoolean("indexqualifier", false);
    while (iter.hasNext()) {
        // Get the prefix to judge whethre primary table(Prefix == 0) or index table (prefix  > 0)
        int rowkeyPrefix = Bytes.toInt(rowKey.get(), 0, 4);
        byte[] rowKeyWithoutPrefix = Bytes.tail(rowKey.get(), rowKey.get().length - 4);
        Set<KeyValue> map = new TreeSet<KeyValue>(KeyValue.COMPARATOR);
        long curSize = 0;
        // stop at the end or the RAM threshold
        while (iter.hasNext() && curSize < threshold) {
            Text line = iter.next();
            String lineStr = line.toString();
            try {
                Put p = null;// ww  w . jav a 2s .  c  o  m
                if (rowkeyPrefix == 0) {
                    ArrayList<String> parsedLine = ParsedLine.parse(converter.getRecordSpec(), lineStr);

                    p = converter.convert(parsedLine, rowKeyWithoutPrefix);
                } else {
                    p = new Put(rowKeyWithoutPrefix);
                    if (qualifier) {
                        p.add(family, line.getBytes(), emptyByte);
                    } else {
                        p.add(family, this.qualifier, line.getBytes());
                    }
                }

                if (p != null) {
                    for (List<KeyValue> kvs : p.getFamilyMap().values()) {
                        for (KeyValue kv : kvs) {
                            map.add(kv);
                            curSize += kv.getLength();
                        }
                    }
                }
            } catch (FormatException badLine) {
                if (skipBadLines) {
                    System.err.println("Bad line." + badLine.getMessage());
                    incrementBadLineCount(1);
                    return;
                }
                throw new IOException(badLine);
            } catch (IllegalArgumentException e) {
                if (skipBadLines) {
                    System.err.println("Bad line." + e.getMessage());
                    incrementBadLineCount(1);
                    return;
                }
                throw new IOException(e);
            }
        }
        context.setStatus("Read " + map.size() + " entries of " + map.getClass() + "("
                + StringUtils.humanReadableInt(curSize) + ")");
        int index = 0;
        for (KeyValue kv : map) {
            context.write(rowKey, kv);
            if (++index > 0 && index % 100 == 0)
                context.setStatus("Wrote " + index + " key values.");
        }

        // if we have more entries to process
        if (iter.hasNext()) {
            // force flush because we cannot guarantee intra-row sorted order
            context.write(null, null);
        }
    }
}

From source file:com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline.java

License:Apache License

public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) {
    // Read back the commits to make sure
    Path archiveLogPath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getArchivePath());
    try (SequenceFile.Reader reader = new SequenceFile.Reader(metaClient.getHadoopConf(),
            SequenceFile.Reader.file(archiveLogPath))) {
        Text key = new Text();
        Text val = new Text();
        while (reader.next(key, val)) {
            // TODO - limit the number of commits loaded in memory. this could get very large.
            // This is okay because only tooling will load the archived commit timeline today
            readCommits.put(key.toString(), Arrays.copyOf(val.getBytes(), val.getLength()));
        }// w ww.j a v a 2s .  c o  m
        this.setInstants(readCommits.keySet().stream()
                .map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s))
                .collect(Collectors.toList()));
    } catch (IOException e) {
        throw new HoodieIOException("Could not load archived commit timeline from path " + archiveLogPath, e);
    }
    // multiple casts will make this lambda serializable -
    // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16
    this.details = (Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails;
    this.metaClient = metaClient;
}

From source file:com.yahoo.glimmer.indexing.generator.DocumentMapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text record, Context context) throws IOException, InterruptedException {
    doc.setContent(record.getBytes(), record.getLength());

    if (doc == null || doc.getSubject() == null) {
        // Failed parsing
        context.getCounter(Counters.FAILED_PARSING).increment(1);
        LOG.error("Document failed parsing");
        return;/*from  w w  w .  j  av  a2s .  co  m*/
    }

    if (doc.getId() < 0) {
        throw new IllegalStateException("Negative docId:" + doc.getId() + " subject:" + doc.getSubject());
    }

    // This is used to write the position of the last occurrence and testing
    // if the fakeDocOccurrrence for the term has already been written.
    Map<String, DocStat> termToDocStatMap = new HashMap<String, DocStat>();

    // Iterate over all indices
    for (int indexId = 0; indexId < fields.length; indexId++) {
        String fieldName = fields[indexId];
        if (fieldName.startsWith("NOINDEX")) {
            continue;
        }

        TermValue indexIdValue = new TermValue(Type.INDEX_ID, indexId);

        // Iterate in parallel over the words of the indices
        MutableString term = new MutableString("");
        MutableString nonWord = new MutableString("");
        WordReader termReader = doc.content(indexId);
        int position = 0;

        while (termReader.next(term, nonWord)) {
            // Read next property as well
            if (term != null && term.length() > 0) {
                String termString = term.toString();

                // Report progress
                context.setStatus(fields[indexId] + "=" + term.substring(0, Math.min(term.length(), 50)));

                // Create an occurrence at the next position
                TermValue occurrenceValue = new TermValue(Type.OCCURRENCE, doc.getId(), position);
                context.write(new TermKey(termString, indexId, occurrenceValue), occurrenceValue);

                DocStat docStat = termToDocStatMap.get(termString);
                if (docStat == null) {
                    if (doc.getIndexType() == RDFDocumentFactory.IndexType.VERTICAL) {
                        // For the Alignment Index, we write the predicate
                        // id(Which is equal to the index id for a VERTICAL
                        // index) the first time we encounter a term.
                        // The 'Alignment Index' is an index without counts
                        // or positions. It's used for query optimization in
                        // the query parser. The resulting 'alignment index'
                        // is basically used as a map from term to
                        // predicates that the term occurs in.
                        context.write(new TermKey(termString, ALIGNMENT_INDEX, indexIdValue), indexIdValue);
                    }
                    docStat = new DocStat();
                    docStat.last = position;
                    docStat.count = 1;
                    termToDocStatMap.put(termString, docStat);
                } else {
                    docStat.last = position;
                    docStat.count++;
                }

                position++;
                context.getCounter(Counters.INDEXED_OCCURRENCES).increment(1);
            } else {
                LOG.info("Nextterm is null");
            }
        }

        if (doc.getIndexType() == RDFDocumentFactory.IndexType.HORIZONTAL && position > 0) {
            TermValue docSizeValue = new TermValue(Type.DOC_SIZE, doc.getId(), position);
            context.write(new TermKey(TermKey.DOC_SIZE_TERM, indexId, docSizeValue), docSizeValue);
        }

        for (String termString : termToDocStatMap.keySet()) {
            DocStat docStat = termToDocStatMap.get(termString);
            TermValue occurrenceCountValue = new TermValue(Type.TERM_STATS, docStat.count, docStat.last);
            context.write(new TermKey(termString, indexId, occurrenceCountValue), occurrenceCountValue);
        }
        termToDocStatMap.clear();
    }

    context.getCounter(Counters.NUMBER_OF_RECORDS).increment(1);
}

From source file:core.data.ConditionalMutation.java

License:Apache License

public ConditionalMutation(Text row) {
    this(row.getBytes(), 0, row.getLength());
}

From source file:cosmos.impl.IndexToMultimapRecord.java

License:Apache License

@Override
public MultimapRecord apply(Entry<Key, Value> input) {
    Key k = input.getKey();/*ww w.  ja va 2 s . com*/

    Text colqual = k.getColumnQualifier();

    int index = colqual.find(Defaults.NULL_BYTE_STR);
    if (-1 == index) {
        throw new RuntimeException("Was provided unexpected Key: " + k);
    }

    int start = index + 1;
    try {
        String docId = Text.decode(colqual.getBytes(), start, colqual.getLength() - start);

        return sorts.contents(id, docId);

    } catch (TableNotFoundException e) {
        throw new RuntimeException(e);
    } catch (UnexpectedStateException e) {
        throw new RuntimeException(e);
    } catch (CharacterCodingException e) {
        throw new RuntimeException(e);
    }
}