Example usage for org.apache.hadoop.io Text copyBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text copyBytes.

Prototype

public byte[] copyBytes()

Source Link

Document

Get a copy of the bytes that is exactly the length of the data.

Usage

From source file:com.facebook.presto.accumulo.io.AccumuloPageSink.java

License:Apache License

/**
 * Converts a {@link Row} to an Accumulo mutation.
 *
 * @param row Row object//  w  w  w .  j av a2 s .c om
 * @param rowIdOrdinal Ordinal in the list of columns that is the row ID. This isn't checked at all, so I hope you're right. Also, it is expected that the list of column handles is sorted in ordinal order. This is a very demanding function.
 * @param columns All column handles for the Row, sorted by ordinal.
 * @param serializer Instance of {@link AccumuloRowSerializer} used to encode the values of the row to the Mutation
 * @return Mutation
 */
public static Mutation toMutation(Row row, int rowIdOrdinal, List<AccumuloColumnHandle> columns,
        AccumuloRowSerializer serializer) {
    // Set our value to the row ID
    Text value = new Text();
    Field rowField = row.getField(rowIdOrdinal);
    if (rowField.isNull()) {
        throw new PrestoException(INVALID_FUNCTION_ARGUMENT,
                "Column mapped as the Accumulo row ID cannot be null");
    }

    setText(rowField, value, serializer);

    // Iterate through all the column handles, setting the Mutation's columns
    Mutation mutation = new Mutation(value);

    // Store row ID in a special column
    mutation.put(ROW_ID_COLUMN, ROW_ID_COLUMN, new Value(value.copyBytes()));
    for (AccumuloColumnHandle columnHandle : columns) {
        // Skip the row ID ordinal
        if (columnHandle.getOrdinal() == rowIdOrdinal) {
            continue;
        }

        // If the value of the field is not null
        if (!row.getField(columnHandle.getOrdinal()).isNull()) {
            // Serialize the value to the text
            setText(row.getField(columnHandle.getOrdinal()), value, serializer);

            // And add the bytes to the Mutation
            mutation.put(columnHandle.getFamily().get(), columnHandle.getQualifier().get(),
                    new Value(value.copyBytes()));
        }
    }

    return mutation;
}

From source file:com.facebook.presto.accumulo.serializers.StringRowSerializer.java

License:Apache License

@Override
public byte[] encode(Type type, Object value) {
    Text text = new Text();
    if (Types.isArrayType(type)) {
        throw new PrestoException(NOT_SUPPORTED, "arrays are not (yet?) supported for StringRowSerializer");
    } else if (Types.isMapType(type)) {
        throw new PrestoException(NOT_SUPPORTED, "maps are not (yet?) supported for StringRowSerializer");
    } else if (type.equals(BIGINT) && value instanceof Integer) {
        setLong(text, ((Integer) value).longValue());
    } else if (type.equals(BIGINT) && value instanceof Long) {
        setLong(text, (Long) value);
    } else if (type.equals(BOOLEAN)) {
        setBoolean(text, value.equals(Boolean.TRUE));
    } else if (type.equals(DATE)) {
        setDate(text, (Date) value);
    } else if (type.equals(DOUBLE)) {
        setDouble(text, (Double) value);
    } else if (type.equals(INTEGER) && value instanceof Integer) {
        setInt(text, (Integer) value);
    } else if (type.equals(INTEGER) && value instanceof Long) {
        setInt(text, ((Long) value).intValue());
    } else if (type.equals(REAL)) {
        setFloat(text, (Float) value);
    } else if (type.equals(SMALLINT)) {
        setShort(text, (Short) value);
    } else if (type.equals(TIME)) {
        setTime(text, (Time) value);
    } else if (type.equals(TIMESTAMP)) {
        setTimestamp(text, (Timestamp) value);
    } else if (type.equals(TINYINT)) {
        setByte(text, (Byte) value);
    } else if (type.equals(VARBINARY) && value instanceof byte[]) {
        setVarbinary(text, (byte[]) value);
    } else if (type.equals(VARBINARY) && value instanceof Slice) {
        setVarbinary(text, ((Slice) value).getBytes());
    } else if (type.equals(VARCHAR) && value instanceof String) {
        setVarchar(text, ((String) value));
    } else if (type.equals(VARCHAR) && value instanceof Slice) {
        setVarchar(text, ((Slice) value).toStringUtf8());
    } else {/*from  www . j  a  v a 2 s  . c  om*/
        throw new PrestoException(NOT_SUPPORTED,
                format("StringLexicoder does not support encoding type %s, object class is %s", type,
                        value.getClass()));
    }

    return text.copyBytes();
}

From source file:com.facebook.presto.accumulo.tools.RewriteIndex.java

License:Apache License

private void addIndexEntries(Connector connector, AccumuloTable table, long start) {
    LOG.info(format("Scanning data table %s to add index entries", table.getFullTableName()));
    BatchScanner scanner = null;/*  w  w  w  . j av  a2  s  .c om*/
    BatchWriter indexWriter = null;
    try {
        // Create index writer and metrics writer, but we are never going to flush the metrics writer
        indexWriter = connector.createBatchWriter(table.getIndexTableName(), bwc);
        Indexer indexer = new Indexer(connector, table, indexWriter,
                table.getMetricsStorageInstance(connector).newWriter(table));
        LOG.info("Created indexer against " + table.getIndexTableName());

        scanner = connector.createBatchScanner(table.getFullTableName(), auths, 10);
        LOG.info(format("Created batch scanner against %s with auths %s", table.getFullTableName(), auths));

        IteratorSetting timestampFilter = new IteratorSetting(21, "timestamp", TimestampFilter.class);
        TimestampFilter.setRange(timestampFilter, 0L, start);
        scanner.addScanIterator(timestampFilter);

        scanner.setRanges(connector.tableOperations().splitRangeByTablets(table.getFullTableName(), new Range(),
                Integer.MAX_VALUE));

        long numRows = 0L;
        long numIndexEntries = 0L;
        Text prevRow = null;
        Text row = new Text();
        Text cf = new Text();
        Text cq = new Text();
        Mutation mutation = null;
        for (Entry<Key, Value> entry : scanner) {
            entry.getKey().getRow(row);
            entry.getKey().getColumnFamily(cf);
            entry.getKey().getColumnQualifier(cq);

            // if the rows do not match, index the mutation
            if (prevRow != null && !prevRow.equals(row)) {
                if (!dryRun) {
                    indexer.index(mutation);
                }
                ++numRows;
                mutation = null;

                if (numRows % 500000 == 0) {
                    if (dryRun) {
                        LOG.info(
                                format("In progress, would have re-indexed %s rows containing %s index entries",
                                        numRows, numIndexEntries));
                    } else {
                        LOG.info(format("In progress, re-indexed %s rows containing %s index entries", numRows,
                                numIndexEntries));
                    }
                }
            }

            if (mutation == null) {
                mutation = new Mutation(row);
            }

            mutation.put(cf, cq, entry.getKey().getColumnVisibilityParsed(), entry.getKey().getTimestamp(),
                    entry.getValue());
            if (table.getColumns().stream()
                    .filter(column -> column.isIndexed() && column.getFamily().isPresent()
                            && column.getQualifier().isPresent()
                            && column.getFamily().get().equals(new String(cf.copyBytes(), UTF_8))
                            && column.getQualifier().get().equals(new String(cq.copyBytes(), UTF_8)))
                    .count() > 0) {
                ++numIndexEntries;
            }

            if (prevRow == null) {
                prevRow = new Text(row);
            } else {
                prevRow.set(row);
            }
        }

        // Index the final mutation
        if (mutation != null) {
            if (!dryRun) {
                indexer.index(mutation);
            }
            ++numRows;
        }

        if (dryRun) {
            LOG.info(format(
                    "Finished dry run of rewriting index entries. Would have re-indexed %s rows containing %s index entries",
                    numRows, numIndexEntries));
        } else {
            LOG.info(format("Finished adding index entries. Re-indexed %s rows containing %s index entries",
                    numRows, numIndexEntries));
        }
    } catch (AccumuloException | AccumuloSecurityException e) {
        LOG.error("Accumulo exception", e);
    } catch (TableNotFoundException e) {
        LOG.error("Table not found, must have been deleted during process", e);
    } finally {
        if (indexWriter != null) {
            try {
                indexWriter.close();
            } catch (MutationsRejectedException e) {
                LOG.error("Server rejected mutations", e);
            }
        }

        if (scanner != null) {
            scanner.close();
        }
    }
}

From source file:com.facebook.presto.accumulo.tools.RewriteMetricsTask.java

License:Apache License

private void incrementTimestampMetric(Map<Text, Map<Text, Map<ColumnVisibility, AtomicLong>>> rowMap,
        Text family, ColumnVisibility visibility, Text timestampValue) {
    for (Entry<TimestampPrecision, Long> entry : getTruncatedTimestamps(
            serializer.decode(TIMESTAMP, timestampValue.copyBytes())).entrySet()) {
        Text timestampFamily = new Text(
                Bytes.concat(family.copyBytes(), TIMESTAMP_CARDINALITY_FAMILIES.get(entry.getKey())));

        Text row = new Text(serializer.encode(TIMESTAMP, entry.getValue()));
        Map<Text, Map<ColumnVisibility, AtomicLong>> familyMap = rowMap.get(row);
        if (familyMap == null) {
            familyMap = new HashMap<>();
            rowMap.put(row, familyMap);/*from   w  w  w .j  a  v  a  2s  .  com*/
        }

        Map<ColumnVisibility, AtomicLong> visibilityMap = familyMap.get(timestampFamily);
        if (visibilityMap == null) {
            visibilityMap = new HashMap<>();
            visibilityMap.put(new ColumnVisibility(), new AtomicLong(0));
            familyMap.put(timestampFamily, visibilityMap);
        }

        if (visibilityMap.containsKey(visibility)) {
            visibilityMap.get(visibility).incrementAndGet();
        } else {
            visibilityMap.put(visibility, new AtomicLong(1));
        }
    }
}

From source file:com.philiphubbard.digraph.MRBuildVerticesTest.java

License:Open Source License

private static void setupTest(Configuration conf) throws IOException {
    FileSystem fileSystem = FileSystem.get(conf);

    Path path = new Path(testInput);
    if (fileSystem.exists(path))
        fileSystem.delete(path, true);//from   w  w  w.  ja v a2 s .c o  m

    ArrayList<MRVertex> vertices = new ArrayList<MRVertex>();

    MRVertex v0 = new MRVertex(0, conf);
    v0.addEdgeTo(2);
    vertices.add(v0);

    MRVertex v1 = new MRVertex(1, conf);
    v1.addEdgeTo(2);
    vertices.add(v1);

    MRVertex v2 = new MRVertex(2, conf);
    v2.addEdgeTo(3);
    vertices.add(v2);

    MRVertex v3 = new MRVertex(3, conf);
    v3.addEdgeTo(4);
    vertices.add(v3);

    MRVertex v4 = new MRVertex(4, conf);
    v4.addEdgeTo(5);
    v4.addEdgeTo(6);
    vertices.add(v4);

    MRVertex v5 = new MRVertex(5, conf);
    vertices.add(v5);

    MRVertex v6 = new MRVertex(6, conf);
    v6.addEdgeTo(7);
    vertices.add(v6);

    MRVertex v7 = new MRVertex(7, conf);
    vertices.add(v7);

    FSDataOutputStream out = fileSystem.create(path);
    for (MRVertex vertex : vertices) {
        Text text = vertex.toText(MRVertex.EdgeFormat.EDGES_TO);
        byte[] bytes = text.copyBytes();
        for (byte b : bytes)
            out.write(b);
        out.write('\n');
    }
    out.close();

    fileSystem.close();
}

From source file:com.philiphubbard.sabe.MRAssemblerTest1.java

License:Open Source License

private static void setupTest(Configuration conf) throws IOException {
    FileSystem fileSystem = FileSystem.get(conf);

    Path path = new Path(testInput);
    if (fileSystem.exists(path))
        fileSystem.delete(path, true);//www.j  ava  2 s  . c o m

    ArrayList<Text> reads = new ArrayList<Text>();

    // Goal: AATTCGGCCTTCGGCAT

    reads.add(new Text("AATTCGGC\n"));
    reads.add(new Text("CTTCGGCAT\n"));

    reads.add(new Text("AATT\n"));
    reads.add(new Text("CGGCCTTCGGCAT\n"));

    reads.add(new Text("AATTCGGCCTTCG\n"));
    reads.add(new Text("GCAT\n"));

    FSDataOutputStream out = fileSystem.create(path);
    for (Text read : reads) {
        byte[] bytes = read.copyBytes();
        for (byte b : bytes)
            out.write(b);
    }
    out.close();

    fileSystem.close();
}

From source file:com.philiphubbard.sabe.MRAssemblerTest2.java

License:Open Source License

private static void setupTest(Configuration conf) throws IOException {
    FileSystem fileSystem = FileSystem.get(conf);

    Path path = new Path(testInput);
    if (fileSystem.exists(path))
        fileSystem.delete(path, true);//from  w  w  w.  ja  v  a2 s  .  c  om

    ArrayList<Text> reads = new ArrayList<Text>();

    // The expected result:
    // CCCTTTCTGTTGACCCATCATTGTTTAGTAACCCGCGGGATGCCTGGCAGACCCGCGGGACGATCTCCTCTGACCCATCATCGAAATTCC
    // Note that it has the following pattern:
    // segment 0: CCCTTTCTGT 
    // segment 1, which will be repeated: TGACCCATCA 
    // segment 2: TTGTTTAGTA 
    // segment 3, which will be repeated: ACCCGCGGGA 
    // segment 4: TGCCTGGCAG 
    // segment 3, again: ACCCGCGGGA 
    // segment 5: CGATCTCCTC
    // segment 1, again: TGACCCATCA 
    // segment 6: TCGAAATTCC

    reads.add(new Text("CCCTTTC\n"));
    // Error: initial T omitted.
    reads.add(new Text("GTTGACCCATCATTGTTTAGTAACCCGCGGGATGCCTGGCAGACC"));
    reads.add(new Text("CGCGGGACGAT\n"));
    // Error: final C omitted.
    reads.add(new Text("CTCCTCTGACCCATCATCGAAATTC\n"));

    reads.add(new Text("CCCTTTCTGTTGACCCAT\n"));
    // Error: final C replaced with G.
    reads.add(new Text("CATTGTTTAGTAACCCGCGGGATGCCTGGCAGACG\n"));
    reads.add(new Text("CGCGGGACGATCTCCTCTGACCCATCATCGAAATTCC\n"));

    // Error: C at index 14 replaced with A.
    reads.add(new Text("CCCTTTCTGTTGACACATCATTGTTTAGTAAC"));
    reads.add(new Text("CCGCGGGATGCC\n"));
    // Error: C at index 25 omitted.
    reads.add(new Text("TGGCAGACCCGCGGGACGATCTCCTTGACCCATCATCGAAATTCC\n"));

    reads.add(new Text("CCCTTTCTGTTGACCCATCATTGTTTAGTAACCCGCGGGATGCCTG\n"));
    // Error: G at index 10 replaced with T.
    reads.add(new Text("GCAGACCCGCTGGACGA\n"));
    reads.add(new Text("TCTCCTCTGACCCATCATCGAAATTCC\n"));

    reads.add(new Text("CCCTTTCTGTTGACCCATCATTGTTTAGTAACCCGCGGGATGC"));
    // Error: final G omitted.
    reads.add(new Text("CTGGCAGACCCGC\n"));
    reads.add(new Text("GGACGATCTCCTCT\n"));
    // Error: CG at index 10 transposed to GC
    reads.add(new Text("GACCCATCATCGAAATTCC\n"));

    FSDataOutputStream out = fileSystem.create(path);
    for (Text read : reads) {
        byte[] bytes = read.copyBytes();
        for (byte b : bytes)
            out.write(b);
    }
    out.close();

    fileSystem.close();
}

From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java

License:Apache License

/** 
 * Reading a bunch of lines of file paths in a list.
 * The code in this method is redistributed from Hadoop LineRecordReader
 * /*from   w ww  .  jav a  2 s . c  o m*/
 * @throws IOException 
 */
private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();

    long begin = fileSplit.getStart();
    long end = begin + fileSplit.getLength();

    LOG.info("Reading paths in file " + path.getName());

    // First check the compression codec
    CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodec.getCodec(path);
    FSDataInputStream fis = fs.open(path);
    SplitLineReader in;

    Seekable filePosition;

    boolean compressed = false;
    Decompressor decompressor = null;
    if (null != codec) {
        compressed = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                    decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
            begin = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
            filePosition = fis;
        }
    } else {
        fis.seek(begin);
        in = new SplitLineReader(fis, conf, (byte[]) null);
        filePosition = fis;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (begin != 0) {
        begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
    }
    long pos = begin;

    int newSize = 0;
    final Text nextLine = new Text();
    paths = new ArrayList<>();
    while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

        if (pos == 0) {
            // Strip BOM(Byte Order Mark)
            // Text only support UTF-8, we only need to check UTF-8 BOM
            // (0xEF,0xBB,0xBF) at the start of the text stream.
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
            pos += newSize;
            int textLength = nextLine.getLength();
            byte[] textBytes = nextLine.getBytes();
            if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                    && (textBytes[2] == (byte) 0xBF)) {
                // find UTF-8 BOM, strip it.
                LOG.info("Found UTF-8 BOM and skipped it");
                textLength -= 3;
                newSize -= 3;
                if (textLength > 0) {
                    // It may work to use the same buffer and 
                    // not do the copyBytes
                    textBytes = nextLine.copyBytes();
                    nextLine.set(textBytes, 3, textLength);
                } else {
                    nextLine.clear();
                }
            }
        } else {
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
            pos += newSize;
        }

        paths.add(nextLine.toString());
        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    try {
        if (in != null) {
            in.close();
        }
        if (fis != null) {
            fis.close();
        }
    } finally {
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
    }
}

From source file:dz.lab.mapred.hbase.custom_output.StartsWithCountReducer_HBase.java

@Override
protected void reduce(Text key, Iterable<IntWritable> counts, Context context)
        throws IOException, InterruptedException {
    int sum = 0;/*  w  w w .j a  va2s  .  c om*/
    for (IntWritable count : counts) {
        sum += count.get();
    }
    // reducer must output either Put or Delete object
    Put put = new Put(key.copyBytes());
    put.add(toBytes(FAMILY), toBytes(RESULT_COLUMN), toBytes(Integer.toString(sum)));
    context.write(null, put);
}

From source file:io.fluo.stress.trie.Init.java

License:Apache License

private Collection<Text> writeSplits(FluoConfiguration props, FileSystem fs, Connector conn, Path splitsPath)
        throws Exception {
    Collection<Text> splits1 = conn.tableOperations().listSplits(props.getAccumuloTable());
    OutputStream out = new BufferedOutputStream(fs.create(splitsPath));
    for (Text split : splits1) {
        out.write(Base64.encodeBase64(split.copyBytes()));
        out.write('\n');
    }//  ww w. j a  v  a2 s.  co m

    out.close();
    return splits1;
}