List of usage examples for org.apache.hadoop.io Text copyBytes
public byte[] copyBytes()
From source file:com.facebook.presto.accumulo.io.AccumuloPageSink.java
License:Apache License
/** * Converts a {@link Row} to an Accumulo mutation. * * @param row Row object// w w w . j av a2 s .c om * @param rowIdOrdinal Ordinal in the list of columns that is the row ID. This isn't checked at all, so I hope you're right. Also, it is expected that the list of column handles is sorted in ordinal order. This is a very demanding function. * @param columns All column handles for the Row, sorted by ordinal. * @param serializer Instance of {@link AccumuloRowSerializer} used to encode the values of the row to the Mutation * @return Mutation */ public static Mutation toMutation(Row row, int rowIdOrdinal, List<AccumuloColumnHandle> columns, AccumuloRowSerializer serializer) { // Set our value to the row ID Text value = new Text(); Field rowField = row.getField(rowIdOrdinal); if (rowField.isNull()) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Column mapped as the Accumulo row ID cannot be null"); } setText(rowField, value, serializer); // Iterate through all the column handles, setting the Mutation's columns Mutation mutation = new Mutation(value); // Store row ID in a special column mutation.put(ROW_ID_COLUMN, ROW_ID_COLUMN, new Value(value.copyBytes())); for (AccumuloColumnHandle columnHandle : columns) { // Skip the row ID ordinal if (columnHandle.getOrdinal() == rowIdOrdinal) { continue; } // If the value of the field is not null if (!row.getField(columnHandle.getOrdinal()).isNull()) { // Serialize the value to the text setText(row.getField(columnHandle.getOrdinal()), value, serializer); // And add the bytes to the Mutation mutation.put(columnHandle.getFamily().get(), columnHandle.getQualifier().get(), new Value(value.copyBytes())); } } return mutation; }
From source file:com.facebook.presto.accumulo.serializers.StringRowSerializer.java
License:Apache License
@Override public byte[] encode(Type type, Object value) { Text text = new Text(); if (Types.isArrayType(type)) { throw new PrestoException(NOT_SUPPORTED, "arrays are not (yet?) supported for StringRowSerializer"); } else if (Types.isMapType(type)) { throw new PrestoException(NOT_SUPPORTED, "maps are not (yet?) supported for StringRowSerializer"); } else if (type.equals(BIGINT) && value instanceof Integer) { setLong(text, ((Integer) value).longValue()); } else if (type.equals(BIGINT) && value instanceof Long) { setLong(text, (Long) value); } else if (type.equals(BOOLEAN)) { setBoolean(text, value.equals(Boolean.TRUE)); } else if (type.equals(DATE)) { setDate(text, (Date) value); } else if (type.equals(DOUBLE)) { setDouble(text, (Double) value); } else if (type.equals(INTEGER) && value instanceof Integer) { setInt(text, (Integer) value); } else if (type.equals(INTEGER) && value instanceof Long) { setInt(text, ((Long) value).intValue()); } else if (type.equals(REAL)) { setFloat(text, (Float) value); } else if (type.equals(SMALLINT)) { setShort(text, (Short) value); } else if (type.equals(TIME)) { setTime(text, (Time) value); } else if (type.equals(TIMESTAMP)) { setTimestamp(text, (Timestamp) value); } else if (type.equals(TINYINT)) { setByte(text, (Byte) value); } else if (type.equals(VARBINARY) && value instanceof byte[]) { setVarbinary(text, (byte[]) value); } else if (type.equals(VARBINARY) && value instanceof Slice) { setVarbinary(text, ((Slice) value).getBytes()); } else if (type.equals(VARCHAR) && value instanceof String) { setVarchar(text, ((String) value)); } else if (type.equals(VARCHAR) && value instanceof Slice) { setVarchar(text, ((Slice) value).toStringUtf8()); } else {/*from www . j a v a 2 s . c om*/ throw new PrestoException(NOT_SUPPORTED, format("StringLexicoder does not support encoding type %s, object class is %s", type, value.getClass())); } return text.copyBytes(); }
From source file:com.facebook.presto.accumulo.tools.RewriteIndex.java
License:Apache License
private void addIndexEntries(Connector connector, AccumuloTable table, long start) { LOG.info(format("Scanning data table %s to add index entries", table.getFullTableName())); BatchScanner scanner = null;/* w w w . j av a2 s .c om*/ BatchWriter indexWriter = null; try { // Create index writer and metrics writer, but we are never going to flush the metrics writer indexWriter = connector.createBatchWriter(table.getIndexTableName(), bwc); Indexer indexer = new Indexer(connector, table, indexWriter, table.getMetricsStorageInstance(connector).newWriter(table)); LOG.info("Created indexer against " + table.getIndexTableName()); scanner = connector.createBatchScanner(table.getFullTableName(), auths, 10); LOG.info(format("Created batch scanner against %s with auths %s", table.getFullTableName(), auths)); IteratorSetting timestampFilter = new IteratorSetting(21, "timestamp", TimestampFilter.class); TimestampFilter.setRange(timestampFilter, 0L, start); scanner.addScanIterator(timestampFilter); scanner.setRanges(connector.tableOperations().splitRangeByTablets(table.getFullTableName(), new Range(), Integer.MAX_VALUE)); long numRows = 0L; long numIndexEntries = 0L; Text prevRow = null; Text row = new Text(); Text cf = new Text(); Text cq = new Text(); Mutation mutation = null; for (Entry<Key, Value> entry : scanner) { entry.getKey().getRow(row); entry.getKey().getColumnFamily(cf); entry.getKey().getColumnQualifier(cq); // if the rows do not match, index the mutation if (prevRow != null && !prevRow.equals(row)) { if (!dryRun) { indexer.index(mutation); } ++numRows; mutation = null; if (numRows % 500000 == 0) { if (dryRun) { LOG.info( format("In progress, would have re-indexed %s rows containing %s index entries", numRows, numIndexEntries)); } else { LOG.info(format("In progress, re-indexed %s rows containing %s index entries", numRows, numIndexEntries)); } } } if (mutation == null) { mutation = new Mutation(row); } mutation.put(cf, cq, entry.getKey().getColumnVisibilityParsed(), entry.getKey().getTimestamp(), entry.getValue()); if (table.getColumns().stream() .filter(column -> column.isIndexed() && column.getFamily().isPresent() && column.getQualifier().isPresent() && column.getFamily().get().equals(new String(cf.copyBytes(), UTF_8)) && column.getQualifier().get().equals(new String(cq.copyBytes(), UTF_8))) .count() > 0) { ++numIndexEntries; } if (prevRow == null) { prevRow = new Text(row); } else { prevRow.set(row); } } // Index the final mutation if (mutation != null) { if (!dryRun) { indexer.index(mutation); } ++numRows; } if (dryRun) { LOG.info(format( "Finished dry run of rewriting index entries. Would have re-indexed %s rows containing %s index entries", numRows, numIndexEntries)); } else { LOG.info(format("Finished adding index entries. Re-indexed %s rows containing %s index entries", numRows, numIndexEntries)); } } catch (AccumuloException | AccumuloSecurityException e) { LOG.error("Accumulo exception", e); } catch (TableNotFoundException e) { LOG.error("Table not found, must have been deleted during process", e); } finally { if (indexWriter != null) { try { indexWriter.close(); } catch (MutationsRejectedException e) { LOG.error("Server rejected mutations", e); } } if (scanner != null) { scanner.close(); } } }
From source file:com.facebook.presto.accumulo.tools.RewriteMetricsTask.java
License:Apache License
private void incrementTimestampMetric(Map<Text, Map<Text, Map<ColumnVisibility, AtomicLong>>> rowMap, Text family, ColumnVisibility visibility, Text timestampValue) { for (Entry<TimestampPrecision, Long> entry : getTruncatedTimestamps( serializer.decode(TIMESTAMP, timestampValue.copyBytes())).entrySet()) { Text timestampFamily = new Text( Bytes.concat(family.copyBytes(), TIMESTAMP_CARDINALITY_FAMILIES.get(entry.getKey()))); Text row = new Text(serializer.encode(TIMESTAMP, entry.getValue())); Map<Text, Map<ColumnVisibility, AtomicLong>> familyMap = rowMap.get(row); if (familyMap == null) { familyMap = new HashMap<>(); rowMap.put(row, familyMap);/*from w w w .j a v a 2s . com*/ } Map<ColumnVisibility, AtomicLong> visibilityMap = familyMap.get(timestampFamily); if (visibilityMap == null) { visibilityMap = new HashMap<>(); visibilityMap.put(new ColumnVisibility(), new AtomicLong(0)); familyMap.put(timestampFamily, visibilityMap); } if (visibilityMap.containsKey(visibility)) { visibilityMap.get(visibility).incrementAndGet(); } else { visibilityMap.put(visibility, new AtomicLong(1)); } } }
From source file:com.philiphubbard.digraph.MRBuildVerticesTest.java
License:Open Source License
private static void setupTest(Configuration conf) throws IOException { FileSystem fileSystem = FileSystem.get(conf); Path path = new Path(testInput); if (fileSystem.exists(path)) fileSystem.delete(path, true);//from w w w. ja v a2 s .c o m ArrayList<MRVertex> vertices = new ArrayList<MRVertex>(); MRVertex v0 = new MRVertex(0, conf); v0.addEdgeTo(2); vertices.add(v0); MRVertex v1 = new MRVertex(1, conf); v1.addEdgeTo(2); vertices.add(v1); MRVertex v2 = new MRVertex(2, conf); v2.addEdgeTo(3); vertices.add(v2); MRVertex v3 = new MRVertex(3, conf); v3.addEdgeTo(4); vertices.add(v3); MRVertex v4 = new MRVertex(4, conf); v4.addEdgeTo(5); v4.addEdgeTo(6); vertices.add(v4); MRVertex v5 = new MRVertex(5, conf); vertices.add(v5); MRVertex v6 = new MRVertex(6, conf); v6.addEdgeTo(7); vertices.add(v6); MRVertex v7 = new MRVertex(7, conf); vertices.add(v7); FSDataOutputStream out = fileSystem.create(path); for (MRVertex vertex : vertices) { Text text = vertex.toText(MRVertex.EdgeFormat.EDGES_TO); byte[] bytes = text.copyBytes(); for (byte b : bytes) out.write(b); out.write('\n'); } out.close(); fileSystem.close(); }
From source file:com.philiphubbard.sabe.MRAssemblerTest1.java
License:Open Source License
private static void setupTest(Configuration conf) throws IOException { FileSystem fileSystem = FileSystem.get(conf); Path path = new Path(testInput); if (fileSystem.exists(path)) fileSystem.delete(path, true);//www.j ava 2 s . c o m ArrayList<Text> reads = new ArrayList<Text>(); // Goal: AATTCGGCCTTCGGCAT reads.add(new Text("AATTCGGC\n")); reads.add(new Text("CTTCGGCAT\n")); reads.add(new Text("AATT\n")); reads.add(new Text("CGGCCTTCGGCAT\n")); reads.add(new Text("AATTCGGCCTTCG\n")); reads.add(new Text("GCAT\n")); FSDataOutputStream out = fileSystem.create(path); for (Text read : reads) { byte[] bytes = read.copyBytes(); for (byte b : bytes) out.write(b); } out.close(); fileSystem.close(); }
From source file:com.philiphubbard.sabe.MRAssemblerTest2.java
License:Open Source License
private static void setupTest(Configuration conf) throws IOException { FileSystem fileSystem = FileSystem.get(conf); Path path = new Path(testInput); if (fileSystem.exists(path)) fileSystem.delete(path, true);//from w w w. ja v a2 s . c om ArrayList<Text> reads = new ArrayList<Text>(); // The expected result: // CCCTTTCTGTTGACCCATCATTGTTTAGTAACCCGCGGGATGCCTGGCAGACCCGCGGGACGATCTCCTCTGACCCATCATCGAAATTCC // Note that it has the following pattern: // segment 0: CCCTTTCTGT // segment 1, which will be repeated: TGACCCATCA // segment 2: TTGTTTAGTA // segment 3, which will be repeated: ACCCGCGGGA // segment 4: TGCCTGGCAG // segment 3, again: ACCCGCGGGA // segment 5: CGATCTCCTC // segment 1, again: TGACCCATCA // segment 6: TCGAAATTCC reads.add(new Text("CCCTTTC\n")); // Error: initial T omitted. reads.add(new Text("GTTGACCCATCATTGTTTAGTAACCCGCGGGATGCCTGGCAGACC")); reads.add(new Text("CGCGGGACGAT\n")); // Error: final C omitted. reads.add(new Text("CTCCTCTGACCCATCATCGAAATTC\n")); reads.add(new Text("CCCTTTCTGTTGACCCAT\n")); // Error: final C replaced with G. reads.add(new Text("CATTGTTTAGTAACCCGCGGGATGCCTGGCAGACG\n")); reads.add(new Text("CGCGGGACGATCTCCTCTGACCCATCATCGAAATTCC\n")); // Error: C at index 14 replaced with A. reads.add(new Text("CCCTTTCTGTTGACACATCATTGTTTAGTAAC")); reads.add(new Text("CCGCGGGATGCC\n")); // Error: C at index 25 omitted. reads.add(new Text("TGGCAGACCCGCGGGACGATCTCCTTGACCCATCATCGAAATTCC\n")); reads.add(new Text("CCCTTTCTGTTGACCCATCATTGTTTAGTAACCCGCGGGATGCCTG\n")); // Error: G at index 10 replaced with T. reads.add(new Text("GCAGACCCGCTGGACGA\n")); reads.add(new Text("TCTCCTCTGACCCATCATCGAAATTCC\n")); reads.add(new Text("CCCTTTCTGTTGACCCATCATTGTTTAGTAACCCGCGGGATGC")); // Error: final G omitted. reads.add(new Text("CTGGCAGACCCGC\n")); reads.add(new Text("GGACGATCTCCTCT\n")); // Error: CG at index 10 transposed to GC reads.add(new Text("GACCCATCATCGAAATTCC\n")); FSDataOutputStream out = fileSystem.create(path); for (Text read : reads) { byte[] bytes = read.copyBytes(); for (byte b : bytes) out.write(b); } out.close(); fileSystem.close(); }
From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java
License:Apache License
/** * Reading a bunch of lines of file paths in a list. * The code in this method is redistributed from Hadoop LineRecordReader * /*from w ww . jav a 2 s . c o m*/ * @throws IOException */ private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); long begin = fileSplit.getStart(); long end = begin + fileSplit.getLength(); LOG.info("Reading paths in file " + path.getName()); // First check the compression codec CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodec.getCodec(path); FSDataInputStream fis = fs.open(path); SplitLineReader in; Seekable filePosition; boolean compressed = false; Decompressor decompressor = null; if (null != codec) { compressed = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis, decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, conf, (byte[]) null); begin = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null); filePosition = fis; } } else { fis.seek(begin); in = new SplitLineReader(fis, conf, (byte[]) null); filePosition = fis; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (begin != 0) { begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end)); } long pos = begin; int newSize = 0; final Text nextLine = new Text(); paths = new ArrayList<>(); while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) { if (pos == 0) { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE); pos += newSize; int textLength = nextLine.getLength(); byte[] textBytes = nextLine.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and // not do the copyBytes textBytes = nextLine.copyBytes(); nextLine.set(textBytes, 3, textLength); } else { nextLine.clear(); } } } else { newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end)); pos += newSize; } paths.add(nextLine.toString()); // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } try { if (in != null) { in.close(); } if (fis != null) { fis.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }
From source file:dz.lab.mapred.hbase.custom_output.StartsWithCountReducer_HBase.java
@Override protected void reduce(Text key, Iterable<IntWritable> counts, Context context) throws IOException, InterruptedException { int sum = 0;/* w w w .j a va2s . c om*/ for (IntWritable count : counts) { sum += count.get(); } // reducer must output either Put or Delete object Put put = new Put(key.copyBytes()); put.add(toBytes(FAMILY), toBytes(RESULT_COLUMN), toBytes(Integer.toString(sum))); context.write(null, put); }
From source file:io.fluo.stress.trie.Init.java
License:Apache License
private Collection<Text> writeSplits(FluoConfiguration props, FileSystem fs, Connector conn, Path splitsPath) throws Exception { Collection<Text> splits1 = conn.tableOperations().listSplits(props.getAccumuloTable()); OutputStream out = new BufferedOutputStream(fs.create(splitsPath)); for (Text split : splits1) { out.write(Base64.encodeBase64(split.copyBytes())); out.write('\n'); }// ww w. j a v a2 s. co m out.close(); return splits1; }