List of usage examples for org.apache.hadoop.io Text getBytes
@Override public byte[] getBytes()
From source file:cosmos.mapred.MediawikiMapper.java
License:Apache License
/** * Called once for each key/value pair in the input split. Most applications should override this, but the default is the identity function. *//*from w w w.j a v a 2 s.c om*/ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { Object o; try { o = unmarshaller.unmarshal(new ByteArrayInputStream(value.getBytes(), 0, value.getLength())); } catch (JAXBException e) { throw new IOException("Couldn't unmarshall '" + value + "'", e); } PageType pageType = (PageType) o; Page page = pageTypeToPage(pageType); Value protobufValue = new Value(page.toByteArray()); Mutation m = new Mutation(Long.toString(page.getId())); m.put(empty, empty, protobufValue); context.write(tableName, m); }
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) { Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00"); ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength()); int cp;/* w ww . ja va2 s . c om*/ while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) { System.out.println(Integer.toHexString(cp)); } }
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void test() throws IOException { // vv TextTest Text t = new Text("hadoop"); assertThat(t.getLength(), is(6)); assertThat(t.getBytes().length, is(6)); assertThat(t.charAt(2), is((int) 'd')); assertThat("Out of bounds", t.charAt(100), is(-1)); // ^^ TextTest }//from www . j a v a2 s.c o m
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void mutability() throws IOException { // vv TextTest-Mutability Text t = new Text("hadoop"); t.set("pig"); assertThat(t.getLength(), is(3)); assertThat(t.getBytes().length, is(3)); // ^^ TextTest-Mutability }// w ww . j av a 2s .com
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void byteArrayNotShortened() throws IOException { // vv TextTest-ByteArrayNotShortened Text t = new Text("hadoop"); t.set(/*[*/new Text("pig")/*]*/); assertThat(t.getLength(), is(3)); assertThat("Byte length not shortened", t.getBytes().length, /*[*/is(6)/*]*/); // ^^ TextTest-ByteArrayNotShortened }//w w w . j a v a2 s.c o m
From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java
License:Apache License
/** * Reading a bunch of lines of file paths in a list. * The code in this method is redistributed from Hadoop LineRecordReader * // w ww . j a v a 2s.c om * @throws IOException */ private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); long begin = fileSplit.getStart(); long end = begin + fileSplit.getLength(); LOG.info("Reading paths in file " + path.getName()); // First check the compression codec CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodec.getCodec(path); FSDataInputStream fis = fs.open(path); SplitLineReader in; Seekable filePosition; boolean compressed = false; Decompressor decompressor = null; if (null != codec) { compressed = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis, decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, conf, (byte[]) null); begin = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null); filePosition = fis; } } else { fis.seek(begin); in = new SplitLineReader(fis, conf, (byte[]) null); filePosition = fis; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (begin != 0) { begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end)); } long pos = begin; int newSize = 0; final Text nextLine = new Text(); paths = new ArrayList<>(); while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) { if (pos == 0) { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE); pos += newSize; int textLength = nextLine.getLength(); byte[] textBytes = nextLine.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and // not do the copyBytes textBytes = nextLine.copyBytes(); nextLine.set(textBytes, 3, textLength); } else { nextLine.clear(); } } } else { newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end)); pos += newSize; } paths.add(nextLine.toString()); // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } try { if (in != null) { in.close(); } if (fis != null) { fis.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step2DistinctDataJobTest.java
License:Apache License
@Test public void testSplit() throws Exception { Text key = new Text("123_456789"); // hard-split using array copy int i = key.find("_", 0); Text outputKey = new Text(""); byte[] bytes = key.getBytes(); outputKey.append(bytes, i + 1, bytes.length - i - 2); String fileName = new String(bytes, 0, i); assertEquals("123", fileName); assertEquals("456789", outputKey.toString()); }
From source file:diamondmapreduce.NLineRecordReader.java
License:Apache License
@Override public boolean nextKeyValue() throws IOException, InterruptedException { if (key == null) { key = new LongWritable(); }/*from w ww . j ava2 s . c om*/ key.set(pos); if (value == null) { value = new Text(); } value.clear(); final Text endline = new Text("\n"); int newSize = 0; for (int i = 0; i < NLINESTOPROCESS; i++) { Text v = new Text(); while (pos < end) { newSize = in.readLine(v, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); value.append(v.getBytes(), 0, v.getLength()); value.append(endline.getBytes(), 0, endline.getLength()); if (newSize == 0) { break; } pos += newSize; if (newSize < maxLineLength) { break; } } } if (newSize == 0) { key = null; value = null; return false; } else { return true; } }
From source file:eastcircle.terasort.TotalOrderPartitioner.java
License:Apache License
/** * Given a sorted set of cut points, build a trie that will find the correct * partition quickly./* w w w . ja va 2 s. co m*/ * @param splits the list of cut points * @param lower the lower bound of partitions 0..numPartitions-1 * @param upper the upper bound of partitions 0..numPartitions-1 * @param prefix the prefix that we have already checked against * @param maxDepth the maximum depth we will build a trie for * @return the trie node that will divide the splits correctly */ private static TrieNode buildTrie(Text[] splits, int lower, int upper, Text prefix, int maxDepth) { int depth = prefix.getLength(); if (depth >= maxDepth || lower == upper) { return new LeafTrieNode(depth, splits, lower, upper); } InnerTrieNode result = new InnerTrieNode(depth); Text trial = new Text(prefix); // append an extra byte on to the prefix trial.append(new byte[1], 0, 1); int currentBound = lower; for (int ch = 0; ch < 255; ++ch) { trial.getBytes()[depth] = (byte) (ch + 1); lower = currentBound; while (currentBound < upper) { if (splits[currentBound].compareTo(trial) >= 0) { break; } currentBound += 1; } trial.getBytes()[depth] = (byte) ch; result.child[ch] = buildTrie(splits, lower, currentBound, trial, maxDepth); } // pick up the rest trial.getBytes()[depth] = (byte) 255; result.child[255] = buildTrie(splits, currentBound, upper, trial, maxDepth); return result; }
From source file:edu.american.student.redis.hadoop.RedisBigTableRecordWriter.java
License:Apache License
@Override public void write(RedisBigTableKey key, Text value) throws IOException, InterruptedException { try {/*ww w. j ava 2 s .c o m*/ foreman.write(table, key, value.getBytes()); } catch (RedisForemanException e) { throw new IOException(MessageFactory.objective("Write key/value").objects(key, value).toString(), e); } }