List of usage examples for org.apache.hadoop.io Text getBytes
@Override public byte[] getBytes()
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java
License:Apache License
static void validateArcFileItemSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, ArcFileItem> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); ArcFileItem value = reader.getCurrentValue(); TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);/* w ww . j ava 2s . c o m*/ // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getContent().getReadOnlyBytes(), value.getContent().getOffset(), value.getContent().getCount()) == 0); NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems()); // validate metadata Assert.assertEquals("text/html", headers.findValue(Constants.ARCFileHeader_ARC_MimeType)); Assert.assertEquals(value.getArcFilePos(), testRecord.streamPos); Assert.assertEquals(value.getArcFileSize(), testRecord.rawSize); Assert.assertEquals("test-value", headers.findValue("test")); Assert.assertEquals(value.getArcFileName(), ((FileSplit) split).getPath().getName()); } reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileRecordReaderTests.java
License:Apache License
@Test public void TestARCFileRecordReader() throws IOException, InterruptedException { Configuration conf = new Configuration(); FileSystem fs = LocalFileSystem.get(conf); Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test")); List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); FSDataOutputStream os = fs.create(path); try {// ww w . j ava2 s. c om // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); } finally { os.close(); } FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]); ARCFileRecordReader reader = new ARCFileRecordReader(); reader.initialize(split, new TaskAttemptContext(conf, new TaskAttemptID())); int index = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); BytesWritable value = reader.getCurrentValue(); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); fs.delete(path, false); }
From source file:org.commoncrawl.util.JoinValue.java
License:Open Source License
public JoinValue(TextBytes tag, Text value) { _tag = tag;/*from ww w . j a v a2 s . c om*/ _type = TEXT_TYPE_JOIN_VALUE; _textValue = new TextBytes(); _textValue.set(value.getBytes(), 0, value.getLength()); }
From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java
License:Apache License
/** * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... *///from w w w .j ava2s .com @Test public void testReader() { DataOutputBuffer os = new DataOutputBuffer(); long timestamp = System.currentTimeMillis(); try { // write the ARC File into memory writeFirstRecord(os, "test", timestamp); List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { NIOHttpHeaders headers = new NIOHttpHeaders(); for (int i = 0; i < record.headers.size(); ++i) { headers.set(record.headers.get(i).e0, record.headers.get(i).e1); } write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); os.close(); final AtomicBoolean streamClosed = new AtomicBoolean(); // setup ArcFileReader to read the file InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) { public synchronized int read(byte b[], int off, int len) { len = 1; return super.read(b, off, len); } public void close() throws IOException { super.close(); streamClosed.set(true); } }; ARCFileReader reader = new ARCFileReader(in); int index = 0; Text key = new Text(); BytesWritable value = new BytesWritable(); // iterate and validate stuff ... while (reader.hasMoreItems()) { reader.nextKeyValue(key, value); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue( compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); if (indexofHeaderTerminator == -1) { throw new IOException("No Header Terminator found in Value!"); } indexofHeaderTerminator += 4; // read headers ... String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator, Charset.forName("UTF-8")); NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText); for (int i = 0; i < testRecord.headers.size(); ++i) { Pair<String, String> testHeaderRecord = testRecord.headers.get(i); Assert.assertNotNull(headers.findValue(testHeaderRecord.e0)); Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0)); } Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT); Assert.assertTrue(streamClosed.get()); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } }
From source file:org.commoncrawl.util.TextBytes.java
License:Open Source License
/** copy a text. */ public void set(Text other) { set(other.getBytes(), 0, other.getLength()); }
From source file:org.commoncrawl.util.URLUtils.java
License:Open Source License
public static String getHostNameFromURLKey(Text key) { fastGetResult result = fastGetHostFromTextURL(key.getBytes(), 0, key.getLength()); if (result != null && result.length != 0) { String hostName = new String(key.getBytes(), result.offset, result.length); return hostName; }/*from ww w. j a v a2s.c o m*/ return null; }
From source file:org.culturegraph.mf.cluster.job.merge.ResultMapper.java
License:Apache License
@Override public void map(final Text tag, final TextArrayWritable members, final Context context) throws IOException, InterruptedException { if (tag.equals(Union.OPEN)) { return;//from ww w. j a v a2 s. c om } memberSet.clear(); members.copyTo(memberSet); final Text representative = memberSet.pollFirst(); for (Text member : memberSet) { final Put put = new Put(member.getBytes()); put.add(Column.Family.PROPERTY, REDIRECT, representative.getBytes()); htable.put(put); } context.getCounter(Union.UNION_FIND, "redirects written").increment(memberSet.size()); }
From source file:org.elasticsearch.hadoop.mr.MapReduceWriter.java
License:Apache License
@SuppressWarnings("unchecked") public boolean write(Writable writable, Generator generator) { if (writable == null || writable instanceof NullWritable) { generator.writeNull();//w ww .j ava2s . c om } else if (writable instanceof Text) { Text text = (Text) writable; generator.writeUTF8String(text.getBytes(), 0, text.getLength()); } else if (writable instanceof UTF8) { UTF8 utf8 = (UTF8) writable; generator.writeUTF8String(utf8.getBytes(), 0, utf8.getLength()); } else if (writable instanceof IntWritable) { generator.writeNumber(((IntWritable) writable).get()); } else if (writable instanceof LongWritable) { generator.writeNumber(((LongWritable) writable).get()); } else if (writable instanceof VLongWritable) { generator.writeNumber(((VLongWritable) writable).get()); } else if (writable instanceof VIntWritable) { generator.writeNumber(((VIntWritable) writable).get()); } else if (writable instanceof ByteWritable) { generator.writeNumber(((ByteWritable) writable).get()); } else if (writable instanceof DoubleWritable) { generator.writeNumber(((DoubleWritable) writable).get()); } else if (writable instanceof FloatWritable) { generator.writeNumber(((FloatWritable) writable).get()); } else if (writable instanceof BooleanWritable) { generator.writeBoolean(((BooleanWritable) writable).get()); } else if (writable instanceof BytesWritable) { BytesWritable bw = (BytesWritable) writable; generator.writeBinary(bw.getBytes(), 0, bw.getLength()); } else if (writable instanceof MD5Hash) { generator.writeString(writable.toString()); } else if (writable instanceof ArrayWritable) { generator.writeBeginArray(); for (Writable wrt : ((ArrayWritable) writable).get()) { if (!write(wrt, generator)) { return false; } } generator.writeEndArray(); } else if (writable instanceof AbstractMapWritable) { Map<Writable, Writable> map = (Map<Writable, Writable>) writable; generator.writeBeginObject(); // ignore handling sets (which are just maps with null values) for (Entry<Writable, Writable> entry : map.entrySet()) { generator.writeFieldName(entry.getKey().toString()); if (!write(entry.getValue(), generator)) { return false; } } generator.writeEndObject(); } else { if (writeUnknownTypes) { return handleUnknown(writable, generator); } return false; } return true; }
From source file:org.elasticsearch.hadoop.mr.SafeWritableConverter.java
License:Apache License
public void invoke(Object from, BytesArray to) { // handle common cases if (from instanceof Text) { Text t = (Text) from; to.bytes(t.getBytes(), t.getLength()); }/* ww w .ja v a 2s . c o m*/ if (from instanceof BytesWritable) { BytesWritable b = (BytesWritable) from; to.bytes(b.getBytes(), b.getLength()); } }
From source file:org.elasticsearch.hadoop.mr.WritableBytesConverter.java
License:Apache License
@Override public void convert(Object from, BytesArray to) { // handle common cases if (from instanceof Text) { Text t = (Text) from; to.bytes(t.getBytes(), t.getLength()); return;/*from ww w .jav a 2 s .c o m*/ } if (from instanceof BytesWritable) { BytesWritable b = (BytesWritable) from; to.bytes(b.getBytes(), b.getLength()); return; } super.convert(from, to); }