List of usage examples for org.apache.hadoop.io Text getBytes
@Override public byte[] getBytes()
From source file:org.cloudata.examples.web.TermUploadJob.java
License:Apache License
public void exec(String[] options) throws Exception { if (options.length < 1) { System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]"); System.exit(0);//from w w w . ja v a 2s. c o m } JobConf jobConf = new JobConf(TermUploadJob.class); JobClient jobClinet = new JobClient(jobConf); int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2; if (options.length > 1) { maxReduce = Integer.parseInt(options[1]); } jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000); FileSystem fs = FileSystem.get(jobConf); CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, TERM_TABLE)) { //Table Path path = new Path("blogdata/tmp/weight"); FileStatus[] paths = fs.listStatus(path); if (paths == null || paths.length == 0) { LOG.error("No Partition info:" + path); return; } SortedSet<Text> terms = new TreeSet<Text>(); Text text = new Text(); for (FileStatus eachPath : paths) { CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath())); while (true) { int length = reader.readLine(text); if (length <= 0) { break; } terms.add(new Text(text)); } } int temrsPerTablet = terms.size() / (maxReduce - 1); int count = 0; List<Row.Key> rowKeys = new ArrayList<Row.Key>(); for (Text term : terms) { count++; if (count == temrsPerTablet) { rowKeys.add(new Row.Key(term.getBytes())); count = 0; } } rowKeys.add(Row.Key.MAX_KEY); TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS); CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {})); } CTable termTable = CTable.openTable(nconf, TERM_TABLE); TabletInfo[] tabletInfos = termTable.listTabletInfos(); Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis()); jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")"); FileInputFormat.addInputPath(jobConf, new Path(options[0])); //<MAP> jobConf.setMapperClass(TermUploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setInputFormat(TextInputFormat.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE); jobConf.setPartitionerClass(WebKeyRangePartitioner.class); jobConf.setMaxMapAttempts(0); //</MAP> //<REDUCE> jobConf.setReducerClass(TermUploadReduce.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setNumReduceTasks(tabletInfos.length); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setNumReduceTasks(maxReduce); jobConf.setMaxReduceAttempts(0); //<REDUCE> //Run Job JobClient.runJob(jobConf); fs.delete(tempOutputPath); }
From source file:org.cloudata.examples.web.TermUploadMap.java
License:Apache License
public void map(WritableComparable key, Writable value, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { collector.collect((Text) value, new Text("")); count++;// www . ja v a 2 s .c o m if (count % 50000 == 0) { Text tValue = (Text) value; String keyStr = new String(tValue.getBytes(), 0, tValue.getLength(), "EUC-KR"); System.out.println(keyStr); } }
From source file:org.cloudata.examples.web.TermUploadReduce.java
License:Apache License
public void reduce(WritableComparable key, Iterator<Writable> values, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { if (exception != null) { throw exception; }//from ww w.j av a2s . c o m Text tKey = (Text) key; int keyIndex = tKey.find("\t"); if (keyIndex < 0) { LOG.error("invalid value:" + tKey); return; } Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, keyIndex); String keyStr = new String(tKey.getBytes(), keyIndex + 1, (tKey.getLength() - keyIndex - 1), "EUC-KR"); //term, ?(tf), documentId url, freq, weight //term, ?(df), df String[] valueTokens = keyStr.split("\t"); if (rowKey.getLength() < TestWebPage.MIN_TERM_LENGTH) { return; } count++; if (count % 50000 == 0) { System.out.println(new Date() + ":" + keyStr); } if (valueTokens.length == 2 && "df".equals(valueTokens[0])) { Row row = new Row(rowKey); row.addCell("df", new Cell(Cell.Key.EMPTY_KEY, valueTokens[1].getBytes())); dfUploader.put(row); } else if (valueTokens.length == 4 && "tf".equals(valueTokens[0])) { Row row = new Row(rowKey); String documentId = valueTokens[1]; String freq = valueTokens[2]; String weight = valueTokens[3]; row.addCell("tf", new Cell(new Cell.Key(documentId), freq.getBytes())); row.addCell("weight", new Cell(new Cell.Key(documentId), weight.getBytes())); byte[] documentIdBytes = documentId.getBytes(); row.addCell("i_weight", new Cell(new Cell.Key((df.format(1.0 - Double.parseDouble(weight)) + documentId).getBytes()), documentIdBytes)); weightUploader.put(row); } else { LOG.error("invalid value:" + valueTokens.length + "," + count + "," + valueTokens[1] + "," + keyStr); return; } }
From source file:org.cloudata.examples.web.TermWeightReduce.java
License:Apache License
public void reduce(WritableComparable key, Iterator<Writable> values, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { if (exception != null) { throw exception; }//from ww w . j a v a 2s . c om //key: term, value: documentId , freq, docLength Text tKey = (Text) key; Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength()); String keyStr = new String(tKey.getBytes(), 0, tKey.getLength(), "EUC-KR"); if (tKey.getLength() == 0 || keyStr.trim().length() < TestWebPage.MIN_TERM_LENGTH) { return; } List<Object[]> termFreqs = new ArrayList<Object[]>(100); Set<String> docs = new HashSet<String>(); while (values.hasNext()) { Text tValue = (Text) values.next(); String valueStr = tValue.toString(); String[] valueTokens = valueStr.split("\t"); if (valueTokens.length < 3) { LOG.error("valueTokens != 3:" + valueStr); return; } String documentId = valueTokens[0]; int freq = Integer.parseInt(valueTokens[1]); long docLength = Long.parseLong(valueTokens[2]); docs.add(documentId); termFreqs.add(new Object[] { documentId, freq, docLength }); if (termFreqs.size() > 100000) { LOG.info("Too many tf:term=" + keyStr); break; } } int numOfdocument = docs.size(); for (Object[] eachValue : termFreqs) { String documentId = (String) eachValue[0]; int freq = (Integer) eachValue[1]; long docLength = (Long) eachValue[2]; double weight = getTermWeight(freq, docLength, avgDocLength, sumDocCount, numOfdocument); collector.collect(tKey, new Text("tf\t" + documentId + "\t" + String.valueOf(freq) + "\t" + df.format(weight))); termCount++; if (termCount % 100000 == 0) { System.out.println("term=" + keyStr + ",document=" + documentId + ",freq=" + freq + ",df=" + numOfdocument + "weight=" + df.format(weight)); } } collector.collect(tKey, new Text("df\t" + numOfdocument)); if (termCount % 100 == 0) { partitionOut.write(tKey.getBytes()); partitionOut.write("\n".getBytes()); } }
From source file:org.cloudata.examples.web.TermWeightReduceOnline.java
License:Apache License
public void reduce(WritableComparable key, Iterator<Writable> values, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { if (exception != null) { throw exception; }// w w w .j av a2 s . c o m //key: term, value: documentId , freq, docLength Text tKey = (Text) key; Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength()); String keyStr = new String(tKey.getBytes(), 0, tKey.getLength(), "EUC-KR"); if (tKey.getLength() == 0 || keyStr.trim().length() < TestWebPage.MIN_TERM_LENGTH) { return; } Row row = termTable.get(rowKey, "df"); if (row == null || row.getColumnSize() == 0) { LOG.error("No df for term:" + keyStr); return; } int docFreq = row.getOne("df").getValue().getValueAsInt(); Row iRow = new Row(rowKey); int count = 0; List<ColumnValue> tfColumnValues = new ArrayList<ColumnValue>(); List<ColumnValue> weightColumnValues = new ArrayList<ColumnValue>(); List<ColumnValue> iWeightColumnValues = new ArrayList<ColumnValue>(); while (values.hasNext()) { Text tValue = (Text) values.next(); String valueStr = tValue.toString(); String[] valueTokens = valueStr.split("\t"); if (valueTokens.length < 3) { LOG.error("valueTokens != 3:" + valueStr); return; } String documentId = valueTokens[0]; int freq = Integer.parseInt(valueTokens[1]); long docLength = Long.parseLong(valueTokens[2]); double weight = getTermWeight(freq, docLength, avgDocLength, sumDocCount, docFreq); iRow.addCell("tf", new Cell(new Cell.Key(documentId), Integer.toString(freq).getBytes())); iRow.addCell("weigth", new Cell(new Cell.Key(documentId), Integer.toString(freq).getBytes())); byte[] documentIdBytes = documentId.getBytes(); iRow.addCell("i_weight", new Cell(new Cell.Key((df.format(1.0 - weight) + documentId).getBytes()), documentIdBytes)); if (termCount % 100000 == 0) { System.out.println("term=" + keyStr + ",document=" + documentId + ",freq=" + freq + ",df=" + docFreq + "weight=" + df.format(weight)); } termCount++; count++; if (count % 500 == 0) { try { termTable.put(iRow); } catch (Exception e) { LOG.error(e); } } iRow = new Row(rowKey); } try { termTable.put(iRow); } catch (Exception e) { LOG.error(e); } }
From source file:org.cloudata.examples.web.WebKeyRangePartitioner.java
License:Apache License
public int getPartition(WritableComparable key, Writable value, int numPartitions) { if (confException != null) { LOG.error(confException.getMessage(), confException); return -1; }// w ww . j a v a 2s .c o m if (numPartitions != tabletInfoSet.size()) { LOG.error( "tablet count(" + tabletInfoSet.size() + ") not equals numPartitions (" + numPartitions + ")"); return -1; } if (tabletInfoSet.size() == 0) { LOG.error("tablet partition size is zero"); return -1; } int partitionNumber = 0; Text tKey = (Text) key; Row.Key rowKey; int keyIndex = tKey.find("\t"); if (keyIndex < 0) { LOG.error("invalid value:" + tKey); rowKey = Row.Key.MAX_KEY; } else { rowKey = new Row.Key(tKey.getBytes(), 0, keyIndex); } SortedSet<RowKeyItem> tailSet = tabletInfoSet.tailSet(new RowKeyItem(rowKey, 0)); RowKeyItem item = null; if (tailSet.size() > 0) { item = tailSet.first(); partitionNumber = item.index; } else { item = tabletInfoSet.last(); partitionNumber = item.index; } if (partitionNumber >= numPartitions) { LOG.info("Partition Number is : " + partitionNumber + ", numPartitions : " + numPartitions + ", Row.Key : " + key.toString()); partitionNumber = numPartitions - 1; } //LOG.info("tablet partition num:" + partitionNumber); count++; if (count % 5000 == 0) { try { System.out.println("Partitioned:" + new String(rowKey.getBytes(), "EUC-KR") + "," + new String(item.rowKey.getBytes(), "EUC-KR")); } catch (UnsupportedEncodingException e) { } } return partitionNumber; }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java
License:Apache License
static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... Text key = new Text(); BytesWritable value = new BytesWritable(); while (reader.next(key, value)) { TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);/*from ww w. j av a2 s. co m*/ // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java
License:Apache License
static void validateArcFileItemSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, ArcFileItem> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... Text key = new Text(); ArcFileItem value = new ArcFileItem(); while (reader.next(key, value)) { TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);//from w w w.j av a2s . com // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getContent().getReadOnlyBytes(), value.getContent().getOffset(), value.getContent().getCount()) == 0); NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems()); // validate metadata Assert.assertEquals("text/html", headers.findValue(Constants.ARCFileHeader_ARC_MimeType)); Assert.assertEquals(value.getArcFilePos(), testRecord.streamPos); Assert.assertEquals(value.getArcFileSize(), testRecord.rawSize); Assert.assertEquals("test-value", headers.findValue("test")); Assert.assertEquals(value.getArcFileName(), ((FileSplit) split).getPath().getName()); } reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileRecordReaderTests.java
License:Apache License
@Test public void TestARCFileRecordReader() throws IOException, InterruptedException { JobConf conf = new JobConf(); FileSystem fs = LocalFileSystem.get(conf); Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test")); List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); FSDataOutputStream os = fs.create(path); try {/*w w w .j av a 2 s. c o m*/ // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); } finally { os.close(); } FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]); ARCFileRecordReader reader = new ARCFileRecordReader(); reader.initialize(conf, split); int index = 0; // iterate and validate stuff ... Text key = reader.createKey(); BytesWritable value = reader.createValue(); while (reader.next(key, value)) { TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); fs.delete(path, false); }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java
License:Apache License
static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); BytesWritable value = reader.getCurrentValue(); TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);//from w ww . j a va 2s.c om // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }