List of usage examples for org.apache.hadoop.io Text decode
public static String decode(byte[] utf8, int start, int length) throws CharacterCodingException
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.SummarySort.java
License:Open Source License
@Override public boolean nextKeyValue() throws IOException, CharacterCodingException { if (!lineRR.nextKeyValue()) return false; Text line = getCurrentValue(); int tabOne = line.find("\t"); int rid = Integer.parseInt(Text.decode(line.getBytes(), 0, tabOne)); int tabTwo = line.find("\t", tabOne + 1); int posBeg = tabOne + 1; int posEnd = tabTwo - 1; int pos = Integer.parseInt(Text.decode(line.getBytes(), posBeg, posEnd - posBeg + 1)); key.set(BAMRecordReader.getKey0(rid, pos)); return true;/*from w w w . ja v a 2 s. c o m*/ }
From source file:hivemall.utils.hadoop.JsonSerdeUtils.java
License:Apache License
@SuppressWarnings("deprecation") @Nullable/*from w ww . j a v a 2 s. c o m*/ private static Object extractCurrentField(@Nonnull final JsonParser p, @Nonnull final HCatFieldSchema hcatFieldSchema, final boolean isTokenCurrent) throws IOException { JsonToken valueToken; if (isTokenCurrent) { valueToken = p.getCurrentToken(); } else { valueToken = p.nextToken(); } final Object val; switch (hcatFieldSchema.getType()) { case INT: val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getIntValue(); break; case TINYINT: val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getByteValue(); break; case SMALLINT: val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getShortValue(); break; case BIGINT: val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getLongValue(); break; case BOOLEAN: String bval = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); if (bval != null) { val = Boolean.valueOf(bval); } else { val = null; } break; case FLOAT: val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getFloatValue(); break; case DOUBLE: val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getDoubleValue(); break; case STRING: val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); break; case BINARY: String b = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText(); if (b != null) { try { String t = Text.decode(b.getBytes(), 0, b.getBytes().length); return t.getBytes(); } catch (CharacterCodingException e) { throw new IOException("Error generating json binary type from object.", e); } } else { val = null; } break; case DATE: val = (valueToken == JsonToken.VALUE_NULL) ? null : Date.valueOf(p.getText()); break; case TIMESTAMP: val = (valueToken == JsonToken.VALUE_NULL) ? null : Timestamp.valueOf(p.getText()); break; case DECIMAL: val = (valueToken == JsonToken.VALUE_NULL) ? null : HiveDecimal.create(p.getText()); break; case VARCHAR: int vLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength(); val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), vLen); break; case CHAR: int cLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength(); val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveChar(p.getText(), cLen); break; case ARRAY: if (valueToken == JsonToken.VALUE_NULL) { val = null; break; } if (valueToken != JsonToken.START_ARRAY) { throw new IOException("Start of Array expected"); } final List<Object> arr = new ArrayList<>(); final HCatFieldSchema elemSchema = hcatFieldSchema.getArrayElementSchema().get(0); while ((valueToken = p.nextToken()) != JsonToken.END_ARRAY) { arr.add(extractCurrentField(p, elemSchema, true)); } val = arr; break; case MAP: if (valueToken == JsonToken.VALUE_NULL) { val = null; break; } if (valueToken != JsonToken.START_OBJECT) { throw new IOException("Start of Object expected"); } final Map<Object, Object> map = new LinkedHashMap<>(); final HCatFieldSchema valueSchema = hcatFieldSchema.getMapValueSchema().get(0); while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) { Object k = getObjectOfCorrespondingPrimitiveType(p.getCurrentName(), hcatFieldSchema.getMapKeyTypeInfo()); Object v = extractCurrentField(p, valueSchema, false); map.put(k, v); } val = map; break; case STRUCT: if (valueToken == JsonToken.VALUE_NULL) { val = null; break; } if (valueToken != JsonToken.START_OBJECT) { throw new IOException("Start of Object expected"); } HCatSchema subSchema = hcatFieldSchema.getStructSubSchema(); int sz = subSchema.getFieldNames().size(); List<Object> struct = new ArrayList<>(Collections.nCopies(sz, null)); while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) { populateRecord(struct, valueToken, p, subSchema); } val = struct; break; default: throw new IOException("Unknown type found: " + hcatFieldSchema.getType()); } return val; }
From source file:hivemall.utils.hadoop.JsonSerdeUtils.java
License:Apache License
@Nonnull private static Object getObjectOfCorrespondingPrimitiveType(String s, PrimitiveTypeInfo mapKeyType) throws IOException { switch (Type.getPrimitiveHType(mapKeyType)) { case INT://ww w . j av a2 s .c o m return Integer.valueOf(s); case TINYINT: return Byte.valueOf(s); case SMALLINT: return Short.valueOf(s); case BIGINT: return Long.valueOf(s); case BOOLEAN: return (s.equalsIgnoreCase("true")); case FLOAT: return Float.valueOf(s); case DOUBLE: return Double.valueOf(s); case STRING: return s; case BINARY: try { String t = Text.decode(s.getBytes(), 0, s.getBytes().length); return t.getBytes(); } catch (CharacterCodingException e) { throw new IOException("Error generating json binary type from object.", e); } case DATE: return Date.valueOf(s); case TIMESTAMP: return Timestamp.valueOf(s); case DECIMAL: return HiveDecimal.create(s); case VARCHAR: return new HiveVarchar(s, ((BaseCharTypeInfo) mapKeyType).getLength()); case CHAR: return new HiveChar(s, ((BaseCharTypeInfo) mapKeyType).getLength()); default: throw new IOException("Could not convert from string to map type " + mapKeyType.getTypeName()); } }
From source file:it.crs4.seal.common.CutText.java
License:Open Source License
public void loadRecord(Text record) throws FormatException { int pos = 0; // the byte position within the record int fieldno = 0; // the field index within the record int colno = 0; // the index within the list of requested fields (columns) try {/*from www .ja va2 s . c o m*/ while (pos < record.getLength() && colno < columns.size()) // iterate over each field { int endpos = record.find(delim, pos); // the field's end position if (endpos < 0) endpos = record.getLength(); if (columns.get(colno) == fieldno) // if we're at a requested field { extractedFields[colno] = Text.decode(record.getBytes(), pos, endpos - pos); extractedFieldPositions[colno] = pos; colno += 1; // advance column } pos = endpos + 1; // the next starting position is the current end + 1 fieldno += 1; } } catch (java.nio.charset.CharacterCodingException e) { throw new FormatException("character coding exception. Message: " + e.getMessage(), record); } if (colno < columns.size()) throw new FormatException("Missing field(s) in record. Field " + colno + " (zero-based) not found.", record); }
From source file:it.crs4.seal.common.TextSamMapping.java
License:Open Source License
protected String getTagText(String name) { if (tagsStart >= unparsedData.getLength()) // no tags return null; String text = null;//from ww w . j a va2s.c om try { int pos = unparsedData.find(Delim + name, tagsStart - 1); if (pos >= 0) { int fieldEnd = unparsedData.find(Delim, pos + 1); // fieldEnd: index one position beyond the last char of the field if (fieldEnd < 0) fieldEnd = unparsedData.getLength(); // decode n bytes from start // start = pos + 1 (+1 to skip the delimiter) // n = fieldEnd - start // = fieldEnd - (pos + 1) // = fieldEnd - pos - 1 text = Text.decode(unparsedData.getBytes(), pos + 1, fieldEnd - pos - 1); } } catch (java.nio.charset.CharacterCodingException e) { throw new RuntimeException( "character coding error retrieving tag '" + name + "' from SAM record " + this.toString()); } return text; }
From source file:it.crs4.seal.prq.PairReadsQSeqMapper.java
License:Open Source License
public void map(Text readId, SequencedFragment read, IMRContext<SequenceId, Text> context) throws IOException, InterruptedException { // build the key builder.delete(0, builder.length()); // field up and including the index number goes in the location. The read is on its own. if (read.getRead() == null) throw new RuntimeException("Cannot get read number from read: " + readId); if (read.getLane() != null && read.getTile() != null && read.getXpos() != null && read.getYpos() != null) { appendIdToBuilder(builder, read); // appends the read id to the builder provided // finally the index field builder.append("#").append(read.getIndexSequence() == null ? '0' : read.getIndexSequence()); sequenceKey.set(builder.toString(), read.getRead()); } else {/* w w w . ja v a 2 s . com*/ // maybe it's a fastq id with a trailing read number (/1 or /2) if (readId.getLength() > 2) { int last = readId.getLength() - 1; if (readId.charAt(last - 1) == '/') { // truncate the /[12] from the read id // last == length - 1. We want length - 2 bytes, which is equal to last - 1 sequenceKey.set(Text.decode(readId.getBytes(), 0, last - 1), read.getRead()); } else throw new RuntimeException( "Didn't find /read_number at end of the read id. Please use qseq files or fastq with illumina-formatted name tags."); } else throw new RuntimeException("Read id " + readId + " is too short. Please use qseq files or fastq with illumina-formatted name tags."); } // then the tab-delimited value sequenceValue.clear(); sequenceValue.append(read.getSequence().getBytes(), 0, read.getSequence().getLength()); sequenceValue.append(Delim, 0, Delim.length); sequenceValue.append(read.getQuality().getBytes(), 0, read.getQuality().getLength()); sequenceValue.append(Delim, 0, Delim.length); // the filter flag is optional. If it's absent we assume the read passes filtering. sequenceValue.append(ZeroOne, (read.getFilterPassed() == null || read.getFilterPassed() ? 1 : 0), 1); context.write(sequenceKey, sequenceValue); context.progress(); }
From source file:it.uniroma1.hadoop.pagerank.job1.PageRankJob1Mapper.java
License:Open Source License
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { /* Job#1 mapper will simply parse a line of the input graph creating a map with key-value(s) pairs. * Input format is the following (separator is TAB): * //from ww w. j a v a 2s . c o m * <nodeA> <nodeB> * * which denotes an edge going from <nodeA> to <nodeB>. * We would need to skip comment lines (denoted by the # characters at the beginning of the line). * We will also collect all the distinct nodes in our graph: this is needed to compute the initial * pagerank value in Job #1 reducer and also in later jobs. */ if (value.charAt(0) != '#') { int tabIndex = value.find("\t"); String nodeA = Text.decode(value.getBytes(), 0, tabIndex); String nodeB = Text.decode(value.getBytes(), tabIndex + 1, value.getLength() - (tabIndex + 1)); context.write(new Text(nodeA), new Text(nodeB)); // add the current source node to the node list so we can // compute the total amount of nodes of our graph in Job#2 PageRank.NODES.add(nodeA); // also add the target node to the same list: we may have a target node // with no outlinks (so it will never be parsed as source) PageRank.NODES.add(nodeB); } }
From source file:it.uniroma1.hadoop.pagerank.job2.PageRankJob2Mapper.java
License:Open Source License
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { /* PageRank calculation algorithm (mapper) * Input file format (separator is TAB): * /*from w w w .j av a 2 s.co m*/ * <title> <page-rank> <link1>,<link2>,<link3>,<link4>,... ,<linkN> * * Output has 2 kind of records: * One record composed by the collection of links of each page: * * <title> |<link1>,<link2>,<link3>,<link4>, ... , <linkN> * * Another record composed by the linked page, the page rank of the source page * and the total amount of out links of the source page: * * <link> <page-rank> <total-links> */ int tIdx1 = value.find("\t"); int tIdx2 = value.find("\t", tIdx1 + 1); // extract tokens from the current line String page = Text.decode(value.getBytes(), 0, tIdx1); String pageRank = Text.decode(value.getBytes(), tIdx1 + 1, tIdx2 - (tIdx1 + 1)); String links = Text.decode(value.getBytes(), tIdx2 + 1, value.getLength() - (tIdx2 + 1)); String[] allOtherPages = links.split(","); for (String otherPage : allOtherPages) { Text pageRankWithTotalLinks = new Text(pageRank + "\t" + allOtherPages.length); context.write(new Text(otherPage), pageRankWithTotalLinks); } // put the original links so the reducer is able to produce the correct output context.write(new Text(page), new Text(PageRank.LINKS_SEPARATOR + links)); }
From source file:it.uniroma1.hadoop.pagerank.job3.PageRankJob3Mapper.java
License:Open Source License
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { /* Rank Ordering (mapper only) * Input file format (separator is TAB): * /*from w w w . ja v a 2 s .c o m*/ * <title> <page-rank> <link1>,<link2>,<link3>,<link4>,... ,<linkN> * * This is a simple job which does the ordering of our documents according to the computed pagerank. * We will map the pagerank (key) to its value (page) and Hadoop will do the sorting on keys for us. * There is no need to implement a reducer: the mapping and sorting is enough for our purpose. */ int tIdx1 = value.find("\t"); int tIdx2 = value.find("\t", tIdx1 + 1); // extract tokens from the current line String page = Text.decode(value.getBytes(), 0, tIdx1); float pageRank = Float.parseFloat(Text.decode(value.getBytes(), tIdx1 + 1, tIdx2 - (tIdx1 + 1))); context.write(new DoubleWritable(pageRank), new Text(page)); }
From source file:mvm.rya.indexing.accumulo.freetext.AccumuloFreeTextIndexer.java
License:Apache License
private static CloseableIteration<Statement, QueryEvaluationException> getIteratorWrapper(final Scanner s) { final Iterator<Entry<Key, Value>> i = s.iterator(); return new CloseableIteration<Statement, QueryEvaluationException>() { @Override/*from w ww . ja va2 s . co m*/ public boolean hasNext() { return i.hasNext(); } @Override public Statement next() throws QueryEvaluationException { Entry<Key, Value> entry = i.next(); Value v = entry.getValue(); try { String dataString = Text.decode(v.get(), 0, v.getSize()); Statement s = StatementSerializer.readStatement(dataString); return s; } catch (CharacterCodingException e) { logger.error("Error decoding value", e); throw new QueryEvaluationException(e); } catch (IOException e) { logger.error("Error deserializing statement", e); throw new QueryEvaluationException(e); } } @Override public void remove() { throw new UnsupportedOperationException("Remove not implemented"); } @Override public void close() throws QueryEvaluationException { if (s != null) { s.close(); } } }; }