List of usage examples for org.apache.hadoop.io Text getBytes
@Override public byte[] getBytes()
From source file:it.crs4.seal.common.CutText.java
License:Open Source License
public void loadRecord(Text record) throws FormatException { int pos = 0; // the byte position within the record int fieldno = 0; // the field index within the record int colno = 0; // the index within the list of requested fields (columns) try {/*from w ww . j a v a2s .c o m*/ while (pos < record.getLength() && colno < columns.size()) // iterate over each field { int endpos = record.find(delim, pos); // the field's end position if (endpos < 0) endpos = record.getLength(); if (columns.get(colno) == fieldno) // if we're at a requested field { extractedFields[colno] = Text.decode(record.getBytes(), pos, endpos - pos); extractedFieldPositions[colno] = pos; colno += 1; // advance column } pos = endpos + 1; // the next starting position is the current end + 1 fieldno += 1; } } catch (java.nio.charset.CharacterCodingException e) { throw new FormatException("character coding exception. Message: " + e.getMessage(), record); } if (colno < columns.size()) throw new FormatException("Missing field(s) in record. Field " + colno + " (zero-based) not found.", record); }
From source file:it.crs4.seal.common.TextSamMapping.java
License:Open Source License
public TextSamMapping(Text sam) throws FormatException { unparsedData = new Text(); cutter = new CutText(Delim, 0, 1, 2, 3, 4, 5, 6, 7, 8); // all fields up to and including insert size try {/* w w w . ja v a2 s. c o m*/ cutter.loadRecord(sam); flag = Integer.parseInt(cutter.getField(1)); // set flag first so we can use the flag methods mapQ = Byte.parseByte(cutter.getField(4)); if (isMapped()) pos5 = Integer.parseInt(cutter.getField(3)); if (isMateMapped()) matePos5 = Integer.parseInt(cutter.getField(7)); if (isMapped() && isMateMapped()) insertSize = Integer.parseInt(cutter.getField(8)); } catch (CutText.FormatException e) { throw new FormatException("sam formatting problem: " + e + ". Record: " + sam); } catch (NumberFormatException e) { throw new FormatException("sam formatting problem. Found text in place of a number. Record: " + sam); } int seqStart = cutter.getFieldPos(8) + cutter.getField(8).length() + 1; if (seqStart > sam.getLength()) throw new FormatException("Incomplete SAM record -- missing fields. Record: " + sam); // copy the sequence and tag data to our internal buffer unparsedData.set(sam.getBytes(), seqStart, sam.getLength() - seqStart); // Find the end of the sequence field. Search for a Delim after the insert size field. int end = unparsedData.find(Delim); if (end < 0) throw new FormatException("Bad SAM format. Missing terminator for sequence field. SAM: " + sam); seqLen = end; // now repeat for the quality field qualityStart = end + 1; if (qualityStart > unparsedData.getLength()) throw new FormatException("Incomplete SAM record -- missing quality field. Record: " + sam); end = unparsedData.find(Delim, qualityStart); if (end < 0) end = unparsedData.getLength(); if (seqLen != end - qualityStart) { throw new FormatException( "Length of sequence (" + seqLen + ") is different from length of quality string (" + (end - qualityStart) + "). Record: " + sam); } tagsStart = end + 1; }
From source file:it.crs4.seal.prq.PairReadsQSeqMapper.java
License:Open Source License
public void map(Text readId, SequencedFragment read, IMRContext<SequenceId, Text> context) throws IOException, InterruptedException { // build the key builder.delete(0, builder.length()); // field up and including the index number goes in the location. The read is on its own. if (read.getRead() == null) throw new RuntimeException("Cannot get read number from read: " + readId); if (read.getLane() != null && read.getTile() != null && read.getXpos() != null && read.getYpos() != null) { appendIdToBuilder(builder, read); // appends the read id to the builder provided // finally the index field builder.append("#").append(read.getIndexSequence() == null ? '0' : read.getIndexSequence()); sequenceKey.set(builder.toString(), read.getRead()); } else {//from www .j av a 2 s.c o m // maybe it's a fastq id with a trailing read number (/1 or /2) if (readId.getLength() > 2) { int last = readId.getLength() - 1; if (readId.charAt(last - 1) == '/') { // truncate the /[12] from the read id // last == length - 1. We want length - 2 bytes, which is equal to last - 1 sequenceKey.set(Text.decode(readId.getBytes(), 0, last - 1), read.getRead()); } else throw new RuntimeException( "Didn't find /read_number at end of the read id. Please use qseq files or fastq with illumina-formatted name tags."); } else throw new RuntimeException("Read id " + readId + " is too short. Please use qseq files or fastq with illumina-formatted name tags."); } // then the tab-delimited value sequenceValue.clear(); sequenceValue.append(read.getSequence().getBytes(), 0, read.getSequence().getLength()); sequenceValue.append(Delim, 0, Delim.length); sequenceValue.append(read.getQuality().getBytes(), 0, read.getQuality().getLength()); sequenceValue.append(Delim, 0, Delim.length); // the filter flag is optional. If it's absent we assume the read passes filtering. sequenceValue.append(ZeroOne, (read.getFilterPassed() == null || read.getFilterPassed() ? 1 : 0), 1); context.write(sequenceKey, sequenceValue); context.progress(); }
From source file:it.crs4.seal.prq.PairReadsQSeqReducer.java
License:Open Source License
public void reduce(SequenceId key, Iterable<Text> values, IMRContext<Text, ReadPair> context) throws IOException, InterruptedException { outputKey.set(key.getLocation());//w ww .j a v a2 s . c o m outputValue.clear(); int nReads = 0; int nBadReads = 0; for (Text read : values) { ++nReads; if (nReads > nReadsPerTemplate) throw new RuntimeException("got more than " + nReadsPerTemplate + " reads for sequence key " + key + ". Record: " + read); int[] fieldsPos = findFields(read); // filtered read? // If dropFailedFilter is false it shortcuts the test and sets filterPassed directly to true. // If it's true then we check whether the field is equal to '1' boolean filterPassed = !dropFailedFilter || read.getBytes()[fieldsPos[2]] == (byte) '1'; if (!filterPassed) { context.increment(ReadCounters.FailedFilter, 1); ++nBadReads; } else if (!checkReadQuality(read, fieldsPos)) { context.increment(ReadCounters.NotEnoughBases, 1); ++nBadReads; } // In here we do all the work to prepare the read for output. It will be written to the // appropriate WritableMapping, which will in turn be inserted into the ReadPair outputValue. prepMapping(read.getBytes(), fieldsPos, nReads - 1); } if (nReads < nReadsPerTemplate) { context.increment(ReadCounters.Unpaired, nReads); String msg = String.format("Too few reads for template! (found %s). Key: %s", nReads, key); if (warnOnlyIfUnpaired) LOG.warn(msg); else throw new RuntimeException(msg + "\nread: " + outputValue.toString()); } // nReads can't be > nReadsPerTemplate since that should be caught in the loop above. // If they're a complete template and they're not all bad write. Unpaired are dropped if (nReads == nReadsPerTemplate && nBadReads < nReads) context.write(outputKey, outputValue); else context.increment(ReadCounters.Dropped, nReads); context.progress(); }
From source file:it.crs4.seal.prq.PairReadsQSeqReducer.java
License:Open Source License
/** * Verify whether a read satisfies quality standards. * For now this method verifies whether the read has at least * minBasesThreshold known bases (ignoring unknown bases N). *//*from w w w.j a v a 2 s . co m*/ protected boolean checkReadQuality(Text read, int[] fieldsPos) { /* The read's delimiter is at the bytes before the second field starts */ int readEnd = fieldsPos[1] - 1; // The condition is "min number of valid bases". However, we consider // the inverse condition "max number of unknowns". // readEnd is also the length of the read fragment // readEnd - minBasesThreshold gives us the maximum number of unknowns acceptable. int nAcceptableUnknowns = readEnd - minBasesThreshold; if (nAcceptableUnknowns < 0) // the fragment is shorter than minBasesThreshold return false; int nUnknownBases = 0; byte[] data = read.getBytes(); // we can work directly in bytes as long as we only have ASCII characters for (int pos = 0; pos < readEnd; ++pos) { if (data[pos] == UnknownBase) { ++nUnknownBases; if (nUnknownBases > nAcceptableUnknowns) return false; } } return true; }
From source file:it.uniroma1.hadoop.pagerank.job1.PageRankJob1Mapper.java
License:Open Source License
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { /* Job#1 mapper will simply parse a line of the input graph creating a map with key-value(s) pairs. * Input format is the following (separator is TAB): * //from w w w. java 2 s . c om * <nodeA> <nodeB> * * which denotes an edge going from <nodeA> to <nodeB>. * We would need to skip comment lines (denoted by the # characters at the beginning of the line). * We will also collect all the distinct nodes in our graph: this is needed to compute the initial * pagerank value in Job #1 reducer and also in later jobs. */ if (value.charAt(0) != '#') { int tabIndex = value.find("\t"); String nodeA = Text.decode(value.getBytes(), 0, tabIndex); String nodeB = Text.decode(value.getBytes(), tabIndex + 1, value.getLength() - (tabIndex + 1)); context.write(new Text(nodeA), new Text(nodeB)); // add the current source node to the node list so we can // compute the total amount of nodes of our graph in Job#2 PageRank.NODES.add(nodeA); // also add the target node to the same list: we may have a target node // with no outlinks (so it will never be parsed as source) PageRank.NODES.add(nodeB); } }
From source file:it.uniroma1.hadoop.pagerank.job2.PageRankJob2Mapper.java
License:Open Source License
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { /* PageRank calculation algorithm (mapper) * Input file format (separator is TAB): * // w w w .j a va 2 s .com * <title> <page-rank> <link1>,<link2>,<link3>,<link4>,... ,<linkN> * * Output has 2 kind of records: * One record composed by the collection of links of each page: * * <title> |<link1>,<link2>,<link3>,<link4>, ... , <linkN> * * Another record composed by the linked page, the page rank of the source page * and the total amount of out links of the source page: * * <link> <page-rank> <total-links> */ int tIdx1 = value.find("\t"); int tIdx2 = value.find("\t", tIdx1 + 1); // extract tokens from the current line String page = Text.decode(value.getBytes(), 0, tIdx1); String pageRank = Text.decode(value.getBytes(), tIdx1 + 1, tIdx2 - (tIdx1 + 1)); String links = Text.decode(value.getBytes(), tIdx2 + 1, value.getLength() - (tIdx2 + 1)); String[] allOtherPages = links.split(","); for (String otherPage : allOtherPages) { Text pageRankWithTotalLinks = new Text(pageRank + "\t" + allOtherPages.length); context.write(new Text(otherPage), pageRankWithTotalLinks); } // put the original links so the reducer is able to produce the correct output context.write(new Text(page), new Text(PageRank.LINKS_SEPARATOR + links)); }
From source file:it.uniroma1.hadoop.pagerank.job3.PageRankJob3Mapper.java
License:Open Source License
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { /* Rank Ordering (mapper only) * Input file format (separator is TAB): * /*from w w w .j a va 2 s . com*/ * <title> <page-rank> <link1>,<link2>,<link3>,<link4>,... ,<linkN> * * This is a simple job which does the ordering of our documents according to the computed pagerank. * We will map the pagerank (key) to its value (page) and Hadoop will do the sorting on keys for us. * There is no need to implement a reducer: the mapping and sorting is enough for our purpose. */ int tIdx1 = value.find("\t"); int tIdx2 = value.find("\t", tIdx1 + 1); // extract tokens from the current line String page = Text.decode(value.getBytes(), 0, tIdx1); float pageRank = Float.parseFloat(Text.decode(value.getBytes(), tIdx1 + 1, tIdx2 - (tIdx1 + 1))); context.write(new DoubleWritable(pageRank), new Text(page)); }
From source file:mapred.io.CustomRecordReader.java
License:Apache License
private int skipUtfByteOrderMark() throws IOException { Text value = new Text(); // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE); int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos)); // Even we read 3 extra bytes for the first line, // we won't alter existing behavior (no backwards incompat issue). // Because the newSize is less than maxLineLength and // the number of bytes copied to Text is always no more than newSize. // If the return size from readLine is not less than maxLineLength, // we will discard the current line and read the next line. pos += newSize;/*from w w w. java 2 s. c o m*/ int textLength = value.getLength(); byte[] textBytes = value.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and not do the copyBytes textBytes = value.copyBytes(); value.set(textBytes, 3, textLength); } else { value.clear(); } } return newSize; }
From source file:mr.MyFileRecordReader2.java
License:Apache License
private int skipUtfByteOrderMark(Text value) throws IOException { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE); int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos)); // Even we read 3 extra bytes for the first line, // we won't alter existing behavior (no backwards incompat issue). // Because the newSize is less than maxLineLength and // the number of bytes copied to Text is always no more than newSize. // If the return size from readLine is not less than maxLineLength, // we will discard the current line and read the next line. pos += newSize;// ww w .j av a 2 s. co m int textLength = value.getLength(); byte[] textBytes = value.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and not do the copyBytes textBytes = value.copyBytes(); value.set(textBytes, 3, textLength); } else { value.clear(); } } return newSize; }