List of usage examples for org.apache.hadoop.io Text getBytes
@Override public byte[] getBytes()
From source file:gaffer.accumulo.utils.IngestUtils.java
License:Apache License
/** * Get the existing splits from a table in Accumulo and write a splits file. * The number of splits is returned./*from w w w . j a v a 2 s. c o m*/ * * @param conn An existing connection to an Accumulo instance * @param table The table name * @param fs The FileSystem in which to create the splits file * @param splitsFile A path for the splits file * @return The number of splits in the table * @throws TableNotFoundException * @throws IOException */ public static int createSplitsFile(Connector conn, String table, FileSystem fs, Path splitsFile) throws TableNotFoundException, IOException { // Get the splits from the table Collection<Text> splits = conn.tableOperations().getSplits(table); // Write the splits to file if (splits.isEmpty()) { return 0; } PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(splitsFile, true))); for (Text split : splits) { out.println(new String(Base64.encodeBase64(split.getBytes()))); } out.close(); return splits.size(); }
From source file:gaffer.accumulo.utils.IngestUtils.java
License:Apache License
/** * Given some split points, write a Base64 encoded splits file. * /*from ww w .ja v a 2 s. c o m*/ * @param splits The split points * @param fs The FileSystem in which to create the splits file * @param splitsFile The location of the output splits file * @throws IOException */ public static void writeSplitsFile(Collection<Text> splits, FileSystem fs, Path splitsFile) throws IOException { PrintStream out = null; try { out = new PrintStream(new BufferedOutputStream(fs.create(splitsFile, true))); for (Text split : splits) { out.println(new String(Base64.encodeBase64(split.getBytes()))); } } finally { IOUtils.closeStream(out); } }
From source file:gaffer.accumulostore.utils.IngestUtils.java
License:Apache License
/** * Get the existing splits from a table in Accumulo and write a splits file. * The number of splits is returned.// w ww.ja va2 s . c om * * @param conn - An existing connection to an Accumulo instance * @param table - The table name * @param fs - The FileSystem in which to create the splits file * @param splitsFile - A Path for the output splits file * @param maxSplits - The maximum number of splits * @return The number of splits in the table * @throws IOException for any IO issues reading from the file system. Other accumulo exceptions are caught and wrapped in an IOException. */ public static int createSplitsFile(final Connector conn, final String table, final FileSystem fs, final Path splitsFile, final int maxSplits) throws IOException { LOGGER.info("Creating splits file in location {} from table {} with maximum splits {}", splitsFile, table, maxSplits); // Get the splits from the table Collection<Text> splits; try { splits = conn.tableOperations().listSplits(table, maxSplits); } catch (TableNotFoundException | AccumuloSecurityException | AccumuloException e) { throw new IOException(e.getMessage(), e); } // This should have returned at most maxSplits splits, but this is not implemented properly in MockInstance. if (splits.size() > maxSplits) { if (conn instanceof MockConnector) { LOGGER.info("Manually reducing the number of splits to {} due to MockInstance not implementing" + " listSplits(table, maxSplits) properly", maxSplits); } else { LOGGER.info("Manually reducing the number of splits to {} (number of splits was {})", maxSplits, splits.size()); } final Collection<Text> filteredSplits = new TreeSet<>(); final int outputEveryNth = splits.size() / maxSplits; LOGGER.info("Outputting every {}-th split from {} total", outputEveryNth, splits.size()); int i = 0; for (final Text text : splits) { if (i % outputEveryNth == 0) { filteredSplits.add(text); } i++; if (filteredSplits.size() >= maxSplits) { break; } } splits = filteredSplits; } LOGGER.info("Found {} splits from table {}", splits.size(), table); try (final PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(splitsFile, true)), false, CommonConstants.UTF_8)) { // Write the splits to file if (splits.isEmpty()) { out.close(); return 0; } for (final Text split : splits) { out.println(new String(Base64.encodeBase64(split.getBytes()), CommonConstants.UTF_8)); } } return splits.size(); }
From source file:gaffer.accumulostore.utils.IngestUtils.java
License:Apache License
/** * Given some split points, write a Base64 encoded splits file * <p>/*from ww w. j a v a 2s. c o m*/ * * @param splits - A Collection of splits * @param fs - The FileSystem in which to create the splits file * @param splitsFile - A Path for the output splits file * @throws IOException for any IO issues writing to the file system. */ public static void writeSplitsFile(final Collection<Text> splits, final FileSystem fs, final Path splitsFile) throws IOException { try (final PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(splitsFile, true)), false, CommonConstants.UTF_8)) { for (final Text split : splits) { out.println(new String(Base64.encodeBase64(split.getBytes()), CommonConstants.UTF_8)); } } }
From source file:gov.jgi.meta.hadoop.input.FastaBlockLineReader.java
License:Open Source License
public int readLine(Text key, Map<String, String> set, int maxLineLength, long maxBytesToConsume) throws IOException { int totalBytesRead = 0; int numRecordsRead = 0; Boolean eof = false;/* w w w . j a v a 2 s .c om*/ int startPosn; Text recordBlock = new Text(); /* first thing to do is to move forward till you see a start character */ startPosn = bufferPosn; do { if (bufferPosn >= bufferLength) { totalBytesRead += bufferPosn - startPosn; bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '>'); /* if we hit the end of file already, then just return 0 bytes processed */ if (eof) return totalBytesRead; /* now bufferPosn should be at the start of a fasta record */ totalBytesRead += (bufferPosn - 1) - startPosn; startPosn = bufferPosn - 1; // startPosn guaranteed to be at a ">" /* find the next record start */ eof = false; do { if (bufferPosn >= bufferLength) { /* copy the current buffer before refreshing the buffer */ int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '>' || (totalBytesRead + bufferPosn - startPosn) <= maxBytesToConsume); if (!eof) { bufferPosn--; // make sure we leave bufferPosn pointing to the next record int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; } /* record block now has the byte array we want to process for reads */ Text k = new Text(); Text s = new Text(); int i = 1; // skip initial record seperator ">" int j = 1; do { k.clear(); s.clear(); /* first parse the key */ i = j; Boolean junkOnLine = false; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } else if (c == ' ' || c == '\t') { junkOnLine = true; break; } } k.append(recordBlock.getBytes(), i, j - i - 1); /* in case there is additional metadata on the header line, ignore everything after the first word. */ if (junkOnLine) { while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF) j++; } //LOG.info ("key = " + k.toString()); /* now skip the newlines */ while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; /* now read the sequence */ do { i = j; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } } s.append(recordBlock.getBytes(), i, j - i - 1); set.put(k.toString(), s.toString().toLowerCase()); while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>'); numRecordsRead++; /* now skip characters (newline or carige return most likely) till record start */ while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>') { j++; } j++; // skip the ">" } while (j < recordBlock.getLength()); return totalBytesRead; }
From source file:gov.jgi.meta.hadoop.input.FastqBlockLineReader.java
License:Open Source License
public int readLine(Text key, Map<String, String> set, int maxLineLength, int maxBytesToConsume) throws IOException { int totalBytesRead = 0; int numRecordsRead = 0; Boolean eof = false;// www . ja v a 2 s.co m int startPosn; Text recordBlock = new Text(); /* first thing to do is to move forward till you see a start character */ startPosn = bufferPosn; do { if (bufferPosn >= bufferLength) { totalBytesRead += bufferPosn - startPosn; bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '@'); /* if we hit the end of file already, then just return 0 bytes processed */ if (eof) return totalBytesRead; /* now bufferPosn should be at the start of a fasta record */ totalBytesRead += (bufferPosn - 1) - startPosn; startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@" /* find the next record start */ eof = false; do { if (bufferPosn >= bufferLength) { /* copy the current buffer before refreshing the buffer */ int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '@' || (totalBytesRead + bufferPosn - startPosn) < maxBytesToConsume); if (!eof) { bufferPosn--; // make sure we leave bufferPosn pointing to the next record int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; } /* record block now has the byte array we want to process for reads */ Text k = new Text(); Text s = new Text(); int i = 1; // skip initial record seperator ">" int j = 1; do { k.clear(); s.clear(); /* first parse the key */ i = j; Boolean junkOnLine = false; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } else if (c == ' ' || c == '\t') { junkOnLine = true; break; } } k.append(recordBlock.getBytes(), i, j - i - 1); /* in case there is additional metadata on the header line, ignore everything after the first word. */ if (junkOnLine) { while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF) j++; } //LOG.info ("key = " + k.toString()); /* now skip the newlines */ while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; /* now read the sequence */ do { i = j; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } } s.append(recordBlock.getBytes(), i, j - i - 1); set.put(k.toString(), s.toString().toLowerCase()); while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+'); numRecordsRead++; /* now skip characters (newline or carige return most likely) till record start */ while (j < recordBlock.getLength() && recordBlock.charAt(j) != '@') { j++; } j++; // skip the "@" } while (j < recordBlock.getLength()); return totalBytesRead; }
From source file:gov.jgi.meta.hadoop.input.FastqLineReader.java
License:Open Source License
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF) , '\r' (CR), * or '\r\n' (CR+LF). EOF also terminates an otherwise unterminated * line./*ww w .j a v a 2 s . c o m*/ * * @param str the object to store the given line (without newline) * @param maxLineLength the maximum number of bytes to store into str; * the rest of the line is silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume * in this call. This is only a hint, because if the line cross * this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * @return the number of bytes read including the (longest) newline * found. * @throws java.io.IOException if the underlying stream throws */ public int readLine(Text key, Text str, int maxLineLength, int maxBytesToConsume) throws IOException { int totalBytesRead = 0; int numRecordsRead = 0; Boolean eof = false; int startPosn; Text recordBlock = new Text(); /* first thing to do is to move forward till you see a start character */ startPosn = bufferPosn; do { if (bufferPosn >= bufferLength) { totalBytesRead += bufferPosn - startPosn; bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } } while (buffer[bufferPosn++] != '@'); /* if we hit the end of file already, then just return 0 bytes processed */ if (eof) return totalBytesRead; /* now bufferPosn should be at the start of a fastq record */ totalBytesRead += (bufferPosn - 1) - startPosn; startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@" /* find the next record start */ eof = false; int numOfNewlines = 0;//Added by lanhin do { if (bufferPosn >= bufferLength) { /* copy the current buffer before refreshing the buffer */ int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) { eof = true; break; // EOF } } //Modefied by lanhin if (buffer[bufferPosn] == CR || buffer[bufferPosn] == LF) { numOfNewlines++; } if ((numOfNewlines >= 4) && buffer[bufferPosn] == '@') { bufferPosn++; break; } bufferPosn++; } while (true);//buffer[bufferPosn++] != '@'); // only read one record at a time //Modefied by lanhin end if (!eof) { bufferPosn--; // make sure we leave bufferPosn pointing to the next record int appendLength = bufferPosn - startPosn; recordBlock.append(buffer, startPosn, appendLength); totalBytesRead += appendLength; } /* record block now has the byte array we want to process for reads */ int i = 1; // skip initial record seperator "@" int j = 1; do { key.clear(); str.clear(); /* first parse the key */ i = j; Boolean junkOnLine = false; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } else if (c == ' ' || c == '\t') { junkOnLine = true; break; } } key.append(recordBlock.getBytes(), i, j - i - 1); /* in case there is additional metadata on the header line, ignore everything after the first word. */ if (junkOnLine) { while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF) j++; } //LOG.info ("key = " + k.toString()); /* now skip the newlines */ while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; /* now read the sequence */ do { i = j; while (j < recordBlock.getLength()) { int c = recordBlock.charAt(j++); if (c == CR || c == LF) { break; } } str.append(recordBlock.getBytes(), i, j - i - 1); while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF)) j++; } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+'); numRecordsRead++; /* now skip characters (newline or carige return most likely) till record start */ while (j < recordBlock.getLength()) { // && recordBlock.charAt(j) != '@') { // Modified by lanhin /* Should go straight to the end of recordBlock, ignore all the left info. --lanhin*/ j++; } j++; // skip the "@" } while (j < recordBlock.getLength()); return totalBytesRead; }
From source file:gov.jgi.meta.pig.storage.FastaStorage.java
License:Open Source License
/** * returns the next sequence from the block *//*from ww w . ja v a2s . c o m*/ @Override public Tuple getNext() throws IOException { if (mProtoTuple == null) { mProtoTuple = new ArrayList<Object>(); } try { boolean notDone = in.nextKeyValue(); if (!notDone) { return (null); } /* check the id of the sequence to see if its a paired read */ String seqid = (in.getCurrentKey()).toString(); String seqkey = null; String seqkey2; String header = ""; String direction; for (int i = 0; i < seqid.length(); i++) { if (seqid.charAt(i) == ' ' || seqid.charAt(i) == '\t') { seqkey = seqid.substring(0, i); header = seqid.substring(i, seqid.length()); break; } } if (seqkey == null) seqkey = seqid; if (seqkey.indexOf("/") >= 0) { String[] a = seqkey.split("/"); seqkey2 = a[0]; direction = a[1]; } else { seqkey2 = seqkey; direction = "0"; } Text value = ((Text) in.getCurrentValue()); mProtoTuple.add(new DataByteArray(seqkey2.getBytes(), 0, seqkey2.length())); // add key mProtoTuple.add(new DataByteArray(direction.getBytes(), 0, direction.length())); // add direction mProtoTuple.add(new DataByteArray(value.getBytes(), 0, value.getLength())); // add sequence mProtoTuple.add(new DataByteArray(header.getBytes(), 0, header.length())); // add header Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); mProtoTuple = null; return (t); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } }
From source file:gov.jgi.meta.sequence.SequenceString.java
License:Open Source License
public static String byteArrayToSequence(Text seq) { init();/* w ww.j a v a 2s . com*/ StringBuffer sb = new StringBuffer(); byte[] ba = seq.getBytes(); for (int i = 0; i < seq.getLength(); i++) { sb.append(reverseHash.get(ba[i])); } return sb.toString(); }
From source file:gov.jgi.meta.sequence.SequenceStringCompress.java
License:Open Source License
/** * Third version//from w w w . ja v a 2 s . c o m * Different input type */ // because Text.bytes.length is not always the right length to use. public static String byteArrayToSequence(Text seq) { byte[] ba = seq.getBytes(); return byteArrayToSequence(ba); }