List of usage examples for org.apache.hadoop.io Text set
public void set(Text other)
From source file:io.covert.binary.analysis.BuildSequenceFile.java
License:Apache License
@Override public int run(String[] args) throws Exception { File inDir = new File(args[0]); Path name = new Path(args[1]); Text key = new Text(); BytesWritable val = new BytesWritable(); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, name, Text.class, BytesWritable.class, CompressionType.RECORD);//w w w. ja v a2s. c om for (File file : inDir.listFiles()) { if (!file.isFile()) { System.out.println("Skipping " + file + " (not a file) ..."); continue; } FileInputStream fileIn = new FileInputStream(file); ByteArrayOutputStream bytesOut = new ByteArrayOutputStream((int) file.length()); int b; while (-1 != (b = fileIn.read())) { bytesOut.write(b); } fileIn.close(); bytesOut.close(); byte[] bytes = bytesOut.toByteArray(); val.set(bytes, 0, bytes.length); key.set(file.getName()); writer.append(key, val); } writer.close(); return 0; }
From source file:io.covert.binary.analysis.BuildSequenceFileFromTarball.java
License:Apache License
public void load(FileSystem fs, Configuration conf, File inputTarball, Path outputDir) throws Exception { Text key = new Text(); BytesWritable val = new BytesWritable(); Path sequenceName = new Path(outputDir, inputTarball.getName() + ".seq"); System.out.println("Writing to " + sequenceName); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, sequenceName, Text.class, BytesWritable.class, CompressionType.RECORD); InputStream is = new FileInputStream(inputTarball); if (inputTarball.toString().toLowerCase().endsWith(".gz")) { is = new GZIPInputStream(is); } else if (inputTarball.toString().toLowerCase().endsWith(".bz") || inputTarball.toString().endsWith(".bz2")) { is.read(); // read 'B' is.read(); // read 'Z' is = new CBZip2InputStream(is); }// w w w .j ava 2 s .co m final TarArchiveInputStream debInputStream = (TarArchiveInputStream) new ArchiveStreamFactory() .createArchiveInputStream("tar", is); TarArchiveEntry entry = null; while ((entry = (TarArchiveEntry) debInputStream.getNextEntry()) != null) { if (!entry.isDirectory()) { try { final ByteArrayOutputStream outputFileStream = new ByteArrayOutputStream(); IOUtils.copy(debInputStream, outputFileStream); outputFileStream.close(); byte[] outputFile = outputFileStream.toByteArray(); val.set(outputFile, 0, outputFile.length); MessageDigest md = MessageDigest.getInstance("MD5"); md.update(outputFile); byte[] digest = md.digest(); String hexdigest = ""; for (int i = 0; i < digest.length; i++) { hexdigest += Integer.toString((digest[i] & 0xff) + 0x100, 16).substring(1); } key.set(hexdigest); writer.append(key, val); } catch (IOException e) { System.err.println("Warning: tarball may be truncated: " + inputTarball); // Truncated Tarball break; } } } debInputStream.close(); writer.close(); }
From source file:io.covert.binary.analysis.BuildTarBzSequenceFile.java
License:Apache License
@Override public int run(String[] args) throws Exception { File inDir = new File(args[0]); Path name = new Path(args[1]); Text key = new Text(); BytesWritable val = new BytesWritable(); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); if (!fs.exists(name)) { fs.mkdirs(name);//w w w . j a va 2 s. c om } for (File file : inDir.listFiles()) { Path sequenceName = new Path(name, file.getName() + ".seq"); System.out.println("Writing to " + sequenceName); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, sequenceName, Text.class, BytesWritable.class, CompressionType.RECORD); if (!file.isFile()) { System.out.println("Skipping " + file + " (not a file) ..."); continue; } final InputStream is = new FileInputStream(file); final TarArchiveInputStream debInputStream = (TarArchiveInputStream) new ArchiveStreamFactory() .createArchiveInputStream("tar", is); TarArchiveEntry entry = null; while ((entry = (TarArchiveEntry) debInputStream.getNextEntry()) != null) { if (!entry.isDirectory()) { final ByteArrayOutputStream outputFileStream = new ByteArrayOutputStream(); IOUtils.copy(debInputStream, outputFileStream); outputFileStream.close(); byte[] outputFile = outputFileStream.toByteArray(); val.set(outputFile, 0, outputFile.length); MessageDigest md = MessageDigest.getInstance("MD5"); md.update(outputFile); byte[] digest = md.digest(); String hexdigest = ""; for (int i = 0; i < digest.length; i++) { hexdigest += Integer.toString((digest[i] & 0xff) + 0x100, 16).substring(1); } key.set(hexdigest); writer.append(key, val); } } debInputStream.close(); writer.close(); } return 0; }
From source file:io.github.thammegowda.Local2SeqFile.java
License:Apache License
private void writeOutput(RemoteIterator<? extends FileStatus> input) throws IOException { Path outPath = new Path(output); if (distribFs.exists(outPath)) { throw new IllegalArgumentException("Output file already exists, Not overwriting it:" + output); }//w w w.j a v a 2 s .c om Writer writer = SequenceFile.createWriter(distribFs.getConf(), Writer.file(outPath), Writer.keyClass(Text.class), Writer.valueClass(BytesWritable.class), Writer.compression(SequenceFile.CompressionType.RECORD)); Text key = new Text(); BytesWritable value = new BytesWritable(); long skipped = 0; long copied = 0; while (input.hasNext()) { FileStatus next = input.next(); if (filter(next)) { key.set(next.getPath().toString()); FSDataInputStream stream = localFs.open(next.getPath()); //CAUTION : this could cause memory overflow byte[] bytes = IOUtils.toByteArray(stream); value.set(bytes, 0, bytes.length); writer.append(key, value); copied++; } else { skipped++; } } writer.close(); System.out.println("Files copied ::" + copied); System.out.println("Files skipped ::" + skipped); }
From source file:io.prestosql.plugin.accumulo.io.AccumuloRecordCursor.java
License:Apache License
public AccumuloRecordCursor(AccumuloRowSerializer serializer, BatchScanner scanner, String rowIdName, List<AccumuloColumnHandle> columnHandles, List<AccumuloColumnConstraint> constraints) { this.columnHandles = requireNonNull(columnHandles, "columnHandles is null"); this.scanner = requireNonNull(scanner, "scanner is null"); this.serializer = requireNonNull(serializer, "serializer is null"); this.serializer.setRowIdName(requireNonNull(rowIdName, "rowIdName is null")); requireNonNull(columnHandles, "columnHandles is null"); requireNonNull(constraints, "constraints is null"); if (retrieveOnlyRowIds(rowIdName)) { this.scanner.addScanIterator(new IteratorSetting(1, "firstentryiter", FirstEntryInRowIterator.class)); fieldToColumnName = new String[1]; fieldToColumnName[0] = rowIdName; // Set a flag on the serializer saying we are only going to be retrieving the row ID this.serializer.setRowOnly(true); } else {/*from ww w. j av a 2s .c o m*/ // Else, we will be scanning some more columns here this.serializer.setRowOnly(false); // Fetch the reserved row ID column this.scanner.fetchColumn(ROW_ID_COLUMN, ROW_ID_COLUMN); Text family = new Text(); Text qualifier = new Text(); // Create an array which maps the column ordinal to the name of the column fieldToColumnName = new String[columnHandles.size()]; for (int i = 0; i < columnHandles.size(); ++i) { AccumuloColumnHandle columnHandle = columnHandles.get(i); fieldToColumnName[i] = columnHandle.getName(); // Make sure to skip the row ID! if (!columnHandle.getName().equals(rowIdName)) { // Set the mapping of presto column name to the family/qualifier this.serializer.setMapping(columnHandle.getName(), columnHandle.getFamily().get(), columnHandle.getQualifier().get()); // Set our scanner to fetch this family/qualifier column // This will help us prune which data we receive from Accumulo family.set(columnHandle.getFamily().get()); qualifier.set(columnHandle.getQualifier().get()); this.scanner.fetchColumn(family, qualifier); } } } IteratorSetting setting = new IteratorSetting(WHOLE_ROW_ITERATOR_PRIORITY, WholeRowIterator.class); scanner.addScanIterator(setting); iterator = this.scanner.iterator(); }
From source file:io.prestosql.plugin.accumulo.serializers.StringRowSerializer.java
License:Apache License
@Override public void setDate(Text text, Date value) { text.set(Long.toString(MILLISECONDS.toDays(value.getTime())).getBytes(UTF_8)); }
From source file:io.sanfran.wikiTrends.extraction.hadoop.FileNameLineRecordReader.java
License:Open Source License
/** Read a line. */ public synchronized boolean next(Text key, Text value) throws IOException { try {/* ww w. j a v a 2s . c o m*/ // We always read one extra line, which lies outside the upper // split limit i.e. (end - 1) while (getFilePosition() <= end) { key.set(fileName); int newSize = in.readLine(value, maxLineLength, Math.max(maxBytesToConsume(pos), maxLineLength)); if (newSize == 0) { return false; } pos += newSize; if (newSize < maxLineLength) { return true; } // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } } catch (EOFException corruptFileException) { LOG.warn("corrupt file: " + fileName); } return false; }
From source file:it.uniroma1.bdc.piccioli.tesi.TriangleCounting.java
License:Apache License
@Override public void compute(Vertex<Text, Text, NullWritable> vertex, Iterable<Text> messages) throws IOException { if (getSuperstep() == 0) { //costruisco Text lista vicini Text neigborhood = new Text(); Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges(); for (Edge<Text, NullWritable> edge : edges) { neigborhood.set(neigborhood.toString() + "-" + edge.getTargetVertexId().toString()); }/*from ww w . j a v a 2s .c o m*/ for (Edge<Text, NullWritable> edge : edges) { this.sendMessage(edge.getTargetVertexId(), neigborhood); } } else if (getSuperstep() == 1) { Double T = 0.0; //confronto edge "mancanti" inviati con lista vicini nodo for (Text message : messages) { String[] msgSplit = message.toString().split("-");//lista neigbohood Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges(); for (Edge<Text, NullWritable> edge : edges) { for (String missEdge : msgSplit) { if (missEdge.equals(edge.getTargetVertexId().toString())) { T++; } } } } T = T / 6; //send aggragate value aggregate(SOMMA, new DoubleWritable(T)); vertex.setValue(new Text(T.toString())); vertex.voteToHalt(); } }
From source file:it.uniroma1.bdc.tesi.piccioli.giraphstandalone.trash.TriangleCount_OLD.java
License:Apache License
@Override public void compute(Vertex<Text, Text, NullWritable> vertex, Iterable<Text> messages) throws IOException { if (getSuperstep() == 0) { //costruisco Text lista vicini Text neigborhood = new Text(); Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges(); for (Edge<Text, NullWritable> edge : edges) { neigborhood.set(neigborhood.toString() + "-" + edge.getTargetVertexId().toString()); }//from www. j a v a 2 s . c o m for (Edge<Text, NullWritable> edge : edges) { this.sendMessage(edge.getTargetVertexId(), neigborhood); } } else if (getSuperstep() == 1) { Double T = 0.0; Iterable<Edge<Text, NullWritable>> edges = vertex.getEdges(); //confronto edge "mancanti" inviati con lista vicini nodo for (Text message : messages) { String[] msgSplit = message.toString().split("-");//lista neigbohood for (Edge<Text, NullWritable> edge : edges) { for (String missEdge : msgSplit) { if (missEdge.equals(edge.getTargetVertexId().toString())) { T++; } } } } T = T / 6; vertex.setValue(new Text(T.toString())); vertex.voteToHalt(); aggregate(SOMMA, new DoubleWritable(T)); getAggregatedValue(SOMMA); System.out.println("DEBUG " + getAggregatedValue(SOMMA)); } }
From source file:jadoop.HadoopGridJob.java
License:Open Source License
/** * Create a tasks.seq sequence file in the input directory for each task. * This file contains the key and command that defines the map task. The key * is the key that was associated with the task when it was added to the * job. The value is an TextArrayWritable object with the following * contents:// w w w. j a v a 2 s. co m * <UL> * <LI>true/false - indicating if standard output is to be captured. * <LI>true/false - indicating if standard error is to be captured. * <LI>cmd - the command to be run in the mapper task. * <LI>... - any successive elements contains an argument to the cmd. * </UL> * * @see HadoopGridTaskRunner * * @param hdfsInputDir * the input directory on the HDFS where the tasks.seq file is to * be created. * * @throws IOException * if there is a problem creating the tasks.seq file. */ private void writeTasksSequenceFiles(Path hdfsInputDir) throws IOException { /* * Seems as if we should be able to just write one task file with * multiple key/value pairs in it. However, hadoop did not seem to want * to send each entry to a different node. Rather one node processed * many of the tasks. It seems as if this could be fixed by defining how * hadoop is to split up the sequence file, but we were unable to get * that to work. Writing a different task file for each task is a bit of * a hack solution, but it works. Each task is then run on a different * node, as desired. */ Text mapperKey = new Text(); TextArrayWritable mapperVal = new TextArrayWritable(); // for each task in the job... int index = 0; for (HadoopGridTask hgt : taskMap.values()) { Path seqFileInDirPath = new Path(hdfsInputDir.toString() + "/tasks" + index + ".seq"); SequenceFile.Writer writer = SequenceFile.createWriter(job.getConfiguration(), Writer.file(seqFileInDirPath), Writer.keyClass(Text.class), Writer.valueClass(TextArrayWritable.class)); String taskKey = hgt.getKey(); String[] taskVal = hgt.getCommand(); // set the key for sequence file entry for this task mapperKey.set(taskKey); /* * Build an array of Writeable holding the flags that indicate if * standard output/error are to be captured, the timeout and the * command and its arguments. */ Writable[] vals = new Writable[taskVal.length + 3]; // put the flags in the array. vals[0] = new Text(String.valueOf(hgt.captureStandardOutput())); vals[1] = new Text(String.valueOf(hgt.captureStandardError())); vals[2] = new Text(String.valueOf(hgt.getTimeout())); // put the command and its arguments into the array. for (int i = 3; i < taskVal.length + 3; i++) { vals[i] = new Text(taskVal[i - 3]); } /* * Set the value for the sequence file entry for this task to be the * array. */ mapperVal.set(vals); writer.append(mapperKey, mapperVal); writer.close(); index++; } }