List of usage examples for org.apache.hadoop.io Text clear
public void clear()
From source file:edu.umn.cs.spatialHadoop.mapred.SpatialRecordReader.java
License:Open Source License
/** * Reads the next line from input and return true if a line was read. * If no more lines are available in this split, a false is returned. * @param value//from w w w . ja v a 2 s .c om * @return * @throws IOException */ protected boolean nextLine(Text value) throws IOException { if (blockType == BlockType.RTREE && pos == 8) { // File is positioned at the RTree header // Skip the header and go to first data object in file pos += RTree.skipHeader(in); LOG.info("Skipped R-tree to position: " + pos); // Reinitialize record reader at the new position lineReader = new LineReader(in); } while (getFilePosition() <= end) { value.clear(); int b = 0; if (buffer != null) { // Read the first line encountered in buffer int eol = RTree.skipToEOL(buffer, 0); b += eol; value.append(buffer, 0, eol); if (eol < buffer.length) { // There are still some bytes remaining in buffer byte[] tmp = new byte[buffer.length - eol]; System.arraycopy(buffer, eol, tmp, 0, tmp.length); buffer = tmp; } else { buffer = null; } // Check if a complete line has been read from the buffer byte last_byte = value.getBytes()[value.getLength() - 1]; if (last_byte == '\n' || last_byte == '\r') return true; } // Read the first line from stream Text temp = new Text(); b += lineReader.readLine(temp); if (b == 0) { // Indicates an end of stream return false; } pos += b; // Append the part read from stream to the part extracted from buffer value.append(temp.getBytes(), 0, temp.getLength()); if (value.getLength() > 1) { // Read a non-empty line. Note that end-of-line character is included return true; } } // Reached end of file return false; }
From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3.java
License:Open Source License
/** * Reads the next line from input and return true if a line was read. * If no more lines are available in this split, a false is returned. * @param value The text object to fill with the next line * @return <code>true</code> if a line was read; <code>false</code> otherwise. * @throws IOException If an error occurs while reading from disk. *///from w w w . j av a 2 s. c o m protected boolean nextLine(Text value) throws IOException { while (getPos() <= end) { value.clear(); int lineLength; // Read the first line from stream if ((lineLength = lineReader.readLine(value)) <= 0) { // Indicates an end of stream return false; } // Append the part read from stream to the part extracted from buffer bytesRead += lineLength; if (value.getLength() > 1) { // Read a non-empty line. Note that end-of-line character is included return true; } } // Reached end of file return false; }
From source file:edu.umn.cs.spatialHadoop.operations.FileMBR.java
License:Open Source License
public static Partition fileMBRLocal(Path[] inFiles, final OperationsParams params) throws IOException, InterruptedException { // 1- Split the input path/file to get splits that can be processed independently final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>(); Job job = Job.getInstance(params);//from www . j a v a 2 s . c om SpatialInputFormat3.setInputPaths(job, inFiles); final List<org.apache.hadoop.mapreduce.InputSplit> splits = inputFormat.getSplits(job); int parallelism = params.getInt("parallel", Runtime.getRuntime().availableProcessors()); // 2- Process splits in parallel List<Map<String, Partition>> allMbrs = Parallel.forEach(splits.size(), new RunnableRange<Map<String, Partition>>() { @Override public Map<String, Partition> run(int i1, int i2) { Map<String, Partition> mbrs = new HashMap<String, Partition>(); for (int i = i1; i < i2; i++) { try { org.apache.hadoop.mapreduce.lib.input.FileSplit fsplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) splits .get(i); final RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat .createRecordReader(fsplit, null); if (reader instanceof SpatialRecordReader3) { ((SpatialRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof RTreeRecordReader3) { ((RTreeRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof HDFRecordReader) { ((HDFRecordReader) reader).initialize(fsplit, params); } else { throw new RuntimeException("Unknown record reader"); } Partition p = mbrs.get(fsplit.getPath().getName()); if (p == null) { p = new Partition(); p.filename = fsplit.getPath().getName(); p.cellId = p.filename.hashCode(); p.size = 0; p.recordCount = 0; p.set(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); mbrs.put(p.filename, p); } Text temp = new Text2(); while (reader.nextKeyValue()) { Iterable<Shape> shapes = reader.getCurrentValue(); for (Shape s : shapes) { Rectangle mbr = s.getMBR(); if (mbr != null) p.expand(mbr); p.recordCount++; temp.clear(); s.toText(temp); p.size += temp.getLength() + 1; } } } catch (IOException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } } return mbrs; } }, parallelism); Map<String, Partition> mbrs = allMbrs.remove(allMbrs.size() - 1); for (Map<String, Partition> list : allMbrs) { for (Partition p1 : list.values()) { Partition p2 = mbrs.get(p1.filename); if (p2 != null) { p2.expand(p1); } else { mbrs.put(p1.filename, p1); } } } // Cache the final result, if needed for (Path inFile : inFiles) { FileSystem inFs = inFile.getFileSystem(params); if (!inFs.getFileStatus(inFile).isDir()) continue; Path gindex_path = new Path(inFile, "_master.heap"); // Answer has been already cached (may be by another job) if (inFs.exists(gindex_path)) continue; FileStatus[] files = inFs.listStatus(inFile, SpatialSite.NonHiddenFileFilter); PrintStream wktout = new PrintStream(inFs.create(new Path(inFile, "_heap.wkt"), false)); PrintStream gout = new PrintStream(inFs.create(gindex_path, false)); Text text = new Text2(); for (FileStatus file : files) { text.clear(); Partition p = mbrs.get(file.getPath().getName()); gout.println(p.toText(text).toString()); wktout.println(p.toWKT()); } wktout.close(); gout.close(); } // Return the final answer Partition finalResult = new Partition(); finalResult.size = finalResult.recordCount = 0; finalResult.x1 = finalResult.y1 = Double.MAX_VALUE; finalResult.x2 = finalResult.y2 = -Double.MAX_VALUE; for (Partition p2 : mbrs.values()) finalResult.expand(p2); return finalResult; }
From source file:edu.umn.cs.spatialHadoop.operations.KNN.java
License:Open Source License
private static <S extends Shape> long knnLocal(Path inFile, Path outPath, OperationsParams params) throws IOException, InterruptedException { int iterations = 0; FileSystem fs = inFile.getFileSystem(params); Point queryPoint = (Point) OperationsParams.getShape(params, "point"); int k = params.getInt("k", 1); // Top-k objects are retained in this object PriorityQueue<ShapeWithDistance<S>> knn = new KNNObjects<ShapeWithDistance<S>>(k); SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>(); final GlobalIndex<Partition> gIndex = SpatialSite.getGlobalIndex(fs, inFile); double kthDistance = Double.MAX_VALUE; if (gIndex != null) { // There is a global index, use it PriorityQueue<ShapeWithDistance<Partition>> partitionsToProcess = new PriorityQueue<KNN.ShapeWithDistance<Partition>>() { {/*from ww w. ja va2s. c o m*/ initialize(gIndex.size()); } @Override protected boolean lessThan(Object a, Object b) { return ((ShapeWithDistance<Partition>) a).distance < ((ShapeWithDistance<Partition>) b).distance; } }; for (Partition p : gIndex) { double distance = p.getMinDistanceTo(queryPoint.x, queryPoint.y); partitionsToProcess.insert(new ShapeWithDistance<Partition>(p.clone(), distance)); } while (partitionsToProcess.size() > 0 && partitionsToProcess.top().distance <= kthDistance) { ShapeWithDistance<Partition> partitionToProcess = partitionsToProcess.pop(); // Process this partition Path partitionPath = new Path(inFile, partitionToProcess.shape.filename); long length = fs.getFileStatus(partitionPath).getLen(); FileSplit fsplit = new FileSplit(partitionPath, 0, length, new String[0]); RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplit, null); if (reader instanceof SpatialRecordReader3) { ((SpatialRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof RTreeRecordReader3) { ((RTreeRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof HDFRecordReader) { ((HDFRecordReader) reader).initialize(fsplit, params); } else { throw new RuntimeException("Unknown record reader"); } iterations++; while (reader.nextKeyValue()) { Iterable<Shape> shapes = reader.getCurrentValue(); for (Shape shape : shapes) { double distance = shape.distanceTo(queryPoint.x, queryPoint.y); if (distance <= kthDistance) knn.insert(new ShapeWithDistance<S>((S) shape.clone(), distance)); } } reader.close(); if (knn.size() >= k) kthDistance = knn.top().distance; } } else { // No global index, have to scan the whole file Job job = new Job(params); SpatialInputFormat3.addInputPath(job, inFile); List<InputSplit> splits = inputFormat.getSplits(job); for (InputSplit split : splits) { RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(split, null); if (reader instanceof SpatialRecordReader3) { ((SpatialRecordReader3) reader).initialize(split, params); } else if (reader instanceof RTreeRecordReader3) { ((RTreeRecordReader3) reader).initialize(split, params); } else if (reader instanceof HDFRecordReader) { ((HDFRecordReader) reader).initialize(split, params); } else { throw new RuntimeException("Unknown record reader"); } iterations++; while (reader.nextKeyValue()) { Iterable<Shape> shapes = reader.getCurrentValue(); for (Shape shape : shapes) { double distance = shape.distanceTo(queryPoint.x, queryPoint.y); knn.insert(new ShapeWithDistance<S>((S) shape.clone(), distance)); } } reader.close(); } if (knn.size() >= k) kthDistance = knn.top().distance; } long resultCount = knn.size(); if (outPath != null && params.getBoolean("output", true)) { FileSystem outFS = outPath.getFileSystem(params); PrintStream ps = new PrintStream(outFS.create(outPath)); Vector<ShapeWithDistance<S>> resultsOrdered = new Vector<ShapeWithDistance<S>>((int) resultCount); resultsOrdered.setSize((int) resultCount); while (knn.size() > 0) { ShapeWithDistance<S> nextAnswer = knn.pop(); resultsOrdered.set(knn.size(), nextAnswer); } Text text = new Text(); for (ShapeWithDistance<S> answer : resultsOrdered) { text.clear(); TextSerializerHelper.serializeDouble(answer.distance, text, ','); answer.shape.toText(text); ps.println(text); } ps.close(); } TotalIterations.addAndGet(iterations); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Sample from a stream by a sampling ratio. The stream has to be scanned * and each record is selected with the a probability equal to the given * sampling ratio./* www. ja va 2 s .c om*/ * @param in * @param streamLength * @param ratio * @param seed * @param output * @return * @throws IOException */ private static int sampleStreamByRatio(InputStream in, double ratio, long seed, ResultCollector<Text> output) throws IOException { Random rand = new Random(seed); Text line = new Text2(); int sampleSize = 0; while (readUntilEOL(in, line) > 0) { if (rand.nextDouble() < ratio) { if (output != null) output.collect(line); sampleSize++; } line.clear(); } return sampleSize; }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Reads a sample of a specific count from a stream. With a stream, we cannot * randomly seek to any position and we have to use reservoir sampling * techniques.//from w ww . j av a 2s .c o m * * @return * @throws IOException */ private static int sampleStreamByCount(InputStream in, long streamLength, int count, long seed, ResultCollector<Text> output) throws IOException { Random rand = new Random(seed); Text dummyLine = new Text2(); Text[] sample = new Text[count]; long pos = 0; int k = 0; while (pos < streamLength) { if (k < count) { // Phase 1- Fill in the reservoir pos += readUntilEOL(in, sample[k] = new Text2()); } else { // Phase 2- Replace an existing item with probability p=(count/k) if (rand.nextInt(k) < count) { // Replace a randomly selected item int victim = rand.nextInt(count); sample[victim].clear(); pos += readUntilEOL(in, sample[victim]); } else { // Skip this item dummyLine.clear(); pos += readUntilEOL(in, dummyLine); } } k++; } // Report sampled items int sampleSize = Math.min(k, count); if (output != null) { for (int i = 0; i < sampleSize; i++) output.collect(sample[i]); } return sampleSize; }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Sample a specific number of lines from a given file * @param fs/*from w ww . jav a 2 s. co m*/ * @param file * @param count * @param seed * @param output * @return * @throws IOException */ private static int sampleFileSplitByCount(FileSplit file, Configuration conf, int count, long seed, ResultCollector<Text> output) throws IOException { InputStream in = null; Decompressor decompressor = null; try { CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath()); // Open the file and read the sample FileSystem fs = file.getPath().getFileSystem(conf); in = fs.open(file.getPath()); int sampledLines = 0; if (codec != null) { // Special handling for compressed file as we cannot compute the actual // size of the underlying data decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( in, decompressor, file.getStart(), file.getStart() + file.getLength(), SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn; // Adjust the start of the end based on the compressed data long start = cIn.getAdjustedStart(); long end = cIn.getAdjustedEnd(); sampledLines = sampleStreamByCount(in, end - start, count, seed, output); } else { // Non-splittable input, need to start from the beginning in = codec.createInputStream(in, decompressor); sampledLines = sampleStreamByCount(in, Long.MAX_VALUE, count, seed, output); } } else { long pos = 0; // Current position in file // Generate random offsets and keep them sorted for IO efficiency Random rand = new Random(seed); long[] sampleOffsets = new long[count]; for (int i = 0; i < count; i++) sampleOffsets[i] = Math.abs(rand.nextLong()) % file.getLength() + file.getStart(); Arrays.sort(sampleOffsets); // Sample the generated numbers Text line = new Text2(); for (int i = 0; i < count; i++) { pos += in.skip(sampleOffsets[i] - pos); // Skip until end of line line.clear(); pos += readUntilEOL(in, line); // Read the next full line line.clear(); if ((pos += readUntilEOL(in, line)) > 1) { sampledLines++; if (output != null) output.collect(line); } } } return sampledLines; } finally { if (in != null) in.close(); if (decompressor != null) CodecPool.returnDecompressor(decompressor); } }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Sample text lines from the given split with the given sampling ratio * @param fs//from w w w. j ava 2s. c om * @param file * @param ratio * @param seed * @param output * @return * @throws IOException */ private static int sampleFileSplitByRatio(FileSplit file, Configuration conf, float ratio, long seed, ResultCollector<Text> output) throws IOException { InputStream in = null; Decompressor decompressor = null; int sampledLines; Text line = new Text2(); try { CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecFactory.getCodec(file.getPath()); FileSystem fs = file.getPath().getFileSystem(conf); in = fs.open(file.getPath()); if (codec != null) { // Special handling for compressed file as we cannot compute the actual // size of the underlying data decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( in, decompressor, file.getStart(), file.getStart() + file.getLength(), SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn; // Adjust the start of the end based on the compressed data long start = cIn.getAdjustedStart(); long end = cIn.getAdjustedEnd(); // Skip first line if needed if (file.getStart() > 0) start += readUntilEOL(cIn, line); sampledLines = sampleStreamByRatio(in, ratio, seed, output); } else { // Non-splittable input, need to start from the beginning in = codec.createInputStream(in, decompressor); // No need to skip first line because we actually read the file from // the beginning sampledLines = sampleStreamByRatio(in, ratio, seed, output); } } else { // Not a compressed file. Apply a more efficient, though approximate, // solution // Open the file and read the sample long pos = 0; // Current position in file if (file.getStart() > 0) { pos += in.skip(file.getStart()); pos += readUntilEOL(in, line); } // Initialize the random variable which is used for sampling Random rand = new Random(seed); sampledLines = 0; // Read the first 10 lines to estimate the average record size long end = file.getStart() + file.getLength(); for (int i = 0; i < 10 && pos < end; i++) { line.clear(); pos += readUntilEOL(in, line); if (rand.nextFloat() < ratio) { sampledLines++; if (output != null) output.collect(line); } } int averageLineSize = (int) ((pos - file.getStart()) / 10); int count = Math.round(ratio * file.getLength() / averageLineSize) - sampledLines; long[] sampleOffsets = new long[count]; for (int i = 0; i < count; i++) sampleOffsets[i] = Math.abs(rand.nextLong()) % (end - pos) + file.getStart(); Arrays.sort(sampleOffsets); // Sample the generated numbers for (int i = 0; i < count; i++) { pos += in.skip(sampleOffsets[i] - pos); // Skip until end of line line.clear(); pos += readUntilEOL(in, line); // Read the next full line line.clear(); if ((pos += readUntilEOL(in, line)) > 1) { sampledLines++; if (output != null) output.collect(line); } } } } finally { if (in != null) in.close(); if (decompressor != null) CodecPool.returnDecompressor(decompressor); } in.close(); return sampledLines; }
From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java
License:Open Source License
/** * Creates a proxy ResultCollector that takes as input objects of type T * and converts them to objects of type O. * It returns an object with a collect method that takes as input an object * of type T (i.e., inObj). This object converts the given object to the * type O (i.e., outObj) and sends the result to the method in output#collect. * @param <O>//w w w. j av a 2 s.c o m * @param <T> * @param output * @param inObj * @param outObj * @return */ private static <O extends TextSerializable, T extends TextSerializable> ResultCollector<T> createConverter( final ResultCollector<O> output, T inObj, final O outObj) { if (output == null) return null; if (inObj.getClass() == outObj.getClass()) { return new ResultCollector<T>() { @Override public void collect(T r) { output.collect((O) r); } }; } else if (inObj instanceof Shape && outObj instanceof Point) { final Point out_pt = (Point) outObj; return new ResultCollector<T>() { @Override public void collect(T r) { Shape s = (Shape) r; if (s == null) return; Rectangle mbr = s.getMBR(); if (mbr == null) return; Point pt = mbr.getCenterPoint(); out_pt.x = pt.x; out_pt.y = pt.y; output.collect(outObj); } }; } else if (inObj instanceof Shape && outObj instanceof Rectangle) { final Rectangle out_rect = (Rectangle) outObj; return new ResultCollector<T>() { @Override public void collect(T r) { out_rect.set((Shape) r); output.collect(outObj); } }; } else if (outObj instanceof Text) { final Text text = (Text) outObj; return new ResultCollector<T>() { @Override public void collect(T r) { text.clear(); r.toText(text); output.collect(outObj); } }; } else if (inObj instanceof Text) { final Text text = (Text) inObj; return new ResultCollector<T>() { @Override public void collect(T r) { outObj.fromText(text); output.collect(outObj); } }; } else { throw new RuntimeException("Cannot convert from " + inObj.getClass() + " to " + outObj.getClass()); } }
From source file:edu.umn.cs.spatialHadoop.util.JspSpatialHelper.java
License:Open Source License
/** * Runs the given process and returns the result code. Feeds the given string * to the stdin of the run process. If stdout or stderr is non-null, they are * filled with the stdout or stderr of the run process, respectively. * If wait is set to true, the process is run in synchronous mode where we * wait until it is finished. Otherwise, this function call returns * immediately and leaves the process running in the background. In the later * case, stdout, stderr and the return value are not valid. * //w w w. j a v a2 s. c o m * @param workingDir - The working directory to run the script. Set null for * default. * @param cmd - The command line to run including all parameters * @param stdin - The string to feed to the stdin of the run process. * @param stdout - If non-null, the stdout of the process is fed here. * @param stderr - If non-null, the stderr of the process is fed here. * @param wait - Set to true to wait until the process exits. * @return * @throws IOException */ public static int runProcess(File workingDir, String cmd, String stdin, Text stdout, Text stderr, boolean wait) throws IOException { new File("asdf").list(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return false; } }); Process process; if (workingDir == null) process = Runtime.getRuntime().exec(cmd); else process = Runtime.getRuntime().exec(cmd, null, workingDir); if (stdin != null) { PrintStream ps = new PrintStream(process.getOutputStream()); ps.print(stdin); ps.close(); } if (!wait) return 0; try { int exitCode = process.waitFor(); byte[] buffer = new byte[4096]; if (stdout != null) { stdout.clear(); InputStream in = process.getInputStream(); while (in.available() > 0) { int bytesRead = in.read(buffer); stdout.append(buffer, 0, bytesRead); } in.close(); } if (stderr != null) { stderr.clear(); InputStream err = process.getErrorStream(); while (err.available() > 0) { int bytesRead = err.read(buffer); stderr.append(buffer, 0, bytesRead); } err.close(); } return exitCode; } catch (InterruptedException e) { e.printStackTrace(); return 1; } }