Example usage for org.apache.hadoop.io Text clear

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text clear.

Prototype

public void clear()

Source Link

Document

Clear the string to empty.

Usage

From source file:edu.isi.mavuno.util.TokenWritable.java

License:Apache License

protected static void safeSet(Text t, String s) {
    if (s == null) {
        t.clear();
    } else {/*from   w  w w  . j  a  v  a2 s . c  om*/
        t.set(s);
    }
}

From source file:edu.stanford.pigir.warc.WarcRecord.java

License:Open Source License

/**
 * The actual heavy lifting of reading wbRecordReader the next WARC record. The
 * readContent parameter is used to support cases when the original
 * Pig query project out the content. We save time if we don't need
 * that content./*from  w  w w .j a v a2s .co  m*/
 * 
 * @param warcLineReader a line reader
 * @param readContent indicate whether the content of the record is needed, as opposed to just the WARC header info.
 * @return the content bytes (w/ the headerBuffer populated)
 * @throws java.io.IOException
 */
private static byte[] readNextRecord(LineAndChunkReader warcLineReader, boolean readContent)
        throws IOException {
    if (warcLineReader == null) {
        return null;
    }

    Text txtBuf = new Text();
    byte[] retContent = null;

    tmpOptionalHeaderKeys.clear();
    tmpGrandTotalBytesRead = 0L;
    tmpHeaderMap.clear();
    // Find our WARC header
    boolean foundWARCHeader = scanToRecordStart(warcLineReader, txtBuf);
    txtBuf.clear();

    // No WARC header found?
    if (!foundWARCHeader) {
        return null;
    }

    // Read the header (up to the first empty line).
    // Make sure we get the (mandatory) content length 
    // is wbRecordReader the header, because we rely on it below. 
    // We do not check for the other mandatory header fields:
    int contentLength = pullHeaderFromStream(warcLineReader, txtBuf);
    txtBuf.clear();

    if (contentLength < 0) {
        return null;
    }

    if (readContent) {
        // Pull the bytes of the content from the stream:
        retContent = new byte[contentLength];
        Integer totalRead = pullContent(warcLineReader, retContent, contentLength);
        if (totalRead == null)
            throw new IOException("Could not read content from WARC record ID: "
                    + tmpHeaderMap.get(WARC_RECORD_ID) + " of supposed content length "
                    + tmpHeaderMap.get(CONTENT_LENGTH) + ". Reason is other than EOF.");

        if (totalRead < contentLength) {
            // Did we hit EOF wbRecordReader the middle of the WARC record's content?
            throw new IOException("Hit end of file while reading content of WARC record ID: "
                    + tmpHeaderMap.get(WARC_RECORD_ID) + " of supposed content length "
                    + tmpHeaderMap.get(CONTENT_LENGTH) + ".");
        }
        tmpGrandTotalBytesRead += totalRead;
        return retContent;
    } else {
        return new byte[0];
    }
}

From source file:edu.stanford.pigir.warc.WarcRecord.java

License:Open Source License

/**
 * @param warcLineReader//from w ww . jav  a2  s .c  o  m
 * @param txtBuf
 * @param inHeader
 * @return
 * @throws IOException
 */
private static int pullHeaderFromStream(LineAndChunkReader warcLineReader, Text txtBuf) throws IOException {
    boolean inHeader = true;
    String line;
    int bytesRead;
    int contentLength = -1;
    String headerAttrName;
    String headerAttrValue;
    txtBuf.clear();
    while (inHeader && ((bytesRead = warcLineReader.readLine(txtBuf)) != 0)) {
        line = txtBuf.toString();
        tmpGrandTotalBytesRead += bytesRead;
        if (line.trim().length() == 0) {
            inHeader = false;
        } else {
            String[] thisHeaderPieceParts = line.split(":", 2);
            if (thisHeaderPieceParts.length == 2) {
                headerAttrName = (thisHeaderPieceParts[0]).trim().toLowerCase();
                headerAttrValue = thisHeaderPieceParts[1].trim();
                tmpHeaderMap.put(headerAttrName, headerAttrValue);

                // Accumulate a list of optional header keys:
                if (mandatoryHeaderFieldsLookup.get(headerAttrName) == null)
                    tmpOptionalHeaderKeys.add(headerAttrName);

                if (headerAttrName.startsWith(CONTENT_LENGTH)) {
                    try {
                        contentLength = Integer.parseInt(headerAttrValue.trim());
                    } catch (NumberFormatException nfEx) {
                        contentLength = -1;
                    }
                }
            }
        }
        txtBuf.clear();
    }
    return contentLength;
}

From source file:edu.stanford.pigir.warc.WarcRecord.java

License:Open Source License

/**
 * @param warcLineReader//w w  w  . j  a  v  a  2  s  .  com
 * @param txtBuf
 * @return success true/false
 * @throws IOException
 */
private static boolean scanToRecordStart(LineAndChunkReader warcLineReader, Text txtBuf) throws IOException {
    String line;
    boolean foundMark = false;
    int bytesRead;
    while ((!foundMark) && ((bytesRead = warcLineReader.readLine(txtBuf)) != 0)) {
        line = txtBuf.toString();
        tmpGrandTotalBytesRead += bytesRead;

        for (String acceptableWarcVersion : WARC_VERSIONS) {
            if (line.startsWith(acceptableWarcVersion)) {
                foundMark = true;
            }
        }
        txtBuf.clear();
    }
    return foundMark;
}

From source file:edu.stolaf.cs.wmrserver.streaming.StreamKeyValUtil.java

License:Apache License

/**
 * Read a utf8 encoded line from a data input stream. 
 * @param lineReader LineReader to read the line from.
 * @param out Text to read into//from www  .j  a  v  a  2s.  c om
 * @return number of bytes read 
 * @throws IOException
 */
public static int readLine(LineReader lineReader, Text out) throws IOException {
    out.clear();
    return lineReader.readLine(out);
}

From source file:edu.umn.cs.spatialHadoop.core.RTree.java

License:Open Source License

/**
 * Builds the RTree given a serialized list of elements. It uses the given
 * stockObject to deserialize these elements using
 * {@link TextSerializable#fromText(Text)} and build the tree. Also writes the
 * created tree to the disk directly.// w  ww.ja v  a 2 s .c  o  m
 * 
 * @param element_bytes
 *          - serialization of all elements separated by new lines
 * @param offset
 *          - offset of the first byte to use in elements_bytes
 * @param len
 *          - number of bytes to use in elements_bytes
 * @param degree
 *          - Degree of the R-tree to build in terms of number of children per
 *          node
 * @param dataOut
 *          - output stream to write the result to.
 * @param fast_sort
 *          - setting this to <code>true</code> allows the method to run
 *          faster by materializing the offset of each element in the list
 *          which speeds up the comparison. However, this requires an
 *          additional 16 bytes per element. So, for each 1M elements, the
 *          method will require an additional 16 M bytes (approximately).
 */
public void bulkLoadWrite(final byte[] element_bytes, final int offset, final int len, final int degree,
        DataOutput dataOut, final boolean fast_sort) {
    try {

        // Count number of elements in the given text
        int i_start = offset;
        final Text line = new Text();
        while (i_start < offset + len) {
            int i_end = skipToEOL(element_bytes, i_start);
            // Extract the line without end of line character
            line.set(element_bytes, i_start, i_end - i_start - 1);
            stockObject.fromText(line);
            elementCount++;
            i_start = i_end;
        }
        LOG.info("Bulk loading an RTree with " + elementCount + " elements");

        // It turns out the findBestDegree returns the best degree when the whole
        // tree is loaded to memory when processed. However, as current algorithms
        // process the tree while it's on disk, a higher degree should be selected
        // such that a node fits one file block (assumed to be 4K).
        //final int degree = findBestDegree(bytesAvailable, elementCount);
        LOG.info("Writing an RTree with degree " + degree);

        int height = Math.max(1, (int) Math.ceil(Math.log(elementCount) / Math.log(degree)));
        int leafNodeCount = (int) Math.pow(degree, height - 1);
        if (elementCount < 2 * leafNodeCount && height > 1) {
            height--;
            leafNodeCount = (int) Math.pow(degree, height - 1);
        }
        int nodeCount = (int) ((Math.pow(degree, height) - 1) / (degree - 1));
        int nonLeafNodeCount = nodeCount - leafNodeCount;

        // Keep track of the offset of each element in the text
        final int[] offsets = new int[elementCount];
        final double[] xs = fast_sort ? new double[elementCount] : null;
        final double[] ys = fast_sort ? new double[elementCount] : null;

        i_start = offset;
        line.clear();
        for (int i = 0; i < elementCount; i++) {
            offsets[i] = i_start;
            int i_end = skipToEOL(element_bytes, i_start);
            if (xs != null) {
                // Extract the line with end of line character
                line.set(element_bytes, i_start, i_end - i_start - 1);
                stockObject.fromText(line);
                // Sample center of the shape
                xs[i] = (stockObject.getMBR().x1 + stockObject.getMBR().x2) / 2;
                ys[i] = (stockObject.getMBR().y1 + stockObject.getMBR().y2) / 2;
            }
            i_start = i_end;
        }

        /**A struct to store information about a split*/
        class SplitStruct extends Rectangle {
            /**Start and end index for this split*/
            int index1, index2;
            /**Direction of this split*/
            byte direction;
            /**Index of first element on disk*/
            int offsetOfFirstElement;

            static final byte DIRECTION_X = 0;
            static final byte DIRECTION_Y = 1;

            SplitStruct(int index1, int index2, byte direction) {
                this.index1 = index1;
                this.index2 = index2;
                this.direction = direction;
            }

            @Override
            public void write(DataOutput out) throws IOException {
                out.writeInt(offsetOfFirstElement);
                super.write(out);
            }

            void partition(Queue<SplitStruct> toBePartitioned) {
                IndexedSortable sortableX;
                IndexedSortable sortableY;

                if (fast_sort) {
                    // Use materialized xs[] and ys[] to do the comparisons
                    sortableX = new IndexedSortable() {
                        @Override
                        public void swap(int i, int j) {
                            // Swap xs
                            double tempx = xs[i];
                            xs[i] = xs[j];
                            xs[j] = tempx;
                            // Swap ys
                            double tempY = ys[i];
                            ys[i] = ys[j];
                            ys[j] = tempY;
                            // Swap id
                            int tempid = offsets[i];
                            offsets[i] = offsets[j];
                            offsets[j] = tempid;
                        }

                        @Override
                        public int compare(int i, int j) {
                            if (xs[i] < xs[j])
                                return -1;
                            if (xs[i] > xs[j])
                                return 1;
                            return 0;
                        }
                    };

                    sortableY = new IndexedSortable() {
                        @Override
                        public void swap(int i, int j) {
                            // Swap xs
                            double tempx = xs[i];
                            xs[i] = xs[j];
                            xs[j] = tempx;
                            // Swap ys
                            double tempY = ys[i];
                            ys[i] = ys[j];
                            ys[j] = tempY;
                            // Swap id
                            int tempid = offsets[i];
                            offsets[i] = offsets[j];
                            offsets[j] = tempid;
                        }

                        @Override
                        public int compare(int i, int j) {
                            if (ys[i] < ys[j])
                                return -1;
                            if (ys[i] > ys[j])
                                return 1;
                            return 0;
                        }
                    };
                } else {
                    // No materialized xs and ys. Always deserialize objects to compare
                    sortableX = new IndexedSortable() {
                        @Override
                        public void swap(int i, int j) {
                            // Swap id
                            int tempid = offsets[i];
                            offsets[i] = offsets[j];
                            offsets[j] = tempid;
                        }

                        @Override
                        public int compare(int i, int j) {
                            // Get end of line
                            int eol = skipToEOL(element_bytes, offsets[i]);
                            line.set(element_bytes, offsets[i], eol - offsets[i] - 1);
                            stockObject.fromText(line);
                            double xi = (stockObject.getMBR().x1 + stockObject.getMBR().x2) / 2;

                            eol = skipToEOL(element_bytes, offsets[j]);
                            line.set(element_bytes, offsets[j], eol - offsets[j] - 1);
                            stockObject.fromText(line);
                            double xj = (stockObject.getMBR().x1 + stockObject.getMBR().x2) / 2;
                            if (xi < xj)
                                return -1;
                            if (xi > xj)
                                return 1;
                            return 0;
                        }
                    };

                    sortableY = new IndexedSortable() {
                        @Override
                        public void swap(int i, int j) {
                            // Swap id
                            int tempid = offsets[i];
                            offsets[i] = offsets[j];
                            offsets[j] = tempid;
                        }

                        @Override
                        public int compare(int i, int j) {
                            int eol = skipToEOL(element_bytes, offsets[i]);
                            line.set(element_bytes, offsets[i], eol - offsets[i] - 1);
                            stockObject.fromText(line);
                            double yi = (stockObject.getMBR().y1 + stockObject.getMBR().y2) / 2;

                            eol = skipToEOL(element_bytes, offsets[j]);
                            line.set(element_bytes, offsets[j], eol - offsets[j] - 1);
                            stockObject.fromText(line);
                            double yj = (stockObject.getMBR().y1 + stockObject.getMBR().y2) / 2;
                            if (yi < yj)
                                return -1;
                            if (yi > yj)
                                return 1;
                            return 0;
                        }
                    };
                }

                final IndexedSorter sorter = new QuickSort();

                final IndexedSortable[] sortables = new IndexedSortable[2];
                sortables[SplitStruct.DIRECTION_X] = sortableX;
                sortables[SplitStruct.DIRECTION_Y] = sortableY;

                sorter.sort(sortables[direction], index1, index2);

                // Partition into maxEntries partitions (equally) and
                // create a SplitStruct for each partition
                int i1 = index1;
                for (int iSplit = 0; iSplit < degree; iSplit++) {
                    int i2 = index1 + (index2 - index1) * (iSplit + 1) / degree;
                    SplitStruct newSplit = new SplitStruct(i1, i2, (byte) (1 - direction));
                    toBePartitioned.add(newSplit);
                    i1 = i2;
                }
            }
        }

        // All nodes stored in level-order traversal
        Vector<SplitStruct> nodes = new Vector<SplitStruct>();
        final Queue<SplitStruct> toBePartitioned = new LinkedList<SplitStruct>();
        toBePartitioned.add(new SplitStruct(0, elementCount, SplitStruct.DIRECTION_X));

        while (!toBePartitioned.isEmpty()) {
            SplitStruct split = toBePartitioned.poll();
            if (nodes.size() < nonLeafNodeCount) {
                // This is a non-leaf
                split.partition(toBePartitioned);
            }
            nodes.add(split);
        }

        if (nodes.size() != nodeCount) {
            throw new RuntimeException(
                    "Expected node count: " + nodeCount + ". Real node count: " + nodes.size());
        }

        // Now we have our data sorted in the required order. Start building
        // the tree.
        // Store the offset of each leaf node in the tree
        FSDataOutputStream fakeOut = null;
        try {
            fakeOut = new FSDataOutputStream(new java.io.OutputStream() {
                // Null output stream
                @Override
                public void write(int b) throws IOException {
                    // Do nothing
                }

                @Override
                public void write(byte[] b, int off, int len) throws IOException {
                    // Do nothing
                }

                @Override
                public void write(byte[] b) throws IOException {
                    // Do nothing
                }
            }, null, TreeHeaderSize + nodes.size() * NodeSize);
            for (int i_leaf = nonLeafNodeCount, i = 0; i_leaf < nodes.size(); i_leaf++) {
                nodes.elementAt(i_leaf).offsetOfFirstElement = (int) fakeOut.getPos();
                if (i != nodes.elementAt(i_leaf).index1)
                    throw new RuntimeException();
                double x1, y1, x2, y2;

                // Initialize MBR to first object
                int eol = skipToEOL(element_bytes, offsets[i]);
                fakeOut.write(element_bytes, offsets[i], eol - offsets[i]);
                line.set(element_bytes, offsets[i], eol - offsets[i] - 1);
                stockObject.fromText(line);
                Rectangle mbr = stockObject.getMBR();
                x1 = mbr.x1;
                y1 = mbr.y1;
                x2 = mbr.x2;
                y2 = mbr.y2;
                i++;

                while (i < nodes.elementAt(i_leaf).index2) {
                    eol = skipToEOL(element_bytes, offsets[i]);
                    fakeOut.write(element_bytes, offsets[i], eol - offsets[i]);
                    line.set(element_bytes, offsets[i], eol - offsets[i] - 1);
                    stockObject.fromText(line);
                    mbr = stockObject.getMBR();
                    if (mbr.x1 < x1)
                        x1 = mbr.x1;
                    if (mbr.y1 < y1)
                        y1 = mbr.y1;
                    if (mbr.x2 > x2)
                        x2 = mbr.x2;
                    if (mbr.y2 > y2)
                        y2 = mbr.y2;
                    i++;
                }
                nodes.elementAt(i_leaf).set(x1, y1, x2, y2);
            }

        } finally {
            if (fakeOut != null)
                fakeOut.close();
        }

        // Calculate MBR and offsetOfFirstElement for non-leaves
        for (int i_node = nonLeafNodeCount - 1; i_node >= 0; i_node--) {
            int i_first_child = i_node * degree + 1;
            nodes.elementAt(i_node).offsetOfFirstElement = nodes.elementAt(i_first_child).offsetOfFirstElement;
            int i_child = 0;
            Rectangle mbr;
            mbr = nodes.elementAt(i_first_child + i_child);
            double x1 = mbr.x1;
            double y1 = mbr.y1;
            double x2 = mbr.x2;
            double y2 = mbr.y2;
            i_child++;

            while (i_child < degree) {
                mbr = nodes.elementAt(i_first_child + i_child);
                if (mbr.x1 < x1)
                    x1 = mbr.x1;
                if (mbr.y1 < y1)
                    y1 = mbr.y1;
                if (mbr.x2 > x2)
                    x2 = mbr.x2;
                if (mbr.y2 > y2)
                    y2 = mbr.y2;
                i_child++;
            }
            nodes.elementAt(i_node).set(x1, y1, x2, y2);
        }

        // Start writing the tree
        // write tree header (including size)
        // Total tree size. (== Total bytes written - 8 bytes for the size itself)
        dataOut.writeInt(TreeHeaderSize + NodeSize * nodeCount + len);
        // Tree height
        dataOut.writeInt(height);
        // Degree
        dataOut.writeInt(degree);
        dataOut.writeInt(elementCount);

        // write nodes
        for (SplitStruct node : nodes) {
            node.write(dataOut);
        }
        // write elements
        for (int element_i = 0; element_i < elementCount; element_i++) {
            int eol = skipToEOL(element_bytes, offsets[element_i]);
            dataOut.write(element_bytes, offsets[element_i], eol - offsets[element_i]);
        }

    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:edu.umn.cs.spatialHadoop.delaunay.DelaunayTriangulation.java

License:Open Source License

/**
 * Compute the Deluanay triangulation in the local machine
 * @param inPaths/*w w w . j ava  2 s  . c o m*/
 * @param outPath
 * @param params
 * @throws IOException
 * @throws InterruptedException
 */
public static void delaunayLocal(Path[] inPaths, Path outPath, final OperationsParams params)
        throws IOException, InterruptedException {
    if (params.getBoolean("mem", false))
        MemoryReporter.startReporting();
    // 1- Split the input path/file to get splits that can be processed
    // independently
    final SpatialInputFormat3<Rectangle, Point> inputFormat = new SpatialInputFormat3<Rectangle, Point>();
    Job job = Job.getInstance(params);
    SpatialInputFormat3.setInputPaths(job, inPaths);
    final List<InputSplit> splits = inputFormat.getSplits(job);
    final Point[][] allLists = new Point[splits.size()][];

    // 2- Read all input points in memory
    LOG.info("Reading points from " + splits.size() + " splits");
    List<Integer> numsPoints = Parallel.forEach(splits.size(), new RunnableRange<Integer>() {
        @Override
        public Integer run(int i1, int i2) {
            try {
                int numPoints = 0;
                for (int i = i1; i < i2; i++) {
                    List<Point> points = new ArrayList<Point>();
                    FileSplit fsplit = (FileSplit) splits.get(i);
                    final RecordReader<Rectangle, Iterable<Point>> reader = inputFormat
                            .createRecordReader(fsplit, null);
                    if (reader instanceof SpatialRecordReader3) {
                        ((SpatialRecordReader3) reader).initialize(fsplit, params);
                    } else if (reader instanceof RTreeRecordReader3) {
                        ((RTreeRecordReader3) reader).initialize(fsplit, params);
                    } else if (reader instanceof HDFRecordReader) {
                        ((HDFRecordReader) reader).initialize(fsplit, params);
                    } else {
                        throw new RuntimeException("Unknown record reader");
                    }
                    while (reader.nextKeyValue()) {
                        Iterable<Point> pts = reader.getCurrentValue();
                        for (Point p : pts) {
                            points.add(p.clone());
                        }
                    }
                    reader.close();
                    numPoints += points.size();
                    allLists[i] = points.toArray(new Point[points.size()]);
                }
                return numPoints;
            } catch (IOException e) {
                e.printStackTrace();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            return null;
        }
    }, params.getInt("parallel", Runtime.getRuntime().availableProcessors()));

    int totalNumPoints = 0;
    for (int numPoints : numsPoints)
        totalNumPoints += numPoints;

    LOG.info("Read " + totalNumPoints + " points and merging into one list");
    Point[] allPoints = new Point[totalNumPoints];
    int pointer = 0;

    for (int iList = 0; iList < allLists.length; iList++) {
        System.arraycopy(allLists[iList], 0, allPoints, pointer, allLists[iList].length);
        pointer += allLists[iList].length;
        allLists[iList] = null; // To let the GC collect it
    }

    if (params.getBoolean("dedup", true)) {
        float threshold = params.getFloat("threshold", 1E-5f);
        allPoints = SpatialAlgorithms.deduplicatePoints(allPoints, threshold);
    }

    LOG.info("Computing DT for " + allPoints.length + " points");
    GSDTAlgorithm dtAlgorithm = new GSImprovedAlgorithm(allPoints, null);
    LOG.info("DT computed");

    Rectangle mbr = FileMBR.fileMBR(inPaths, params);
    double buffer = Math.max(mbr.getWidth(), mbr.getHeight()) / 10;
    Rectangle bigMBR = mbr.buffer(buffer, buffer);
    if (outPath != null && params.getBoolean("output", true)) {
        LOG.info("Writing the output as a soup of triangles");
        Triangulation answer = dtAlgorithm.getFinalTriangulation();
        FileSystem outFS = outPath.getFileSystem(params);
        PrintStream out = new PrintStream(outFS.create(outPath));

        Text text = new Text2();
        byte[] tab = "\t".getBytes();
        for (Point[] triangle : answer.iterateTriangles()) {
            text.clear();
            triangle[0].toText(text);
            text.append(tab, 0, tab.length);
            triangle[1].toText(text);
            text.append(tab, 0, tab.length);
            triangle[2].toText(text);
            out.println(text);
        }
        out.close();
    }

    //    dtAlgorithm.getFinalTriangulation().draw();
    //Triangulation finalPart = new Triangulation();
    //Triangulation nonfinalPart = new Triangulation();
    //dtAlgorithm.splitIntoFinalAndNonFinalParts(new Rectangle(-180, -90, 180, 90), finalPart, nonfinalPart);
}

From source file:edu.umn.cs.spatialHadoop.delaunay.Triangulation.java

License:Open Source License

/**
 * Draw as SVG Rasem commands.//w  w w  .  j a va2 s  .  c o  m
 */
public void draw(PrintStream out, Rectangle mbr, double scale) {
    System.out.println("group {");
    Text text = new Text();
    for (Point s : sites) {
        text.clear();
        System.out.printf("circle %f, %f, 0.5 # %s\n", (s.x - mbr.x1) * scale, (s.y - mbr.y1) * scale,
                s.toText(text).toString());
    }
    System.out.println("}");
    System.out.println("group {");
    for (int i = 0; i < edgeStarts.length; i++) {
        if (edgeStarts[i] < edgeEnds[i])
            System.out.printf("line %f, %f, %f, %f\n", (sites[edgeStarts[i]].x - mbr.x1) * scale,
                    (sites[edgeStarts[i]].y - mbr.y1) * scale, (sites[edgeEnds[i]].x - mbr.x1) * scale,
                    (sites[edgeEnds[i]].y - mbr.y1) * scale);
    }
    System.out.println("}");
}

From source file:edu.umn.cs.spatialHadoop.indexing.RTree.java

License:Open Source License

/**
 * Builds the RTree given a serialized list of elements. It uses the given
 * stockObject to deserialize these elements using
 * {@link TextSerializable#fromText(Text)} and build the tree. Also writes the
 * created tree to the disk directly./*from  ww w  .  j  a v a 2  s.c  o  m*/
 * 
 * @param element_bytes
 *          - serialization of all elements separated by new lines
 * @param offset
 *          - offset of the first byte to use in elements_bytes
 * @param len
 *          - number of bytes to use in elements_bytes
 * @param degree
 *          - Degree of the R-tree to build in terms of number of children per
 *          node
 * @param dataOut
 *          - output stream to write the result to.
 * @param fast_sort
 *          - setting this to <code>true</code> allows the method to run
 *          faster by materializing the offset of each element in the list
 *          which speeds up the comparison. However, this requires an
 *          additional 16 bytes per element. So, for each 1M elements, the
 *          method will require an additional 16 M bytes (approximately).
 */
public static void bulkLoadWrite(final byte[] element_bytes, final int offset, final int len, final int degree,
        DataOutput dataOut, final Shape stockObject, final boolean fast_sort) {
    try {

        int elementCount = 0;
        // Count number of elements in the given text
        int i_start = offset;
        final Text line = new Text();
        while (i_start < offset + len) {
            int i_end = skipToEOL(element_bytes, i_start);
            // Extract the line without end of line character
            line.set(element_bytes, i_start, i_end - i_start - 1);
            stockObject.fromText(line);
            elementCount++;
            i_start = i_end;
        }
        LOG.info("Bulk loading an RTree with " + elementCount + " elements");

        // It turns out the findBestDegree returns the best degree when the whole
        // tree is loaded to memory when processed. However, as current algorithms
        // process the tree while it's on disk, a higher degree should be selected
        // such that a node fits one file block (assumed to be 4K).
        //final int degree = findBestDegree(bytesAvailable, elementCount);

        int height = Math.max(1, (int) Math.ceil(Math.log(elementCount) / Math.log(degree)));
        int leafNodeCount = (int) Math.pow(degree, height - 1);
        if (elementCount < 2 * leafNodeCount && height > 1) {
            height--;
            leafNodeCount = (int) Math.pow(degree, height - 1);
        }
        int nodeCount = (int) ((Math.pow(degree, height) - 1) / (degree - 1));
        int nonLeafNodeCount = nodeCount - leafNodeCount;

        // Keep track of the offset of each element in the text
        final int[] offsets = new int[elementCount];
        final double[] xs = fast_sort ? new double[elementCount] : null;
        final double[] ys = fast_sort ? new double[elementCount] : null;

        i_start = offset;
        line.clear();
        for (int i = 0; i < elementCount; i++) {
            offsets[i] = i_start;
            int i_end = skipToEOL(element_bytes, i_start);
            if (xs != null) {
                // Extract the line with end of line character
                line.set(element_bytes, i_start, i_end - i_start - 1);
                stockObject.fromText(line);
                // Sample center of the shape
                xs[i] = (stockObject.getMBR().x1 + stockObject.getMBR().x2) / 2;
                ys[i] = (stockObject.getMBR().y1 + stockObject.getMBR().y2) / 2;
            }
            i_start = i_end;
        }

        /**A struct to store information about a split*/
        class SplitStruct extends Rectangle {
            /**Start and end index for this split*/
            int index1, index2;
            /**Direction of this split*/
            byte direction;
            /**Index of first element on disk*/
            int offsetOfFirstElement;

            static final byte DIRECTION_X = 0;
            static final byte DIRECTION_Y = 1;

            SplitStruct(int index1, int index2, byte direction) {
                this.index1 = index1;
                this.index2 = index2;
                this.direction = direction;
            }

            @Override
            public void write(DataOutput out) throws IOException {
                out.writeInt(offsetOfFirstElement);
                super.write(out);
            }

            void partition(Queue<SplitStruct> toBePartitioned) {
                IndexedSortable sortableX;
                IndexedSortable sortableY;

                if (fast_sort) {
                    // Use materialized xs[] and ys[] to do the comparisons
                    sortableX = new IndexedSortable() {
                        @Override
                        public void swap(int i, int j) {
                            // Swap xs
                            double tempx = xs[i];
                            xs[i] = xs[j];
                            xs[j] = tempx;
                            // Swap ys
                            double tempY = ys[i];
                            ys[i] = ys[j];
                            ys[j] = tempY;
                            // Swap id
                            int tempid = offsets[i];
                            offsets[i] = offsets[j];
                            offsets[j] = tempid;
                        }

                        @Override
                        public int compare(int i, int j) {
                            if (xs[i] < xs[j])
                                return -1;
                            if (xs[i] > xs[j])
                                return 1;
                            return 0;
                        }
                    };

                    sortableY = new IndexedSortable() {
                        @Override
                        public void swap(int i, int j) {
                            // Swap xs
                            double tempx = xs[i];
                            xs[i] = xs[j];
                            xs[j] = tempx;
                            // Swap ys
                            double tempY = ys[i];
                            ys[i] = ys[j];
                            ys[j] = tempY;
                            // Swap id
                            int tempid = offsets[i];
                            offsets[i] = offsets[j];
                            offsets[j] = tempid;
                        }

                        @Override
                        public int compare(int i, int j) {
                            if (ys[i] < ys[j])
                                return -1;
                            if (ys[i] > ys[j])
                                return 1;
                            return 0;
                        }
                    };
                } else {
                    // No materialized xs and ys. Always deserialize objects to compare
                    sortableX = new IndexedSortable() {
                        @Override
                        public void swap(int i, int j) {
                            // Swap id
                            int tempid = offsets[i];
                            offsets[i] = offsets[j];
                            offsets[j] = tempid;
                        }

                        @Override
                        public int compare(int i, int j) {
                            // Get end of line
                            int eol = skipToEOL(element_bytes, offsets[i]);
                            line.set(element_bytes, offsets[i], eol - offsets[i] - 1);
                            stockObject.fromText(line);
                            double xi = (stockObject.getMBR().x1 + stockObject.getMBR().x2) / 2;

                            eol = skipToEOL(element_bytes, offsets[j]);
                            line.set(element_bytes, offsets[j], eol - offsets[j] - 1);
                            stockObject.fromText(line);
                            double xj = (stockObject.getMBR().x1 + stockObject.getMBR().x2) / 2;
                            if (xi < xj)
                                return -1;
                            if (xi > xj)
                                return 1;
                            return 0;
                        }
                    };

                    sortableY = new IndexedSortable() {
                        @Override
                        public void swap(int i, int j) {
                            // Swap id
                            int tempid = offsets[i];
                            offsets[i] = offsets[j];
                            offsets[j] = tempid;
                        }

                        @Override
                        public int compare(int i, int j) {
                            int eol = skipToEOL(element_bytes, offsets[i]);
                            line.set(element_bytes, offsets[i], eol - offsets[i] - 1);
                            stockObject.fromText(line);
                            double yi = (stockObject.getMBR().y1 + stockObject.getMBR().y2) / 2;

                            eol = skipToEOL(element_bytes, offsets[j]);
                            line.set(element_bytes, offsets[j], eol - offsets[j] - 1);
                            stockObject.fromText(line);
                            double yj = (stockObject.getMBR().y1 + stockObject.getMBR().y2) / 2;
                            if (yi < yj)
                                return -1;
                            if (yi > yj)
                                return 1;
                            return 0;
                        }
                    };
                }

                final IndexedSorter sorter = new QuickSort();

                final IndexedSortable[] sortables = new IndexedSortable[2];
                sortables[SplitStruct.DIRECTION_X] = sortableX;
                sortables[SplitStruct.DIRECTION_Y] = sortableY;

                sorter.sort(sortables[direction], index1, index2);

                // Partition into maxEntries partitions (equally) and
                // create a SplitStruct for each partition
                int i1 = index1;
                for (int iSplit = 0; iSplit < degree; iSplit++) {
                    int i2 = index1 + (index2 - index1) * (iSplit + 1) / degree;
                    SplitStruct newSplit = new SplitStruct(i1, i2, (byte) (1 - direction));
                    toBePartitioned.add(newSplit);
                    i1 = i2;
                }
            }
        }

        // All nodes stored in level-order traversal
        Vector<SplitStruct> nodes = new Vector<SplitStruct>();
        final Queue<SplitStruct> toBePartitioned = new LinkedList<SplitStruct>();
        toBePartitioned.add(new SplitStruct(0, elementCount, SplitStruct.DIRECTION_X));

        while (!toBePartitioned.isEmpty()) {
            SplitStruct split = toBePartitioned.poll();
            if (nodes.size() < nonLeafNodeCount) {
                // This is a non-leaf
                split.partition(toBePartitioned);
            }
            nodes.add(split);
        }

        if (nodes.size() != nodeCount) {
            throw new RuntimeException(
                    "Expected node count: " + nodeCount + ". Real node count: " + nodes.size());
        }

        // Now we have our data sorted in the required order. Start building
        // the tree.
        // Store the offset of each leaf node in the tree
        FSDataOutputStream fakeOut = null;
        try {
            fakeOut = new FSDataOutputStream(new java.io.OutputStream() {
                // Null output stream
                @Override
                public void write(int b) throws IOException {
                    // Do nothing
                }

                @Override
                public void write(byte[] b, int off, int len) throws IOException {
                    // Do nothing
                }

                @Override
                public void write(byte[] b) throws IOException {
                    // Do nothing
                }
            }, null, TreeHeaderSize + nodes.size() * NodeSize);
            for (int i_leaf = nonLeafNodeCount, i = 0; i_leaf < nodes.size(); i_leaf++) {
                nodes.elementAt(i_leaf).offsetOfFirstElement = (int) fakeOut.getPos();
                if (i != nodes.elementAt(i_leaf).index1)
                    throw new RuntimeException();
                double x1, y1, x2, y2;

                // Initialize MBR to first object
                int eol = skipToEOL(element_bytes, offsets[i]);
                fakeOut.write(element_bytes, offsets[i], eol - offsets[i]);
                line.set(element_bytes, offsets[i], eol - offsets[i] - 1);
                stockObject.fromText(line);
                Rectangle mbr = stockObject.getMBR();
                x1 = mbr.x1;
                y1 = mbr.y1;
                x2 = mbr.x2;
                y2 = mbr.y2;
                i++;

                while (i < nodes.elementAt(i_leaf).index2) {
                    eol = skipToEOL(element_bytes, offsets[i]);
                    fakeOut.write(element_bytes, offsets[i], eol - offsets[i]);
                    line.set(element_bytes, offsets[i], eol - offsets[i] - 1);
                    stockObject.fromText(line);
                    mbr = stockObject.getMBR();
                    if (mbr.x1 < x1)
                        x1 = mbr.x1;
                    if (mbr.y1 < y1)
                        y1 = mbr.y1;
                    if (mbr.x2 > x2)
                        x2 = mbr.x2;
                    if (mbr.y2 > y2)
                        y2 = mbr.y2;
                    i++;
                }
                nodes.elementAt(i_leaf).set(x1, y1, x2, y2);
            }

        } finally {
            if (fakeOut != null)
                fakeOut.close();
        }

        // Calculate MBR and offsetOfFirstElement for non-leaves
        for (int i_node = nonLeafNodeCount - 1; i_node >= 0; i_node--) {
            int i_first_child = i_node * degree + 1;
            nodes.elementAt(i_node).offsetOfFirstElement = nodes.elementAt(i_first_child).offsetOfFirstElement;
            int i_child = 0;
            Rectangle mbr;
            mbr = nodes.elementAt(i_first_child + i_child);
            double x1 = mbr.x1;
            double y1 = mbr.y1;
            double x2 = mbr.x2;
            double y2 = mbr.y2;
            i_child++;

            while (i_child < degree) {
                mbr = nodes.elementAt(i_first_child + i_child);
                if (mbr.x1 < x1)
                    x1 = mbr.x1;
                if (mbr.y1 < y1)
                    y1 = mbr.y1;
                if (mbr.x2 > x2)
                    x2 = mbr.x2;
                if (mbr.y2 > y2)
                    y2 = mbr.y2;
                i_child++;
            }
            nodes.elementAt(i_node).set(x1, y1, x2, y2);
        }

        // Start writing the tree
        // write tree header (including size)
        // Total tree size. (== Total bytes written - 8 bytes for the size itself)
        dataOut.writeInt(TreeHeaderSize + NodeSize * nodeCount + len);
        // Tree height
        dataOut.writeInt(height);
        // Degree
        dataOut.writeInt(degree);
        dataOut.writeInt(elementCount);

        // write nodes
        for (SplitStruct node : nodes) {
            node.write(dataOut);
        }
        // write elements
        for (int element_i = 0; element_i < elementCount; element_i++) {
            int eol = skipToEOL(element_bytes, offsets[element_i]);
            dataOut.write(element_bytes, offsets[element_i], eol - offsets[element_i]);
        }

    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java

License:Open Source License

public static synchronized Geometry consumeGeometryJTS(Text text, char separator) {
    // Check whether this text is a Well Known Text (WKT) or a hexed string
    boolean wkt = false;
    byte[] bytes = text.getBytes();
    int length = text.getLength();
    Geometry geom;//from ww  w  .  j a va 2 s  . c o m
    int i1, i2; // Start and end offset of the geometry being parsed
    int i_next; // Beginning of the next field
    boolean isWKT = false;
    boolean isHex = false;
    if (bytes[0] == '\'' || bytes[0] == '\"') {
        // A quoted string. Find terminating quote and trim the quotes
        i1 = 1;
        i2 = 2;
        while (i2 < length && bytes[i2] != bytes[0])
            i2++;
        if (i2 == length)
            throw new RuntimeException("Unterminated quoted string");
        i_next = i2 + 1;
        i2--; // Back one step to remove the terminating quote
        isWKT = true; // Assume any quoted string to be WKT
    } else {
        // Not a quoted string, check if the type is WKT
        int i_shape = 0;
        while (!wkt && i_shape < ShapeNames.length) {
            byte[] shapeName = ShapeNames[i_shape];
            if (length > shapeName.length) {
                int i = 0;
                while (i < shapeName.length && shapeName[i] == bytes[i])
                    i++;
                if (i == shapeName.length) {
                    wkt = true;
                    break;
                }
            }
            i_shape++;
        }

        if (i_shape < ShapeNames.length) {
            isWKT = true;
            // Look for the terminator of the shape text
            i1 = 0;
            i2 = 1;
            // Search for the first open parenthesis
            while (i2 < length && bytes[i2] != '(')
                i2++;
            if (i2 < length)
                i2++; // Skip the open parenthesis itself
            int nesting = 1;
            while (i2 < length && nesting > 0) {
                if (bytes[i2] == '(')
                    nesting++;
                else if (bytes[i2] == ')')
                    nesting--;
                i2++;
            }
            i_next = i2 + 1;
        } else {
            // Check if the type is hex-encoded WKB
            i1 = 0;
            i2 = 0;
            while (i2 < length && IsHex[bytes[i2]])
                i2++;
            isHex = i2 > 1;
            i_next = i2;
        }
    }

    String geom_text = new String(bytes, i1, i2);

    try {
        if (isWKT) {
            geom = wktReader.read(geom_text);
        } else if (isHex) {
            byte[] binary = hexToBytes(geom_text);
            geom = wkbReader.read(binary);
        } else {
            geom = null;
        }
    } catch (ParseException e) {
        throw new RuntimeException(String.format("Error parsing '%s'", geom_text), e);
    }

    // Remove consumed bytes from the text
    if (i_next >= text.getLength())
        text.clear();
    else {
        if (bytes[i_next] == separator)
            i_next++;
        text.set(bytes, i_next, length - i_next);
    }

    return geom;
}