org.apache.lucene.server.handlers.BulkAddDocumentHandler.java Source code

Introduction

Here is the source code for org.apache.lucene.server.handlers.BulkAddDocumentHandler.java
Source

package org.apache.lucene.server.handlers;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.CharArrayReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Semaphore;

import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.BinaryPoint;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.server.FieldDef;
import org.apache.lucene.server.FinishRequest;
import org.apache.lucene.server.GlobalState;
import org.apache.lucene.server.IndexState;
import org.apache.lucene.server.Server;
import org.apache.lucene.server.ShardState;
import org.apache.lucene.server.params.*;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IOUtils;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;

import net.minidev.json.JSONArray;
import net.minidev.json.JSONObject;

/** Bulk addDocument from JSON single-doc-per-line encoding */

public class BulkAddDocumentHandler extends Handler {

    private static StructType TYPE = new StructType();

    // nocommit what about \r\n!?
    private static final char NEWLINE = '\n';

    /** We break the incoming JSON chars into chunks of this size and send each chunk off to separate threads for parsing and indexing */
    private static final int CHUNK_SIZE_KB = 512;

    /** Sole constructor. */
    public BulkAddDocumentHandler(GlobalState state) {
        super(state);
    }

    @Override
    public StructType getType() {
        return TYPE;
    }

    @Override
    public boolean doStream() {
        return true;
    }

    /** Parses and indexes documents from one chunk of CSV */
    private static class ParseAndIndexOneChunk implements Runnable {

        private static final JsonFactory jsonFactory = new JsonFactory();

        /** Shared context across all docs being indexed in this one stream */
        private final ShardState.IndexingContext ctx;

        private final IndexState indexState;
        private final char[] chars;
        private final Semaphore semaphore;
        private final AddDocumentHandler addDocHandler;

        /** Char starting offset of our chunk in the total incoming char stream; we use this to locate errors */
        private final long globalOffset;

        /** The job handling the chunk just before us */
        private ParseAndIndexOneChunk prev;

        private int endFragmentStartOffset = -1;

        private char[] nextStartFragment;
        private int nextStartFragmentOffset;
        private int nextStartFragmentLength;

        public ParseAndIndexOneChunk(long globalOffset, ShardState.IndexingContext ctx, ParseAndIndexOneChunk prev,
                IndexState indexState, char[] chars, Semaphore semaphore, AddDocumentHandler addDocHandler)
                throws InterruptedException {
            this.ctx = ctx;
            ctx.inFlightChunks.register();
            this.prev = prev;
            this.indexState = indexState;
            this.chars = chars;
            this.semaphore = semaphore;
            this.globalOffset = globalOffset;
            this.addDocHandler = addDocHandler;
            semaphore.acquire();
        }

        /** Indexes the one document that spans across the end of our chunk.  This is invoked when the chunk after us first starts, or when we
         *  finish processing all whole docs in our chunk, whichever comes last. */
        private void indexSplitDoc() {
            try {
                _indexSplitDoc();
            } finally {
                // nocommit only one semaphore!!
                semaphore.release();
                indexState.globalState.indexingJobsRunning.release();
                ctx.inFlightChunks.arrive();
            }
        }

        private void _indexSplitDoc() {

            if (endFragmentStartOffset == -2) {
                assert prev != null;
                // nocommit for very large docs this glueing together is O(N^2) ... fix this to be a List<char[]> instead:
                // Our entire chunk was inside a single document; instead of indexing a split doc, we combine our whole fragment and
                // the next start fragment and pass back to the previous chunk:
                char[] allChars = new char[chars.length + nextStartFragmentLength];
                System.arraycopy(chars, 0, allChars, 0, chars.length);
                System.arraycopy(nextStartFragment, 0, allChars, chars.length, nextStartFragmentLength);
                prev.setNextStartFragment(allChars, 0, allChars.length);
                prev = null;
                return;
            }

            ShardState shardState = indexState.getShard(0);
            int endFragmentLength = chars.length - endFragmentStartOffset;
            if (endFragmentLength + nextStartFragmentLength > 0) {
                char[] allChars = new char[endFragmentLength + nextStartFragmentLength];
                System.arraycopy(chars, endFragmentStartOffset, allChars, 0, endFragmentLength);
                System.arraycopy(nextStartFragment, 0, allChars, endFragmentLength, nextStartFragmentLength);

                // TODO: can/should we make a single parser and reuse it?  It's thread private here...
                JsonParser parser;
                try {
                    parser = jsonFactory.createJsonParser(new CharArrayReader(allChars));
                } catch (IOException ioe) {
                    throw new RuntimeException(ioe);
                }

                Document doc = new Document();

                try {
                    addDocHandler.parseFields(indexState, doc, parser);
                } catch (Throwable t) {
                    ctx.setError(t);
                    return;
                }

                ctx.addCount.incrementAndGet();
                if (indexState.hasFacets()) {
                    try {
                        doc = indexState.facetsConfig.build(shardState.taxoWriter, doc);
                    } catch (IOException ioe) {
                        ctx.setError(new RuntimeException(
                                "document at offset " + (globalOffset + parser.getCurrentLocation().getCharOffset())
                                        + " hit exception building facets",
                                ioe));
                        return;
                    }
                }

                try {
                    shardState.indexDocument(doc);
                } catch (Throwable t) {
                    ctx.setError(new RuntimeException(
                            "failed to index document at offset " + (globalOffset + endFragmentStartOffset), t));
                    return;
                }

                // At most one document spans across two chunks:
                assert parser.getCurrentLocation().getCharOffset() == allChars.length - 1 : " parser location="
                        + parser.getCurrentLocation().getCharOffset() + " vs " + allChars.length + " tail: "
                        + new String(allChars, allChars.length - 20, 20);
            }
        }

        /** The chunk after us calls this with its prefix fragment */
        public synchronized void setNextStartFragment(char[] chars, int offset, int length) {
            if (nextStartFragment != null) {
                throw new IllegalStateException("setNextStartFragment was already called");
            }
            nextStartFragment = chars;
            nextStartFragmentOffset = offset;
            nextStartFragmentLength = length;
            if (endFragmentStartOffset != -1) {
                // OK we already know our end fragment; together, these are one document; parse and index it now:
                indexSplitDoc();
            }
        }

        /** We call this internally with the trailing fragment in our chunk */
        private synchronized void setEndFragment(int offset) {
            endFragmentStartOffset = offset;
            if (nextStartFragment != null) {
                indexSplitDoc();
            }
        }

        @Override
        public void run() {
            try {
                _run();
            } catch (Throwable t) {
                System.out.println("FAILED:");
                t.printStackTrace(System.out);
                throw new RuntimeException(t);
            }
        }

        private void _run() {

            ShardState shardState = indexState.getShard(0);

            // find the start of our first document:
            int upto;
            if (prev != null) {
                upto = 0;
                while (upto < chars.length) {
                    if (chars[upto] == NEWLINE) {
                        break;
                    }
                    upto++;
                }
            } else {
                upto = -1;
            }

            if (upto < chars.length) {

                // skip the NEWLINE:
                upto++;

                // Give the previous chunk the leading fragment:

                if (prev != null) {
                    prev.setNextStartFragment(chars, 0, upto);
                    //System.out.println("CHUNK @ " + globalOffset + " done setNextStartFragment");

                    // break the link so GC can promptly drop finished chunks:
                    prev = null;
                }

                final int startUpto = upto;

                // add all documents in this chunk as a block:
                final int[] endOffset = new int[1];
                try {
                    // nocommit what if this Iterable produces 0 docs?  does IW get angry?
                    shardState.writer.addDocuments(new Iterable<Document>() {
                        @Override
                        public Iterator<Document> iterator() {

                            // now parse & index whole documents:

                            final boolean hasFacets = indexState.hasFacets();

                            return new Iterator<Document>() {
                                private Document nextDoc;
                                private boolean nextSet;
                                private int upto = startUpto;

                                @Override
                                public boolean hasNext() {
                                    if (nextSet == false) {

                                        // TODO: can we avoid this initial pass?  Would be somewhat hairy: we'd need to hang onto the JSONParser and
                                        // (separately) feed it the last fragment:
                                        int end = upto;
                                        while (end < chars.length && chars[end] != NEWLINE) {
                                            end++;
                                        }

                                        if (end == chars.length) {
                                            // nocommit we must assert that there was a trailing newline here.  make test!
                                            endOffset[0] = upto;
                                            nextDoc = null;
                                            nextSet = true;
                                            return false;
                                        }

                                        //System.out.println("NOW PARSE: " + new String(chars, upto, end-upto));

                                        // TODO: can/should we make a single parser and reuse it?  It's thread private here...
                                        JsonParser parser;
                                        try {
                                            parser = jsonFactory
                                                    .createJsonParser(new CharArrayReader(chars, upto, end - upto));
                                        } catch (IOException ioe) {
                                            throw new RuntimeException(ioe);
                                        }

                                        nextDoc = new Document();
                                        try {
                                            addDocHandler.parseFields(indexState, nextDoc, parser);
                                        } catch (Throwable t) {
                                            nextSet = true;
                                            nextDoc = null;
                                            ctx.setError(t);
                                            return false;
                                        }
                                        //System.out.println("ADD: " + nextDoc);

                                        ctx.addCount.incrementAndGet();
                                        if (hasFacets) {
                                            try {
                                                nextDoc = indexState.facetsConfig.build(shardState.taxoWriter,
                                                        nextDoc);
                                            } catch (IOException ioe) {
                                                nextSet = true;
                                                nextDoc = null;
                                                ctx.setError(new RuntimeException("document at offset "
                                                        + (globalOffset
                                                                + parser.getCurrentLocation().getCharOffset())
                                                        + " hit exception building facets", ioe));
                                                return false;
                                            }
                                        }
                                        // nocommit: live field values
                                        nextSet = true;

                                        // skip NEWLINE:
                                        upto = end + 1;
                                        return true;
                                    } else {
                                        return nextDoc != null;
                                    }
                                }

                                @Override
                                public Document next() {
                                    assert nextSet;
                                    try {
                                        return nextDoc;
                                    } finally {
                                        nextSet = false;
                                        nextDoc = null;
                                    }
                                }
                            };
                        }
                    });
                } catch (Throwable t) {
                    ctx.setError(t);
                }

                // nocommit need test showing you MUST have the trailing newline

                //System.out.println("CHUNK @ " + globalOffset + ": done parsing; end fragment length=" + (bytes.length-offset));
                setEndFragment(endOffset[0]);

            } else {
                // exotic case: the entire chunk is inside one document
                setEndFragment(-2);
                // nocommit also handle the exotic case where the chunk split right at a doc boundary
            }
        }
    }

    @Override
    public String handleStreamed(Reader reader, Map<String, List<String>> params) throws Exception {

        if (params.get("indexName") == null) {
            throw new IllegalArgumentException("required parameter \"indexName\" is missing");
        }

        if (params.get("indexName").size() != 1) {
            throw new IllegalArgumentException("only one \"indexName\" value is allowed");
        }

        String indexName = params.get("indexName").get(0);

        // Make sure the index does in fact exist
        IndexState indexState = globalState.get(indexName);
        ShardState shardState = indexState.getShard(0);

        // Make sure the index is started:
        if (indexState.isStarted() == false) {
            throw new IllegalArgumentException("index \"" + indexName + "\" isn't started: cannot index documents");
        }

        ShardState.IndexingContext ctx = new ShardState.IndexingContext();

        // nocommit tune this .. core count?

        // Use this to limit how many in-flight 256 KB chunks we allow into the JVM at once:

        // nocommit this should be in GlobalState so it's across all incoming indexing:
        Semaphore semaphore = new Semaphore(64);

        boolean done = false;

        // create first chunk buffer, and carry over any leftovers from the header processing:
        char[] buffer = new char[CHUNK_SIZE_KB * 1024 / 2];
        int bufferUpto = 0;

        long globalOffset = 0;

        ParseAndIndexOneChunk prev = null;
        int phase = ctx.inFlightChunks.getPhase();

        AddDocumentHandler addDocHandler = (AddDocumentHandler) globalState.getHandler("addDocument");

        while (done == false && ctx.getError() == null) {
            int count = reader.read(buffer, bufferUpto, buffer.length - bufferUpto);
            if (count == -1 || bufferUpto + count == buffer.length) {
                if (count != -1) {
                    bufferUpto += count;
                } else if (bufferUpto < buffer.length) {
                    char[] realloc = new char[bufferUpto];
                    System.arraycopy(buffer, 0, realloc, 0, bufferUpto);
                    buffer = realloc;
                }
                // NOTE: This ctor will stall when it tries to acquire the semaphore if we already have too many in-flight indexing chunks:
                prev = new ParseAndIndexOneChunk(globalOffset, ctx, prev, indexState, buffer, semaphore,
                        addDocHandler);
                globalState.submitIndexingTask(prev);
                if (count == -1) {
                    // the end
                    prev.setNextStartFragment(new char[0], 0, 0);
                    done = true;
                    break;
                } else {
                    globalOffset += buffer.length;
                    // not done yet, make the next buffer:
                    buffer = new char[CHUNK_SIZE_KB * 1024 / 2];
                    bufferUpto = 0;
                }
            } else {
                bufferUpto += count;
            }
        }

        if (done == false) {
            // we exited loop due to error; force last indexing chunk to finish up:
            prev.setNextStartFragment(new char[0], 0, 0);
        }

        // Wait for all chunks to finish indexing:
        ctx.inFlightChunks.awaitAdvance(phase);

        Throwable t = ctx.getError();
        if (t != null) {
            IOUtils.reThrow(t);
            return null;
        } else {
            JSONObject o = new JSONObject();
            o.put("indexGen", shardState.writer.getMaxCompletedSequenceNumber());
            o.put("indexedDocumentCount", ctx.addCount.get());
            return o.toString();
        }
    }

    @Override
    public String getTopDoc() {
        return "Add more than one document in a single request, encoded as doc-per-line JSON.";
    }

    @Override
    public FinishRequest handle(IndexState state, Request r, Map<String, List<String>> params) {
        throw new UnsupportedOperationException();
    }
}