org.elasticlib.node.repository.Index.java Source code

Introduction

Here is the source code for org.elasticlib.node.repository.Index.java
Source

/*
 * Copyright 2014 Guillaume Masclet <guillaume.masclet@yahoo.fr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.elasticlib.node.repository;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import static com.google.common.io.BaseEncoding.base16;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import static java.lang.Math.min;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import static java.util.Collections.emptyList;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools.Resolution;
import static org.apache.lucene.document.DateTools.timeToString;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SingleInstanceLockFactory;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.elasticlib.common.exception.BadRequestException;
import org.elasticlib.common.exception.IOFailureException;
import org.elasticlib.common.exception.InvalidRepositoryPathException;
import org.elasticlib.common.exception.RepositoryClosedException;
import org.elasticlib.common.hash.Hash;
import org.elasticlib.common.model.IndexEntry;
import org.elasticlib.common.model.RevisionTree;
import org.elasticlib.common.value.Value;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A Lucene index on a repository.
 */
class Index {

    private static final String INDEX = "index";
    private static final String CONTENT = "content";
    private static final String LENGTH = "length";
    private static final String REVISION = "revision";
    private static final String BODY = "body";
    private static final Logger LOG = LoggerFactory.getLogger(Index.class);

    private final String name;
    private final Directory directory;
    private final Analyzer analyzer;

    private Index(String name, Path path) throws IOException {
        this.name = name;
        directory = FSDirectory.open(path.resolve(INDEX), new SingleInstanceLockFactory());
        analyzer = new StandardAnalyzer();
    }

    private IndexWriter newIndexWriter() throws IOException {
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        return new IndexWriter(directory, config);
    }

    public void close() {
        try {
            directory.close();

        } catch (IOException e) {
            LOG.error("[" + name + "] Failed to close index", e);
        }
    }

    /**
     * Create a new index.
     *
     * @param name repository name.
     * @param path repository path.
     * @return Created index.
     */
    public static Index create(String name, Path path) {
        try {
            Files.createDirectory(path.resolve(INDEX));
            return new Index(name, path);

        } catch (IOException e) {
            throw new IOFailureException(e);
        }
    }

    /**
     * Open an existing index.
     *
     * @param name repository name.
     * @param path repository path.
     * @return Opened index.
     */
    public static Index open(String name, Path path) {
        if (!Files.isDirectory(path.resolve(INDEX))) {
            throw new InvalidRepositoryPathException();
        }
        try {
            return new Index(name, path);

        } catch (IOException ex) {
            throw new IOFailureException(ex);
        }
    }

    /**
     * Index supplied content.
     *
     * @param revisionTree Revision tree associated with content to index.
     * @param inputStream Input stream of the content to index.
     */
    public void index(RevisionTree revisionTree, InputStream inputStream) {
        LOG.info("[{}] Indexing {}, at revision {}", name, revisionTree.getContent(), revisionTree.getHead());
        Optional<IndexEntry> existing = getEntry(revisionTree.getContent());
        if (existing.isPresent() && existing.get().getRevisions().equals(revisionTree.getHead())) {
            // already indexed !
            return;
        }
        try {
            if (!indexInfoAndContent(revisionTree, inputStream)) {
                // Fallback if Tika fails to extract content.
                indexInfo(revisionTree);
            }
        } catch (AlreadyClosedException e) {
            throw new RepositoryClosedException(e);

        } catch (IOException e) {
            throw new IOFailureException(e);
        }
    }

    private boolean indexInfoAndContent(RevisionTree revisionTree, InputStream inputStream) throws IOException {
        try (IndexWriter writer = newIndexWriter()) {
            // First delete any existing document.
            writer.deleteDocuments(new Term(CONTENT, revisionTree.getContent().asHexadecimalString()));

            // Then (re)create the document.
            try (Reader reader = new Tika().parse(inputStream)) {
                Document document = newDocument(revisionTree);
                document.add(new TextField(BODY, reader));
                writer.addDocument(document);
                return true;

            } catch (IOException e) {
                if (e.getCause() instanceof TikaException) {
                    LOG.error("Failed to index content from " + revisionTree.getContent(), e);
                    writer.rollback();
                    return false;
                }
                throw e;
            }
        }
    }

    private void indexInfo(RevisionTree revisionTree) throws IOException {
        try (IndexWriter writer = newIndexWriter()) {
            // First delete any existing document.
            writer.deleteDocuments(new Term(CONTENT, revisionTree.getContent().asHexadecimalString()));

            // Here we do not extract and index content.
            Document document = newDocument(revisionTree);
            writer.addDocument(document);
        }
    }

    private static Document newDocument(RevisionTree revisionTree) {
        Document document = new Document();
        document.add(new TextField(CONTENT, revisionTree.getContent().asHexadecimalString(), Store.YES));
        revisionTree.getHead().forEach(rev -> {
            document.add(new TextField(REVISION, rev.asHexadecimalString(), Store.YES));
        });
        document.add(new LongField(LENGTH, revisionTree.getLength(), Store.NO));
        headMetadata(revisionTree).asMap().entrySet().stream()
                .forEach(entry -> entry.getValue().forEach(value -> add(document, entry.getKey(), value)));

        return document;
    }

    private static Multimap<String, Value> headMetadata(RevisionTree revisionTree) {
        Multimap<String, Value> metadata = HashMultimap.create();
        revisionTree.getHead().stream().flatMap(rev -> revisionTree.get(rev).getMetadata().entrySet().stream())
                .forEach(entry -> metadata.put(entry.getKey(), entry.getValue()));

        return metadata;
    }

    private static void add(Document document, String key, Value value) {
        switch (value.type()) {
        case BOOLEAN:
            document.add(new TextField(key, value.asBoolean() ? "true" : "false", Store.NO));
            return;

        case INTEGER:
            document.add(new LongField(key, value.asLong(), Store.NO));
            return;

        case DECIMAL:
            document.add(new DoubleField(key, value.asBigDecimal().doubleValue(), Store.NO));
            return;

        case STRING:
            document.add(new TextField(key, value.asString(), Store.NO));
            return;

        case DATE:
            String formatted = timeToString(value.asInstant().toEpochMilli(), Resolution.SECOND);
            document.add(new TextField(key, formatted, Store.NO));
            return;

        case BINARY:
            document.add(new TextField(key, base16().lowerCase().encode(value.asByteArray()), Store.NO));
            return;

        case ARRAY:
            value.asList().forEach(item -> add(document, key, item));
            return;

        case OBJECT:
            value.asMap().entrySet().forEach(entry -> add(document, key + "." + entry.getKey(), entry.getValue()));
            return;

        case NULL:
            return;

        default:
            throw new IllegalArgumentException(value.type().toString());
        }
    }

    private Optional<IndexEntry> getEntry(Hash hash) {
        try {
            if (directory.listAll().length == 0) {
                return Optional.empty();
            }
            try (DirectoryReader reader = DirectoryReader.open(directory)) {
                IndexSearcher searcher = new IndexSearcher(reader);
                TermQuery query = new TermQuery(new Term(CONTENT, hash.asHexadecimalString()));
                ScoreDoc[] hits = searcher.search(query, 1).scoreDocs;
                if (hits.length == 0) {
                    return Optional.empty();
                }
                return Optional.of(newIndexEntry(searcher.doc(hits[0].doc)));
            }

        } catch (AlreadyClosedException e) {
            throw new RepositoryClosedException(e);

        } catch (IOException e) {
            throw new IOFailureException(e);
        }
    }

    /**
     * Delete all index entry about content which hash is supplied.
     *
     * @param hash Hash of this content.
     */
    public void delete(Hash hash) {
        LOG.info("[{}] Deleting {}", name, hash);
        try (IndexWriter writer = newIndexWriter()) {
            writer.deleteDocuments(new Term(CONTENT, hash.asHexadecimalString()));

        } catch (AlreadyClosedException e) {
            throw new RepositoryClosedException(e);

        } catch (IOException e) {
            throw new IOFailureException(e);
        }
    }

    /**
     * Find index entries matching supplied query.
     *
     * @param query Search query.
     * @param first First result to return.
     * @param number Number of results to return.
     * @return A list of content hashes.
     */
    public List<IndexEntry> find(String query, int first, int number) {
        try {
            if (first < 0) {
                number += first;
                first = 0;
            }
            if (directory.listAll().length == 0 || number <= 0) {
                return emptyList();
            }
            try (DirectoryReader reader = DirectoryReader.open(directory)) {
                IndexSearcher searcher = new IndexSearcher(reader);
                QueryParser parser = new QueryParser(BODY, analyzer);
                ScoreDoc[] hits = searcher.search(parser.parse(query), first + number).scoreDocs;
                List<IndexEntry> entries = new ArrayList<>(number);
                int last = min(first + number, hits.length);
                for (int i = first; i < last; i++) {
                    Document document = searcher.doc(hits[i].doc);
                    Hash hash = new Hash(document.getValues(CONTENT)[0]);
                    Set<Hash> head = new HashSet<>();
                    for (String value : document.getValues(REVISION)) {
                        head.add(new Hash(value));
                    }
                    entries.add(new IndexEntry(hash, head));
                }
                return entries;
            }
        } catch (AlreadyClosedException e) {
            throw new RepositoryClosedException(e);

        } catch (ParseException e) {
            throw new BadRequestException(e);

        } catch (IOException e) {
            throw new IOFailureException(e);
        }
    }

    private IndexEntry newIndexEntry(Document document) {
        Hash hash = new Hash(document.getValues(CONTENT)[0]);
        Set<Hash> head = new HashSet<>();
        for (String value : document.getValues(REVISION)) {
            head.add(new Hash(value));
        }
        return new IndexEntry(hash, head);
    }
}