org.lilyproject.linkindex.LinkIndex.java Source code

Java tutorial

Introduction

Here is the source code for org.lilyproject.linkindex.LinkIndex.java

Source

/*
 * Copyright 2010 Outerthought bvba
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.lilyproject.linkindex;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import com.google.common.collect.Sets;
import org.apache.hadoop.hbase.util.Bytes;
import org.lilyproject.hbaseindex.Index;
import org.lilyproject.hbaseindex.IndexDefinition;
import org.lilyproject.hbaseindex.IndexEntry;
import org.lilyproject.hbaseindex.IndexManager;
import org.lilyproject.hbaseindex.IndexNotFoundException;
import org.lilyproject.hbaseindex.Query;
import org.lilyproject.hbaseindex.QueryResult;
import org.lilyproject.linkindex.LinkIndexMetrics.Action;
import org.lilyproject.repository.api.AbsoluteRecordId;
import org.lilyproject.repository.api.IdGenerator;
import org.lilyproject.repository.api.RecordId;
import org.lilyproject.repository.api.RepositoryException;
import org.lilyproject.repository.api.RepositoryManager;
import org.lilyproject.repository.api.SchemaId;
import org.lilyproject.util.Pair;
import org.lilyproject.util.hbase.LilyHBaseSchema.Table;
import org.lilyproject.util.io.Closer;

/**
 * The index of links that exist between documents.
 *
 * <p>Terminology:
 *
 * <ul>
 * <li>referrers = backwards links = incoming links</li>
 * <li>forward links = outgoing links</li>
 * </ul>
 */
// IMPORTANT implementation note: the order in which changes are applied, first to the forward or first to
// the backward table, is not arbitrary. It is such that if the process would fail in between, there would
// never be left any state in the backward table which would not be found via the forward index.
public class LinkIndex {
    private RepositoryManager repositoryManager;
    private IdGenerator lazyIdGenerator;
    private LinkIndexMetrics metrics;
    private Index forwardIndex;
    private Index backwardIndex;

    private static final byte[] SOURCE_FIELD_KEY = Bytes.toBytes("sf");
    private static final byte[] VTAG_KEY = Bytes.toBytes("vt");

    public LinkIndex(final IndexManager indexManager, RepositoryManager repositoryManager)
            throws IndexNotFoundException, IOException, InterruptedException {
        metrics = new LinkIndexMetrics("linkIndex");
        this.repositoryManager = repositoryManager;

        // About the structure of these indexes:
        //  - the vtag comes after the recordid because this way we can delete all
        //    entries for a record without having to know the vtags under which they occur
        //  - the sourcefield will often by optional in queries, that's why it comes last

        final int schemaIdByteLength = 16; // see SchemaIdImpl
        {
            IndexDefinition indexDef = new IndexDefinition("links-forward");
            // For the record ID we use a variable length byte array field of which the first two bytes are fixed length
            // The first byte is actually the record identifier byte.
            // The second byte really is the first byte of the record id. We put this in the fixed length part
            // (safely because a record id should at least be a single byte long) because this prevents BCD encoding
            // on the first byte, thus making it easier to configure table splitting based on the original input.
            indexDef.addVariableLengthByteField("source", 2);
            indexDef.addByteField("vtag", schemaIdByteLength);
            indexDef.addByteField("sourcefield", schemaIdByteLength);
            forwardIndex = indexManager.getIndex(indexDef);
        }

        {
            IndexDefinition indexDef = new IndexDefinition("links-backward");
            // Same remark as in the forwardIndex.
            indexDef.addVariableLengthByteField("target", 2);
            indexDef.addByteField("vtag", schemaIdByteLength);
            indexDef.addByteField("sourcefield", schemaIdByteLength);
            backwardIndex = indexManager.getIndex(indexDef);
        }
    }

    public void deleteLinks(RecordId sourceRecord) throws LinkIndexException, InterruptedException {
        deleteLinks(getAbsoluteId(sourceRecord));
    }

    /**
     * Deletes all links of a record, irrespective of the vtag.
     */
    public void deleteLinks(AbsoluteRecordId sourceRecord) throws LinkIndexException, InterruptedException {
        long before = System.currentTimeMillis();
        try {
            byte[] sourceAsBytes = sourceRecord.toBytes();

            // Read links from the forwards table
            Set<Pair<FieldedLink, SchemaId>> oldLinks = getAllForwardLinks(sourceRecord);

            // Delete existing entries from the backwards table
            List<IndexEntry> entries = new ArrayList<IndexEntry>(oldLinks.size());
            for (Pair<FieldedLink, SchemaId> link : oldLinks) {
                IndexEntry entry = createBackwardIndexEntry(link.getV2(), link.getV1().getAbsoluteRecordId(),
                        link.getV1().getFieldTypeId());
                entry.setIdentifier(sourceAsBytes);
                entries.add(entry);
            }
            backwardIndex.removeEntries(entries);

            // Delete existing entries from the forwards table
            entries.clear();
            for (Pair<FieldedLink, SchemaId> link : oldLinks) {
                IndexEntry entry = createForwardIndexEntry(link.getV2(), sourceRecord,
                        link.getV1().getFieldTypeId());
                entry.setIdentifier(link.getV1().getAbsoluteRecordId().toBytes());
                entries.add(entry);
            }
            forwardIndex.removeEntries(entries);
        } catch (LinkIndexException e) {
            throw new LinkIndexException("Error deleting links for record '" + sourceRecord + "'", e);
        } catch (IOException e) {
            throw new LinkIndexException("Error deleting links for record '" + sourceRecord + "'", e);
        } finally {
            metrics.report(Action.DELETE_LINKS, System.currentTimeMillis() - before);
        }
    }

    public void deleteLinks(RecordId sourceRecord, SchemaId vtag) throws LinkIndexException, InterruptedException {
        deleteLinks(getAbsoluteId(sourceRecord), vtag);
    }

    public void deleteLinks(AbsoluteRecordId sourceRecord, SchemaId vtag)
            throws LinkIndexException, InterruptedException {
        long before = System.currentTimeMillis();
        try {
            byte[] sourceAsBytes = sourceRecord.toBytes();

            // Read links from the forwards table
            Set<FieldedLink> oldLinks = getFieldedForwardLinks(sourceRecord, vtag);

            // Delete existing entries from the backwards table
            List<IndexEntry> entries = new ArrayList<IndexEntry>(oldLinks.size());
            for (FieldedLink link : oldLinks) {
                IndexEntry entry = createBackwardIndexEntry(vtag, link.getAbsoluteRecordId(),
                        link.getFieldTypeId());
                entry.setIdentifier(sourceAsBytes);
                entries.add(entry);
            }
            backwardIndex.removeEntries(entries);

            // Delete existing entries from the forwards table
            entries.clear();
            for (FieldedLink link : oldLinks) {
                IndexEntry entry = createForwardIndexEntry(vtag, sourceRecord, link.getFieldTypeId());
                entry.setIdentifier(link.getAbsoluteRecordId().toBytes());
                entries.add(entry);
            }
            forwardIndex.removeEntries(entries);
        } catch (LinkIndexException e) {
            throw new LinkIndexException(
                    "Error deleting links for record '" + sourceRecord + "', vtag '" + vtag + "'", e);
        } catch (IOException e) {
            throw new LinkIndexException(
                    "Error deleting links for record '" + sourceRecord + "', vtag '" + vtag + "'", e);
        } finally {
            metrics.report(Action.DELETE_LINKS_VTAG, System.currentTimeMillis() - before);
        }
    }

    public void updateLinks(RecordId sourceRecord, SchemaId vtag, Set<FieldedLink> links)
            throws LinkIndexException, InterruptedException {
        updateLinks(sourceRecord, vtag, links, false);
    }

    public void updateLinks(AbsoluteRecordId sourceRecord, SchemaId vtag, Set<FieldedLink> links)
            throws LinkIndexException, InterruptedException {
        updateLinks(sourceRecord, vtag, links, false);
    }

    /**
     * @param links       if this set is empty, then calling this method is equivalent to calling deleteLinks
     * @param isNewRecord if this is a new record, then we can skip querying the existing links, thus gaining some
     *                    time.
     */
    public void updateLinks(RecordId sourceRecord, SchemaId vtag, Set<FieldedLink> links, boolean isNewRecord)
            throws LinkIndexException, InterruptedException {
        updateLinks(getAbsoluteId(sourceRecord), vtag, links, isNewRecord);
    }

    public void updateLinks(AbsoluteRecordId sourceRecord, SchemaId vtag, Set<FieldedLink> links,
            boolean isNewRecord) throws LinkIndexException, InterruptedException {
        long before = System.currentTimeMillis();
        try {
            // We could simply delete all the old entries using deleteLinks() and then add
            // all new entries, but instead we find out what actually needs adding or removing and only
            // perform that. This is to avoid running into problems due to http://search-hadoop.com/m/rNnhN15Xecu
            // (= delete and put within the same millisecond).

            Set<FieldedLink> oldLinks = isNewRecord ? Collections.<FieldedLink>emptySet()
                    : getFieldedForwardLinks(sourceRecord, vtag);

            if (links.isEmpty() && oldLinks.isEmpty()) {
                // No links to add, no links to remove
                return;
            }

            // Find out what changed
            Set<FieldedLink> removedLinks = new HashSet<FieldedLink>(oldLinks);
            removedLinks.removeAll(links);
            Set<FieldedLink> addedLinks = new HashSet<FieldedLink>(links);
            addedLinks.removeAll(oldLinks);

            // Apply added links
            byte[] sourceAsBytes = sourceRecord.toBytes();
            List<IndexEntry> fwdEntries = null;
            List<IndexEntry> bkwdEntries = null;
            if (addedLinks.size() > 0) {
                fwdEntries = new ArrayList<IndexEntry>(Math.max(addedLinks.size(), removedLinks.size()));
                bkwdEntries = new ArrayList<IndexEntry>(fwdEntries.size());
                for (FieldedLink link : addedLinks) {
                    IndexEntry fwdEntry = createForwardIndexEntry(vtag, sourceRecord, link.getFieldTypeId());
                    fwdEntry.setIdentifier(link.getAbsoluteRecordId().toBytes());
                    fwdEntries.add(fwdEntry);

                    IndexEntry bkwdEntry = createBackwardIndexEntry(vtag, link.getAbsoluteRecordId(),
                            link.getFieldTypeId());
                    bkwdEntry.setIdentifier(sourceAsBytes);
                    bkwdEntries.add(bkwdEntry);
                }
                forwardIndex.addEntries(fwdEntries);
                backwardIndex.addEntries(bkwdEntries);
            }

            // Apply removed links
            if (removedLinks.size() > 0) {
                if (fwdEntries != null) {
                    fwdEntries.clear();
                    bkwdEntries.clear();
                } else {
                    fwdEntries = new ArrayList<IndexEntry>(removedLinks.size());
                    bkwdEntries = new ArrayList<IndexEntry>(fwdEntries.size());
                }

                for (FieldedLink link : removedLinks) {
                    IndexEntry bkwdEntry = createBackwardIndexEntry(vtag, link.getAbsoluteRecordId(),
                            link.getFieldTypeId());
                    bkwdEntry.setIdentifier(sourceAsBytes);
                    bkwdEntries.add(bkwdEntry);

                    IndexEntry fwdEntry = createForwardIndexEntry(vtag, sourceRecord, link.getFieldTypeId());
                    fwdEntry.setIdentifier(link.getAbsoluteRecordId().toBytes());
                    fwdEntries.add(fwdEntry);
                }
                backwardIndex.removeEntries(bkwdEntries);
                forwardIndex.removeEntries(fwdEntries);
            }
        } catch (IOException e) {
            throw new LinkIndexException(
                    "Error updating links for record '" + sourceRecord + "', vtag '" + vtag + "'", e);
        } finally {
            metrics.report(Action.UPDATE_LINKS, System.currentTimeMillis() - before);
        }
    }

    private IndexEntry createBackwardIndexEntry(SchemaId vtag, AbsoluteRecordId target, SchemaId sourceField) {
        IndexEntry entry = new IndexEntry(backwardIndex.getDefinition());

        entry.addField("vtag", vtag.getBytes());
        entry.addField("target", target.toBytes());
        entry.addField("sourcefield", sourceField.getBytes());

        entry.addData(SOURCE_FIELD_KEY, sourceField.getBytes());

        return entry;
    }

    private IndexEntry createForwardIndexEntry(SchemaId vtag, AbsoluteRecordId source, SchemaId sourceField) {
        IndexEntry entry = new IndexEntry(forwardIndex.getDefinition());

        entry.addField("vtag", vtag.getBytes());
        entry.addField("source", source.toBytes());
        entry.addField("sourcefield", sourceField.getBytes());

        entry.addData(SOURCE_FIELD_KEY, sourceField.getBytes());
        entry.addData(VTAG_KEY, vtag.getBytes());

        return entry;
    }

    public Set<RecordId> getReferrers(RecordId record, SchemaId vtag)
            throws LinkIndexException, InterruptedException {
        return getReferrers(record, vtag, null);
    }

    public Set<AbsoluteRecordId> getAbsoluteReferrers(AbsoluteRecordId record, SchemaId vtag)
            throws LinkIndexException, InterruptedException {
        return getAbsoluteReferrers(record, vtag, null);
    }

    public Set<RecordId> getReferrers(RecordId record, SchemaId vtag, SchemaId sourceField)
            throws LinkIndexException, InterruptedException {
        return getReferrers(getAbsoluteId(record), vtag, sourceField);
    }

    public Set<RecordId> getReferrers(AbsoluteRecordId record, SchemaId vtag, SchemaId sourceField)
            throws LinkIndexException, InterruptedException {
        Set<AbsoluteRecordId> absoluteReferrers = getAbsoluteReferrers(record, vtag, sourceField);
        Set<RecordId> referrers = Sets.newHashSetWithExpectedSize(absoluteReferrers.size());
        for (AbsoluteRecordId absoluteReferrer : absoluteReferrers) {
            referrers.add(absoluteReferrer.getRecordId());
        }
        return referrers;
    }

    public Set<AbsoluteRecordId> getAbsoluteReferrers(AbsoluteRecordId record, SchemaId vtag, SchemaId sourceField)
            throws LinkIndexException, InterruptedException {
        long before = System.currentTimeMillis();
        try {
            Query query = new Query();
            query.addEqualsCondition("target", record.toBytes());
            if (vtag != null) {
                query.addEqualsCondition("vtag", vtag.getBytes());
            }
            if (sourceField != null) {
                query.addEqualsCondition("sourcefield", sourceField.getBytes());
            }

            Set<AbsoluteRecordId> result = Sets.newHashSet();

            QueryResult qr = backwardIndex.performQuery(query);
            byte[] id;
            while ((id = qr.next()) != null) {
                result.add(getIdGenerator().absoluteFromBytes(id));
            }
            Closer.close(qr); // Not closed in finally block: avoid HBase contact when there could be connection problems.

            return result;
        } catch (IOException e) {
            throw new LinkIndexException("Error getting referrers for record '" + record + "', vtag '" + vtag
                    + "', field '" + sourceField + "'", e);
        } finally {
            metrics.report(Action.GET_REFERRERS, System.currentTimeMillis() - before);
        }
    }

    public Set<FieldedLink> getFieldedReferrers(RecordId record, SchemaId vtag)
            throws LinkIndexException, InterruptedException {
        long before = System.currentTimeMillis();
        try {
            Query query = new Query();
            query.addEqualsCondition("target", record.toBytes());
            if (vtag != null) {
                query.addEqualsCondition("vtag", vtag.getBytes());
            }

            Set<FieldedLink> result = new HashSet<FieldedLink>();

            QueryResult qr = backwardIndex.performQuery(query);
            byte[] id;
            while ((id = qr.next()) != null) {
                SchemaId sourceField = getIdGenerator().getSchemaId(qr.getData(SOURCE_FIELD_KEY));
                result.add(new FieldedLink(getIdGenerator().absoluteFromBytes(id), sourceField));
            }
            Closer.close(qr); // Not closed in finally block: avoid HBase contact when there could be connection problems.

            return result;
        } catch (IOException e) {
            throw new LinkIndexException("Error getting referrers for record '" + record + "', vtag '" + vtag + "'",
                    e);
        } finally {
            metrics.report(Action.GET_FIELDED_REFERRERS, System.currentTimeMillis() - before);
        }
    }

    public Set<Pair<FieldedLink, SchemaId>> getAllForwardLinks(RecordId record)
            throws LinkIndexException, InterruptedException {
        return this.getAllForwardLinks(getAbsoluteId(record));
    }

    public Set<Pair<FieldedLink, SchemaId>> getAllForwardLinks(AbsoluteRecordId record)
            throws LinkIndexException, InterruptedException {
        long before = System.currentTimeMillis();
        try {
            Query query = new Query();
            query.addEqualsCondition("source", record.toBytes());

            Set<Pair<FieldedLink, SchemaId>> result = new HashSet<Pair<FieldedLink, SchemaId>>();

            QueryResult qr = forwardIndex.performQuery(query);
            byte[] id;
            while ((id = qr.next()) != null) {
                SchemaId sourceField = getIdGenerator().getSchemaId(qr.getData(SOURCE_FIELD_KEY));
                SchemaId vtag = getIdGenerator().getSchemaId(qr.getData(VTAG_KEY));
                result.add(new Pair<FieldedLink, SchemaId>(
                        new FieldedLink(getIdGenerator().absoluteFromBytes(id), sourceField), vtag));
            }
            Closer.close(qr); // Not closed in finally block: avoid HBase contact when there could be connection problems.

            return result;
        } catch (IOException e) {
            throw new LinkIndexException("Error getting forward links for record '" + record + "'", e);
        } finally {
            metrics.report(Action.GET_ALL_FW_LINKS, System.currentTimeMillis() - before);
        }
    }

    public Set<RecordId> getForwardLinks(RecordId record, SchemaId vtag)
            throws LinkIndexException, InterruptedException {
        return getForwardLinks(record, vtag, null);
    }

    public Set<RecordId> getForwardLinks(RecordId record, SchemaId vtag, SchemaId sourceField)
            throws LinkIndexException, InterruptedException {
        Set<AbsoluteRecordId> absoluteLinks = getForwardLinks(getAbsoluteId(record), vtag, sourceField);
        Set<RecordId> relativeLinks = Sets.newHashSetWithExpectedSize(absoluteLinks.size());
        for (AbsoluteRecordId absoluteLink : absoluteLinks) {
            relativeLinks.add(absoluteLink.getRecordId());
        }
        return relativeLinks;
    }

    public Set<AbsoluteRecordId> getForwardLinks(AbsoluteRecordId record, SchemaId vtag, SchemaId sourceField)
            throws LinkIndexException, InterruptedException {
        long before = System.currentTimeMillis();
        try {
            Query query = new Query();
            query.addEqualsCondition("source", record.toBytes());
            if (vtag != null) {
                query.addEqualsCondition("vtag", vtag.getBytes());
            }
            if (sourceField != null) {
                query.addEqualsCondition("sourcefield", sourceField.getBytes());
            }

            Set<AbsoluteRecordId> result = new HashSet<AbsoluteRecordId>();

            QueryResult qr = forwardIndex.performQuery(query);
            byte[] id;
            while ((id = qr.next()) != null) {
                result.add(getIdGenerator().absoluteFromBytes(id));
            }
            Closer.close(qr); // Not closed in finally block: avoid HBase contact when there could be connection problems.

            return result;
        } catch (IOException e) {
            throw new LinkIndexException("Error getting forward links for record '" + record + "', vtag '" + vtag
                    + "', field '" + sourceField + "'", e);
        } finally {
            metrics.report(Action.GET_FW_LINKS, System.currentTimeMillis() - before);
        }
    }

    public Set<FieldedLink> getFieldedForwardLinks(RecordId record, SchemaId vtag)
            throws LinkIndexException, InterruptedException {
        return getFieldedForwardLinks(getAbsoluteId(record), vtag);
    }

    public Set<FieldedLink> getFieldedForwardLinks(AbsoluteRecordId record, SchemaId vtag)
            throws LinkIndexException, InterruptedException {
        long before = System.currentTimeMillis();
        try {
            Query query = new Query();
            query.addEqualsCondition("source", record.toBytes());
            if (vtag != null) {
                query.addEqualsCondition("vtag", vtag.getBytes());
            }

            Set<FieldedLink> result = new HashSet<FieldedLink>();

            QueryResult qr = forwardIndex.performQuery(query);
            byte[] id;
            while ((id = qr.next()) != null) {
                SchemaId sourceField = getIdGenerator().getSchemaId(qr.getData(SOURCE_FIELD_KEY));
                result.add(new FieldedLink(getIdGenerator().absoluteFromBytes(id), sourceField));
            }
            Closer.close(qr); // Not closed in finally block: avoid HBase contact when there could be connection problems.

            return result;
        } catch (IOException e) {
            throw new LinkIndexException(
                    "Error getting forward links for record '" + record + "', vtag '" + vtag + "'", e);
        } finally {
            metrics.report(Action.GET_FW_LINKS, System.currentTimeMillis() - before);
        }
    }

    private IdGenerator getIdGenerator() throws InterruptedException, LinkIndexException {
        // synchronization not an issue, doesn't matter if this happens twice
        // can't assign IdGenerator in constructor since the repository is a premature one
        if (lazyIdGenerator == null) {
            try {
                lazyIdGenerator = repositoryManager.getDefaultRepository().getIdGenerator();
            } catch (RepositoryException e) {
                throw new LinkIndexException(e);
            }
        }

        return lazyIdGenerator;
    }

    private AbsoluteRecordId getAbsoluteId(RecordId recordId) throws LinkIndexException, InterruptedException {
        return getIdGenerator().newAbsoluteRecordId(Table.RECORD.name, recordId);
    }

}