com.yahoo.glimmer.indexing.VerticalDocument.java Source code

Introduction

Here is the source code for com.yahoo.glimmer.indexing.VerticalDocument.java
Source

package com.yahoo.glimmer.indexing;

/*
 * Copyright (c) 2012 Yahoo! Inc. All rights reserved.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 *  Unless required by applicable law or agreed to in writing, software distributed under the License is 
 *  distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and limitations under the License.
 *  See accompanying LICENSE file.
 */

import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.semanticweb.yars.nx.BNode;
import org.semanticweb.yars.nx.Resource;
import org.semanticweb.yars.nx.namespace.RDF;

import com.yahoo.glimmer.indexing.RDFDocumentFactory.IndexType;
import com.yahoo.glimmer.indexing.RDFDocumentFactory.RdfCounters;
import com.yahoo.glimmer.indexing.RDFDocumentFactory.ResourceHashLookupException;

/**
 * A RDF document.
 * 
 * <p>
 * We delay the actual parsing until it is actually necessary, so operations
 * like getting the document URI will not require parsing.
 */

class VerticalDocument extends RDFDocument {
    private static final Log LOG = LogFactory.getLog(VerticalDocument.class);

    private List<List<String>> fields = new ArrayList<List<String>>();

    protected VerticalDocument(VerticalDocumentFactory factory) {
        super(factory);
        fields = new ArrayList<List<String>>(factory.getFieldCount());
        while (fields.size() < factory.getFieldCount()) {
            fields.add(new ArrayList<String>());
        }
    }

    @Override
    public IndexType getIndexType() {
        return IndexType.VERTICAL;
    }

    protected void ensureParsed_(Iterator<Relation> relations) throws IOException {
        // clear fields
        for (List<String> field : fields) {
            field.clear();
        }

        while (relations.hasNext()) {
            Relation relation = relations.next();
            String predicate = relation.getPredicate().toString();
            // Check if prefix is on blacklist
            if (RDFDocumentFactory.isOnPredicateBlacklist(predicate)) {
                factory.incrementCounter(RdfCounters.BLACKLISTED_TRIPLES, 1);
                continue;
            }
            // Determine whether we need to index, and the field
            int fieldIndex = factory.getFieldIndex(predicate);
            if (fieldIndex == -1) {
                factory.incrementCounter(RdfCounters.UNINDEXED_PREDICATE_TRIPLES, 1);
                continue;
            }

            List<String> fieldForPredicate = fields.get(fieldIndex);

            if (relation.getObject() instanceof Resource || relation.getObject() instanceof BNode) {
                // Encode the resource URI or bnode ID using the resources hash
                String objectId;
                try {
                    objectId = factory.lookupResource(relation.getObject().toString(), true);
                } catch (ResourceHashLookupException rhle) {
                    factory.incrementCounter(RdfCounters.OBJECT_NOT_IN_HASH, 1);
                    LOG.info("Object not in hash:" + relation.getContext().toString());
                    continue;
                }

                fieldForPredicate.add(objectId);

                if (predicate.equals(RDF.TYPE.toString())) {
                    // If the predicate is RDF type and the object is a Resource
                    // we use the ontology(if set)
                    // to also index all super types.
                    factory.incrementCounter(RdfCounters.RDF_TYPE_TRIPLES, 1);

                    for (String ancestor : factory.getAncestors(relation.getObject().toString())) {
                        String ancestorId;
                        try {
                            ancestorId = factory.lookupResource(ancestor, true);
                        } catch (ResourceHashLookupException rhle) {
                            factory.incrementCounter(RdfCounters.ANCESTOR_OBJECT_NOT_IN_HASH, 1);
                            LOG.info("Ancestor(" + ancestor + ") of " + relation.getObject().toString()
                                    + " not in resources hash function!. Was the same ontology used with the PrepTool?");
                            continue;
                        }
                        if (ancestorId == null) {
                            throw new IllegalStateException();
                        }
                        fieldForPredicate.add(ancestorId);
                    }
                }
            } else {
                String object = relation.getObject().toString();

                // Iterate over the words of the value
                FastBufferedReader fbr = new FastBufferedReader(object.toCharArray());
                MutableString word = new MutableString();
                MutableString nonWord = new MutableString();
                while (fbr.next(word, nonWord)) {
                    if (word != null && !word.equals("")) {
                        if (CombinedTermProcessor.getInstance().processTerm(word)) {
                            fieldForPredicate.add(word.toString());
                        }
                    }
                }
                fbr.close();
            }
            factory.incrementCounter(RdfCounters.INDEXED_TRIPLES, 1);
        }
    }

    @Override
    public WordReader content(final int field) throws IOException {
        factory.ensureFieldIndex(field);
        ensureParsed();
        return new WordArrayReader(fields.get(field));
    }
}