io.ecarf.core.term.dictionary.TermDictionary.java Source code

Introduction

Here is the source code for io.ecarf.core.term.dictionary.TermDictionary.java
Source

/**
 * The contents of this file may be used under the terms of the Apache License, Version 2.0
 * in which case, the provisions of the Apache License Version 2.0 are applicable instead of those above.
 *
 * Copyright 2014, Ecarf.io
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.ecarf.core.term.dictionary;

import io.cloudex.framework.utils.ObjectUtils;
import io.ecarf.core.term.TermUtils;
import io.ecarf.core.triple.SchemaURIType;
import io.ecarf.core.utils.BiMapJsonDeserializer;
import io.ecarf.core.utils.NumberUtils;
import io.ecarf.core.utils.Utils;

import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Serializable;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.google.common.base.Stopwatch;
import com.google.common.collect.BiMap;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.stream.JsonReader;

/**
 * Represent a term dictionary holding URIs and Blank nodes as the key and the encoded integer as the value
 * Literals are not encoded by this dictionary. To differentiate between Ids for resources vs. blank nodes 
 * we use prime numbers for Blank nodes. {@link BigInteger} primality methods are used to check for ids used
 * by blank nodes. Another approach for blank nodes is that we use a fixed threshold for them, for example
 * from 100 to 1,000,000 are blank nodes, but this puts a superficial limit on the number of blank nodes we 
 * can have. Using prime number has the advantage of adding infinite number of blank nodes.
 *   
 * @author Omer Dawelbeit (omerio)
 *
 */
public abstract class TermDictionary implements Serializable {

    private static final long serialVersionUID = 1314487458787673648L;

    private final static Log log = LogFactory.getLog(TermDictionary.class);

    public static final Gson GSON = new GsonBuilder().registerTypeAdapter(BiMap.class, new BiMapJsonDeserializer())
            .create();

    public static final int RESOURCE_ID_INVALID = 100;
    public static final String INVALID_RESOURCE = "<invalid>";

    /**
     * Resources start at 101, leaving 0 to 99 to be assigned to RDF, RDFS, OWL & OWL2 URIs
     */
    public static final int RESOURCE_ID_START = 101;

    private int largestResourceId = RESOURCE_ID_START;

    /**
     * The default number of bits used to store information about the parts of an encoded dictionary entry
     * 0, to specify a blank node, 1 to n to specify n number of parts
     */
    //private int infoBits = 8;

    /**
     * Using a key get a value
     * @param key
     * @return
     */
    public abstract Integer get(String key);

    /**
     * Given a value return a key
     * @param value
     * @return
     */
    public abstract String get(Integer value);

    public abstract int size();

    public abstract void put(String key, Integer value);

    public abstract boolean containsKey(String key);

    protected abstract void putAll(Map<? extends String, ? extends Integer> map);

    /**
     * Encode a blank node
     * @param blankNode
     * @return
     */
    public long encodeBlankNode(String blankNode) {
        long value = 0;

        Integer enc = this.get(blankNode);
        if (enc == null) {
            throw new IllegalArgumentException("Term part not found in the dictionary: " + blankNode);
        }

        value = (enc <<= 3) | 0b001;

        return value;
    }

    /**
     * Encode a term in the format <http://dbpedia.org/resource/Alexander_II_of_Russia>
     * @param term
     * @return
     */
    public long encode(String term) {

        long value = 0;

        if (SchemaURIType.RDF_OWL_TERMS.contains(term)) {

            value = this.get(term);

        } else {

            // TODO why I'm using this code here instead of TermUtils.split?
            // because I need to efficiently check for https
            String url = term.substring(1, term.length() - 1);
            String path = StringUtils.removeStart(url, TermUtils.HTTP);
            boolean https = false;

            if (path.length() == url.length()) {
                path = StringUtils.removeStart(path, TermUtils.HTTPS);
                https = true;
            }

            //String [] parts = StringUtils.split(path, URI_SEP);
            // this is alot faster than String.split or StringUtils.split

            List<String> parts = TermUtils.splitIntoTwo(term);

            if (!parts.isEmpty()) {

                // 1- encode parts
                long[] values = new long[parts.size()];
                int index = 0;
                Integer enc;
                for (String part : parts) {
                    enc = this.get(part);
                    if (enc == null) {
                        throw new IllegalArgumentException("Term part not found in the dictionary: " + part);
                    }

                    values[index] = enc;
                    index++;
                }

                // 2- compose
                value = NumberUtils.join(values);

                // 3- add flags
                //              1                               1                           1 
                // has  / at the end (1) or not (0)     http (0) or https (1)     one (1) or two (0) parts
                int flags = 0;

                if (parts.size() == 1) {
                    flags = 1;
                }

                if (https) {
                    flags |= 0b10;
                }

                // do we have a slash at the end of the URL or not
                if (url.lastIndexOf(TermUtils.URI_SEP) == (url.length() - 1)) {
                    flags |= 0b100;
                }

                value = (value <<= 3) | flags;

            } else {
                // invalid URIs, e.g. <http:///www.taotraveller.com> is parsed by NxParser as http:///
                value = RESOURCE_ID_INVALID;
            }
        }

        return value;
    }

    /**
     * Decode the value provided into a term
     * @param value
     * @return
     */
    public String decode(long value) {

        String term = null;

        if (value <= RESOURCE_ID_INVALID && value >= 0) {

            if (value == RESOURCE_ID_INVALID) {
                term = INVALID_RESOURCE;

            } else {
                // built in RDF or OWL URI
                term = this.get((int) value);
            }

        } else {

            // extract the flags
            int flags = (int) (value & 0b111);

            // one or two parts
            int n = ((flags & 0b001) == 1) ? 1 : 2;

            // has http/s 
            String protocol = ((flags & 0b010) == 2) ? TermUtils.HTTPS : TermUtils.HTTP;

            // has slash at the end or not
            boolean hasSlash = ((flags & 0b100) == 4);

            boolean blank = false;

            // remove the 3 flags bits
            value >>= 3;

            if (n == 1) {

                term = this.get((int) value);

                blank = term.startsWith("_");

            } else {

                // 2- decompose
                long[] values = NumberUtils.disjoin(n, value);

                // 3- decode parts
                List<String> parts = new ArrayList<>();
                for (long partv : values) {
                    parts.add(this.get((int) partv));
                }

                term = StringUtils.join(parts, TermUtils.URI_SEP);
            }

            if (!blank) {
                // 4- Assemble the term values
                StringBuilder termBuilder = (new StringBuilder(250)).append('<').append(protocol).append(term);

                if (hasSlash) {
                    termBuilder.append(TermUtils.URI_SEP);
                }

                term = termBuilder.append('>').toString();
            }
        }

        return term;
    }

    /**
     * add a part to the dictionary (part of a URI or a blank node)
     * @param part
     */
    public void add(String part) {
        if (!this.containsKey(part)) {
            this.largestResourceId++;
            this.put(part, this.largestResourceId);
        }
    }

    /**
     * Add an entry to the dictionary
     * @param term
     * @param id
     */
    protected void add(String term, Integer id) {
        if (this.largestResourceId < id) {
            this.largestResourceId = id;
        }
        this.put(term, id);
    }

    /**
     * Create a dictionary pre-populated with the default RDF and OWL URIs
     * @return
     */
    public static TermDictionary populateRDFOWLData(TermDictionary dictionary) {

        for (SchemaURIType uri : SchemaURIType.values()) {
            dictionary.add(uri.uri, uri.id);
        }

        return dictionary;
    }

    //-------------------------------------------------------- Binary Serialization 
    /**
     * Serialize this dictionary to a binary file, optionally compressed
     * @param filename
     * @param compress
     * @throws IOException
     */
    public String toFile(String filename, boolean compress) throws IOException {
        Stopwatch stopwatch = Stopwatch.createStarted();

        Utils.objectToFile(filename, this, compress, false);

        log.debug("TIMER# serialized dictionary to file: " + filename + ", in: " + stopwatch);

        return filename;
    }

    /**
     * De-serialize a TermDictionary instance from a binary file.
     * @param jsonFile - a file path
     * @return a TermDictionary instance
     * @throws FileNotFoundException if the file is not found
     * @throws IOException if the io operation fails
     * @throws ClassNotFoundException 
     */
    public static TermDictionary fromFile(String file, boolean compressed)
            throws FileNotFoundException, IOException, ClassNotFoundException {

        Stopwatch stopwatch = Stopwatch.createStarted();

        TermDictionary dictionary = Utils.objectFromFile(file, TermDictionary.class, compressed, false);

        log.debug("TIMER# deserialized dictionary from file: " + file + ", in: " + stopwatch);

        return dictionary;

    }

    //-------------------------------------------------------- JSON Serialization 
    /**
     * Convert to JSON
     * @return json representation of this object.
     */
    public String toJson() {
        return ObjectUtils.GSON.toJson(this);
    }

    /**
     * De-serialize a TermDictionary instance from a json file.
     * @param jsonFile - a file path
     * @return a TermDictionary instance
     * @throws FileNotFoundException if the file is not found
     * @throws IOException if the json conversion fails
     */
    public static TermDictionary fromJsonFile(String jsonFile) throws FileNotFoundException, IOException {
        return fromJsonFile(jsonFile, false);
    }

    /**
     * De-serialize a TermDictionary instance from a json file.
     * @param jsonFile - a file path
     * @return a TermDictionary instance
     * @throws FileNotFoundException if the file is not found
     * @throws IOException if the json conversion fails
     */
    public static TermDictionary fromJsonFile(String jsonFile, boolean compressed)
            throws FileNotFoundException, IOException {
        Stopwatch stopwatch = Stopwatch.createStarted();

        String filename = jsonFile;
        if (compressed) {
            filename = Utils.unCompressFile(jsonFile);
        }

        try (FileReader reader = new FileReader(filename)) {
            TermDictionary dictionary = GSON.fromJson(new JsonReader(reader), TermDictionary.class);

            log.debug("TIMER# deserialized dictionary from JSON file: " + jsonFile + ", in: " + stopwatch);

            return dictionary;
        }
    }

    /**
     * Serialize this dictionary to a json file, optionally compressed
     * @param filename
     * @param compress
     * @throws IOException
     */
    public String toJsonFile(String filename, boolean compress) throws IOException {
        Stopwatch stopwatch = Stopwatch.createStarted();
        Utils.objectToJsonFile(filename, this);

        if (compress) {
            filename = Utils.compressFile(filename);
        }

        log.debug("TIMER# serialized dictionary to JSON file: " + filename + ", in: " + stopwatch);

        return filename;
    }

    /**
     * De-serialize a TermDictionary instance from a json string.
     * @param json - a json string
     * @return a TermDictionary instance
     */
    public static TermDictionary fromJsonString(String json) {
        return GSON.fromJson(json, TermDictionary.class);
    }

    /**
     * @return the largestResourceId
     */
    public int getLargestResourceId() {
        return largestResourceId;
    }

    /**
     * @param largestResourceId the largestResourceId to set
     */
    protected void setLargestResourceId(int largestResourceId) {
        this.largestResourceId = largestResourceId;
    }

    /* (non-Javadoc)
     * @see java.lang.Object#toString()
     */
    @Override
    public String toString() {
        return new ToStringBuilder(this).append("largestResourceId", this.largestResourceId)
                .append("size", this.size()).toString();
    }

}