Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.marmotta.kiwi.io; import org.apache.commons.lang3.StringUtils; import org.apache.marmotta.commons.io.DataIO; import org.apache.marmotta.commons.vocabulary.SCHEMA; import org.apache.marmotta.commons.vocabulary.XSD; import org.apache.marmotta.kiwi.model.rdf.*; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.openrdf.model.vocabulary.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Date; import java.util.HashMap; import java.util.Locale; import java.util.Map; import java.util.zip.DataFormatException; import java.util.zip.Deflater; import java.util.zip.Inflater; /** * Add file description here! * * @author Sebastian Schaffert (sschaffert@apache.org) */ public class KiWiIO { public static final String NS_DBPEDIA = "http://dbpedia.org/resource/"; public static final String NS_FREEBASE = "http://rdf.freebase.com/ns/"; private static Logger log = LoggerFactory.getLogger(KiWiIO.class); /** * Minimum length of content where we start using compression. */ private static final int LITERAL_COMPRESS_LENGTH = 500; private static final int PREFIX_UNKNOWN = 0; private static final int PREFIX_XSD = 1; private static final int PREFIX_RDF = 2; private static final int PREFIX_RDFS = 3; private static final int PREFIX_SKOS = 4; private static final int PREFIX_DC = 5; private static final int PREFIX_DCT = 6; private static final int PREFIX_OWL = 7; private static final int PREFIX_LOCAL = 8; private static final int PREFIX_REDLINK = 9; private static final int PREFIX_SCHEMA = 10; private static final int PREFIX_DBPEDIA = 11; private static final int PREFIX_FREEBASE = 12; private static final int TYPE_URI = 1; private static final int TYPE_BNODE = 2; private static final int TYPE_BOOLEAN = 3; private static final int TYPE_DATE = 4; private static final int TYPE_DOUBLE = 5; private static final int TYPE_INT = 6; private static final int TYPE_STRING = 7; public static final int MODE_DEFAULT = 1; // no compression public static final int MODE_PREFIX = 2; // prefix compression for some known URI prefixes public static final int MODE_COMPRESSED = 3; // reserved: ZLIB string compression for long literals private static final int LANG_UNKNOWN = 0; private static final int LANG_EN = 1; private static final int LANG_DE = 2; private static final int LANG_FR = 3; private static final int LANG_ES = 4; private static final int LANG_NL = 5; private static final int LANG_PT = 6; private static final int LANG_RU = 7; private static final int LANG_SV = 8; private static final int LANG_NO = 9; private static final int LANG_FI = 10; private static final int LANG_DK = 11; private static final int LANG_IT = 12; private static final int LANG_PL = 13; public static final String HTTP_LOCALHOST = "http://localhost"; public static final String NS_REDLINK = "http://data.redlink.io"; private static Map<Class<? extends KiWiNode>, Integer> classTable = new HashMap<>(); static { classTable.put(KiWiUriResource.class, TYPE_URI); classTable.put(KiWiAnonResource.class, TYPE_BNODE); classTable.put(KiWiBooleanLiteral.class, TYPE_BOOLEAN); classTable.put(KiWiDateLiteral.class, TYPE_DATE); classTable.put(KiWiDoubleLiteral.class, TYPE_DOUBLE); classTable.put(KiWiIntLiteral.class, TYPE_INT); classTable.put(KiWiStringLiteral.class, TYPE_STRING); } private static Map<String, Integer> langTable = new HashMap<>(); static { langTable.put("en", LANG_EN); langTable.put("de", LANG_DE); langTable.put("fr", LANG_FR); langTable.put("es", LANG_ES); langTable.put("nl", LANG_NL); langTable.put("pt", LANG_PT); langTable.put("ru", LANG_RU); langTable.put("sv", LANG_SV); langTable.put("no", LANG_NO); langTable.put("fi", LANG_FI); langTable.put("dk", LANG_DK); langTable.put("it", LANG_IT); langTable.put("pl", LANG_PL); } /** * Efficiently serialize a KiWiNode to a DataOutput destination. The type of node will be encoded with a single * byte usinbg the TYPE_* constants defined in this class * * @param output DataOutput destination * @param node KiWiNode to serialize * @throws IOException */ public static void writeNode(DataOutput output, KiWiNode node) throws IOException { if (node == null) { output.writeByte(0); } else { int type = classTable.get(node.getClass()); output.writeByte(type); switch (type) { case TYPE_URI: writeURI(output, (KiWiUriResource) node); break; case TYPE_BNODE: writeBNode(output, (KiWiAnonResource) node); break; case TYPE_BOOLEAN: writeBooleanLiteral(output, (KiWiBooleanLiteral) node); break; case TYPE_DATE: writeDateLiteral(output, (KiWiDateLiteral) node); break; case TYPE_DOUBLE: writeDoubleLiteral(output, (KiWiDoubleLiteral) node); break; case TYPE_INT: writeIntLiteral(output, (KiWiIntLiteral) node); break; case TYPE_STRING: writeStringLiteral(output, (KiWiStringLiteral) node); break; default: throw new IllegalArgumentException("unknown KiWiNode type: " + node.getClass()); } } } /** * Read a KiWiNode serialized with writeNode and return it. The type indicator is used to determine which type * of resource to instantiate. * * @param input DataInput source * @return an instance of a subclass of KiWiNode, depending on the type indicator read from the source * @throws IOException */ public static KiWiNode readNode(DataInput input) throws IOException { int type = input.readByte(); switch (type) { case 0: return null; case TYPE_URI: return readURI(input); case TYPE_BNODE: return readBNode(input); case TYPE_BOOLEAN: return readBooleanLiteral(input); case TYPE_DATE: return readDateLiteral(input); case TYPE_DOUBLE: return readDoubleLiteral(input); case TYPE_INT: return readIntLiteral(input); case TYPE_STRING: return readStringLiteral(input); default: throw new IllegalArgumentException("unknown KiWiNode type: " + type); } } /** * Efficiently serialize a KiWiUriResource to a DataOutput destination, using prefix compression for commonly used * prefixes. * * @param out DataOutput destination * @param uri KiWiUriResource to serialize * @throws IOException */ public static void writeURI(DataOutput out, KiWiUriResource uri) throws IOException { if (uri == null) { out.writeLong(-1L); } else { out.writeLong(uri.getId()); // compression for commonly used constant prefixes if (uri.stringValue().startsWith(XSD.NAMESPACE)) { out.writeByte(PREFIX_XSD); DataIO.writeString(out, uri.stringValue().substring(XSD.NAMESPACE.length())); } else if (uri.stringValue().startsWith(RDF.NAMESPACE)) { out.writeByte(PREFIX_RDF); DataIO.writeString(out, uri.stringValue().substring(RDF.NAMESPACE.length())); } else if (uri.stringValue().startsWith(RDFS.NAMESPACE)) { out.writeByte(PREFIX_RDFS); DataIO.writeString(out, uri.stringValue().substring(RDFS.NAMESPACE.length())); } else if (uri.stringValue().startsWith(SKOS.NAMESPACE)) { out.writeByte(PREFIX_SKOS); DataIO.writeString(out, uri.stringValue().substring(SKOS.NAMESPACE.length())); } else if (uri.stringValue().startsWith(DC.NAMESPACE)) { out.writeByte(PREFIX_DC); DataIO.writeString(out, uri.stringValue().substring(DC.NAMESPACE.length())); } else if (uri.stringValue().startsWith(DCTERMS.NAMESPACE)) { out.writeByte(PREFIX_DCT); DataIO.writeString(out, uri.stringValue().substring(DCTERMS.NAMESPACE.length())); } else if (uri.stringValue().startsWith(OWL.NAMESPACE)) { out.writeByte(PREFIX_OWL); DataIO.writeString(out, uri.stringValue().substring(OWL.NAMESPACE.length())); } else if (uri.stringValue().startsWith(SCHEMA.NAMESPACE)) { out.writeByte(PREFIX_SCHEMA); DataIO.writeString(out, uri.stringValue().substring(SCHEMA.NAMESPACE.length())); } else if (uri.stringValue().startsWith(NS_REDLINK)) { out.writeByte(PREFIX_REDLINK); DataIO.writeString(out, uri.stringValue().substring(NS_REDLINK.length())); } else if (uri.stringValue().startsWith(NS_DBPEDIA)) { out.writeByte(PREFIX_DBPEDIA); DataIO.writeString(out, uri.stringValue().substring(NS_DBPEDIA.length())); } else if (uri.stringValue().startsWith(NS_FREEBASE)) { out.writeByte(PREFIX_FREEBASE); DataIO.writeString(out, uri.stringValue().substring(NS_FREEBASE.length())); } else if (uri.stringValue().startsWith(HTTP_LOCALHOST)) { out.writeByte(PREFIX_LOCAL); DataIO.writeString(out, uri.stringValue().substring(HTTP_LOCALHOST.length())); } else { out.writeByte(PREFIX_UNKNOWN); DataIO.writeString(out, uri.stringValue()); } out.writeLong(uri.getCreated().getTime()); } } /** * Read a KiWiUriResource serialized with writeURI and return it. * * @param input DataInput source * @return a KiWiUriResource * @throws IOException */ public static KiWiUriResource readURI(DataInput input) throws IOException { long id = input.readLong(); if (id == -1) { return null; } else { int prefixMode = input.readByte(); String uriPrefix = ""; String uriSuffix = DataIO.readString(input); switch (prefixMode) { case PREFIX_XSD: uriPrefix = XSD.NAMESPACE; break; case PREFIX_RDF: uriPrefix = RDF.NAMESPACE; break; case PREFIX_RDFS: uriPrefix = RDFS.NAMESPACE; break; case PREFIX_SKOS: uriPrefix = SKOS.NAMESPACE; break; case PREFIX_DC: uriPrefix = DC.NAMESPACE; break; case PREFIX_DCT: uriPrefix = DCTERMS.NAMESPACE; break; case PREFIX_OWL: uriPrefix = OWL.NAMESPACE; break; case PREFIX_SCHEMA: uriPrefix = SCHEMA.NAMESPACE; break; case PREFIX_REDLINK: uriPrefix = NS_REDLINK; break; case PREFIX_DBPEDIA: uriPrefix = NS_DBPEDIA; break; case PREFIX_FREEBASE: uriPrefix = NS_FREEBASE; break; case PREFIX_LOCAL: uriPrefix = HTTP_LOCALHOST; break; default: uriPrefix = ""; break; } Date created = new Date(input.readLong()); KiWiUriResource r = new KiWiUriResource(uriPrefix + uriSuffix, created); r.setId(id); return r; } } /** * Efficiently serialize a KiWiAnonResource to a DataOutput destination. * * @param out the destination * @param bnode the KiWiAnonResource to serialize * @throws IOException */ public static void writeBNode(DataOutput out, KiWiAnonResource bnode) throws IOException { if (bnode == null) { out.writeLong(-1L); } else { out.writeLong(bnode.getId()); DataIO.writeString(out, bnode.stringValue()); out.writeLong(bnode.getCreated().getTime()); } } /** * Read a KiWiAnonResource serialized with writeBNode from a DataInput source * * @param input the source * @return the de-serialized KiWiAnonResource * @throws IOException */ public static KiWiAnonResource readBNode(DataInput input) throws IOException { long id = input.readLong(); if (id == -1) { return null; } else { String anonId = DataIO.readString(input); Date created = new Date(input.readLong()); KiWiAnonResource r = new KiWiAnonResource(anonId, created); r.setId(id); return r; } } /** * Efficiently serialize a KiWiBooleanLiteral to a DataOutput destination. * * @param out the destination * @param literal the KiWiBooleanLiteral to serialize * @throws IOException */ public static void writeBooleanLiteral(DataOutput out, KiWiBooleanLiteral literal) throws IOException { if (literal == null) { out.writeLong(-1L); } else { out.writeLong(literal.getId()); out.writeBoolean(literal.booleanValue()); writeURI(out, literal.getType()); out.writeLong(literal.getCreated().getTime()); } } /** * Read a KiWiBooleanLiteral serialized with writeBooleanLiteral from a DataInput source * * @param input the source * @return the de-serialized KiWiBooleanLiteral * @throws IOException */ public static KiWiBooleanLiteral readBooleanLiteral(DataInput input) throws IOException { long id = input.readLong(); if (id == -1) { return null; } else { boolean content = input.readBoolean(); KiWiUriResource dtype = readURI(input); Date created = new Date(input.readLong()); KiWiBooleanLiteral r = new KiWiBooleanLiteral(content, dtype, created); r.setId(id); return r; } } /** * Efficiently serialize a KiWiDateLiteral to a DataOutput destination. * * @param out the destination * @param literal the KiWiDateLiteral to serialize * @throws IOException */ public static void writeDateLiteral(DataOutput out, KiWiDateLiteral literal) throws IOException { if (literal == null) { out.writeLong(-1L); } else { out.writeLong(literal.getId()); out.writeLong(literal.getDateContent().getMillis()); out.writeInt(literal.getDateContent().getZone().getOffset(literal.getDateContent())); writeURI(out, literal.getType()); out.writeLong(literal.getCreated().getTime()); } } /** * Read a KiWiDateLiteral serialized with writeDateLiteral from a DataInput source * * @param input the source * @return the de-serialized KiWiDateLiteral * @throws IOException */ public static KiWiDateLiteral readDateLiteral(DataInput input) throws IOException { long id = input.readLong(); if (id == -1) { return null; } else { DateTime content = new DateTime(input.readLong(), DateTimeZone.forOffsetMillis(input.readInt())); KiWiUriResource dtype = readURI(input); Date created = new Date(input.readLong()); KiWiDateLiteral r = new KiWiDateLiteral(content, dtype, created); r.setId(id); return r; } } /** * Efficiently serialize a KiWiDoubleLiteral to a DataOutput destination. * * @param out the destination * @param literal the KiWiDoubleLiteral to serialize * @throws IOException */ public static void writeDoubleLiteral(DataOutput out, KiWiDoubleLiteral literal) throws IOException { if (literal == null) { out.writeLong(-1L); } else { out.writeLong(literal.getId()); out.writeDouble(literal.getDoubleContent()); writeURI(out, literal.getType()); out.writeLong(literal.getCreated().getTime()); } } /** * Read a KiWiDoubleLiteral serialized with writeDoubleLiteral from a DataInput source * * @param input the source * @return the de-serialized KiWiDoubleLiteral * @throws IOException */ public static KiWiDoubleLiteral readDoubleLiteral(DataInput input) throws IOException { long id = input.readLong(); if (id == -1) { return null; } else { double content = input.readDouble(); KiWiUriResource dtype = readURI(input); Date created = new Date(input.readLong()); KiWiDoubleLiteral r = new KiWiDoubleLiteral(content, dtype, created); r.setId(id); return r; } } /** * Efficiently serialize a KiWiIntLiteral to a DataOutput destination. * * @param out the destination * @param literal the KiWiIntLiteral to serialize * @throws IOException */ public static void writeIntLiteral(DataOutput out, KiWiIntLiteral literal) throws IOException { if (literal == null) { out.writeLong(-1L); } else { out.writeLong(literal.getId()); out.writeLong(literal.getIntContent()); writeURI(out, literal.getType()); out.writeLong(literal.getCreated().getTime()); } } /** * Read a KiWiIntLiteral serialized with writeIntLiteral from a DataInput source * * @param input the source * @return the de-serialized KiWiIntLiteral * @throws IOException */ public static KiWiIntLiteral readIntLiteral(DataInput input) throws IOException { long id = input.readLong(); if (id == -1) { return null; } else { long content = input.readLong(); KiWiUriResource dtype = readURI(input); Date created = new Date(input.readLong()); KiWiIntLiteral r = new KiWiIntLiteral(content, dtype, created); r.setId(id); return r; } } /** * Efficiently serialize a KiWiStringLiteral to a DataOutput destination. * * @param out the destination * @param literal the KiWiStringLiteral to serialize * @throws IOException */ public static void writeStringLiteral(DataOutput out, KiWiStringLiteral literal) throws IOException { if (literal == null) { out.writeLong(-1L); } else { out.writeLong(literal.getId()); writeContent(out, literal.getContent()); if (langTable.containsKey(literal.getLanguage())) { out.writeByte(langTable.get(literal.getLanguage())); } else { out.writeByte(LANG_UNKNOWN); DataIO.writeString(out, literal.getLanguage()); } writeURI(out, literal.getType()); out.writeLong(literal.getCreated().getTime()); } } /** * Read a KiWiStringLiteral serialized with writeStringLiteral from a DataInput source * * @param input the source * @return the de-serialized KiWiStringLiteral * @throws IOException */ public static KiWiStringLiteral readStringLiteral(DataInput input) throws IOException { long id = input.readLong(); if (id == -1) { return null; } else { String content = readContent(input); byte langB = input.readByte(); String lang; switch (langB) { case LANG_EN: lang = "en"; break; case LANG_DE: lang = "de"; break; case LANG_FR: lang = "fr"; break; case LANG_ES: lang = "es"; break; case LANG_IT: lang = "it"; break; case LANG_PT: lang = "pt"; break; case LANG_NL: lang = "nl"; break; case LANG_SV: lang = "sv"; break; case LANG_NO: lang = "no"; break; case LANG_FI: lang = "fi"; break; case LANG_RU: lang = "ru"; break; case LANG_DK: lang = "dk"; break; case LANG_PL: lang = "pl"; break; default: lang = DataIO.readString(input); } KiWiUriResource dtype = readURI(input); Date created = new Date(input.readLong()); KiWiStringLiteral r = new KiWiStringLiteral(content, lang != null ? Locale.forLanguageTag(lang) : null, dtype, created); r.setId(id); return r; } } /** * Efficiently serialize a KiWiTriple to a DataOutput destination. * * @param output the destination * @param triple the KiWiTriple to serialize * @throws IOException */ public static void writeTriple(DataOutput output, KiWiTriple triple) throws IOException { output.writeLong(triple.getId()); // in case subject and object are both uris we use a special prefix-compressed mode if (triple.getSubject().isUriResource() && triple.getObject().isUriResource()) { String sUri = triple.getSubject().stringValue(); String oUri = triple.getObject().stringValue(); String prefix = StringUtils.getCommonPrefix(sUri, oUri); output.writeByte(MODE_PREFIX); DataIO.writeString(output, prefix); output.writeLong(triple.getSubject().getId()); DataIO.writeString(output, sUri.substring(prefix.length())); output.writeLong(triple.getSubject().getCreated().getTime()); writeURI(output, triple.getPredicate()); output.writeLong(triple.getObject().getId()); DataIO.writeString(output, oUri.substring(prefix.length())); output.writeLong(triple.getObject().getCreated().getTime()); } else { output.writeByte(MODE_DEFAULT); writeNode(output, triple.getSubject()); writeURI(output, triple.getPredicate()); writeNode(output, triple.getObject()); } writeNode(output, triple.getContext()); writeNode(output, triple.getCreator()); output.writeBoolean(triple.isDeleted()); output.writeBoolean(triple.isInferred()); output.writeBoolean(triple.isNewTriple()); output.writeLong(triple.getCreated().getTime()); if (triple.getDeletedAt() != null) { output.writeLong(triple.getDeletedAt().getTime()); } else { output.writeLong(0); } } /** * Read a KiWiTriple serialized with writeTriple from a DataInput source * * @param input the source * @return the de-serialized KiWiTriple * @throws IOException */ public static KiWiTriple readTriple(DataInput input) throws IOException { KiWiTriple result = new KiWiTriple(); result.setId(input.readLong()); int mode = input.readByte(); if (mode == MODE_PREFIX) { String prefix = DataIO.readString(input); long sId = input.readLong(); String sUri = prefix + DataIO.readString(input); long sTime = input.readLong(); KiWiUriResource s = new KiWiUriResource(sUri); s.setId(sId); s.setCreated(new Date(sTime)); result.setSubject(s); result.setPredicate(readURI(input)); long oId = input.readLong(); String oUri = prefix + DataIO.readString(input); long oTime = input.readLong(); KiWiUriResource o = new KiWiUriResource(oUri); o.setId(oId); o.setCreated(new Date(oTime)); result.setObject(o); } else { result.setSubject((KiWiResource) readNode(input)); result.setPredicate(readURI(input)); result.setObject(readNode(input)); } result.setContext((KiWiResource) readNode(input)); result.setCreator((KiWiResource) readNode(input)); result.setDeleted(input.readBoolean()); result.setInferred(input.readBoolean()); result.setNewTriple(input.readBoolean()); result.setCreated(new Date(input.readLong())); long deletedAt = input.readLong(); if (deletedAt > 0) { result.setDeletedAt(new Date(deletedAt)); } return result; } /** * Read a potentially compressed string from the data input. * * @param in * @return * @throws IOException */ private static String readContent(DataInput in) throws IOException { int mode = in.readByte(); if (mode == MODE_COMPRESSED) { try { int strlen = in.readInt(); int buflen = in.readInt(); byte[] buffer = new byte[buflen]; in.readFully(buffer); Inflater decompressor = new Inflater(true); decompressor.setInput(buffer); byte[] data = new byte[strlen]; decompressor.inflate(data); decompressor.end(); return new String(data, "UTF-8"); } catch (DataFormatException ex) { throw new IllegalStateException("input data is not valid", ex); } } else { return DataIO.readString(in); } } /** * Write a string to the data output. In case the string length exceeds LITERAL_COMPRESS_LENGTH, uses a LZW * compressed format, otherwise writes the plain bytes. * * @param out output destination to write to * @param content string to write * @throws IOException */ private static void writeContent(DataOutput out, String content) throws IOException { if (content.length() > LITERAL_COMPRESS_LENGTH) { // temporary buffer of the size of bytes in the content string (assuming that the compressed data will fit into it) byte[] data = content.getBytes("UTF-8"); byte[] buffer = new byte[data.length]; Deflater compressor = new Deflater(Deflater.BEST_COMPRESSION, true); compressor.setInput(data); compressor.finish(); int length = compressor.deflate(buffer); // only use compressed version if it is smaller than the number of bytes used by the string if (length < buffer.length) { log.debug("compressed string with {} bytes; compression ratio {}", data.length, (double) length / data.length); out.writeByte(MODE_COMPRESSED); out.writeInt(data.length); out.writeInt(length); out.write(buffer, 0, length); } else { log.warn("compressed length exceeds string buffer: {} > {}", length, buffer.length); out.writeByte(MODE_DEFAULT); DataIO.writeString(out, content); } compressor.end(); } else { out.writeByte(MODE_DEFAULT); DataIO.writeString(out, content); } } }