Java tutorial
/** * Copyright 2011, Campinas Stephane Licensed under the Apache License, Version * 2.0 (the "License"); you may not use this file except in compliance with the * License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law * or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the specific language * governing permissions and limitations under the License. */ /** * @project trec-entity-tool * @author Campinas Stephane [ 3 Jun 2011 ] * @link stephane.campinas@deri.org */ package org.sindice.siren.index; import java.io.IOException; import java.io.StringReader; import java.nio.ByteBuffer; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentSkipListSet; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.lang.StringUtils; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.rio.RDFHandlerException; import org.openrdf.rio.RDFParseException; import org.openrdf.rio.RDFParser; import org.openrdf.rio.helpers.StatementCollector; import org.openrdf.rio.ntriples.NTriplesParser; /** * Helper functions for indexing */ public class Utils { /* byte array used for reading the compressed tar files */ private static final ByteBuffer bbuffer = ByteBuffer.allocate(1024); private static final String RDF_TYPE = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"; private static final String DC_DESCRIPTION = "<http://purl.org/dc/elements/1.1/description>"; private static final String DBP_ABSTRACT = "<http://dbpedia.org/ontology/abstract>"; private static final String RDFS_LABEL = "<http://www.w3.org/2000/01/rdf-schema#label>"; private static final StringBuilder sb = new StringBuilder(); private static RDFParser parser = null; private static StatementCollector collector = null; // Efficient byte to char conversion private static final int BYTE_RANGE = (1 + Byte.MAX_VALUE) - Byte.MIN_VALUE; private static byte[] allBytes = new byte[BYTE_RANGE]; private static char[] byteToChars = new char[BYTE_RANGE]; static { for (int i = Byte.MIN_VALUE; i <= Byte.MAX_VALUE; i++) { allBytes[i - Byte.MIN_VALUE] = (byte) i; } String allBytesString = new String(allBytes, 0, Byte.MAX_VALUE - Byte.MIN_VALUE); for (int i = 0; i < (Byte.MAX_VALUE - Byte.MIN_VALUE); i++) { byteToChars[i] = allBytesString.charAt(i); } } /** * Read size bytes from the reader at the current position * * @param reader * the TarArchiveInputStream reader * @param size * the number of bytes to read * @param data * the buffer to store the content * @throws IOException */ public static void getFile(final TarArchiveInputStream reader, long size, final StringBuilder data) throws IOException { bbuffer.clear(); while (size > bbuffer.capacity()) { reader.read(bbuffer.array(), 0, bbuffer.capacity()); size -= bbuffer.capacity(); toAsciiString(data, bbuffer.capacity()); bbuffer.clear(); } reader.read(bbuffer.array(), 0, (int) size); toAsciiString(data, (int) size); } /** * Convert the byte array in the platform encoding * @param data the string buffer * @param length number of bytes to decode */ private static final void toAsciiString(final StringBuilder data, final int length) { for (int i = 0; i < length; i++) { data.append(byteToChars[(int) bbuffer.get(i) - Byte.MIN_VALUE]); } } /** * Sort and flatten a list of triples to n-tuples containing many objects for * the same predicate. Generate one n-tuple per predicate. * The tuples are ordered by predicate. <br> * The sorted and flatten representation is generally more efficient in term * of index size than the normal flatten approach. * * @param triples * @param map * @param types * @param isOut */ public static synchronized void sortAndFlattenNTriples(final StringBuilder triples, final ConcurrentHashMap<String, ConcurrentSkipListSet<String>> map, final ConcurrentSkipListSet<String> types, final ConcurrentSkipListSet<String> label, final ConcurrentSkipListSet<String> description, final boolean isOut) { flattenNTriples(triples, map, types, label, description, isOut); } private static void initParser() { if (parser == null) { parser = (RDFParser) new NTriplesParser(); collector = new StatementCollector(); parser.setRDFHandler(collector); } collector.clear(); } /** * Flatten a list of triples to n-tuples containing many objects for the same * predicate. Generate one n-tuple per predicate. * * @param values * The list of n-triples. * @return The n-tuples concatenated. */ private static synchronized void flattenNTriples(final StringBuilder triples, final ConcurrentHashMap<String, ConcurrentSkipListSet<String>> map, final ConcurrentSkipListSet<String> types, final ConcurrentSkipListSet<String> label, final ConcurrentSkipListSet<String> description, final boolean isOut) { try { initParser(); parser.parse(new StringReader(triples.toString()), ""); for (Statement st : collector.getStatements()) { sb.setLength(0); final String subject = sb.append('<').append(st.getSubject().toString()).append('>').toString(); sb.setLength(0); final String predicate = sb.append('<').append(st.getPredicate().toString()).append('>').toString(); sb.setLength(0); final String object = (st.getObject() instanceof URI) ? sb.append('<').append(st.getObject().toString()).append('>').toString() : st.getObject().toString(); if (label != null && predicate.equals(RDFS_LABEL)) label.add(object); if (description != null && predicate.equals(DC_DESCRIPTION)) description.add(object); if (description != null && predicate.equals(DBP_ABSTRACT)) description.add(object); if (types != null && predicate.equals(RDF_TYPE)) { types.add(object); } else { ConcurrentSkipListSet<String> hs = map.get(predicate); final String toAdd = isOut ? object : subject; if (hs == null) { hs = new ConcurrentSkipListSet<String>(); map.put(predicate, hs); } if (hs.size() < 65535) // 2 ^ 16 - 1 hs.add(toAdd); } } } catch (RDFParseException e1) { } catch (RDFHandlerException e1) { } catch (IOException e1) { } triples.setLength(0); } /** * Outputs elements of the hashset into a string, separated by a whitespace * and ending with a dot. * @param set * @return */ public static synchronized String toString(final ConcurrentSkipListSet<String> set) { sb.setLength(0); for (String s : Collections.synchronizedSet(set)) { sb.append(s).append(' '); } sb.append(".\n"); return sb.toString(); } }