de.dfki.km.perspecting.obie.connection.RDFTripleParser.java Source code

Java tutorial

Introduction

Here is the source code for de.dfki.km.perspecting.obie.connection.RDFTripleParser.java

Source

/*
Copyright (c) 2011, 
Benjamin Adrian <benjamin.horak@gmail.com>
German Research Center for Artificial Intelligence (DFKI) <info@dfki.de>
    
All rights reserved.
    
This file is part of SCOOBIE.
    
SCOOBIE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
    
SCOOBIE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with SCOOBIE.  If not, see <http://www.gnu.org/licenses/>.
 */

package de.dfki.km.perspecting.obie.connection;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Locale;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.openrdf.model.Literal;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandler;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParser;
import org.openrdf.rio.RDFParserFactory;
import org.openrdf.rio.RDFParserRegistry;

import de.dfki.km.perspecting.obie.transducer.model.LiteralHashing;
import de.dfki.km.perspecting.obie.vocabulary.MediaType;

public class RDFTripleParser {

    /**
     * 
     */
    private static final String BASEURI = "http://www.dfki.de";

    private static final Logger log = Logger.getLogger(RDFTripleParser.class.getName());

    private static final String ALL = "all";

    private static final ExecutorService pool = Executors.newCachedThreadPool();

    private static final int URISIZE = 120;

    final Object SEMAPHOR = new Object();

    private final LiteralHashing hashing;

    public RDFTripleParser(LiteralHashing hashing) {
        this.hashing = hashing;
    }

    private static InputStream getStream(InputStream stream, MediaType mediatype) throws Exception {

        switch (mediatype) {
        case BZIP:
            return new BZip2CompressorInputStream(stream);
        case GZIP:
            return new GzipCompressorInputStream(stream);
        case ZIP:
            return new ZipArchiveInputStream(stream);

        default:
            return stream;

        }
    }

    static class TripleStats {
        // public File literalLanguageList;

        public File datatypeProps;

        public File objectProps;

        public volatile int datatypePropsSize = 0;

        public volatile int objectPropsSize = 0;

    }

    private static char[] encloseCharacterString(String uri) {
        char[] c = new char[uri.length() + 2];
        c[0] = '"';
        System.arraycopy(uri.toCharArray(), 0, c, 1, uri.length());
        c[c.length - 1] = '"';
        return c;
    }

    /**
     * 
     * @param input
     * @param mimetype
     * @param sessionPath
     * @param absoluteBaseURI
     * @return
     * @throws Exception
     */
    public TripleStats parseTriples(final InputStream[] input, final MediaType rdf_mimetype, final File sessionPath,
            final String absoluteBaseURI, final MediaType file_mimetype) throws Exception {

        final TripleStats stats = new TripleStats();
        //      int count = 0;

        new File(sessionPath.getAbsolutePath() + "/dump/").mkdirs();

        stats.datatypeProps = new File(sessionPath.getAbsolutePath() + "/dump/datatypeProperties.lst");
        stats.objectProps = new File(sessionPath.getAbsolutePath() + "/dump/objectProperties.lst");

        stats.datatypeProps.deleteOnExit();
        stats.objectProps.deleteOnExit();

        stats.datatypeProps.setReadable(true, false);
        stats.objectProps.setReadable(true, false);

        if (stats.datatypeProps.exists() && stats.objectProps.exists()) {
            return stats;
        }

        // stats.literalLanguageList = new File(sessionPath +
        // "/dump/literals_language.lst");

        // final BufferedWriter literalLanguageWriter = new BufferedWriter(new
        // FileWriter(
        // stats.literalLanguageList, true));

        final BufferedWriter datatypePropertiesWriter = new BufferedWriter(
                new FileWriter(stats.datatypeProps, false));

        final BufferedWriter objectPropertiesWriter = new BufferedWriter(new FileWriter(stats.objectProps, false));

        final ArrayList<Callable<Boolean>> threads = new ArrayList<Callable<Boolean>>();

        int sourceCount = 0;

        for (final InputStream stream : input) {

            final String source = (++sourceCount) + "";
            log.info("Parsing: " + source + " from ( " + input.length + " )");

            final RDFParser parser = getParser(rdf_mimetype);
            parser.setRDFHandler(new RDFHandler() {

                long tripleCount = 0;

                @Override
                public void startRDF() throws RDFHandlerException {
                    log.info("Start parsing RDF triples");
                }

                @Override
                public void handleStatement(Statement stmt) throws RDFHandlerException {
                    try {

                        tripleCount++;

                        if (tripleCount % 10000 == 0) {
                            log.info(source + ": Parsed " + tripleCount + " RDF triples");
                        }
                        // get triple components
                        String p = stmt.getPredicate().toString();
                        String s = stmt.getSubject().toString();
                        String o = stmt.getObject().toString();

                        // test URIs
                        if (s.length() > URISIZE) {
                            log.warning("Skipping too long subject " + s);
                            return;
                        }

                        if (p.length() > URISIZE) {
                            log.warning("Skipping too long predicate " + p);
                            return;
                        }

                        if (stmt.getSubject() instanceof URI)
                            s = fixJavaURI(s);

                        p = fixJavaURI(p);

                        // check object properties URIs
                        if (stmt.getObject() instanceof URI) {
                            if (o.length() > URISIZE) {
                                return;
                            } else {
                                o = fixJavaURI(o);
                                appendObjectTriple(s, p, o);
                            }
                        } else if (stmt.getObject() instanceof Literal) {
                            o = stmt.getObject().stringValue().replaceAll("[\n\t\\\\\"]", "").trim();

                            if (o.length() < 2 || o.length() > 100) {
                                return;
                            }

                            appendLiteralTriple(s, p, o, ((Literal) stmt.getObject()).getLanguage());
                        } else {
                            log.warning("Skipping bad triple " + stmt);
                        }

                    } catch (Exception e) {
                        log.log(Level.SEVERE, "Error in parsing: " + source, e);
                    }

                }

                /**
                 * Encodes characters invalid (e.g. "|") in the uri and returns
                 * the encoded string.
                 * 
                 * @param uri
                 *            uri to enctode
                 * @return encoded uri
                 */
                private String fixJavaURI(String uri) {

                    try {
                        new java.net.URI(uri);
                    } catch (URISyntaxException e) {
                        String badChar = Character.toString(uri.charAt(e.getIndex()));
                        try {
                            log.fine("Fixing bad uri: " + uri);
                            return fixJavaURI(uri.replace(badChar, URLEncoder.encode(badChar, "utf-8")));
                        } catch (UnsupportedEncodingException e1) {
                            throw new RuntimeException(e1);
                        }
                    }

                    return uri;
                }

                private void appendLiteralTriple(String subject, String predicate, String literal, String language)
                        throws IOException {

                    if (language == null) {
                        language = ALL;
                    }

                    synchronized (SEMAPHOR) {
                        stats.datatypePropsSize++;
                        datatypePropertiesWriter.write(RDFTripleParser.encloseCharacterString(subject));
                        datatypePropertiesWriter.append(',');
                        datatypePropertiesWriter.write(RDFTripleParser.encloseCharacterString(predicate));
                        datatypePropertiesWriter.append(',');
                        datatypePropertiesWriter.write(RDFTripleParser.encloseCharacterString(literal));

                        datatypePropertiesWriter.append(',');
                        datatypePropertiesWriter
                                .write(Integer.toString(hashing.hash(literal.toLowerCase(Locale.US))));

                        datatypePropertiesWriter.newLine();
                    }

                }

                private void appendObjectTriple(String subject, String predicate, String object)
                        throws IOException {

                    synchronized (SEMAPHOR) {

                        stats.objectPropsSize++;

                        objectPropertiesWriter.write(RDFTripleParser.encloseCharacterString(subject));
                        objectPropertiesWriter.append(',');
                        objectPropertiesWriter.write(RDFTripleParser.encloseCharacterString(predicate));
                        objectPropertiesWriter.append(',');
                        objectPropertiesWriter.write(RDFTripleParser.encloseCharacterString(object));
                        objectPropertiesWriter.newLine();
                    }

                }

                @Override
                public void handleNamespace(String arg0, String arg1) throws RDFHandlerException {
                }

                @Override
                public void handleComment(String arg0) throws RDFHandlerException {
                }

                @Override
                public void endRDF() throws RDFHandlerException {
                    log.info("Finished parsing RDF triples " + tripleCount + " RDF triples");

                }
            });

            threads.add(new Callable<Boolean>() {
                /*
                 * (non-Javadoc)
                 * 
                 * @see java.util.concurrent.Callable#call()
                 */
                @Override
                public Boolean call() throws Exception {

                    InputStream unpackedStream = getStream(stream, file_mimetype);
                    try {
                        if (absoluteBaseURI != null)
                            parser.parse(unpackedStream, absoluteBaseURI);
                        else
                            parser.parse(unpackedStream, BASEURI);
                    } catch (Exception e) {
                        new Exception("Error during parsing " + source + " with mimetype " + file_mimetype, e)
                                .printStackTrace();
                        unpackedStream.close();
                        return false;
                    }
                    unpackedStream.close();

                    return true;
                }
            });

        }

        for (Future<Boolean> future : pool.invokeAll(threads)) {
            if (!future.get()) {
                throw new Exception("error occured during parsing");
            }
        }

        // literalLanguageWriter.close();
        objectPropertiesWriter.close();
        datatypePropertiesWriter.close();

        return stats;
    }

    private RDFParser getParser(MediaType mimetype) {
        RDFParserRegistry parserRegistry = RDFParserRegistry.getInstance();
        RDFFormat format = parserRegistry.getFileFormatForMIMEType(mimetype.toString());
        RDFParserFactory parserFactory = parserRegistry.get(format);
        RDFParser parser = parserFactory.getParser();
        parser.setValueFactory(new MyValueFactoryImpl());
        parser.setVerifyData(false);
        parser.setStopAtFirstError(false);
        return parser;
    }

    static class MyValueFactoryImpl extends ValueFactoryImpl {

        /*
         * (non-Javadoc)
         * 
         * @see
         * org.openrdf.model.impl.ValueFactoryImpl#createURI(java.lang.String)
         */
        @Override
        public URI createURI(String uri) {
            try {
                return super.createURI(uri);
            } catch (Exception e) {

                if (uri.contains("|")) {
                    try {
                        String uri1 = uri.replace("|", URLEncoder.encode("|", "utf-8"));
                        log.fine("Fixed URI: " + uri + " to " + uri1);
                        return createURI(uri1);
                    } catch (UnsupportedEncodingException e1) {
                        throw new RuntimeException(e1);
                    }
                } else {
                    log.fine("Fixed URI: " + uri + " to http://" + uri);
                    return super.createURI("http://" + uri);
                }
            }
        }

    }

}