examples.RdfSerializationExample.java Source code

Introduction

Here is the source code for examples.RdfSerializationExample.java
Source

package examples;

/*
 * #%L
 * Wikidata Toolkit Examples
 * %%
 * Copyright (C) 2014 Wikidata Toolkit Developers
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.BufferedOutputStream;
import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;

import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipParameters;
import org.openrdf.rio.RDFFormat;
import org.wikidata.wdtk.datamodel.interfaces.Sites;
import org.wikidata.wdtk.dumpfiles.DumpProcessingController;
import org.wikidata.wdtk.rdf.PropertyRegister;
import org.wikidata.wdtk.rdf.RdfSerializer;

/**
 * This class shows how convert data from wikidata.org to RDF in N3 format. The
 * compressed output will be written into an output file.
 * <p>
 * The Wikidata Toolkit command line client provides a stand-alone tool for
 * generating RDF exports and it supports a range of parameters. This example is
 * merely used to illustrate how to achieve this from your own code if needed.
 *
 * @author Michael Gnther
 * @author Markus Kroetzsch
 */
public class RdfSerializationExample {

    public static void main(String[] args) throws IOException {

        // Define where log messages go
        ExampleHelpers.configureLogging();

        // Print information about this program
        printDocumentation();

        // Initialize sites; only needed to link to Wikipedia pages in RDF
        DumpProcessingController dumpProcessingController = new DumpProcessingController("wikidatawiki");
        dumpProcessingController.setOfflineMode(ExampleHelpers.OFFLINE_MODE);
        Sites sites = dumpProcessingController.getSitesInformation();

        // Prepare a compressed output stream to write the data to
        // (admittedly, this is slightly over-optimized for an example)
        OutputStream bufferedFileOutputStream = new BufferedOutputStream(
                ExampleHelpers.openExampleFileOuputStream("wikidata-simple-statements.nt.gz"), 1024 * 1024 * 5);
        GzipParameters gzipParameters = new GzipParameters();
        gzipParameters.setCompressionLevel(7);
        OutputStream compressorOutputStream = new GzipCompressorOutputStream(bufferedFileOutputStream,
                gzipParameters);
        OutputStream exportOutputStream = asynchronousOutputStream(compressorOutputStream);

        // Create a serializer processor
        RdfSerializer serializer = new RdfSerializer(RDFFormat.NTRIPLES, exportOutputStream, sites,
                PropertyRegister.getWikidataPropertyRegister());
        // Serialize simple statements (and nothing else) for all items
        serializer.setTasks(RdfSerializer.TASK_ITEMS | RdfSerializer.TASK_SIMPLE_STATEMENTS);

        // Run serialization
        serializer.open();
        ExampleHelpers.processEntitiesFromWikidataDump(serializer);
        serializer.close();
    }

    /**
     * Print some basic documentation about this program.
     */
    private static void printDocumentation() {
        System.out.println("********************************************************************");
        System.out.println("*** Wikidata Toolkit: RDF Serialization Example");
        System.out.println("*** ");
        System.out.println(
                "*** This program will download dumps from Wikidata and serialize the data in a RDF format.");
        System.out.println("*** Downloading may take some time initially. After that, files");
        System.out.println("*** are stored on disk and are used until newer dumps are available.");
        System.out.println("*** You can delete files manually when no longer needed (see ");
        System.out.println("*** message below for the directory where dump files are found).");
        System.out.println("********************************************************************");
    }

    /**
     * Creates a separate thread for writing into the given output stream and
     * returns a pipe output stream that can be used to pass data to this
     * thread.
     * <p>
     * This code is inspired by
     * http://stackoverflow.com/questions/12532073/gzipoutputstream
     * -that-does-its-compression-in-a-separate-thread
     *
     * @param outputStream
     *            the stream to write to in the thread
     * @return a new stream that data should be written to
     * @throws IOException
     *             if the pipes could not be created for some reason
     */
    public static OutputStream asynchronousOutputStream(final OutputStream outputStream) throws IOException {
        final int SIZE = 1024 * 1024 * 10;
        final PipedOutputStream pos = new PipedOutputStream();
        final PipedInputStream pis = new PipedInputStream(pos, SIZE);
        new Thread(new Runnable() {
            @Override
            public void run() {
                try {
                    byte[] bytes = new byte[SIZE];
                    for (int len; (len = pis.read(bytes)) > 0;) {
                        outputStream.write(bytes, 0, len);
                    }
                } catch (IOException ioException) {
                    ioException.printStackTrace();
                } finally {
                    close(pis);
                    close(outputStream);
                }
            }
        }, "async-output-stream").start();
        return pos;
    }

    /**
     * Closes a Closeable and swallows any exceptions that might occur in the
     * process.
     *
     * @param closeable
     */
    static void close(Closeable closeable) {
        if (closeable != null) {
            try {
                closeable.close();
            } catch (IOException ignored) {
            }
        }
    }
}