examples.JsonSerializationProcessor.java Source code

Java tutorial

Introduction

Here is the source code for examples.JsonSerializationProcessor.java

Source

package examples;

/*
 * #%L
 * Wikidata Toolkit Examples
 * %%
 * Copyright (C) 2014 - 2015 Wikidata Toolkit Developers
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
import org.wikidata.wdtk.datamodel.helpers.DatamodelConverter;
import org.wikidata.wdtk.datamodel.interfaces.EntityDocumentProcessor;
import org.wikidata.wdtk.datamodel.interfaces.ItemDocument;
import org.wikidata.wdtk.datamodel.interfaces.ItemIdValue;
import org.wikidata.wdtk.datamodel.interfaces.PropertyDocument;
import org.wikidata.wdtk.datamodel.interfaces.PropertyIdValue;
import org.wikidata.wdtk.datamodel.interfaces.Statement;
import org.wikidata.wdtk.datamodel.interfaces.StatementGroup;
import org.wikidata.wdtk.datamodel.interfaces.Value;
import org.wikidata.wdtk.datamodel.interfaces.ValueSnak;
import org.wikidata.wdtk.datamodel.json.jackson.JacksonObjectFactory;
import org.wikidata.wdtk.datamodel.json.jackson.JsonSerializer;

/**
 * This example illustrates how to create a JSON serialization of some of the
 * data found in a dump. It uses a {@link DatamodelConverter} with filter
 * settings to eliminate some of the data.
 * <p>
 * As an example, the program only serializes data for people who were born in
 * Dresden, Germany. This can be changed by modifying the code in
 * {@link #includeDocument(ItemDocument)}.
 *
 * @author Markus Kroetzsch
 *
 */
public class JsonSerializationProcessor implements EntityDocumentProcessor {

    static final String OUTPUT_FILE_NAME = "json-serialization-example.json.gz";

    final JsonSerializer jsonSerializer;

    /**
     * Object used to make simplified copies of Wikidata documents for
     * re-serialization in JSON.
     */
    final DatamodelConverter datamodelConverter;

    /**
     * Runs the example program.
     *
     * @param args
     * @throws IOException
     *             if there was a problem in writing the output file
     */
    public static void main(String[] args) throws IOException {
        ExampleHelpers.configureLogging();
        JsonSerializationProcessor.printDocumentation();

        JsonSerializationProcessor jsonSerializationProcessor = new JsonSerializationProcessor();
        ExampleHelpers.processEntitiesFromWikidataDump(jsonSerializationProcessor);
        jsonSerializationProcessor.close();
    }

    /**
     * Constructor. Initializes various helper objects we use for the JSON
     * serialization, and opens the file that we want to write to.
     *
     * @throws IOException
     *             if there is a problem opening the output file
     */
    public JsonSerializationProcessor() throws IOException {
        // The converter is used to copy selected parts of the data. We use this
        // to remove some parts from the documents we serialize. The converter
        // uses a JacksonObjectFactory to be sure that we create objects for
        // which we know how to write JSON. A similar converter without the
        // filtering enabled would also be used to create JSON-serializable
        // objects from any other implementation of the Wikidata Toolkit
        // datamodel.
        this.datamodelConverter = new DatamodelConverter(new JacksonObjectFactory());
        // Do not copy references at all:
        this.datamodelConverter.setOptionDeepCopyReferences(false);
        // Only copy English labels, descriptions, and aliases:
        this.datamodelConverter.setOptionLanguageFilter(Collections.singleton("en"));
        // Only copy statements of some properties:
        Set<PropertyIdValue> propertyFilter = new HashSet<>();
        propertyFilter.add(Datamodel.makeWikidataPropertyIdValue("P18")); // image
        propertyFilter.add(Datamodel.makeWikidataPropertyIdValue("P106")); // occupation
        propertyFilter.add(Datamodel.makeWikidataPropertyIdValue("P569")); // birthdate
        this.datamodelConverter.setOptionPropertyFilter(propertyFilter);
        // Do not copy any sitelinks:
        this.datamodelConverter.setOptionSiteLinkFilter(Collections.<String>emptySet());

        // The (compressed) file we write to.
        OutputStream outputStream = new GzipCompressorOutputStream(
                new BufferedOutputStream(ExampleHelpers.openExampleFileOuputStream(OUTPUT_FILE_NAME)));
        this.jsonSerializer = new JsonSerializer(outputStream);

        this.jsonSerializer.open();
    }

    @Override
    public void processItemDocument(ItemDocument itemDocument) {
        if (includeDocument(itemDocument)) {
            this.jsonSerializer.processItemDocument(this.datamodelConverter.copy(itemDocument));
        }
    }

    @Override
    public void processPropertyDocument(PropertyDocument propertyDocument) {
        // we do not serialize any properties
    }

    /**
     * Prints some basic documentation about this program.
     */
    public static void printDocumentation() {
        System.out.println("********************************************************************");
        System.out.println("*** Wikidata Toolkit: JsonSerializationProcessor");
        System.out.println("*** ");
        System.out.println("*** This program will download and process dumps from Wikidata.");
        System.out.println("*** It will filter the data and store the results in a new JSON file.");
        System.out.println("*** See source code for further details.");
        System.out.println("********************************************************************");
    }

    /**
     * Closes the output. Should be called after the JSON serialization was
     * finished.
     *
     * @throws IOException
     *             if there was a problem closing the output
     */
    public void close() throws IOException {
        System.out.println("Serialized " + this.jsonSerializer.getEntityDocumentCount()
                + " item documents to JSON file " + OUTPUT_FILE_NAME + ".");
        this.jsonSerializer.close();
    }

    /**
     * Returns true if the given document should be included in the
     * serialization.
     *
     * @param itemDocument
     *            the document to check
     * @return true if the document should be serialized
     */
    private boolean includeDocument(ItemDocument itemDocument) {
        for (StatementGroup sg : itemDocument.getStatementGroups()) {
            // "P19" is "place of birth" on Wikidata
            if (!"P19".equals(sg.getProperty().getId())) {
                continue;
            }
            for (Statement s : sg.getStatements()) {
                if (s.getClaim().getMainSnak() instanceof ValueSnak) {
                    Value v = ((ValueSnak) s.getClaim().getMainSnak()).getValue();
                    // "Q1731" is "Dresden" on Wikidata
                    if (v instanceof ItemIdValue && "Q1731".equals(((ItemIdValue) v).getId())) {
                        return true;
                    }
                }
            }
        }
        return false;
    }
}