deepschema.ExtractingTool.java Source code

Java tutorial

Introduction

Here is the source code for deepschema.ExtractingTool.java

Source

package deepschema;

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static deepschema.Parameters.classes;
import static deepschema.Parameters.classesStream;
import static deepschema.Parameters.datamodelConverter;
import static deepschema.Parameters.includeInstances;
import static deepschema.Parameters.instanceOfRelationsStream;
import static deepschema.Parameters.instances;
import static deepschema.Parameters.instancesStream;
import static deepschema.Parameters.jsonSerializer;
import static deepschema.Parameters.jsonStream;
import static deepschema.Parameters.operation;
import static deepschema.Parameters.output;
import static deepschema.Parameters.rdfSerializer;
import static deepschema.Parameters.rdfStream;
import static deepschema.Parameters.separator;
import static deepschema.Parameters.subclassOfRelationsStream;
import static deepschema.Parameters.txtStream;
import static deepschema.Parameters.useCache;
import static deepschema.DumpOperations.enhanceAndFilter;
import static deepschema.DumpOperations.exploreDataset;
import static deepschema.DumpOperations.processEntitiesFromWikidataDump;
import static deepschema.DumpOperations.readDump;
import static deepschema.DumpOperations.structureOutput;

import java.io.BufferedOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.file.FileAlreadyExistsException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import org.openrdf.rio.RDFFormat;
import org.wikidata.wdtk.datamodel.helpers.DatamodelConverter;
import org.wikidata.wdtk.datamodel.interfaces.EntityDocumentProcessor;
import org.wikidata.wdtk.datamodel.interfaces.ItemDocument;
import org.wikidata.wdtk.datamodel.interfaces.PropertyDocument;
import org.wikidata.wdtk.datamodel.json.jackson.JacksonObjectFactory;
import org.wikidata.wdtk.datamodel.json.jackson.JsonSerializer;
import org.wikidata.wdtk.dumpfiles.DumpProcessingController;
import org.wikidata.wdtk.rdf.PropertyRegister;
import org.wikidata.wdtk.rdf.RdfSerializer;

import deepschema.Parameters.Operation;
import deepschema.Parameters.Output;
import deepschema.Parameters.WikidataClassProperties;
import deepschema.Parameters.WikidataInstanceProperties;

/**
 * deepchema.org extracting tool.
 *
 * @author Panayiotis Smeros
 *
 */
public class ExtractingTool implements EntityDocumentProcessor {

    /**
     * Constructor. Opens the file that we want to write to.
     *
     */
    public ExtractingTool() {
        try {
            if (output == Output.TSV) {

                classesStream = new GzipCompressorOutputStream(
                        new BufferedOutputStream(openOutput("classes.tsv.gz")));
                subclassOfRelationsStream = new GzipCompressorOutputStream(
                        new BufferedOutputStream(openOutput("subclassOfRelations.tsv.gz")));
                if (includeInstances) {
                    instancesStream = new GzipCompressorOutputStream(
                            new BufferedOutputStream(openOutput("instances.tsv.gz")));
                    instanceOfRelationsStream = new GzipCompressorOutputStream(
                            new BufferedOutputStream(openOutput("instanceOfRelations.tsv.gz")));
                }
            } else if (output == Output.JSON) {
                jsonStream = new GzipCompressorOutputStream(
                        new BufferedOutputStream(openOutput("classesAndInstances.json.gz")));

                // DataModel
                datamodelConverter = new DatamodelConverter(new JacksonObjectFactory());
                // Do not copy references.
                datamodelConverter.setOptionDeepCopyReferences(false);
                // Only copy English labels, descriptions, and aliases.
                datamodelConverter.setOptionLanguageFilter(Collections.singleton("en"));
                // Copy statements of all the properties.
                datamodelConverter.setOptionPropertyFilter(null);
                // Do not copy sitelinks.
                datamodelConverter.setOptionSiteLinkFilter(Collections.<String>emptySet());

                jsonSerializer = new JsonSerializer(jsonStream);
                jsonSerializer.open();
            } else if (output == Output.RDF) {
                rdfStream = new GzipCompressorOutputStream(
                        new BufferedOutputStream(openOutput("classesAndInstances.nt.gz")));

                // DataModel
                datamodelConverter = new DatamodelConverter(new JacksonObjectFactory());
                // Do not copy references.
                datamodelConverter.setOptionDeepCopyReferences(false);
                // Only copy English labels, descriptions, and aliases.
                datamodelConverter.setOptionLanguageFilter(Collections.singleton("en"));
                // Copy statements of all the properties.
                datamodelConverter.setOptionPropertyFilter(null);
                // Do not copy sitelinks.
                datamodelConverter.setOptionSiteLinkFilter(Collections.<String>emptySet());

                rdfSerializer = new RdfSerializer(RDFFormat.NTRIPLES, rdfStream,
                        new DumpProcessingController("wikidatawiki").getSitesInformation(),
                        PropertyRegister.getWikidataPropertyRegister());
                // Serialize simple statements (and nothing else) for all items
                rdfSerializer.setTasks(RdfSerializer.TASK_ITEMS | RdfSerializer.TASK_SIMPLE_STATEMENTS);
                rdfSerializer.open();
            } else if (output == Output.TXT) {
                txtStream = new GzipCompressorOutputStream(new BufferedOutputStream(openOutput("output.txt.gz")));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Initializes the extracting procedure.
     * 
     * @param args
     */
    public static void main(String[] args) {
        // Create the appender that will write log messages to the console.
        ConsoleAppender consoleAppender = new ConsoleAppender();
        // Define the pattern of log messages.
        // Insert the string "%c{1}:%L" to also show class name and line.
        String pattern = "%d{yyyy-MM-dd HH:mm:ss} %-5p - %m%n";
        consoleAppender.setLayout(new PatternLayout(pattern));
        // Change to Level.ERROR for fewer messages:
        consoleAppender.setThreshold(Level.INFO);
        consoleAppender.activateOptions();
        Logger.getRootLogger().addAppender(consoleAppender);

        ExtractingTool extractingTool = new ExtractingTool();

        if (useCache) {
            extractingTool.cache("read");
            extractingTool.writeOutput();
        } else {
            operation = Operation.READ_CLASSES;
            processEntitiesFromWikidataDump(extractingTool);
            operation = Operation.ENHANCE_AND_FILTER;
            processEntitiesFromWikidataDump(extractingTool);
            extractingTool.writeOutput();
            extractingTool.cache("write");
        }
    }

    /**
     * Opens output file.
     * 
     * @param filename
     * @return FileOutputStream
     */
    FileOutputStream openOutput(String filename) throws IOException {
        Path directoryPath = Paths.get("results");

        try {
            Files.createDirectory(directoryPath);
        } catch (FileAlreadyExistsException e) {
            if (!Files.isDirectory(directoryPath)) {
                throw e;
            }
        }

        Path filePath = directoryPath.resolve(filename);
        return new FileOutputStream(filePath.toFile());
    }

    /**
     * Writes output to the corresponding files.
     */
    void writeOutput() {
        try {
            if (output == Output.TSV) {
                for (Entry<Integer, WikidataClassProperties> entry : classes.entrySet())
                    classesStream.write((entry.getKey() + separator + entry.getValue().label + "\n").getBytes());

                for (Entry<Integer, WikidataClassProperties> entry : classes.entrySet())
                    for (int value : entry.getValue().subclassOf)
                        subclassOfRelationsStream.write((entry.getKey() + separator + value + "\n").getBytes());

                if (includeInstances) {
                    for (Entry<Integer, WikidataInstanceProperties> entry : instances.entrySet())
                        instancesStream
                                .write((entry.getKey() + separator + entry.getValue().label + "\n").getBytes());

                    for (Entry<Integer, WikidataInstanceProperties> entry : instances.entrySet())
                        for (int value : entry.getValue().instanceOf)
                            instanceOfRelationsStream.write((entry.getKey() + separator + value + "\n").getBytes());
                }
                classesStream.close();
                subclassOfRelationsStream.close();

                if (includeInstances) {
                    instancesStream.close();
                    instanceOfRelationsStream.close();
                }
            } else if (output == Output.JSON) {
                operation = Operation.STRUCTURE_OUTPUT;
                processEntitiesFromWikidataDump(this);
                jsonStream.close();
            } else if (output == Output.RDF) {
                operation = Operation.STRUCTURE_OUTPUT;
                processEntitiesFromWikidataDump(this);
                rdfStream.close();
            } else if (output == Output.TXT) {
                operation = Operation.EXPLORE_DATASET;
                processEntitiesFromWikidataDump(this);
                txtStream.close();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Caches classes and subclasses to file.
     * 
     * @param operation : "read" or "write"
     */
    @SuppressWarnings("unchecked")
    void cache(String action) {
        final String cacheFile = ".cache";
        try {
            switch (action) {
            case "read": {
                ObjectInputStream objectInputStream = new ObjectInputStream(new FileInputStream(cacheFile));
                classes = (Map<Integer, WikidataClassProperties>) ((ObjectInputStream) objectInputStream)
                        .readObject();
                instances = (Map<Integer, WikidataInstanceProperties>) ((ObjectInputStream) objectInputStream)
                        .readObject();
                objectInputStream.close();
            }
            case "write": {
                ObjectOutputStream objectOutputStream = new ObjectOutputStream(new FileOutputStream(cacheFile));
                objectOutputStream.writeObject(classes);
                objectOutputStream.writeObject(instances);
                objectOutputStream.flush();
                objectOutputStream.close();
            }
            }
        } catch (ClassNotFoundException | IOException e) {
            System.err.println("Problem while reading/writing from/to cache.");
            e.printStackTrace();
        }
    }

    @Override
    public void processItemDocument(ItemDocument itemDocument) {
        if (operation == Operation.READ_CLASSES) {
            readDump(itemDocument);
        } else if (operation == Operation.ENHANCE_AND_FILTER) {
            enhanceAndFilter(itemDocument);
        } else if (operation == Operation.STRUCTURE_OUTPUT) {
            structureOutput(itemDocument);
        } else if (operation == Operation.EXPLORE_DATASET) {
            exploreDataset(itemDocument);
        }
    }

    @Override
    public void processPropertyDocument(PropertyDocument propertyDocument) {
        // Do not serialize any properties.
    }
}