de.uni_koblenz.west.splendid.tools.NXVoidGenerator.java Source code

Introduction

Here is the source code for de.uni_koblenz.west.splendid.tools.NXVoidGenerator.java
Source

/*
 * This file is part of RDF Federator.
 * Copyright 2011 Olaf Goerlitz
 * 
 * RDF Federator is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * RDF Federator is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with RDF Federator.  If not, see <http://www.gnu.org/licenses/>.
 * 
 * RDF Federator uses libraries from the OpenRDF Sesame Project licensed 
 * under the Aduna BSD-style license. 
 */
package de.uni_koblenz.west.splendid.tools;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.openrdf.model.BNode;
import org.openrdf.model.Literal;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.impl.ValueFactoryImpl;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.model.vocabulary.RDFS;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFWriter;
import org.semanticweb.yars.nx.Node;
import org.semanticweb.yars.nx.parser.NxParser;

import de.uni_koblenz.west.splendid.statistics.util.CompactBNodeTurtleWriter;
import de.uni_koblenz.west.splendid.vocabulary.VOID2;

/**
 * 
 * @author goerlitz@uni-koblenz.de
 */
public class NXVoidGenerator {

    static final String USAGE = "NXVoidGenerator [-h] -o <outfile> -i <infile> [<infile2> ...]";

    static final Options OPTIONS = new Options();
    static final Option HELP = new Option("h", "help", false, "print this message");
    static final Option OUTPUT_FILE = OptionBuilder.hasArg().withArgName("outfile")
            .withDescription(
                    "use given file for output (append .gz for Gzipped output); defaults to console output")
            .create("o");

    static final Option INPUT_FILES = OptionBuilder.hasArg().withArgName("infiles").hasArgs()
            .withDescription("use given files for input (append .gz for Gzipped input)").create("i");

    static {
        OPTIONS.addOption(HELP);
        OPTIONS.addOption(OUTPUT_FILE);
        OPTIONS.addOption(INPUT_FILES);
    }

    public static void main(String[] args) {

        try {
            // parse the command line arguments
            CommandLineParser parser = new GnuParser();
            CommandLine cmd = parser.parse(OPTIONS, args);

            // print help message
            if (cmd.hasOption("h") || cmd.hasOption("help")) {
                new HelpFormatter().printHelp(USAGE, OPTIONS);
                System.exit(0);
            }

            // get input files (from option -i or all remaining parameters)
            String[] inputFiles = cmd.getOptionValues("i");
            if (inputFiles == null)
                inputFiles = cmd.getArgs();
            if (inputFiles.length == 0) {
                System.out.println("need at least one input file.");
                new HelpFormatter().printUsage(new PrintWriter(System.out, true), 80, USAGE);
                System.exit(1);
            }
            String outputFile = cmd.getOptionValue("o");

            // process all input files
            new NXVoidGenerator().process(outputFile, inputFiles);

        } catch (ParseException exp) {
            // print parse error and display usage message
            System.out.println(exp.getMessage());
            new HelpFormatter().printUsage(new PrintWriter(System.out, true), 80, USAGE, OPTIONS);
        }
    }

    // --------------------------------------------------------------

    private static final ValueFactory vf = ValueFactoryImpl.getInstance();
    private static final URI DATASET = vf.createURI(VOID2.Dataset.toString());
    private static final URI TRIPLES = vf.createURI(VOID2.triples.toString());
    private static final URI CLASSES = vf.createURI(VOID2.classes.toString());
    private static final URI ENDTITIES = vf.createURI(VOID2.entities.toString());
    private static final URI PROPERTIES = vf.createURI(VOID2.properties.toString());

    Node lastContext = null;
    Set<Node> contexts = new HashSet<Node>();
    long totalTripleCount = 0;
    long tripleCount = 0;
    long contextCount = 0;
    Map<Node, Integer> pMap = new HashMap<Node, Integer>();
    Counter<Integer> predCount = new Counter<Integer>();
    Counter<Node> typeCount = new Counter<Node>();

    RDFWriter writer = null;

    public void process(String outputFile, String[] inputFiles) {

        // sanity check, output file should not be listed as input file
        for (String inputFile : inputFiles) {
            if (inputFile.equals(outputFile)) {
                System.err.println("output file must not overwrite input file");
                return;
            }
        }

        long start = System.currentTimeMillis();

        try {
            // prepare output file
            this.writer = new CompactBNodeTurtleWriter(getOutputStream(outputFile));
            writer.startRDF();
            writer.handleNamespace("void", "http://rdfs.org/ns/void#");
            writer.handleNamespace("rdf", RDF.NAMESPACE);
            writer.handleNamespace("rdfs", RDFS.NAMESPACE);

            // process all input files
            for (String input : inputFiles) {
                process(input);
            }

            writer.endRDF();

        } catch (RDFHandlerException e) {
            e.printStackTrace();
        } catch (IOException e) {
            System.err.println("cannot write " + e.getMessage());
        }

        System.out.println("time elapsed: " + ((System.currentTimeMillis() - start) / 1000) + " seconds.");
    }

    private void process(String input) {

        System.out.println("processing " + input);

        try {
            InputStream in = getInputStream(input);
            NxParser parser = new NxParser(in);

            Node[] quad = null;
            while (parser.hasNext()) {
                quad = parser.next();

                totalTripleCount++;

                Node ctx = quad[3];

                // check context order consistency
                if (isUnorderedContext(ctx)) {
                    System.err.println("aborting: " + input + " is not ordered by context (line " + totalTripleCount
                            + ", ctx=" + ctx + ")");
                    System.exit(1);
                }

                // check if context differs from last context
                if (!ctx.equals(lastContext)) {
                    try {
                        postProcess(lastContext);
                    } catch (RDFHandlerException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                    addContext(ctx);
                }

                handleStatement(quad[0], quad[1], quad[2]);
            }

            in.close();
        } catch (IOException e) {
            System.err.println("cannot read " + e.getMessage());
        }
    }

    private void handleStatement(Node s, Node p, Node o) {
        tripleCount++;

        // build predicate map
        Integer pID = getPredicateID(p);
        predCount.add(pID);

        // test if rdf:type
        if (p.toString().equals(RDF.TYPE.toString())) {
            typeCount.add(o);
        }

        // store predicate ID for subject and object
    }

    private Integer getPredicateID(Node p) {
        Integer pID = pMap.get(p);
        if (pID == null) {
            pID = pMap.size() + 1;
            pMap.put(p, pID);
        }
        return pID;
    }

    private void addPredID() {

    }

    /**
     * Checks if the data is not ordered by context.
     * 
     * @param context the current context.
     * @return true if not ordered by context; false otherwise.
     */
    private boolean isUnorderedContext(Node context) {
        return (!context.equals(lastContext) && contexts.contains(context));
    }

    private void postProcess(Node context) throws RDFHandlerException {
        if (context == null)
            return; // nothing to do

        URI dataset = vf.createURI(context.toString());

        // general void information
        writer.handleStatement(vf.createStatement(dataset, RDF.TYPE, DATASET));
        writer.handleStatement(vf.createStatement(dataset, TRIPLES, vf.createLiteral(String.valueOf(tripleCount))));
        writer.handleStatement(
                vf.createStatement(dataset, PROPERTIES, vf.createLiteral(String.valueOf(predCount.size()))));

        List<Node> keys = new ArrayList<Node>(pMap.keySet());
        Collections.sort(keys);
        for (Node n : keys) {
            try {
                URI predicate = vf.createURI(n.toString());
                writePredicateStatToVoid(dataset, predicate, predCount.countMap.get(pMap.get(n)), 0, 0);
            } catch (IllegalArgumentException e) {
                System.err.println("bad predicate: " + e.getMessage());
                continue;
            }
        }

        keys = new ArrayList<Node>(typeCount.countMap.keySet());
        Collections.sort(keys);
        for (Node n : keys) {
            try {
                URI type = vf.createURI(n.toString());
                writeTypeStatToVoid(dataset, type, typeCount.countMap.get(n));
            } catch (IllegalArgumentException e) {
                System.err.println("bad type: " + e.getMessage());
                continue;
            }
        }

        //      writer.handleStatement(vf.createStatement(dataset, vf.createURI(VOID2.classes.toString()), vf.createLiteral(String.valueOf(typeCountMap.size()))));
        //      writer.handleStatement(vf.createStatement(dataset, vf.createURI(VOID2.entities.toString()), vf.createLiteral(String.valueOf(entityCount))));

        //      System.out.println("Context [" + contextCount + "] " + context + " has " + predCount.size() + " distinct predicates, " + tripleCount + " triples.");

        // reset counters etc.
        tripleCount = 0;
        predCount = new Counter<Integer>();
        pMap = new HashMap<Node, Integer>();
    }

    private void addContext(Node ctx) {
        contexts.add(ctx);
        lastContext = ctx;
        contextCount++;
    }

    public OutputStream getOutputStream(String file) throws IOException {
        if (file == null)
            return System.out;

        // TODO: check if file already exists and should be overwritten

        OutputStream out = new FileOutputStream(file);
        if (file.endsWith(".gz")) {
            out = new GZIPOutputStream(out);
        }
        return out;
    }

    public InputStream getInputStream(String file) throws IOException {
        if (file == null)
            return System.in;

        InputStream in = new FileInputStream(file);
        if (file.endsWith("gz")) {
            in = new GZIPInputStream(in);
        }
        return in;
    }

    // --------------------------------------------------------------

    private void writePredicateStatToVoid(URI dataset, URI predicate, long pCount, int distS, int distO) {
        BNode propPartition = vf.createBNode();
        Literal count = vf.createLiteral(String.valueOf(pCount));
        Literal distinctS = vf.createLiteral(String.valueOf(distS));
        Literal distinctO = vf.createLiteral(String.valueOf(distO));
        try {
            writer.handleStatement(
                    vf.createStatement(dataset, vf.createURI(VOID2.propertyPartition.toString()), propPartition));
            writer.handleStatement(
                    vf.createStatement(propPartition, vf.createURI(VOID2.property.toString()), predicate));
            writer.handleStatement(
                    vf.createStatement(propPartition, vf.createURI(VOID2.triples.toString()), count));
            writer.handleStatement(
                    vf.createStatement(propPartition, vf.createURI(VOID2.distinctSubjects.toString()), distinctS));
            writer.handleStatement(
                    vf.createStatement(propPartition, vf.createURI(VOID2.distinctObjects.toString()), distinctO));
        } catch (RDFHandlerException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    private void writeTypeStatToVoid(URI dataset, Value type, long tCount) {
        BNode classPartition = vf.createBNode();
        Literal count = vf.createLiteral(String.valueOf(tCount));
        try {
            writer.handleStatement(
                    vf.createStatement(dataset, vf.createURI(VOID2.classPartition.toString()), classPartition));
            writer.handleStatement(vf.createStatement(classPartition, vf.createURI(VOID2.clazz.toString()), type));
            writer.handleStatement(
                    vf.createStatement(classPartition, vf.createURI(VOID2.entities.toString()), count));
        } catch (RDFHandlerException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    // --------------------------------------------------------------

    /**
     * Simple counting class.
     * 
     * @param <T>
     */
    class Counter<T> {

        Map<T, Integer> countMap = new HashMap<T, Integer>();

        public void add(T item) {
            Integer count = countMap.get(item);
            if (count == null)
                countMap.put(item, 1);
            else
                countMap.put(item, count + 1);
        }

        public int size() {
            return countMap.size();
        }

    }

}