de.uni_koblenz.west.splendid.tools.NQuadSourceAggregator.java Source code

Introduction

Here is the source code for de.uni_koblenz.west.splendid.tools.NQuadSourceAggregator.java
Source

/*
 * This file is part of RDF Federator.
 * Copyright 2010 Olaf Goerlitz
 * 
 * RDF Federator is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * RDF Federator is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with RDF Federator.  If not, see <http://www.gnu.org/licenses/>.
 * 
 * RDF Federator uses libraries from the OpenRDF Sesame Project licensed 
 * under the Aduna BSD-style license. 
 */
package de.uni_koblenz.west.splendid.tools;

import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.semanticweb.yars.nx.Node;
import org.semanticweb.yars.nx.Resource;
import org.semanticweb.yars.nx.parser.NxParser;

/**
 * The source information (context URI) in NQuads, with several subdomains and
 * long path fragments, is reduced to a common minimal host name by completely
 * omitting the path information and truncating the first subdomain.
 * 
 * @author goerlitz@uni-koblenz.de
 */
public class NQuadSourceAggregator {

    static final String USAGE = "NQuadSourceAggregator [-h] -o <outfile> -i <infile> [<infile2> ...]";
    static final String LINE_SEP = System.getProperty("line.separator");

    static final Options OPTIONS = new Options();
    static final Option HELP = new Option("h", "help", false, "print this message");
    static final Option OUTPUT_FILE = OptionBuilder.hasArg().withArgName("outfile")
            .withDescription(
                    "use given file for output (append .gz for Gzipped output); defaults to console output")
            .create("o");

    static final Option INPUT_FILES = OptionBuilder.hasArg().withArgName("infiles").hasArgs()
            .withDescription("use given files for input (append .gz for Gzipped input); defaults to console input")
            .create("i");

    private Counter<Resource> ctxCounter = new Counter<Resource>();

    static {
        OPTIONS.addOption(HELP);
        OPTIONS.addOption(OUTPUT_FILE);
        OPTIONS.addOption(INPUT_FILES);
    }

    public static void main(String[] args) {

        try {
            // parse the command line arguments
            CommandLineParser parser = new GnuParser();
            CommandLine cmd = parser.parse(OPTIONS, args);

            // print help message
            if (cmd.hasOption("h") || cmd.hasOption("help")) {
                new HelpFormatter().printHelp(USAGE, OPTIONS);
                System.exit(0);
            }

            // get input files (from option -i or all remaining parameters)
            String[] inputFiles = cmd.getOptionValues("i");
            if (inputFiles == null)
                inputFiles = cmd.getArgs();
            if (inputFiles.length == 0) {
                System.out.println("need at least one input file.");
                new HelpFormatter().printUsage(new PrintWriter(System.out, true), 80, USAGE);
                System.exit(1);
            }
            String outputFile = cmd.getOptionValue("o");

            // process all input files
            new NQuadSourceAggregator().process(outputFile, inputFiles);

        } catch (ParseException exp) {
            // print parse error and display usage message
            System.out.println(exp.getMessage());
            new HelpFormatter().printUsage(new PrintWriter(System.out, true), 80, USAGE, OPTIONS);
        }
    }

    // --------------------------------------------------------------

    public void process(String outputFile, String[] inputFiles) {

        // sanity check, output file should not be listed as input file
        for (String inputFile : inputFiles) {
            if (inputFile.equals(outputFile)) {
                System.err.println("output file must not overwrite input file");
                return;
            }
        }

        long start = System.currentTimeMillis();

        try {
            Writer writer = new BufferedWriter(new OutputStreamWriter(getOutputStream(outputFile)), 50000);

            // handle all input streams
            for (String input : inputFiles) {

                try {
                    InputStream in = getInputStream(input);
                    process(in, writer);
                    in.close();
                } catch (IOException e) {
                    System.err.println("cannot process " + e.getMessage());
                }
            }

            writer.close();

        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println("time elapsed: " + ((System.currentTimeMillis() - start) / 1000) + " seconds.");
        System.out.println("reduced to " + ctxCounter.countMap.size() + " contexts.");
    }

    private void process(InputStream in, Writer writer) {

        NxParser parser = new NxParser(in);

        Node[] quad = null;
        while (parser.hasNext()) {
            try {
                quad = parser.next();

                // ignore path information, just consider host name
                String host = new URI(quad[3].toString()).getHost();

                if (!isIPAddress(host))
                    host = truncateHostName(host);

                quad[3] = new Resource("http://" + host);

                ctxCounter.add((Resource) quad[3]);

                writer.write(quad[0].toN3());
                writer.write(" ");
                writer.write(quad[1].toN3());
                writer.write(" ");
                writer.write(quad[2].toN3());
                writer.write(" ");
                writer.write(quad[3].toN3());
                writer.write(" .");
                writer.write(LINE_SEP);

            } catch (URISyntaxException e) {
                System.out.println("Invalid URI: " + e.getMessage());
                continue;
            } catch (Exception e) {
                System.out.println("ERROR: " + e.getClass() + " - " + e.getMessage());
                continue;
            }
        }
    }

    /**
     * Checks if the host name is an IP address.
     * 
     * @param host the host name to check.
     * @return true is the host name is an IP address; false otherwise.
     */
    private boolean isIPAddress(String host) {
        for (char c : host.toCharArray()) {
            if (!Character.isDigit(c) && !(c == '.')) {
                return false;
            }
        }
        return true;
    }

    /**
     * Removes the first subdomain from host name.
     * 
     * @param host the host name to truncate.
     * @return the updated host name.
     */
    private String truncateHostName(String host) {
        // find dot separator after first subdomain name
        int firstDot = host.indexOf(".");
        if (firstDot > 0) {
            // check for '.co.uk' TLD, don't remove domain name 
            if (host.endsWith("co.uk") && (host.length() - firstDot) == 6) {
                return host;
            }
            // check if there is still a domain name left
            if (host.indexOf(".", firstDot + 1) > -1)
                host = host.substring(firstDot + 1);
        }
        return host;
    }

    public OutputStream getOutputStream(String file) throws IOException {
        if (file == null)
            return System.out;

        // TODO: check if file already exists and should be overwritten

        OutputStream out = new FileOutputStream(file);
        if (file.endsWith(".gz")) {
            out = new GZIPOutputStream(out);
        }
        return out;
    }

    public InputStream getInputStream(String file) throws IOException {
        if (file == null)
            return System.in;

        InputStream in = new FileInputStream(file);
        if (file.endsWith("gz")) {
            in = new GZIPInputStream(in);
        }
        return in;
    }

    // --------------------------------------------------------------

    /**
     * Simple counting class.
     * 
     * @param <T>
     */
    class Counter<T> {

        Map<T, Integer> countMap = new HashMap<T, Integer>();

        public void add(T item) {
            Integer count = countMap.get(item);
            if (count == null)
                countMap.put(item, 1);
            else
                countMap.put(item, count + 1);
        }

        public int size() {
            return countMap.size();
        }

    }

}