com.dumontierlab.pdb2rdf.Pdb2Rdf.java Source code

Java tutorial

Introduction

Here is the source code for com.dumontierlab.pdb2rdf.Pdb2Rdf.java

Source

/**
 * Copyright (c) 2009 Dumontierlab
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package com.dumontierlab.pdb2rdf;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
import org.apache.log4j.Logger;
import org.xml.sax.InputSource;

import com.dumontierlab.pdb2rdf.dao.VirtuosoDaoFactory;
import com.dumontierlab.pdb2rdf.model.PdbRdfModel;
import com.dumontierlab.pdb2rdf.model.VirtPdbRdfModel;
import com.dumontierlab.pdb2rdf.parser.DetailLevel;
import com.dumontierlab.pdb2rdf.parser.PdbXmlParser;
import com.dumontierlab.pdb2rdf.parser.vocabulary.PdbOwlVocabulary;
import com.dumontierlab.pdb2rdf.parser.vocabulary.uri.Bio2RdfPdbUriPattern;
import com.dumontierlab.pdb2rdf.parser.vocabulary.uri.UriBuilder;
import com.dumontierlab.pdb2rdf.util.ClusterIterator;
import com.dumontierlab.pdb2rdf.util.ConsoleProgressMonitorImpl;
import com.dumontierlab.pdb2rdf.util.DirectoryIterator;
import com.dumontierlab.pdb2rdf.util.FileIterator;
import com.dumontierlab.pdb2rdf.util.InputInterator;
import com.dumontierlab.pdb2rdf.util.Pdb2RdfInputIterator;
import com.dumontierlab.pdb2rdf.util.Pdb2RdfInputIteratorAdapter;
import com.dumontierlab.pdb2rdf.util.PdbsIterator;
import com.dumontierlab.pdb2rdf.util.ProgressMonitor;
import com.dumontierlab.pdb2rdf.util.Statistics;
import com.hp.hpl.jena.query.Dataset;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.RDFWriter;
import com.hp.hpl.jena.rdf.model.RDFWriterF;
import com.hp.hpl.jena.rdf.model.impl.RDFWriterFImpl;
import com.hp.hpl.jena.shared.NoWriterForLangException;
import com.hp.hpl.jena.tdb.TDBFactory;

/**
 * @autor Jose Cruz-Toledo
 * @author Alexander De Leon
 */
public class Pdb2Rdf {

    static final Logger LOG = Logger.getLogger(Pdb2Rdf.class);

    private static final String STATSFILE_NAME = "pdb2rdf-stats.txt";

    public static void main(String[] args) {
        Options options = createOptions();
        CommandLineParser parser = createCliParser();
        try {
            CommandLine cmd = parser.parse(options, args);
            if (cmd.hasOption("help")) {
                printUsage();
            }
            Map<String, Double> stats = null;
            if (cmd.hasOption("stats")) {
                stats = new HashMap<String, Double>();
            }

            if (cmd.hasOption("statsFromRDF")) {
                generateStatsFromRDF(cmd);
            } else if (cmd.hasOption("load")) {
                load(cmd, stats);
            } else if (cmd.hasOption("ontology")) {
                printOntology();
            } else {
                printRdf(cmd, stats);
            }
            if (stats != null) {
                try {
                    outputStats(cmd, stats);
                } catch (FileNotFoundException e) {
                    LOG.warn("Unable to write statistics file", e);
                }
            }

        } catch (ParseException e) {
            LOG.fatal("Unable understand your command.");
            printUsage();
            System.exit(1);
        }

    }

    private static void generateStatsFromRDF(CommandLine cmd) {
        String dir = cmd.getOptionValue("dir");
        if (dir == null) {
            LOG.fatal("Need to specify -dir with -statsFromRDF");
            System.exit(1);
        }
        Map<String, Double> stats = new HashMap<String, Double>();
        Statistics statistics = new Statistics();
        try {
            InputInterator input = new DirectoryIterator(new File(dir), cmd.hasOption("gzip"));
            while (input.hasNext()) {
                try {
                    Model model = ModelFactory.createDefaultModel();
                    model.read(input.next(), "");
                    statistics.mergeStats(statistics.getStatistics(model), stats);
                } catch (Exception e) {
                    LOG.warn("Fail to read input file", e);
                }
            }
            outputStats(cmd, stats);
        } catch (IOException e) {
            LOG.fatal("Unable to read files form " + dir);
            System.exit(1);
        }
    }

    private static void updateStats(Map<String, Double> stats, PdbRdfModel model) {
        Statistics statsFactory = new Statistics();
        try {
            statsFactory.mergeStats(statsFactory.getStatistics(model), stats);
        } catch (Exception e) {
            String id = null;
            if (model != null) {
                id = model.getPdbId();
            }
            LOG.error("Unable to count statistics for PDB: " + id, e);
        }
    }

    private static void outputStats(CommandLine cmd, Map<String, Double> stats) throws FileNotFoundException {
        File outputDir = getOutputDirectory(cmd);
        File statsFile = null;
        if (outputDir != null) {
            statsFile = new File(outputDir, STATSFILE_NAME);
        } else {
            statsFile = new File(STATSFILE_NAME);
        }
        PrintWriter out = new PrintWriter(statsFile);
        try {
            for (Map.Entry<String, Double> stat : stats.entrySet()) {
                out.println(stat.getKey() + ": " + stat.getValue());
            }
            out.flush();
        } finally {
            out.close();
        }

    }

    private static void printOntology() {
        PdbOwlVocabulary.getOntology().write(System.out);
    }

    @SuppressWarnings("unused")
    private static void printRdf(final CommandLine cmd) {
        printRdf(cmd, null);
    }

    private static void printRdf(final CommandLine cmd, final Map<String, Double> stats) {
        final File outDir = getOutputDirectory(cmd);
        final RDFWriter writer = getWriter(cmd);
        final ProgressMonitor monitor = getProgressMonitor();
        Pdb2RdfInputIterator i = processInput(cmd);
        final int inputSize = i.size();
        final AtomicInteger progressCount = new AtomicInteger();
        ExecutorService pool = null;
        if (outDir != null) {
            pool = getThreadPool(cmd);
        } else {
            // if output is going to the STDOUT then we need to do process in
            // sequential mode.
            pool = Executors.newSingleThreadExecutor();
        }

        final Object lock = new Object();

        while (i.hasNext()) {
            final InputSource input = i.next();
            pool.execute(new Runnable() {
                @Override
                public void run() {
                    OutputStream out = System.out;
                    PdbXmlParser parser = new PdbXmlParser();
                    PdbRdfModel model = null;
                    try {
                        if (cmd.hasOption("detailLevel")) {
                            try {
                                DetailLevel detailLevel = Enum.valueOf(DetailLevel.class,
                                        cmd.getOptionValue("detailLevel"));
                                model = parser.parse(input, new PdbRdfModel(), detailLevel);
                            } catch (IllegalArgumentException e) {
                                LOG.fatal("Invalid argument value for detailLevel option", e);
                                System.exit(1);
                            }
                        } else {
                            model = parser.parse(input, new PdbRdfModel());
                        }
                        // add the input file information
                        model.addInputFileInformation();
                        // add the outputFile information();
                        model.addRDFFileInformation();
                        if (outDir != null) {
                            File directory = new File(outDir, model.getPdbId().substring(1, 3));
                            synchronized (lock) {
                                if (!directory.exists()) {
                                    directory.mkdir();
                                }
                            }
                            File file = new File(directory, model.getPdbId() + ".rdf.gz");
                            out = new GZIPOutputStream(new FileOutputStream(file));
                        }
                        if (cmd.hasOption("format")) {
                            if (cmd.getOptionValue("format").equalsIgnoreCase("NQUADs")) {
                                Dataset ds = TDBFactory.createDataset();
                                ds.addNamedModel(model.getDatasetResource().toString(), model);
                                StringWriter sw = new StringWriter();
                                RDFDataMgr.write(sw, ds, Lang.NQUADS);

                                out.write(sw.toString().getBytes(Charset.forName("UTF-8")));
                                ds.close();

                            }
                        }

                        writer.write(model, out, null);

                        if (stats != null) {
                            updateStats(stats, model);
                        }
                        if (monitor != null) {
                            monitor.setProgress(progressCount.incrementAndGet(), inputSize);
                        }

                    } catch (Exception e) {
                        String id = null;
                        if (model != null) {
                            id = model.getPdbId();
                        }
                        LOG.error("Unable to parse input for PDB: " + id, e);
                    } finally {
                        try {
                            out.close();
                        } catch (IOException e) {
                            LOG.error("Unable to close output stream", e);
                        }
                    }
                }
            });
        }
        pool.shutdown();
        while (!pool.isTerminated()) {
            try {
                pool.awaitTermination(1, TimeUnit.SECONDS);
            } catch (InterruptedException e) {
                break;
            }
        }
    }

    private static RDFWriter getWriter(CommandLine cmd) {
        RDFWriterF writerFactory = new RDFWriterFImpl();
        RDFWriter writer = writerFactory.getWriter("RDF/XML");
        if (cmd.hasOption("format")) {
            if (!cmd.getOptionValue("format").equalsIgnoreCase("NQUADS")) {
                try {
                    writer = writerFactory.getWriter(cmd.getOptionValue("format"));
                } catch (NoWriterForLangException e) {
                    System.out.println("Invalid format option selected!");
                    e.printStackTrace();
                    System.exit(0);
                }
            }
        }
        return writer;
    }

    private static File getOutputDirectory(CommandLine cmd) {
        if (cmd.hasOption("out")) {
            File outDir = new File(cmd.getOptionValue("out"));
            if (!outDir.isDirectory()) {
                LOG.fatal("The out paramater must specify a directory");
                System.exit(1);
            }
            return outDir;
        }
        return null;
    }

    private static void load(CommandLine cmd) {
        load(cmd, null);
    }

    private static void load(CommandLine cmd, final Map<String, Double> stats) {
        String username = "dba";
        String password = "dba";
        String host = "localhost";
        int port = 1111;
        DetailLevel detailLevel = null;
        if (cmd.hasOption("detailLevel")) {
            try {
                detailLevel = Enum.valueOf(DetailLevel.class, cmd.getOptionValue("detailLevel"));
            } catch (IllegalArgumentException e) {
                LOG.fatal("Invalid argument value for detailLevel option", e);
                System.exit(1);
            }
        }
        final DetailLevel f_detailLevel = detailLevel;

        if (cmd.hasOption("username")) {
            username = cmd.getOptionValue("username");
        }
        if (cmd.hasOption("password")) {
            password = cmd.getOptionValue("password");
        }
        if (cmd.hasOption("host")) {
            host = cmd.getOptionValue("host");
        }
        if (cmd.hasOption("port")) {
            try {
                port = Integer.parseInt(cmd.getOptionValue("port"));
            } catch (NumberFormatException e) {
                LOG.fatal("Invalid port number: " + cmd.getOptionValue("port"));
                System.exit(1);
            }
        }

        final VirtuosoDaoFactory factory = new VirtuosoDaoFactory(host, port, username, password);
        ExecutorService pool = getThreadPool(cmd);

        final ProgressMonitor monitor = getProgressMonitor();
        final Pdb2RdfInputIterator i = processInput(cmd);
        final int inputSize = i.size();
        final AtomicInteger progressCount = new AtomicInteger();

        if (monitor != null) {
            monitor.setProgress(0, inputSize);
        }

        while (i.hasNext()) {
            final InputSource input = i.next();
            pool.execute(new Runnable() {
                public void run() {
                    PdbXmlParser parser = new PdbXmlParser();
                    UriBuilder uriBuilder = new UriBuilder();
                    PdbRdfModel model = null;
                    try {
                        model = new VirtPdbRdfModel(factory, Bio2RdfPdbUriPattern.PDB_GRAPH, uriBuilder,
                                factory.getTripleStoreDao());
                        if (f_detailLevel != null) {
                            parser.parse(input, model, f_detailLevel);
                        } else {
                            parser.parse(input, model);
                        }
                        if (stats != null) {
                            updateStats(stats, model);
                        }
                        if (monitor != null) {
                            monitor.setProgress(progressCount.incrementAndGet(), inputSize);
                        }

                    } catch (Exception e) {
                        LOG.error("Uanble to parse input for pdb=" + (model != null ? model.getPdbId() : "null"),
                                e);
                    }
                }
            });
        }
        pool.shutdown();
        while (!pool.isTerminated()) {
            try {
                pool.awaitTermination(1, TimeUnit.SECONDS);
            } catch (InterruptedException e) {
                break;
            }
        }
    }

    private static ProgressMonitor getProgressMonitor() {
        try {
            return new ConsoleProgressMonitorImpl();
        } catch (IOException e) {
            LOG.warn("Unable to create progress monitor");
            return null;
        }
    }

    private static void printUsage() {
        HelpFormatter helpFormatter = new HelpFormatter();
        helpFormatter.printHelp("pdb2rdf [OPTIONS] [[PDB ID 1] [PDB ID 2] ...]", createOptions());
    }

    private static CommandLineParser createCliParser() {
        return new GnuParser();
    }

    private static Pdb2RdfInputIterator processInput(CommandLine cmd) {
        boolean gzip = cmd.hasOption("gzip");
        try {
            if (cmd.hasOption("file")) {
                File file = new File(cmd.getOptionValue("file"));
                if (!file.exists() || !file.canRead()) {
                    LOG.fatal("Cannot access file: " + file);
                    System.exit(1);
                }
                return new Pdb2RdfInputIteratorAdapter(new FileIterator(file, gzip));
            } else if (cmd.hasOption("dir")) {
                File dir = new File(cmd.getOptionValue("dir"));
                if (!dir.exists() || !dir.canRead() || !dir.canExecute()) {
                    LOG.fatal("Cannot access directory: " + dir);
                    System.exit(1);
                }
                return new Pdb2RdfInputIteratorAdapter(new DirectoryIterator(dir, gzip));
            } else if (cmd.hasOption("cluster")) {
                String url = cmd.getOptionValue("cluster");
                return new ClusterIterator(url);
            } else {
                String[] args = cmd.getArgs();
                if (args.length == 0) {
                    LOG.fatal(
                            "You need to specified the file option, the dir option, or explicitly list the pdb ids.");
                    printUsage();
                    System.exit(1);
                }
                return new PdbsIterator(args);
            }
        } catch (Exception e) {
            LOG.fatal(e);
            System.exit(1);
            return null;
        }
    }

    @SuppressWarnings("static-access")
    private static Options createOptions() {
        Options options = new Options();

        options.addOption("help", false, "Print this message");
        Option formatOption = OptionBuilder.withArgName("RDF/XML|N-TRIPLE|N3|NQUADS").hasOptionalArgs(1)
                .withDescription("RDF output format (default: RDF/XMl)").hasArg(true).create("format");
        options.addOption(formatOption);
        Option dirOption = OptionBuilder.withArgName("path")
                .withDescription("Directory where input files are located").hasArg(true).create("dir");
        options.addOption(dirOption);
        Option clusterOption = OptionBuilder.withArgName("URL")
                .withDescription("URL of the cluster head where input will be acquired").hasArg(true)
                .create("cluster");
        options.addOption(clusterOption);

        Option fileOption = OptionBuilder.withArgName("path").withDescription("Input file").hasArg(true)
                .create("file");
        options.addOption(fileOption);
        options.addOption("gzip", false, "Input is given as gzip file(s)");
        Option outDirOption = OptionBuilder.withArgName("path")
                .withDescription("Directory where output RDF files will be created").hasArg(true).create("out");
        options.addOption(outDirOption);
        options.addOption("ontology", false, "Prints the ontology for the PDB namespace");
        Option threadsOption = OptionBuilder.withArgName("number")
                .withDescription("Number of threads (default: number of processing units * 2)").hasArg(true)
                .create("threads");
        options.addOption(threadsOption);

        options.addOption("stats", false,
                "Outputs statistics to file pdb2rdf-stats.txt (in output directory, if one is specified, or in the current directory otherwise)");
        options.addOption("statsFromRDF", false,
                "Generates statistics from RDF files (located in the directory spefied by -dir). The stats are output to the file pdb2rdf-stats.txt (in output directory, if one is specified, or in the current directory otherwise)");
        Option noAtomSitesOption = OptionBuilder.hasArg(true)
                .withDescription("Specify detail level: COMPLETE | ATOM | RESIDUE | EXPERIMENT | METADATA ")
                .create("detailLevel");
        options.addOption(noAtomSitesOption);
        return options;
    }

    private static int getNumberOfThreads(CommandLine cmd) {
        int numberOfThreads = Runtime.getRuntime().availableProcessors();
        if (cmd.hasOption("threads")) {
            try {
                numberOfThreads = Integer.parseInt(cmd.getOptionValue("threads"));
            } catch (NumberFormatException e) {
                LOG.fatal("Invalid number of threads", e);
                System.exit(1);
            }
        }
        return numberOfThreads;
    }

    private static ExecutorService getThreadPool(CommandLine cmd) {
        // twice the number of PU
        final Object monitor = new Object();
        int numberOfThreads = getNumberOfThreads(cmd);
        LOG.info("Using " + numberOfThreads + " threads.");
        ThreadPoolExecutor threadPool = new ThreadPoolExecutor(numberOfThreads, numberOfThreads, 10,
                TimeUnit.MINUTES, new ArrayBlockingQueue<Runnable>(1), new RejectedExecutionHandler() {
                    @Override
                    public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) {
                        synchronized (monitor) {
                            try {
                                monitor.wait();
                            } catch (InterruptedException e) {
                                Thread.currentThread().interrupt();
                            }
                        }
                        executor.execute(r);
                    }
                }) {
            @Override
            protected void afterExecute(Runnable r, Throwable t) {
                synchronized (monitor) {
                    monitor.notify();
                }
                super.afterExecute(r, t);
            }
        };

        return threadPool;
    }

}