opendap.metacat.NCMLWriter.java Source code

Introduction

Here is the source code for opendap.metacat.NCMLWriter.java
Source

/////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2010 OPeNDAP, Inc.
// Author: James Gallagher  <jgallagher@opendap.org>
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
/////////////////////////////////////////////////////////////////////////////

package opendap.metacat;

import java.io.FileWriter;
import java.io.PrintStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

import opendap.metacat.Equivalence.SortedValues;
import opendap.metacat.URLGroup.Equivalences;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class NCMLWriter {
    private static Logger log = LoggerFactory.getLogger(NCMLWriter.class);

    private static final HashSet<String> likelyServerNames = new HashSet<String>();

    private static Map<URLGroup, URLGroupFacts> factBase = null;

    private static NCMLBuilder multifileNCMLBuilder = null;

    /// These are used to format dates so they are human- and xslt-usable
    final static SimpleDateFormat iso_8601_sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
    final static SimpleDateFormat infoLogSdf = new SimpleDateFormat("HH:mm:ss");

    public static void main(String[] args) {
        // There has to be a better way...
        likelyServerNames.add("opendap");
        likelyServerNames.add("hyrax");
        likelyServerNames.add("dap");
        likelyServerNames.add("data");

        CommandLineParser parser = new PosixParser();

        Options options = new Options();

        options.addOption("v", "verbose", false, "Write info to stdout");
        options.addOption("V", "very-verbose", false, "Write NCML to stdout");
        options.addOption("h", "help", false, "Usage information");

        options.addOption("n", "groups-name", true, "URLGroups name prefix");
        options.addOption("o", "output", false, "Write NCML files using the groupName name and a counter.");
        // options.addOption("d", "dir", true, "Write NCML files to this directory.");

        try {
            CommandLine line = parser.parse(options, args);

            if (line.hasOption("help")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("NCMLWriter [options] --groups-name <name prefix>", options);
                return;
            }

            boolean verbose = line.hasOption("verbose");
            boolean veryVerbose = line.hasOption("very-verbose");
            PrintStream ps = System.out;

            // Extract options
            String groupsName = line.getOptionValue("groups-name");
            if (groupsName == null || groupsName.isEmpty())
                throw new Exception("The calssifier must have a URLGroups file name.");
            if (verbose)
                ps.println("Groups file: " + groupsName);

            boolean output = line.hasOption("output");

            // Build objects
            DDXRetriever ddxRetriever = new DDXRetriever(true, groupsName);

            multifileNCMLBuilder = new NCMLBuilder(groupsName, "many_ddx2ncml-1.0.xsl");

            if (verbose)
                ps.println("(" + infoLogSdf.format(new Date()) + ") Reading groups");

            URLGroups groups = new URLGroups();
            groups.restoreState(groupsName);

            factBase = new HashMap<URLGroup, URLGroupFacts>();

            if (verbose)
                ps.println("(" + infoLogSdf.format(new Date()) + ") Begining analysis of groups");

            // initialize the fact base.
            Integer count = 0;
            for (URLGroup group : groups) {
                String ddxURL = group.getURLs().get(0).getTheURL();
                factBase.put(group, new URLGroupFacts(ddxURL, ddxRetriever.getDDXDoc(ddxURL)));
                count++;
            }

            // Look at each group and decide if it can be aggregated.
            /*
            for (URLGroup group: groups) {
                   
            }
            */

            if (verbose)
                ps.println("(" + infoLogSdf.format(new Date()) + ") " + count.toString() + " groups.");

            // Find multifile groups
            count = 0;
            for (URLGroup group : groups) {
                if (group.getURLs().size() > 1) {
                    factBase.get(group).setIsMultiFile(true);
                    count++;
                }
            }

            if (verbose)
                ps.println("(" + infoLogSdf.format(new Date()) + ") " + count.toString() + " multifile groups.");

            // Find multifile groups with a date equivalence
            count = 0;
            for (URLGroup group : groups) {
                if (factBase.get(group).getIsMultiFile() && group.getDateEquivalence() != null) {
                    factBase.get(group).setIsTimeSeries(true);

                    SortedValues sortedDates = group.getDateEquivalence().getSortedValues();

                    DateString first = sortedDates.get(0);
                    DateString last = sortedDates.get(sortedDates.size() - 1);

                    factBase.get(group).setFirstDate(iso_8601_sdf.format(first.getDate()));
                    factBase.get(group).setLastDate(iso_8601_sdf.format(last.getDate()));

                    count++;
                }
            }

            if (verbose)
                ps.println("(" + infoLogSdf.format(new Date()) + ") " + count.toString()
                        + " multifile groups with date equivalence classes.");

            // For groups that have multiple files, find various pathnames
            for (URLGroup group : groups) {
                String DDXURL = factBase.get(group).getFirstDDXURL();
                int serverNameEndPosition = findServerNameEnd(DDXURL);
                log.debug("serverNameEndPosition: " + serverNameEndPosition);

                int serverRootPosition = findServerRootPosition(group, serverNameEndPosition);
                log.debug("serverRootPosition: " + serverRootPosition);

                factBase.get(group).setServerRootPosition(serverRootPosition);

                int dataRootPosition = findDataRootPosition(group, serverNameEndPosition);
                log.debug("dataRootPosition: " + dataRootPosition);

                log.debug("DDX (length: " + DDXURL.length() + "): " + DDXURL);
                int len = DDXURL.length();
                // The monkey shines here guard against the case where a single
                // file dataset's dataRootPosition is past the en of the URL.
                String dataRoot = DDXURL.substring(serverRootPosition,
                        (dataRootPosition > len) ? len : dataRootPosition);
                log.debug("DDX: " + DDXURL + "; dataset scan: " + dataRoot);

                factBase.get(group).setDatasetRoot(dataRoot);
            }

            // Check for things that are unique, like dataRoot - for each group
            // is its value of dataRoot Unique?

            // Put all the dataRoot paths in a list
            List<String> dataRootList = new ArrayList<String>();
            ;
            for (URLGroup group : groups) {
                dataRootList.add(factBase.get(group).getDatasetRoot());
            }

            // ... now test to see if the first and last occurrence are the same
            for (URLGroup group : groups) {
                String dr = factBase.get(group).getDatasetRoot();
                if (dataRootList.indexOf(dr) == dataRootList.lastIndexOf(dr))
                    factBase.get(group).setIsDatasetRootUnique(true);
                else
                    factBase.get(group).setIsDatasetRootUnique(false);
            }

            // now build some NCML
            if (verbose)
                ps.println("(" + infoLogSdf.format(new Date()) + ") Building NCML");

            Integer output_counter = 0;
            for (URLGroup group : groups) {
                String ncmlDoc = null;

                if (factBase.get(group).getIsTimeSeries()) {
                    if (verbose)
                        ps.println("(" + infoLogSdf.format(new Date()) + ") Start building NCML");

                    ncmlDoc = buildExplicitNCMLForTimeSeries(group);

                    if (verbose)
                        ps.println("(" + infoLogSdf.format(new Date()) + ") End building NCML");

                    if (veryVerbose)
                        ps.println("(" + infoLogSdf.format(new Date()) + ") NCML: " + ncmlDoc);
                }

                if (output && ncmlDoc != null && !ncmlDoc.isEmpty()) {
                    output_counter++;
                    FileWriter fw;
                    String dr = factBase.get(group).getDatasetRoot();
                    dr = dr.replace('/', '_');
                    if (dr.charAt(dr.length() - 1) == '_')
                        dr = dr.substring(0, dr.length() - 1);
                    if (factBase.get(group).getIsDatasetRootUnique())
                        fw = new FileWriter(dr + ".ncml");
                    else
                        fw = new FileWriter(dr + "_" + output_counter.toString() + ".ncml");

                    fw.write(ncmlDoc);
                    fw.close();
                }

                // delete document just written 
                // ncmlDoc = null;
            }
        } catch (Exception e) {
            System.err.println("Error: " + e.getLocalizedMessage());
            e.printStackTrace();
            return;
        }
    }

    private static String buildExplicitNCMLForTimeSeries(URLGroup group) throws Exception {

        String urlDateFileTuples = buildURLDateFileTuples(group.getDateEquivalence(),
                factBase.get(group).getServerRootPosition());

        String[] params = new String[4];
        params[0] = "date_range";
        params[1] = factBase.get(group).getFirstDate() + " " + factBase.get(group).getLastDate();
        params[2] = "url_date_file";
        params[3] = urlDateFileTuples;

        // build the NCML
        return multifileNCMLBuilder.getNCML(factBase.get(group).getFirstDDXURL(),
                factBase.get(group).getFirstDDXDoc(), params);

    }

    private static String buildURLDateFileTuples(Equivalence dateEquiv, int dataRootPosition) {
        SortedValues sortedDates = dateEquiv.getSortedValues();
        StringBuilder urlDateFileTuples = new StringBuilder("");
        for (DateString d : sortedDates) {
            String date = iso_8601_sdf.format(d.getDate());
            String url = dateEquiv.getParsedURL(d.getDateString()).getTheURL();
            url = url.substring(0, url.lastIndexOf('.'));

            // Add the two offsets to get the filename
            String file = url.substring(dataRootPosition);

            //urlDateFileTuples += url + "*" + date + "*" + file + " ";
            urlDateFileTuples.append(url);
            urlDateFileTuples.append("*");
            urlDateFileTuples.append(date);
            urlDateFileTuples.append("*");
            urlDateFileTuples.append(file);
            urlDateFileTuples.append(" ");
        }
        return urlDateFileTuples.toString();
    }

    /**
     * Guess at the place in the list of equivalence classes where the server's
     * name ends and the DataRoot starts. Pure hackery... use the offset 
     * returned by this function with substring to cut away the unwanted part
     * of the URL.
     * 
     * @note Here's how it works:
     * http://machine/tomcat/servlet/data/nc/fnoc1.nc
     * ^             ^              ^
     * machine       tomcat         Data Root
     * This code returns the distance between 'tomcat' and 'Data Root'
     * 
     * @note Uses a canned set of values to determine where the tomcat 
     * context and stuff end and the data root starts. 
     * 
     * @return Position offset from the end of 'http://machine.name/' to '/' 
     * that marks the start of the DataRoot directory.
     */
    private static int findServerRootPosition(URLGroup group, int serverNameEndPosition) {
        int count = serverNameEndPosition + 1; // Assume every URL has '/' following the machine name

        Equivalences equivs = group.getEquivalences();
        for (Equivalence e : equivs) {
            log.debug("In findDataRoot; e.getPattern(): " + e.getPattern() + "; e.isLitteral(): " + e.isLitteral()
                    + "; likelyServerNames.contains(e.getPattern()): "
                    + likelyServerNames.contains(e.getPattern()));

            if (e.isLitteral() && likelyServerNames.contains(e.getPattern()))
                count += e.getPattern().length() + 1; // +1 includes the tailing '/' for this component in the URL
            else
                return count;
        }

        return count;
    }

    /**
     * Guess at the place in the list of equivalence classes where the group's
     * Dataset scan directory starts. Pure hackery... use the offset 
     * returned by this function with substring to cut away the unwanted part
     * of the URL.
     * 
     * @note Uses the idea that the once the groups' URLs' components start
     * to change those components are part of the dataset. Thus if a path is
     * formed of those components that don't change that is the path leading
     * up to the dataset and can serve as the dataset scan path. This is
     * useful both to make the ncml datasetScan element but also to make a
     * readable filename and title for the NCML itself since those directory
     * names often have meaning.
     * 
     * @return Position offset from the end of 'http://machine.name/' to '/' 
     * that marks the start of the DataRoot directory.
     */
    private static int findDataRootPosition(URLGroup group, int serverNameEndPosition) throws Exception {
        int count = serverNameEndPosition + 1; // Assume every URL has '/' following the machine name

        Equivalences equivs = group.getEquivalences();
        for (Equivalence e : equivs) {
            if (e.getNumberOfValues() != 1)
                return count;
            else
                count += e.getPattern().length() + 1;
        }

        return count;
    }

    /**
     * Where in the string that holds the URL does the machine name end? 
     * 
     * @param url
     * @return
     */
    private static int findServerNameEnd(String url) {
        return url.indexOf('/', url.indexOf("//") + 2);
    }

    /**
     * Look at the list of equivalence classes and find the last one in the list
     * that has only one value. Use this as the root of the group within the
     * server's file system.
     * 
     * @note Assume that the equivalences correspond to slash-separated 
     * pathname components. If the equivalences are the same all the way to
     * the file level, then this must not be a multifile dataset. So we test
     * for that right away and throw an exception.  
     * 
     * @note Not used now, but this could be combined with the scan element
     * to make much smaller NCML files.
     * 
     * @param group
     * @return 
     */
    @SuppressWarnings("unused")
    private int findCommonDirectory(URLGroup group) throws Exception {
        if (group.getURLs().size() == 1)
            throw new Exception("A URLGroup with only one instance was passed to findCommonDirectory");

        Equivalence previous_e = null;
        int previous_members = 0;
        Equivalences equivs = group.getEquivalences();
        for (Equivalence e : equivs) {
            previous_e = e;
            if (previous_members != 0 && e.getTotalMembers() != previous_members)
                throw new Exception(
                        "findCommonDirectory expected that all equivalence classes within a group would have the same number of member elements");
            previous_members = e.getTotalMembers();

            if (e.getNumberOfValues() != 1)
                return previous_e.getPatternPosition();
        }

        return previous_e.getPatternPosition();
    }
}