org.emonocot.job.dwc.read.ArchiveFactory.java Source code

Java tutorial

Introduction

Here is the source code for org.emonocot.job.dwc.read.ArchiveFactory.java

Source

/*
 * This is eMonocot, a global online biodiversity information resource.
 *
 * Copyright  20112015 The Board of Trustees of the Royal Botanic Gardens, Kew and The University of Oxford
 *
 * eMonocot is free software: you can redistribute it and/or modify it under the terms of the
 * GNU Affero General Public License as published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 *
 * eMonocot is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * The complete text of the GNU Affero General Public License is in the source repository as the file
 * COPYING.  It is also available from <http://www.gnu.org/licenses/>.
 */
package org.emonocot.job.dwc.read;

import org.emonocot.api.job.TermFactory;
import org.gbif.dwc.terms.Term;
import org.gbif.dwc.terms.DcTerm;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.text.Archive;
import org.gbif.dwc.text.ArchiveField;
import org.gbif.dwc.text.ArchiveField.DataType;
import org.gbif.dwc.text.ArchiveFile;
import org.gbif.dwc.text.UnsupportedArchiveException;
import org.gbif.file.CSVReader;
import org.gbif.file.CSVReaderFactory;
import org.gbif.file.DownloadUtil;
import org.gbif.metadata.handler.BasicMetadataSaxHandler;
import org.gbif.utils.file.BomSafeInputStreamWrapper;
import org.gbif.utils.file.CompressionUtil;

import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

public class ArchiveFactory {

    /**
     * SAX handler to parse a meta.xml descriptor for dwc archives. It populates a given archive instance and ignores
     * namespaces. The parser needs to be namespace aware!
     */
    static class MetaHandler extends BasicMetadataSaxHandler {

        private static final String NS_DWCA = "http://rs.tdwg.org/dwc/text/";
        private Archive archive;
        private ArchiveFile af;

        protected MetaHandler(Archive archive) {
            this.archive = archive;
        }

        private static Character getFirstChar(String x) throws UnsupportedArchiveException {
            if (x == null || x.length() == 0) {
                return null;
            }
            if (x.length() == 1) {
                return x.charAt(0);
            }
            if (x.equalsIgnoreCase("\\t")) {
                return '\t';
            }
            if (x.equalsIgnoreCase("\\n")) {
                return '\n';
            }
            if (x.equalsIgnoreCase("\\r")) {
                return '\r';
            }
            if (x.length() > 1) {
                throw new UnsupportedArchiveException(
                        "Only darwin core archives with a single quotation character are supported, but found >>>"
                                + x + "<<<");
            }
            return ' ';
        }

        private static void makeLocationPathsAbsolute(ArchiveFile af, File root) {
            // I know this is verbose and stupid, but its easy coded now without the hassle of deep copying lists, etc...
            List<String> newLocs = new ArrayList<String>();
            for (String loc : af.getLocations()) {
                newLocs.add(new File(root, af.getLocation()).getAbsolutePath());
            }
            af.getLocations().clear();
            for (String loc : newLocs) {
                af.getLocations().add(loc);
            }
        }

        private static String unescapeBackslash(String x) {
            if (x == null || x.length() == 0) {
                return null;
            }
            return x.replaceAll("\\\\t", String.valueOf('\t')).replaceAll("\\\\n", String.valueOf('\n'))
                    .replaceAll("\\\\r", String.valueOf('\r')).replaceAll("\\\\f", String.valueOf('\f'));
        }

        private ArchiveFile buildArchiveFile(Attributes attr) throws UnsupportedArchiveException {
            ArchiveFile dwcFile = new ArchiveFile();

            // extract the File attributes
            if (getAttr(attr, "encoding") != null) {
                dwcFile.setEncoding(getAttr(attr, "encoding"));
            }
            if (getAttr(attr, "fieldsTerminatedBy") != null) {
                dwcFile.setFieldsTerminatedBy(unescapeBackslash(getAttr(attr, "fieldsTerminatedBy")));
            }
            if (getAttr(attr, "fieldsEnclosedBy") != null) {
                dwcFile.setFieldsEnclosedBy(getFirstChar(getAttr(attr, "fieldsEnclosedBy")));
            }
            if (getAttr(attr, "linesTerminatedBy") != null) {
                dwcFile.setLinesTerminatedBy(unescapeBackslash(getAttr(attr, "linesTerminatedBy")));
            }
            if (getAttr(attr, "rowType") != null) {
                dwcFile.setRowType(getAttr(attr, "rowType"));
            }
            String ignoreHeaderLines = getAttr(attr, "ignoreHeaderLines");
            try {
                dwcFile.setIgnoreHeaderLines(Integer.parseInt(ignoreHeaderLines));
            } catch (NumberFormatException ignored) { // swallow null or bad value
            }
            return dwcFile;
        }

        /**
         * Build an ArchiveField object based on xml attributes.
         */
        private ArchiveField buildField(Attributes attributes) {
            // build field
            Term term = TermFactory.findTerm(getAttr(attributes, "term"));
            String defaultValue = getAttr(attributes, "default");
            DataType type = DataType.findByXmlSchemaType(getAttr(attributes, "type"));
            if (type == null) {
                type = DataType.string;
            }
            String indexAsString = getAttr(attributes, "index");
            Integer index = null;
            if (indexAsString != null) {
                // let bad errors be thrown up
                try {
                    index = Integer.parseInt(indexAsString);
                } catch (NumberFormatException e) {
                    throw new UnsupportedArchiveException(e);
                }
            }
            return new ArchiveField(index, term, defaultValue, type);
        }

        @Override
        public void endElement(String uri, String localName, String qName) throws SAXException {
            // calling the super method to stringify the character buffer
            super.endElement(uri, localName, qName);

            if (localName.equalsIgnoreCase("archive")) {
                // archive
            } else if (localName.equalsIgnoreCase("core")) {
                // update location to absolute path incl archive path
                //      makeLocationPathsAbsolute(af, archive.getLocation());
                archive.setCore(af);
            } else if (localName.equalsIgnoreCase("extension")) {
                // update location to absolute path incl archive path
                //      makeLocationPathsAbsolute(af, archive.getLocation());
                archive.addExtension(af);
            } else if (localName.equalsIgnoreCase("location")) {
                // a file location
                af.addLocation(content);
            }

        }

        private String getAttr(Attributes attributes, String key) {
            String val = null;
            if (attributes != null) {
                // try without NS
                val = attributes.getValue("", key);
                if (val == null) {
                    // try with dwca NS if nothing found
                    val = attributes.getValue(NS_DWCA, key);
                }
            }
            return val;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes attributes)
                throws SAXException {
            super.startElement(uri, localName, qName, attributes);
            if (localName.equalsIgnoreCase("archive") || localName.equalsIgnoreCase("stararchive")) {
                // metadata location
                archive.setMetadataLocation(getAttr(attributes, "metadata"));
            } else if (localName.equalsIgnoreCase("core") || localName.equalsIgnoreCase("extension")) {
                // archive/extension
                af = new ArchiveFile();
                if (localName.equalsIgnoreCase("core") || localName.equalsIgnoreCase("extension")) {
                    // archive/core or archive/extension
                    af = buildArchiveFile(attributes);
                }
            } else if (localName.equalsIgnoreCase("coreid") || localName.equalsIgnoreCase("id")) {
                ArchiveField field = buildField(attributes);
                if (af != null) {
                    af.setId(field);
                } else {
                    log.warn(localName + " field found outside of an archive file");
                }
            } else if (localName.equalsIgnoreCase("field")) {
                ArchiveField field = buildField(attributes);
                if (af != null) {
                    af.addField(field);
                } else {
                    log.warn("field found outside of an archive file");
                }
            }
        }

    }

    private static final TermFactory TERM_FACTORY = new TermFactory();

    private static final Logger LOG = LoggerFactory.getLogger(ArchiveFactory.class);

    private static final SAXParserFactory SAX_FACTORY = SAXParserFactory.newInstance();

    static {
        SAX_FACTORY.setNamespaceAware(true);
        SAX_FACTORY.setValidating(false);
    }

    /**
     * Opens an archive from a URL, downloading and decompressing it.
     *
     * @param archiveUrl the location of a compressed archive or single data file
     * @param workingDir writable directory to download to and decompress archive
     */
    public static Archive openArchive(URL archiveUrl, File workingDir)
            throws IOException, UnsupportedArchiveException {
        File downloadTo = new File(workingDir, "dwca-download");
        File dwca = new File(workingDir, "dwca");
        DownloadUtil.download(archiveUrl, downloadTo);
        return openArchive(downloadTo, dwca);
    }

    /**
     * Opens an archive from a local file and decompresses or copies it into the given archive directory.
     * Make sure the archive directory does not contain files already!
     *
     * @param archiveFile the location of a compressed archive or single data file
     * @param archiveDir  empty, writable directory used to keep decompress archive in
     */
    public static Archive openArchive(File archiveFile, File archiveDir)
            throws IOException, UnsupportedArchiveException {
        // try to decompress archive
        try {
            List<File> files = CompressionUtil.decompressFile(archiveDir, archiveFile);
            // continue to read archive from the tmp dir
            return openArchive(archiveDir);

        } catch (CompressionUtil.UnsupportedCompressionType e) {
            // If its a text file only we will get this exception - but also for corrupt compressions
            // try to open as text file only
            return openArchive(archiveFile);
        }
    }

    /**
     * @param unzippedFolderLocation the location of an expanded archive directory or just a single dwc text file
     */
    public static Archive openArchive(File unzippedFolderLocation) throws IOException, UnsupportedArchiveException {
        Archive archive = new Archive();
        archive.setLocation(unzippedFolderLocation);

        File mf = null;
        // see if we can find a meta.xml descriptor file
        if (unzippedFolderLocation.isFile()) {
            String suffix = unzippedFolderLocation.getName()
                    .substring(unzippedFolderLocation.getName().lastIndexOf("."));
            if (suffix.equalsIgnoreCase(".xml")) {
                // could be a metafile on its own pointing to remote data files...
                mf = unzippedFolderLocation;
            }
        } else {
            mf = new File(unzippedFolderLocation, "meta.xml");
        }
        // read metadata
        if (mf != null && mf.exists()) {
            // read metafile
            readMetaDescriptor(archive, new FileInputStream(mf), true);
            if (archive.getMetadataLocation() == null) {
                // search for known metadata filenames
                File emlFile = new File(mf.getParentFile(), "eml.xml");
                if (emlFile.exists()) {
                    archive.setMetadataLocation("eml.xml");
                }
            }
        } else {
            // try to detect data files ourselves as best as we can...
            // currently support a single data file or a folder which contains a single data file
            if (unzippedFolderLocation.isFile()) {
                ArchiveFile coreFile = readFileHeaders(unzippedFolderLocation);
                archive.setCore(coreFile);
            } else {
                // folder. see if we got only 1 file in there...
                List<File> dataFiles = new ArrayList<File>();
                FilenameFilter ff = new SuffixFileFilter(".csv", IOCase.INSENSITIVE);
                dataFiles.addAll(Arrays.asList(unzippedFolderLocation.listFiles(ff)));
                ff = new SuffixFileFilter(".txt", IOCase.INSENSITIVE);
                dataFiles.addAll(Arrays.asList(unzippedFolderLocation.listFiles(ff)));

                if (dataFiles.size() == 1) {
                    // set pointer to data file
                    File dataFile = new File(unzippedFolderLocation, dataFiles.get(0).getName());
                    archive.setLocation(unzippedFolderLocation);
                    if (archive.getMetadataLocation() == null && unzippedFolderLocation.isDirectory()) {
                        // search for known metadata filenames
                        File emlFile = new File(unzippedFolderLocation, "eml.xml");
                        if (emlFile.exists()) {
                            archive.setMetadataLocation("eml.xml");
                        }
                    }
                    ArchiveFile coreFile = readFileHeaders(dataFile);
                    coreFile.getLocations().clear();
                    coreFile.addLocation(dataFile.getName());
                    archive.setCore(coreFile);
                } else {
                    throw new UnsupportedArchiveException(
                            "The archive given is a folder with more or less than 1 data files having a txt or csv suffix");
                }
            }
        }
        // final validation
        validateArchive(archive);
        // report basic stats
        LOG.debug("Archive contains " + archive.getExtensions().size() + " described extension files");
        LOG.debug("Archive contains " + archive.getCore().getFields().size() + " core properties");
        return archive;
    }

    /**
     * Use internal term factory to find/build a new ConceptTerm based on its qualified name.
     *
     * @param termName the qualified term name
     *
     * @return the ConceptTerm either as one of the existing enums or an UnknownTerm singleton
     */
    public static Term findTerm(String termName) {
        return TERM_FACTORY.findTerm(termName);
    }

    private static ArchiveFile readFileHeaders(File dataFile) throws UnsupportedArchiveException, IOException {
        ArchiveFile dwcFile = new ArchiveFile();
        dwcFile.addLocation(null);
        dwcFile.setIgnoreHeaderLines(1);

        CSVReader reader = CSVReaderFactory.build(dataFile);

        // copy found delimiters & encoding
        dwcFile.setEncoding(reader.encoding);
        dwcFile.setFieldsTerminatedBy(reader.delimiter);
        dwcFile.setFieldsEnclosedBy(reader.quoteChar);

        // detect dwc terms as good as we can based on header row
        String[] headers = reader.header;
        int index = 0;
        for (String head : headers) {
            // there are never any quotes in term names - remove them just in case the csvreader didnt recognize them
            if (head != null && head.length() > 1) {
                Term dt = TERM_FACTORY.findTerm(head);
                if (dt != null) {
                    ArchiveField field = new ArchiveField(index, dt, null, DataType.string);
                    if (dwcFile.getId() == null && (dt.equals(DwcTerm.occurrenceID) || dt.equals(DwcTerm.taxonID)
                            || dt.equals(DcTerm.identifier))) {
                        dwcFile.setId(field);
                    } else {
                        dwcFile.addField(field);
                    }
                }
            }
            index++;
        }

        return dwcFile;
    }

    private static void readMetaDescriptor(Archive archive, InputStream metaDescriptor, boolean normaliseTerms)
            throws UnsupportedArchiveException {

        try {
            SAXParser p = SAX_FACTORY.newSAXParser();
            MetaHandler mh = new MetaHandler(archive);
            LOG.debug("Reading archive metadata file");
            //    p.parse(metaDescriptor, mh);
            p.parse(new BomSafeInputStreamWrapper(metaDescriptor), mh);
        } catch (Exception e1) {
            LOG.warn("Exception caught", e1);
            throw new UnsupportedArchiveException(e1);
        }
    }

    private static void validateArchive(Archive archive) throws UnsupportedArchiveException {
        validateCoreFile(archive.getCore(), !archive.getExtensions().isEmpty());
        for (ArchiveFile af : archive.getExtensions()) {
            validateExtensionFile(af);
        }
    }

    private static void validateCoreFile(ArchiveFile f, boolean hasExtensions) throws UnsupportedArchiveException {
        if (hasExtensions) {
            if (f.getId() == null) {
                LOG.warn("DwC-A core data file " + f.getTitle()
                        + " is lacking an id column. No extensions allowed in this case");
            }
        }
        validateFile(f);
    }

    private static void validateExtensionFile(ArchiveFile f) throws UnsupportedArchiveException {
        if (f.getId() == null) {
            throw new UnsupportedArchiveException(
                    "DwC-A data file " + f.getTitle() + " requires an id or foreign key to the core id");
        }
        validateFile(f);
    }

    private static void validateFile(ArchiveFile f) throws UnsupportedArchiveException {
        if (f == null) {
            throw new UnsupportedArchiveException("DwC-A data file is NULL");
        }
        if (f.getLocationFile() == null) {
            throw new UnsupportedArchiveException("DwC-A data file " + f.getTitle() + " requires a location");
        }
        if (f.getEncoding() == null) {
            throw new UnsupportedArchiveException(
                    "DwC-A data file " + f.getTitle() + " requires a character encoding");
        }

    }

}