org.gbif.dwca.io.ArchiveFactory.java Source code

Java tutorial

Introduction

Here is the source code for org.gbif.dwca.io.ArchiveFactory.java

Source

/*
 * Copyright 2010-2015 Global Biodiversity Informatics Facility.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.gbif.dwca.io;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.io.Files;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.HiddenFileFilter;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.io.input.BOMInputStream;
import org.gbif.dwc.terms.DcTerm;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.dwc.terms.TermFactory;
import org.gbif.dwca.io.ArchiveField.DataType;
import org.gbif.io.DownloadUtil;
import org.gbif.utils.file.CompressionUtil;
import org.gbif.utils.file.csv.CSVReader;
import org.gbif.utils.file.csv.CSVReaderFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

/**
 * Factory used to build {@link Archive} object from a DarwinCore Archive file.
 * 
 * @author mdoering
 *
 */
public class ArchiveFactory {

    private static final TermFactory TERM_FACTORY = TermFactory.instance();

    private static final Logger LOG = LoggerFactory.getLogger(ArchiveFactory.class);

    /**
     * SAX handler to parse a meta.xml descriptor for dwc archives. It populates a given archive instance and ignores
     * namespaces. The parser needs to be namespace aware!
     */
    static class MetaHandler extends SimpleSaxHandler {

        private static final String NS_DWCA = "http://rs.tdwg.org/dwc/text/";
        private Archive archive;
        private ArchiveFile af;

        protected MetaHandler(Archive archive) {
            this.archive = archive;
        }

        private static Character getFirstChar(String x) throws UnsupportedArchiveException {
            if (x == null || x.length() == 0) {
                return null;
            }
            if (x.length() == 1) {
                return x.charAt(0);
            }
            if (x.equalsIgnoreCase("\\t")) {
                return '\t';
            }
            if (x.equalsIgnoreCase("\\n")) {
                return '\n';
            }
            if (x.equalsIgnoreCase("\\r")) {
                return '\r';
            }
            if (x.length() > 1) {
                throw new UnsupportedArchiveException(
                        "Only darwin core archives with a single quotation character are supported, but found >>>"
                                + x + "<<<");
            }
            return ' ';
        }

        private static String unescapeBackslash(String x) {
            if (x == null || x.length() == 0) {
                return null;
            }
            return x.replaceAll("\\\\t", String.valueOf('\t')).replaceAll("\\\\n", String.valueOf('\n'))
                    .replaceAll("\\\\r", String.valueOf('\r')).replaceAll("\\\\f", String.valueOf('\f'));
        }

        private ArchiveFile buildArchiveFile(Attributes attr) throws UnsupportedArchiveException {
            ArchiveFile dwcFile = new ArchiveFile();

            // extract the File attributes
            if (getAttr(attr, "encoding") != null) {
                dwcFile.setEncoding(getAttr(attr, "encoding"));
            }
            if (getAttr(attr, "fieldsTerminatedBy") != null) {
                dwcFile.setFieldsTerminatedBy(unescapeBackslash(getAttr(attr, "fieldsTerminatedBy")));
            }
            if (getAttr(attr, "fieldsEnclosedBy") != null) {
                dwcFile.setFieldsEnclosedBy(getFirstChar(getAttr(attr, "fieldsEnclosedBy")));
            }
            if (getAttr(attr, "linesTerminatedBy") != null) {
                dwcFile.setLinesTerminatedBy(unescapeBackslash(getAttr(attr, "linesTerminatedBy")));
            }
            if (getAttr(attr, "rowType") != null) {
                dwcFile.setRowType(TERM_FACTORY.findTerm(getAttr(attr, "rowType")));
            }
            String ignoreHeaderLines = getAttr(attr, "ignoreHeaderLines");
            try {
                dwcFile.setIgnoreHeaderLines(Integer.parseInt(ignoreHeaderLines));
            } catch (NumberFormatException ignored) { // swallow null or bad value
            }
            return dwcFile;
        }

        /**
         * Build an ArchiveField object based on xml attributes.
         */
        private ArchiveField buildField(Attributes attributes) {
            // build field
            Term term = TERM_FACTORY.findTerm(getAttr(attributes, "term"));
            String defaultValue = getAttr(attributes, "default");
            String vocabulary = getAttr(attributes, "vocabulary");
            DataType type = DataType.findByXmlSchemaType(getAttr(attributes, "type"));
            if (type == null) {
                type = DataType.string;
            }
            String indexAsString = getAttr(attributes, "index");
            Integer index = null;
            if (indexAsString != null) {
                // let bad errors be thrown up
                try {
                    index = Integer.parseInt(indexAsString);
                } catch (NumberFormatException e) {
                    throw new UnsupportedArchiveException(e);
                }
            }
            String delimiter = getAttr(attributes, "delimitedBy");
            return new ArchiveField(index, term, defaultValue, type, delimiter, vocabulary);
        }

        @Override
        public void endElement(String uri, String localName, String qName) throws SAXException {
            // calling the super method to stringify the character buffer
            super.endElement(uri, localName, qName);

            if (localName.equalsIgnoreCase("archive")) {
                // archive
            } else if (localName.equalsIgnoreCase("core")) {
                // update location to absolute path incl archive path
                //      makeLocationPathsAbsolute(af, archive.getLocation());
                archive.setCore(af);
            } else if (localName.equalsIgnoreCase("extension")) {
                // update location to absolute path incl archive path
                //      makeLocationPathsAbsolute(af, archive.getLocation());

                if (af.getId() != null && af.getId().getIndex() != null) {
                    archive.addExtension(af);
                } else {
                    log.warn("Skipping extension [" + af.getRowType() + "] with no index attribute");
                }
            } else if (localName.equalsIgnoreCase("location")) {
                // a file location
                af.addLocation(content);
            }

        }

        /**
         * Get attribute from a key
         * 
         * @param attributes
         * @param key
         * @return attributes value or null
         */
        private String getAttr(Attributes attributes, String key) {
            String val = null;
            if (attributes != null) {
                // try without NS
                val = attributes.getValue("", key);
                if (val == null) {
                    // try with dwca NS if nothing found
                    val = attributes.getValue(NS_DWCA, key);
                }
            }
            return Strings.isNullOrEmpty(val) ? null : val;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes attributes)
                throws SAXException {
            super.startElement(uri, localName, qName, attributes);
            if (localName.equalsIgnoreCase("archive") || localName.equalsIgnoreCase("stararchive")) {
                // metadata location
                archive.setMetadataLocation(getAttr(attributes, "metadata"));
            } else if (localName.equalsIgnoreCase("core") || localName.equalsIgnoreCase("extension")) {
                // archive/extension
                af = new ArchiveFile();
                if (localName.equalsIgnoreCase("core") || localName.equalsIgnoreCase("extension")) {
                    // archive/core or archive/extension
                    af = buildArchiveFile(attributes);
                }
            } else if (localName.equalsIgnoreCase("coreid") || localName.equalsIgnoreCase("id")) {
                ArchiveField field = buildField(attributes);
                if (af != null) {
                    af.setId(field);
                } else {
                    log.warn(localName + " field found outside of an archive file");
                }
            } else if (localName.equalsIgnoreCase("field")) {
                ArchiveField field = buildField(attributes);
                if (af != null) {
                    af.addField(field);
                } else {
                    log.warn("field found outside of an archive file");
                }
            }
        }

    }

    private static final SAXParserFactory SAX_FACTORY = SAXParserFactory.newInstance();

    static {
        SAX_FACTORY.setNamespaceAware(true);
        SAX_FACTORY.setValidating(false);
    }

    /**
     * Opens an archive from a URL, downloading and decompressing it.
     *
     * @param archiveUrl the location of a compressed archive or single data file
     * @param workingDir writable directory to download to and decompress archive
     */
    public static Archive openArchive(URL archiveUrl, File workingDir)
            throws IOException, UnsupportedArchiveException {
        File downloadTo = new File(workingDir, "dwca-download");
        File dwca = new File(workingDir, "dwca");
        DownloadUtil.download(archiveUrl, downloadTo);
        return openArchive(downloadTo, dwca);
    }

    /**
     * Opens an archive from a local file and decompresses or copies it into the given archive directory.
     * Make sure the archive directory does not contain files already, any existing files will be removed!
     *
     * If the source archive is an uncompressed, single data file and a valid archive, it will be copied as is
     * to the archiveDir.
     *
     * @param archiveFile the location of a compressed archive or single data file
     * @param archiveDir  empty, writable directory used to keep decompress archive in
     */
    public static Archive openArchive(File archiveFile, File archiveDir)
            throws IOException, UnsupportedArchiveException {
        if (archiveDir.exists()) {
            // clean up any existing folder
            LOG.debug("Deleting existing archive folder [{}]", archiveDir.getAbsolutePath());
            org.gbif.utils.file.FileUtils.deleteDirectoryRecursively(archiveDir);
        }
        FileUtils.forceMkdir(archiveDir);
        // try to decompress archive
        try {
            CompressionUtil.decompressFile(archiveDir, archiveFile, true);
            // we keep subfolder, but often the entire archive is within one subfolder. Remove that root folder if present
            File[] rootFiles = archiveDir.listFiles((FileFilter) HiddenFileFilter.VISIBLE);
            if (rootFiles.length == 1) {
                File root = rootFiles[0];
                if (root.isDirectory()) {
                    // single root dir, flatten structure
                    LOG.debug("Removing single root folder {} found in decompressed archive",
                            root.getAbsoluteFile());
                    for (File f : FileUtils.listFiles(root, TrueFileFilter.TRUE, null)) {
                        File f2 = new File(archiveDir, f.getName());
                        f.renameTo(f2);
                    }
                }
            }
            // continue to read archive from the tmp dir
            return openArchive(archiveDir);

        } catch (CompressionUtil.UnsupportedCompressionType e) {
            LOG.debug("Could not uncompress archive [{}], try to read as single text file", archiveFile, e);
            // If its a text file only we will get this exception - but also for corrupt compressions
            // try to open as text file only and if successful copy file to archive dir
            Archive arch = openArchiveDataFile(archiveFile);
            Files.copy(archiveFile, new File(archiveDir, archiveFile.getName()));
            return arch;
        }
    }

    /**
     * Opens a dwca archive which is just a single decompressed data file with headers, e.g. a csv or tab delimited file
     */
    public static Archive openArchiveDataFile(File dataFile) throws IOException, UnsupportedArchiveException {
        Archive archive = new Archive();
        archive.setLocation(dataFile);

        ArchiveFile coreFile = readFileHeaders(dataFile);
        archive.setCore(coreFile);

        // check if we also have a metadata file next to this data file
        discoverMetadataFile(archive, dataFile.getParentFile());

        // final validation
        return validateArchive(archive);
    }

    /**
     * @param dwcaFolder the location of an expanded dwc archive directory or just a single dwc text file
     */
    public static Archive openArchive(File dwcaFolder) throws IOException, UnsupportedArchiveException {
        if (!dwcaFolder.exists()) {
            throw new FileNotFoundException("Archive folder not existing: " + dwcaFolder.getAbsolutePath());
        }
        // delegate to open data file method if its a single file, not a folder
        if (dwcaFolder.isFile()) {
            return openArchiveDataFile(dwcaFolder);
        }

        Archive archive = new Archive();
        archive.setLocation(dwcaFolder);

        // Accommodate archives coming from legacy IPTs which put a "\" before each filename
        // http://dev.gbif.org/issues/browse/POR-2396
        // https://code.google.com/p/gbif-providertoolkit/issues/detail?id=1015
        Iterator<File> iter = FileUtils.iterateFiles(dwcaFolder, new String[] { "xml", "txt" }, false);
        while (iter.hasNext()) {
            File f = iter.next();
            if (f.getName().startsWith("\\")) {
                String orig = f.getName();
                String replacement = f.getName().replaceFirst("\\\\", "");
                LOG.info("Renaming file from {} to {}", orig, replacement);
                f.renameTo(new File(dwcaFolder, replacement));
            }
        }

        // read metadata
        File mf = new File(dwcaFolder, Archive.META_FN);
        if (mf.exists()) {
            // read metafile
            readMetaDescriptor(archive, new FileInputStream(mf));

        } else {
            // meta.xml lacking.
            // Try to detect data files ourselves as best as we can.
            // look for a single, visible text data file
            List<File> dataFiles = new ArrayList<File>();
            for (String suffix : Lists.newArrayList(".csv", ".txt", ".tab", ".text", ".data")) {
                FileFilter ff = FileFilterUtils.and(FileFilterUtils.suffixFileFilter(suffix, IOCase.INSENSITIVE),
                        HiddenFileFilter.VISIBLE);
                dataFiles.addAll(Arrays.asList(dwcaFolder.listFiles(ff)));
            }

            if (dataFiles.size() == 1) {
                File dataFile = new File(dwcaFolder, dataFiles.get(0).getName());
                ArchiveFile coreFile = readFileHeaders(dataFile);
                coreFile.getLocations().clear();
                coreFile.addLocation(dataFile.getName());
                archive.setCore(coreFile);

            } else {
                throw new UnsupportedArchiveException(
                        "The archive given is a folder with more or less than 1 data files having a csv, txt or tab suffix");
            }
        }

        // check if we also have a metadata file next to this data file
        discoverMetadataFile(archive, mf.getParentFile());

        // final validation
        return validateArchive(archive);
    }

    private static void discoverMetadataFile(Archive archive, File folder) {
        if (archive.getMetadataLocation() == null) {
            // search for popular metadata filenames
            for (String metadataFN : Lists.newArrayList("eml.xml", "metadata.xml")) {
                File emlFile = new File(folder, metadataFN);
                if (emlFile.exists()) {
                    archive.setMetadataLocation(metadataFN);
                    break;
                }
            }
        }
    }

    private static ArchiveFile readFileHeaders(File dataFile) throws UnsupportedArchiveException, IOException {
        ArchiveFile dwcFile = new ArchiveFile();
        dwcFile.addLocation(null);
        dwcFile.setIgnoreHeaderLines(1);

        CSVReader reader = CSVReaderFactory.build(dataFile);

        // copy found delimiters & encoding
        dwcFile.setEncoding(reader.encoding);
        dwcFile.setFieldsTerminatedBy(reader.delimiter);
        dwcFile.setFieldsEnclosedBy(reader.quoteChar);

        // detect dwc terms as good as we can based on header row
        String[] headers = reader.header;
        int index = 0;
        for (String head : headers) {
            // there are never any quotes in term names - remove them just in case the csvreader didnt recognize them
            if (head != null && head.length() > 1) {
                try {
                    Term dt = TERM_FACTORY.findTerm(head);
                    ArchiveField field = new ArchiveField(index, dt, null, DataType.string);
                    if (dwcFile.getId() == null && (dt.equals(DwcTerm.occurrenceID) || dt.equals(DwcTerm.taxonID)
                            || dt.equals(DwcTerm.eventID) || dt.equals(DcTerm.identifier))) {
                        dwcFile.setId(field);

                        // Set the rowType corresponding to the type of id encountered (e.g. dwc:Taxon for taxonID).
                        // Please note the ordering of ids is important, and the first id encountered will be used
                        // unless the generic id (dc:identifier) was encountered first.
                        // Ideally only one id will be used per rowType anyways.
                        if (dwcFile.getRowType() == null && dwcFile.getId().getTerm() != DcTerm.identifier) {
                            if (dt.equals(DwcTerm.occurrenceID)) {
                                dwcFile.setRowType(DwcTerm.Occurrence);
                            } else if (dt.equals(DwcTerm.taxonID)) {
                                dwcFile.setRowType(DwcTerm.Taxon);
                            } else if (dt.equals(DwcTerm.eventID)) {
                                dwcFile.setRowType(DwcTerm.Event);
                            }
                        }
                    }
                    dwcFile.addField(field);
                } catch (IllegalArgumentException e) {
                    LOG.warn("Illegal term name >>{}<< found in header, ignore column {}", head, index);
                }
            }
            index++;
        }

        return dwcFile;
    }

    @VisibleForTesting
    protected static void readMetaDescriptor(Archive archive, InputStream metaDescriptor)
            throws UnsupportedArchiveException {

        try {
            SAXParser p = SAX_FACTORY.newSAXParser();
            MetaHandler mh = new MetaHandler(archive);
            LOG.debug("Reading archive metadata file");
            p.parse(new BOMInputStream(metaDescriptor), mh);
        } catch (Exception e1) {
            LOG.warn("Exception caught", e1);
            throw new UnsupportedArchiveException(e1);
        }
    }

    private static Archive validateArchive(Archive archive) throws UnsupportedArchiveException {
        validateCoreFile(archive.getCore(), !archive.getExtensions().isEmpty());
        for (ArchiveFile af : archive.getExtensions()) {
            validateExtensionFile(af);
        }
        // report basic stats
        LOG.debug("Archive contains " + archive.getExtensions().size() + " described extension files");
        LOG.debug("Archive contains " + archive.getCore().getFields().size() + " core properties");
        return archive;
    }

    private static void validateCoreFile(ArchiveFile f, boolean hasExtensions) throws UnsupportedArchiveException {
        if (hasExtensions) {
            if (f.getId() == null) {
                LOG.warn("DwC-A core data file " + f.getTitle()
                        + " is lacking an id column. No extensions allowed in this case");
            }
        }
        validateFile(f);
    }

    private static void validateExtensionFile(ArchiveFile f) throws UnsupportedArchiveException {
        if (f.getId() == null) {
            throw new UnsupportedArchiveException(
                    "DwC-A data file " + f.getTitle() + " requires an id or foreign key to the core id");
        }
        validateFile(f);
    }

    private static void validateFile(ArchiveFile f) throws UnsupportedArchiveException {
        if (f == null) {
            throw new UnsupportedArchiveException("DwC-A data file is NULL");
        }
        if (f.getLocationFile() == null) {
            throw new UnsupportedArchiveException("DwC-A data file " + f.getTitle() + " requires a location");
        }
        if (f.getEncoding() == null) {
            throw new UnsupportedArchiveException(
                    "DwC-A data file " + f.getTitle() + " requires a character encoding");
        }

    }

}