org.gbif.dwca.action.ValidateAction.java Source code

Introduction

Here is the source code for org.gbif.dwca.action.ValidateAction.java
Source

/***************************************************************************
 * Copyright 2010 Global Biodiversity Information Facility Secretariat
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 ***************************************************************************/

package org.gbif.dwca.action;

import org.gbif.api.model.registry.Dataset;
import org.gbif.dwc.record.Record;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.dwc.text.Archive;
import org.gbif.dwc.text.ArchiveFactory;
import org.gbif.dwc.text.ArchiveField;
import org.gbif.dwc.text.ArchiveFile;
import org.gbif.dwc.text.StarRecord;
import org.gbif.dwca.model.Extension;
import org.gbif.dwca.model.ExtensionProperty;
import org.gbif.dwca.service.ExtensionManager;
import org.gbif.dwca.service.ValidationService;
import org.gbif.dwca.utils.FreemarkerUtils;
import org.gbif.dwca.utils.UrlUtils;
import org.gbif.file.CSVReader;
import org.gbif.registry.metadata.parse.DatasetParser;
import org.gbif.utils.HttpUtil;
import org.gbif.utils.collection.CompactHashSet;
import org.gbif.utils.file.ClosableIterator;
import org.gbif.utils.file.CompressionUtil;
import org.gbif.utils.file.CompressionUtil.UnsupportedCompressionType;
import org.gbif.utils.file.FileUtils;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringBufferInputStream;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.URL;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;
import javax.xml.transform.Source;
import javax.xml.transform.stream.StreamSource;

import com.google.inject.Inject;
import freemarker.ext.beans.BeansWrapper;
import freemarker.template.Configuration;
import freemarker.template.TemplateException;
import gnu.trove.map.TObjectByteMap;
import gnu.trove.map.hash.TObjectByteHashMap;
import gnu.trove.map.hash.TObjectLongHashMap;
import org.apache.commons.io.filefilter.HiddenFileFilter;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateFormatUtils;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
import org.apache.struts2.views.freemarker.StrutsBeanWrapper;

/**
 * @author markus
 *
 */
public class ValidateAction extends BaseAction {
    protected static final Pattern NULL_REPL = Pattern.compile("^\\s*(null|\\\\N|\\s)\\s*$",
            Pattern.CASE_INSENSITIVE);

    private static final String REPORTS_DIR_KEY = "reports.dir";
    private static final String REPORTS_WWW_KEY = "reports.www";

    private File file;
    private String fileContentType;
    private String fileFileName;
    private String meta;
    private String archiveUrl;
    private String ifModifiedSince;
    private Date ifModifiedSinceDate;
    @Inject
    private ValidationService validation;
    @Inject
    private ExtensionManager extensionManager;
    @Inject
    private HttpUtil http;
    @Inject
    private Configuration fm;

    // additional webservice params
    private String reportId;
    private String reportUrl;

    // results

    private Date now = new Date();
    // did archive validation succeed overall?
    private boolean valid = true;
    // is archive the online accessible via the given url?
    private boolean online = true;
    private String offlineReason;
    // meta.xml
    private boolean metaExists = true;
    private boolean metaOnly = true;
    private Exception schemaException;
    private List<StackTraceElement> schemaStackTrace;
    // archive factory
    private Exception dwcaException;
    private List<StackTraceElement> dwcaStackTrace = new ArrayList<StackTraceElement>();
    private Archive archive;
    private Map<String, Extension> extensions;
    private Map<String, List<ArchiveField>> fields = new HashMap<String, List<ArchiveField>>();
    private Map<String, List<ExtensionProperty>> fieldsMissing = new HashMap<String, List<ExtensionProperty>>();
    private Map<String, List<ArchiveField>> fieldsUnknown = new HashMap<String, List<ArchiveField>>();
    // as found in dwca folder
    private Set<String> dwcaFiles = new HashSet<String>(); // all files in dwca but meta.xml and the metadata one
    private String coreFile;
    // all of the following max 100
    private static final int MAX_RECORDS_REPORTED = 50;
    private static final int MAX_IDS_STORED = 2000000;
    // key=filename
    private Map<String, Map<Integer, String[]>> brokenLines = new HashMap<String, Map<Integer, String[]>>();
    private Map<String, Set<String>> missingIds = new HashMap<String, Set<String>>();
    private Map<String, Set<String>> brokenRefIntegrity = new HashMap<String, Set<String>>();
    private Map<String, Integer> fileLines = new HashMap<String, Integer>();
    private Map<String, Integer> fileColumns = new HashMap<String, Integer>();
    private Set<String> nonUniqueId = new CompactHashSet<String>();
    private TObjectLongHashMap<String> nullValues = new TObjectLongHashMap<String>();
    private Set<String> acceptedSynonyms;
    private Set<String> parentSynonyms;
    private Set<Integer> emptyLines;
    private TObjectByteMap<String> coreIds = new TObjectByteHashMap<String>();
    // metadata
    private Dataset metadata;
    private Exception metadataException;
    private ArrayList<StackTraceElement> metadataStackTrace = new ArrayList<StackTraceElement>();
    private boolean emlSchemaValidated = false;
    private boolean gbifSchemaValidated = false;
    private boolean tooManyCoreIds = false;

    // records
    private int scanSize = 100;
    private List<List<List<String>>> records = new ArrayList<List<List<String>>>();
    private List<String> extensionOrder = new ArrayList<String>();
    private Map<String, List<String>> recordsHeader = new TreeMap<String, List<String>>();
    private Exception recordsException;
    private ArrayList<StackTraceElement> recordsStackTrace = new ArrayList<StackTraceElement>();
    private StatusLine status;

    static class ArchiveLocation {

        public File dwcaFolder;
        public File metaFile;
    }

    public static File createDwcaDirectory() throws IOException {
        final File temp;

        temp = File.createTempFile("dwca-", Long.toString(System.nanoTime()));

        if (!(temp.delete())) {
            throw new IOException("Could not delete temp dwca file: " + temp.getAbsolutePath());
        }

        if (!(temp.mkdir())) {
            throw new IOException("Could not create temp dwca directory: " + temp.getAbsolutePath());
        }

        return (temp);
    }

    public String eml() throws Exception {
        if (file != null || meta != null) {
            validateEml();
            return SUCCESS;
        }
        return INPUT;
    }

    public Map<String, Set<String>> getBrokenRefIntegrity() {
        return brokenRefIntegrity;
    }

    public boolean isTooManyCoreIds() {
        return tooManyCoreIds;
    }

    public Map<String, Integer> getFileColumns() {
        return fileColumns;
    }

    private ArchiveLocation extractArchive() throws IOException {
        ArchiveLocation archLoc = null;
        if (fileFileName != null) {
            archLoc = openArchive(file, fileFileName);
        } else if (!StringUtils.isBlank(archiveUrl)) {
            // url to achive provided
            // extractArchive and validate
            URL url = new URL(UrlUtils.encodeURLWhitespace(archiveUrl.trim()));
            file = File.createTempFile("download-", ".dwca");
            // use conditional get?
            if (ifModifiedSinceDate != null) {
                log.debug("Use conditional get for download if modified since: " + ifModifiedSinceDate.toString());
                status = http.downloadIfModifiedSince(url, ifModifiedSinceDate, file);
            } else {
                status = http.download(url, file);
            }
            if (status.getStatusCode() == HttpStatus.SC_NOT_MODIFIED) {
                // not modified, no need to download and validate
                online = true;
                archLoc = null;
                addActionMessage("The archive hasn't been modified since " + ifModifiedSinceDate.toString());
                addActionMessage("No download and validation done.");
            } else if (http.success(status)) {
                online = true;
                archLoc = openArchive(file, url.getFile());
            } else {
                String reason = "HTTP" + status.getStatusCode();
                if (status.getReasonPhrase() != null) {
                    reason += " " + status.getReasonPhrase();
                }
                setOffline(reason);
            }
        } else if (meta != null) {
            // meta.xml provided as text
            archLoc = new ArchiveLocation();
            archLoc.metaFile = File.createTempFile("meta-", ".xml");
            archLoc.dwcaFolder = archLoc.metaFile;
            Writer w = FileUtils.startNewUtf8File(archLoc.metaFile);
            w.write(meta);
            w.close();
        }
        return archLoc;
    }

    @Override
    public String execute() {
        ArchiveLocation archLoc = null;
        try {
            if (!StringUtils.isBlank(ifModifiedSince)) {
                ifModifiedSinceDate = DateFormatUtils.ISO_DATE_FORMAT.parse(ifModifiedSince);
                if (ifModifiedSinceDate == null) {
                    log.debug("Use conditional get for download if modified since: " + ifModifiedSince);
                    return INPUT;
                }
            }

            archLoc = extractArchive();
            if (archLoc == null && status == null) {
                return INPUT;
            }
            if (archLoc != null) {
                extensions = extensionManager.map();
                validateAgainstSchema(archLoc.metaFile);
                validateArchive(archLoc.dwcaFolder);
            }
        } catch (ParseException e) {
            setOffline("Invalid ISO date " + e.getMessage());
        } catch (MalformedURLException e) {
            setOffline("MalformedURLException " + e.getMessage());
        } catch (SocketException e) {
            setOffline(e.getClass().getSimpleName() + " " + e.getMessage());
        } catch (Exception e) {
            log.error("Unknown error when validating archive", e);
            valid = false;
        } finally {
            // cleanup temp files!
            cleanupTempFile(archLoc);
        }

        // store html report
        if (archLoc != null) {
            storeReport();
        }

        return SUCCESS;
    }

    private void cleanupTempFile(ArchiveLocation archLoc) {
        Collection<File> tempFiles = new ArrayList<File>();
        tempFiles.add(file);
        if (archLoc != null) {
            tempFiles.add(archLoc.metaFile);
            tempFiles.add(archLoc.dwcaFolder);
        }
        for (File f : tempFiles) {
            if (f != null && f.exists()) {
                try {
                    org.apache.commons.io.FileUtils.forceDelete(f);
                } catch (IOException e) {
                    log.warn("Failed to remove temporary file/folder " + f.getAbsolutePath(), e);
                }
            }

        }
    }

    private void setOffline(String reason) {
        valid = false;
        online = false;
        metaExists = false;
        offlineReason = reason;
    }

    /*
      copies the html result of this action into a file that is stored for subsequent access
     */
    private boolean storeReport() {
        if (reportId == null) {
            // create random report id
            Random rnd = new Random();
            reportId = String.format("%Tj-%d", new Date(), Math.abs(rnd.nextLong()));
        }
        File report = new File(cfg.getProperty(REPORTS_DIR_KEY), reportId + ".html");
        reportUrl = cfg.getProperty(REPORTS_WWW_KEY) + "/" + reportId + ".html";
        log.debug("Writing validation report to " + report.getAbsolutePath());
        try {
            BeansWrapper wrapper = new StrutsBeanWrapper(true);
            wrapper.setExposureLevel(0);
            fm.setObjectWrapper(wrapper);
            FreemarkerUtils.writeUtf8File(fm, report, "/WEB-INF/pages/validate_report.ftl", this);
        } catch (TemplateException e) {
            log.error("Cannot find template for validation report", e);
            return false;
        } catch (IOException e) {
            log.error("Cannot write validation report", e);
            return false;
        }
        return true;
    }

    private ArchiveLocation openArchive(File sourceFile, String originalFileName) throws IOException {
        ArchiveLocation loc = new ArchiveLocation();
        loc.dwcaFolder = createDwcaDirectory();
        List<File> files;
        try {
            files = CompressionUtil.decompressFile(loc.dwcaFolder, sourceFile);
            metaOnly = false;
            loc.metaFile = new File(loc.dwcaFolder, "meta.xml");
        } catch (UnsupportedCompressionType e) {
            // seems to be a single file
            // move single file into temp dir and rename it to original
            File f = new File(loc.dwcaFolder, originalFileName);
            org.apache.commons.io.FileUtils.moveFile(sourceFile, f);
            if (originalFileName.endsWith(".xml")) {
                // might be a meta.xml on its own?
                loc.metaFile = f;
                addActionMessage("Cannot decompress archive - treat like meta.xml on its own");
            } else {
                // some single text file?
                metaOnly = false;
                loc.metaFile = null;
                loc.dwcaFolder = f;
                addActionMessage("Cannot decompress archive - treat like single data file");
            }
        }
        return loc;
    }

    public Archive getArchive() {
        return archive;
    }

    public Exception getDwcaException() {
        return dwcaException;
    }

    public Set<String> getDwcaFiles() {
        return dwcaFiles;
    }

    public String getDwcaSchema() {
        return cfg.getMetaSchema();
    }

    public List<StackTraceElement> getDwcaStackTrace() {
        return dwcaStackTrace;
    }

    private InputStream getEmlInputStream() throws FileNotFoundException {
        InputStream src = null;
        if (file != null) {
            // file upload
            src = new FileInputStream(file);
        } else {
            // copy paste in this.meta
            src = new StringBufferInputStream(meta);
        }
        return src;
    }

    private Source getEmlSource() throws FileNotFoundException {
        Source src = new StreamSource(getEmlInputStream());
        return src;
    }

    public List<String> getExtensionOrder() {
        return extensionOrder;
    }

    public Map<String, Extension> getExtensions() {
        return extensions;
    }

    public Map<String, List<ArchiveField>> getFields() {
        return fields;
    }

    public Map<String, List<ExtensionProperty>> getFieldsMissing() {
        return fieldsMissing;
    }

    public Map<String, List<ArchiveField>> getFieldsUnknown() {
        return fieldsUnknown;
    }

    public String getFileFileName() {
        return fileFileName;
    }

    public Date getLastUpdated() {
        return validation.getLastUpdate();
    }

    public String getMeta() {
        return meta;
    }

    public Dataset getMetadata() {
        return metadata;
    }

    public Exception getMetadataException() {
        return metadataException;
    }

    public ArrayList<StackTraceElement> getMetadataStackTrace() {
        return metadataStackTrace;
    }

    public List<List<List<String>>> getRecords() {
        return records;
    }

    public Exception getRecordsException() {
        return recordsException;
    }

    public Map<String, List<String>> getRecordsHeader() {
        return recordsHeader;
    }

    public ArrayList<StackTraceElement> getRecordsStackTrace() {
        return recordsStackTrace;
    }

    public int getScanSize() {
        return scanSize;
    }

    public Exception getSchemaException() {
        return schemaException;
    }

    public List<StackTraceElement> getSchemaStackTrace() {
        return schemaStackTrace;
    }

    /**
     * gets the row value or throws informative exception if the index does not exist
     */
    private String getRowValue(String[] row, int column, String columnName) {
        try {
            return StringUtils.trimToNull(row[column]);
        } catch (Exception e) {
            valid = false;
            throw new IllegalArgumentException("Column " + columnName + " with index " + column + " not existing",
                    e);
        }
    }

    private void inspectArchiveFile(ArchiveFile af, boolean core) {
        String rowType = af.getRowType();
        String filename = af.getLocation();
        // in case the archive is only a file, this will be null, so we need to use the archive folder name instead!
        if (filename == null) {
            filename = af.getArchive().getLocation().getName();
        }

        fields.put(rowType, new ArrayList<ArchiveField>());
        fieldsUnknown.put(rowType, new ArrayList<ArchiveField>());
        fieldsMissing.put(rowType, new ArrayList<ExtensionProperty>());

        // registered extension?
        Extension ext = extensionManager.get(rowType);

        Collection<ArchiveField> mapped = af.getFields().values();
        if (ext != null) {
            extensions.put(rowType, ext);
            // missing required fields?
            for (ExtensionProperty p : ext.getProperties()) {
                if (p.isRequired() && !af.hasTerm(p.getQualname())) {
                    fieldsMissing.get(rowType).add(p);
                    valid = false;
                }
            }
        }

        // known and unknown mapped fields
        for (ArchiveField f : mapped) {
            if (ext != null) {
                if (ext.hasProperty(f.getTerm())) {
                    fields.get(rowType).add(f);
                } else {
                    fieldsUnknown.get(rowType).add(f);
                }
            } else {
                fields.get(rowType).add(f);
            }
        }

        // test data file if not previously done already (same file can be mapped more than once)
        Map<Integer, String[]> afBrokenLines = new HashMap<Integer, String[]>();
        Set<String> afMissingIds = new CompactHashSet<String>();

        try {
            CSVReader reader = af.getCSVReader();
            int idColumn = -1;
            if (af.getId() != null && af.getId().getIndex() != null) {
                idColumn = af.getId().getIndex();
            }
            int rowSize = -1;
            if (reader.headerRows > 0) {
                rowSize = reader.header.length;
            }
            int acceptedUsageIdx = -1;
            if (af.hasTerm(DwcTerm.acceptedNameUsageID)) {
                acceptedUsageIdx = af.getField(DwcTerm.acceptedNameUsageID).getIndex();
            }
            while (reader.hasNext()) {
                String[] row = reader.next();
                if (rowSize < 0) {
                    rowSize = row.length;
                } else if (afBrokenLines.size() < MAX_RECORDS_REPORTED && row.length != rowSize) {
                    afBrokenLines.put(reader.currLineNumber(), row);
                }
                // check all columns for verbatim NULL strings
                for (String val : row) {
                    if (NULL_REPL.matcher(val).find()) {
                        nullValues.adjustOrPutValue(val, 1, 1);
                    }
                }
                // core id?
                if (idColumn >= 0 && row.length > 0) {
                    String coreID = getRowValue(row, idColumn, "id");
                    if (core) {
                        // if its a taxon, is it a synonym?
                        byte synonym = 0;
                        if (acceptedUsageIdx > -1) {
                            String acceptedID = getRowValue(row, acceptedUsageIdx,
                                    DwcTerm.acceptedNameUsageID.simpleName());
                            if (acceptedID != null && !acceptedID.equals(coreID)) {
                                synonym = 1;
                            }
                        }
                        // check uniqueness of taxonID
                        if (coreIds.size() < MAX_IDS_STORED) {
                            if (coreIds.containsKey(coreID) && nonUniqueId.size() < MAX_RECORDS_REPORTED) {
                                nonUniqueId.add(coreID);
                            } else {
                                coreIds.put(coreID, synonym);
                            }
                        } else {
                            tooManyCoreIds = true;
                        }
                    } else {
                        if (!tooManyCoreIds && afMissingIds.size() < MAX_RECORDS_REPORTED
                                && !coreIds.containsKey(coreID)) {
                            // we know about all core ids - make sure the extension one exists
                            afMissingIds.add(coreID);
                        }
                    }
                }
            }
            // report empty lines
            emptyLines = reader.getEmptyLines();
            fileLines.put(filename, reader.currLineNumber());
            fileColumns.put(filename, rowSize);

            // potential second pass to verify foreign keys in the core
            if (core && !tooManyCoreIds) {
                Set<String> missingAcceptedUsageIDs = new HashSet<String>();
                this.brokenRefIntegrity.put(DwcTerm.acceptedNameUsageID.simpleName(), missingAcceptedUsageIDs);

                int parentUsageIdx = -1;
                Set<String> missingParentUsageIDs = new HashSet<String>();
                if (af.hasTerm(DwcTerm.parentNameUsageID)) {
                    parentUsageIdx = af.getField(DwcTerm.parentNameUsageID).getIndex();
                    this.brokenRefIntegrity.put(DwcTerm.parentNameUsageID.simpleName(), missingParentUsageIDs);
                }

                int originalNameIdx = -1;
                Set<String> missingOriginalUsageIDs = new HashSet<String>();
                if (af.hasTerm(DwcTerm.originalNameUsageID)) {
                    originalNameIdx = af.getField(DwcTerm.originalNameUsageID).getIndex();
                    this.brokenRefIntegrity.put(DwcTerm.originalNameUsageID.simpleName(), missingOriginalUsageIDs);
                }

                // only check file again if foreign key terms are mapped
                if (acceptedUsageIdx >= 0 || parentUsageIdx >= 0 || originalNameIdx >= 0) {

                    if (acceptedUsageIdx >= 0) {
                        acceptedSynonyms = new CompactHashSet<String>();
                    }
                    if (parentUsageIdx >= 0) {
                        parentSynonyms = new CompactHashSet<String>();
                    }
                    reader.close();
                    reader = af.getCSVReader();
                    while (reader.hasNext()) {
                        String[] row = reader.next();
                        String coreID = getRowValue(row, idColumn, "id");
                        // check foreign key terms
                        if (acceptedUsageIdx >= 0 && row.length > acceptedUsageIdx) {
                            // it is allowed to concat multiple ids with PIPEs - split them
                            String acceptedIds = getRowValue(row, acceptedUsageIdx,
                                    DwcTerm.acceptedNameUsageID.simpleName());
                            if (acceptedIds != null) {
                                for (String accId : StringUtils.split(acceptedIds, '|')) {
                                    if (missingAcceptedUsageIDs.size() < MAX_RECORDS_REPORTED
                                            && !coreIds.containsKey(accId)) {
                                        missingAcceptedUsageIDs.add(accId);
                                    }
                                    // is the referenced accepted record a synonym?
                                    if (acceptedSynonyms.size() < MAX_RECORDS_REPORTED && coreIds.containsKey(accId)
                                            && coreIds.get(accId) != 0) {
                                        acceptedSynonyms.add(coreID);
                                    }
                                }
                            }
                        }
                        if (parentUsageIdx >= 0) {
                            String parentID = getRowValue(row, parentUsageIdx,
                                    DwcTerm.parentNameUsageID.simpleName());
                            if (!StringUtils.isBlank(parentID)) {
                                if (missingParentUsageIDs.size() < MAX_RECORDS_REPORTED
                                        && !coreIds.containsKey(parentID)) {
                                    missingParentUsageIDs.add(parentID);
                                }
                                // is the referenced parent record a synonym?
                                if (parentSynonyms.size() < MAX_RECORDS_REPORTED && coreIds.containsKey(parentID)
                                        && coreIds.get(parentID) != 0) {
                                    parentSynonyms.add(coreID);
                                }
                            }
                        }
                        if (originalNameIdx >= 0) {
                            String originalNameID = getRowValue(row, originalNameIdx,
                                    DwcTerm.originalNameUsageID.simpleName());
                            if (!StringUtils.isBlank(originalNameID)
                                    && missingOriginalUsageIDs.size() < MAX_RECORDS_REPORTED
                                    && !coreIds.containsKey(originalNameID)) {
                                missingOriginalUsageIDs.add(originalNameID);
                            }
                        }
                    }
                }
            }
        } catch (Exception e) {
            valid = false;

        } finally {
            if (filename != null) {
                brokenLines.put(filename, afBrokenLines);
                missingIds.put(filename, afMissingIds);
            }
            // is this data file valid?
            if (!nonUniqueId.isEmpty()) {
                valid = false;
            }
            if (!afBrokenLines.isEmpty()) {
                valid = false;
            }
            if (!afMissingIds.isEmpty()) {
                valid = false;
            }
            for (Set<String> integr : this.brokenRefIntegrity.values()) {
                if (!integr.isEmpty()) {
                    valid = false;
                }
            }
        }

    }

    public Map<String, Map<Integer, String[]>> getBrokenLines() {
        return brokenLines;
    }

    public Map<String, Set<String>> getMissingIds() {
        return missingIds;
    }

    public Map<String, Integer> getFileLines() {
        return fileLines;
    }

    public Set<String> getNonUniqueId() {
        return nonUniqueId;
    }

    public Set<Integer> getEmptyLines() {
        return emptyLines;
    }

    private List<String> interpretRecord(List<Term> concepts, Record rec, boolean isCore, int rowSize) {
        List<String> row = new ArrayList<String>();
        if (isCore) {
            row.add(rec.id());
        } else {
            String name = StringUtils.substringAfterLast(rec.rowType(), "/");
            row.add(name);
        }
        for (Term t : concepts) {
            row.add(rec.value(t));
        }
        // make sure all rows have the same width
        while (row.size() < rowSize) {
            row.add("");
        }
        return row;
    }

    public boolean isEmlSchemaValidated() {
        return emlSchemaValidated;
    }

    public boolean isGbifSchemaValidated() {
        return gbifSchemaValidated;
    }

    public boolean isMetaExists() {
        return metaExists;
    }

    public boolean isMetaOnly() {
        return metaOnly;
    }

    private void setDwcaException(Exception e) {
        valid = false;
        dwcaException = e;
        for (StackTraceElement el : e.getStackTrace()) {
            if (el.getClassName().equalsIgnoreCase(this.getClass().getName())) {
                dwcaStackTrace.add(el);
                break;
            }
            dwcaStackTrace.add(el);
        }
    }

    public void setFile(File file) {
        this.file = file;
    }

    public void setFileContentType(String fileContentType) {
        this.fileContentType = fileContentType;
    }

    public void setFileFileName(String fileFileName) {
        this.fileFileName = fileFileName;
    }

    public void setMeta(String meta) {
        this.meta = meta;
    }

    public String getArchiveUrl() {
        return archiveUrl;
    }

    public void setArchiveUrl(String archiveUrl) {
        this.archiveUrl = archiveUrl;
    }

    private void setMetadataException(Exception e) {
        metadataException = e;
        for (StackTraceElement el : e.getStackTrace()) {
            if (el.getClassName().equalsIgnoreCase(this.getClass().getName())) {
                metadataStackTrace.add(el);
                break;
            }
            metadataStackTrace.add(el);
        }
    }

    private void setRecords() {
        try {
            // prepare ordered headers
            Map<String, List<Term>> recordsHeaderFull = new HashMap<String, List<Term>>();
            List<Term> terms = new ArrayList<Term>();
            recordsHeaderFull.put(archive.getCore().getRowType(), terms);
            for (Term t : archive.getCore().getFields().keySet()) {
                terms.add(t);
            }
            int maxRecordWidth = terms.size();
            for (ArchiveFile af : archive.getExtensions()) {
                terms = new ArrayList<Term>();
                recordsHeaderFull.put(af.getRowType(), terms);
                for (Term t : af.getFields().keySet()) {
                    terms.add(t);
                }
                if (terms.size() > maxRecordWidth) {
                    maxRecordWidth = terms.size();
                }
            }
            // finally loop thru data
            ClosableIterator<StarRecord> iter = archive.iterator();
            int i = 0;
            while (iter.hasNext() && i < scanSize) {
                StarRecord rec = iter.next();
                List<List<String>> interpretedRecord = new ArrayList<List<String>>();
                records.add(interpretedRecord);
                // first the core
                interpretedRecord.add(interpretRecord(recordsHeaderFull.get(rec.core().rowType()), rec.core(), true,
                        maxRecordWidth + 1));
                for (Record r : rec) {
                    interpretedRecord
                            .add(interpretRecord(recordsHeaderFull.get(r.rowType()), r, false, maxRecordWidth));
                }
                i++;
            }
            // finally use only simple extension names for headers:
            String coreName = StringUtils.substringAfterLast(archive.getCore().getRowType(), "/");
            recordsHeader.put(coreName, null);
            List<String> extensionNames = new ArrayList<String>();
            for (String rt : recordsHeaderFull.keySet()) {
                String name = StringUtils.substringAfterLast(rt, "/");
                List<String> concepts = new ArrayList<String>();
                for (Term ct : recordsHeaderFull.get(rt)) {
                    concepts.add(ct.simpleName());
                }
                while (concepts.size() < maxRecordWidth) {
                    concepts.add("");
                }
                recordsHeader.put(name, concepts);
                extensionNames.add(name);
            }
            extensionOrder.add(coreName);
            extensionNames.remove(coreName);
            Collections.sort(extensionNames);
            extensionOrder.addAll(extensionNames);
        } catch (Exception e) {
            setRecordsException(e);
        }
    }

    private void setRecordsException(Exception e) {
        recordsException = e;
        for (StackTraceElement el : e.getStackTrace()) {
            if (el.getClassName().equalsIgnoreCase(this.getClass().getName())) {
                recordsStackTrace.add(el);
                break;
            }
            recordsStackTrace.add(el);
        }
    }

    private void setSchemaException(Exception e) {
        valid = false;
        schemaException = e;
        schemaStackTrace = new ArrayList<StackTraceElement>();
        for (StackTraceElement el : e.getStackTrace()) {
            if (el.getClassName().equalsIgnoreCase(this.getClass().getName())) {
                schemaStackTrace.add(el);
                break;
            }
            schemaStackTrace.add(el);
        }
    }

    /**
     *
     */
    private void validateAgainstSchema(File metaFile) {
        if (metaFile != null) {
            // perform validation:
            log.info("Validating meta.xml ...");
            try {
                validation.getMetaValidator().validate(new StreamSource(metaFile));
                log.info("XML Schema validation success.");
            } catch (Exception e) {
                setSchemaException(e);
            }
        } else {
            metaExists = false;
        }
    }

    /**
     * @param dwcaFolder
     */
    private void validateArchive(File dwcaFolder) {
        if (dwcaFolder == null) {
            return;
        }

        log.info("Inspecting uploaded dwc archive");
        try {
            archive = ArchiveFactory.openArchive(dwcaFolder);

            // inspect dwca folder files
            if (dwcaFolder.isDirectory()) {
                if (archive != null && archive.getCore() != null) {
                    coreFile = archive.getCore().getLocation();
                }
                dwcaFiles = new HashSet<String>(Arrays.asList(dwcaFolder.list(HiddenFileFilter.VISIBLE)));
                dwcaFiles.remove("meta.xml");
                if (archive.getMetadataLocation() != null) {
                    dwcaFiles.remove(new File(archive.getMetadataLocation()).getName());
                }
            } else {
                coreFile = dwcaFolder.getName();
                dwcaFiles.add(coreFile);
            }

            // inspect archive files
            ArchiveFile af = archive.getCore();
            inspectArchiveFile(af, true);
            for (ArchiveFile ext : archive.getExtensions()) {
                inspectArchiveFile(ext, false);
            }

            // read records
            if (!metaOnly) {
                setRecords();
            }
        } catch (Exception e) {
            setDwcaException(e);
        }

        // read metadata
        try {
            metadata = archive.getMetadata();
        } catch (Exception e) {
            setMetadataException(e);
        }
    }

    /**
     *
     */
    private void validateEml() {
        // perform validation. EML first
        log.info("Validating against EML ...");

        // try gbif profile
        log.info("Validating against GBIF profile ...");
        try {
            validation.getGbifProfileValidator().validate(getEmlSource());
            gbifSchemaValidated = true;
            log.info("GBIF Profile Schema validation success.");

            // try against the official eml schema
            try {
                validation.getEmlValidator().validate(getEmlSource());
                emlSchemaValidated = true;
                log.info("EML Schema validation success.");
            } catch (Exception e) {
                setMetadataException(e);
            }

        } catch (Exception e) {
            setMetadataException(e);
        }

        // try to parse EML doc with dwca reader
        try {
            metadata = DatasetParser.build(getEmlInputStream());
        } catch (Exception e) {
            log.info("Cant parse eml document with dataset parser");
            metadataException = e;
        }

    }

    public boolean isValid() {
        return valid;
    }

    public String getReportId() {
        return reportId;
    }

    public void setReportId(String reportId) {
        this.reportId = reportId;
    }

    public boolean isOnline() {
        return online;
    }

    public Date getNow() {
        return now;
    }

    public String getReportUrl() {
        return reportUrl;
    }

    public String getOfflineReason() {
        return offlineReason;
    }

    public String getCoreFile() {
        return coreFile;
    }

    public String getIfModifiedSince() {
        return ifModifiedSince;
    }

    public void setIfModifiedSince(String ifModifiedSince) {
        this.ifModifiedSince = ifModifiedSince;
    }

    public StatusLine getStatus() {
        return status;
    }

    public Set<String> getParentSynonyms() {
        return parentSynonyms;
    }

    public Set<String> getAcceptedSynonyms() {
        return acceptedSynonyms;
    }

    public TObjectLongHashMap<String> getNullValues() {
        return nullValues;
    }

    public void setValidate(Object x) {
        //ignore
    }
}