org.kurator.validation.actors.io.DwCaReader.java Source code

Java tutorial

Introduction

Here is the source code for org.kurator.validation.actors.io.DwCaReader.java

Source

/** 
 * DwCaReader.java 
 * 
 * Copyright 2015 President and Fellows of Harvard College
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kurator.validation.actors.io;

import akka.actor.PoisonPill;
import akka.routing.Broadcast;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.filteredpush.kuration.util.SpecimenRecord;
import org.gbif.dwc.record.StarRecord;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.text.Archive;
import org.gbif.dwc.text.ArchiveFactory;
import org.gbif.dwc.text.UnsupportedArchiveException;
import org.kurator.akka.actors.OneShot;

import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

/**
 * Actor to read occurrence data from DarwinCore Archive files.
 * 
 * @author mole
 *
 */
public class DwCaReader extends OneShot {

    private static final Log logger = LogFactory.getLog(DwCaReader.class);

    public String filePath;
    public Archive dwcArchive = null;

    Iterator<StarRecord> iterator;

    @Override
    public void fireOnce() throws Exception {
        File file = new File(filePath);
        if (!file.exists()) {
            // Error
            logger.error(filePath + " not found.");
        }
        if (!file.canRead()) {
            // error
            logger.error("Unable to read " + filePath);
        }
        if (file.isDirectory()) {
            // check if it is an unzipped dwc archive.
            dwcArchive = openArchive(file);
        }
        if (file.isFile()) {
            // unzip it
            File outputDirectory = new File(file.getName().replace(".", "_") + "_content");
            if (!outputDirectory.exists()) {
                outputDirectory.mkdir();
                try {
                    byte[] buffer = new byte[1024];
                    ZipInputStream inzip = new ZipInputStream(new FileInputStream(file));
                    ZipEntry entry = inzip.getNextEntry();
                    while (entry != null) {
                        String fileName = entry.getName();
                        File expandedFile = new File(outputDirectory.getPath() + File.separator + fileName);
                        new File(expandedFile.getParent()).mkdirs();
                        FileOutputStream expandedfileOutputStream = new FileOutputStream(expandedFile);
                        int len;
                        while ((len = inzip.read(buffer)) > 0) {
                            expandedfileOutputStream.write(buffer, 0, len);
                        }

                        expandedfileOutputStream.close();
                        entry = inzip.getNextEntry();
                    }
                    inzip.closeEntry();
                    inzip.close();
                    System.out.println("Unzipped archive into " + outputDirectory.getPath());
                } catch (FileNotFoundException e) {
                    logger.error(e.getMessage());
                    e.printStackTrace();
                } catch (IOException e) {
                    logger.error(e.getMessage());
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            // look into the unzipped directory
            dwcArchive = openArchive(outputDirectory);

            iterator = dwcArchive.iterator();
            while (iterator.hasNext()) {
                // read initial set of rows, pass downstream
                StarRecord dwcrecord = iterator.next();
                SpecimenRecord record = new SpecimenRecord(dwcrecord);
                broadcast(record);
            }
        }
        if (dwcArchive != null) {
            if (checkArchive()) {
                // good to go
            }
        } else {
            System.out.println("Problem opening archive.");
        }
    }

    /**
     * Attempt to open a DarwinCore archive directory and return it as an Archive object.  
     * If an UnsupportedArchiveException is thrown, trys again harder by looking for an archive
     * directory inside the provided directory.
     * 
     * @param outputDirectory directory that should represent an unzipped DarwinCore archive.
     * @return an Archive object repsesenting the content of the directory or null if unable
     * to open an archive object.
     */
    protected Archive openArchive(File outputDirectory) {
        Archive result = null;
        try {
            result = ArchiveFactory.openArchive(outputDirectory);
        } catch (UnsupportedArchiveException e) {
            logger.error(e.getMessage());
            File[] containedFiles = outputDirectory.listFiles();
            boolean foundContained = false;
            for (int i = 0; i < containedFiles.length; i++) {
                if (containedFiles[i].isDirectory()) {
                    try {
                        // Try harder, some pathological archives contain a extra level of subdirectory
                        result = ArchiveFactory.openArchive(containedFiles[i]);
                        foundContained = true;
                    } catch (Exception e1) {
                        logger.error(e.getMessage());
                        System.out.println("Unable to open archive directory " + e.getMessage());
                        System.out.println(
                                "Unable to open directory contained within archive directory " + e1.getMessage());
                    }
                }
            }
            if (!foundContained) {
                System.out.println("Unable to open archive directory " + e.getMessage());
            }
        } catch (IOException e) {
            logger.error(e.getMessage());
            System.out.println("Unable to open archive directory " + e.getMessage());
        }
        return result;
    }

    protected boolean checkArchive() {
        boolean result = false;
        if (dwcArchive == null) {
            return result;
        }
        if (dwcArchive.getCore() == null) {
            System.out.println("Cannot locate the core datafile in " + dwcArchive.getLocation().getPath());
            return result;
        }
        System.out.println("Core file found: " + dwcArchive.getCore().getLocations());
        System.out.println("Core row type: " + dwcArchive.getCore().getRowType());
        if (dwcArchive.getCore().getRowType().equals(DwcTerm.Occurrence)) {

            // check expectations 
            List<DwcTerm> expectedTerms = new ArrayList<DwcTerm>();
            expectedTerms.add(DwcTerm.scientificName);
            expectedTerms.add(DwcTerm.scientificNameAuthorship);
            expectedTerms.add(DwcTerm.eventDate);
            expectedTerms.add(DwcTerm.recordedBy);
            expectedTerms.add(DwcTerm.decimalLatitude);
            expectedTerms.add(DwcTerm.decimalLongitude);
            expectedTerms.add(DwcTerm.locality);
            expectedTerms.add(DwcTerm.basisOfRecord);

            for (DwcTerm term : expectedTerms) {
                if (!dwcArchive.getCore().hasTerm(term)) {
                    System.out.println("Cannot find " + term + " in core of input dataset.");
                }
            }

            result = true;
        } else {
            // currently can only process occurrence core
        }

        return result;
    }

}