org.gbif.occurrence.OccurrenceParser.java Source code

Introduction

Here is the source code for org.gbif.occurrence.OccurrenceParser.java
Source

/*
 * Copyright 2011 Global Biodiversity Information Facility (GBIF)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gbif.occurrence;

import org.gbif.api.exception.ServiceUnavailableException;
import org.gbif.occurrence.constants.ExtractionSimpleXPaths;
import org.gbif.occurrence.model.RawOccurrenceRecord;
import org.gbif.occurrence.parsing.RawXmlOccurrence;
import org.gbif.occurrence.parsing.response_file.ParsedSearchResponse;
import org.gbif.occurrence.parsing.xml.XmlFragmentParser;
import org.gbif.occurrence.util.XmlSanitizingReader;
import org.gbif.utils.file.CharsetDetection;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;

import com.sun.org.apache.xerces.internal.impl.io.MalformedByteSequenceException;
import org.apache.commons.digester.Digester;
import org.apache.commons.digester.NodeCreateRule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import static com.google.common.base.Preconditions.checkNotNull;

/**
 * Entry point into the parsing of raw occurrence records as retrieved from publishers.  Will attempt to determine
 * both XML encodings and schema type.  Parse happens in two steps - first extracts each record element into a
 * RawXmlOccurrence, and then parses each of those into RawOccurrenceRecords.
 */
public class OccurrenceParser {

    private static final Logger LOG = LoggerFactory.getLogger(OccurrenceParser.class);

    public List<RawOccurrenceRecord> parseResponseFileToRor(File inputFile) {
        List<RawXmlOccurrence> raws = parseResponseFileToRawXml(inputFile);
        List<RawOccurrenceRecord> rors = parseRawXmlToRor(raws);

        return rors;
    }

    /**
     * This parses a stream of uncompressed ABCD or DwC Occurrences into {@link RawXmlOccurrence}s.
     *
     * No care is taken to handle wrong encodings or character sets in general. This might be changed later on.
     *
     * @param is stream to parse
     * @return list of parsed occurrences
     * @throws ParsingException if there were any problems during parsing the stream
     */
    // TODO: Optionally handle compressed streams
    public List<RawXmlOccurrence> parseStream(InputStream is) throws ParsingException {
        checkNotNull(is, "is can't be null");
        try {
            ParsedSearchResponse responseBody = new ParsedSearchResponse();
            InputSource inputSource = new InputSource(is);

            Digester digester = new Digester();
            digester.setNamespaceAware(true);
            digester.setValidating(false);
            digester.push(responseBody);

            NodeCreateRule rawAbcd = new NodeCreateRule();
            digester.addRule(ExtractionSimpleXPaths.ABCD_RECORD_XPATH, rawAbcd);
            digester.addSetNext(ExtractionSimpleXPaths.ABCD_RECORD_XPATH, "addRecordAsXml");

            NodeCreateRule rawAbcd1Header = new NodeCreateRule();
            digester.addRule(ExtractionSimpleXPaths.ABCD_HEADER_XPATH, rawAbcd1Header);
            digester.addSetNext(ExtractionSimpleXPaths.ABCD_HEADER_XPATH, "setAbcd1Header");

            NodeCreateRule rawDwc1_0 = new NodeCreateRule();
            digester.addRule(ExtractionSimpleXPaths.DWC_1_0_RECORD_XPATH, rawDwc1_0);
            digester.addSetNext(ExtractionSimpleXPaths.DWC_1_0_RECORD_XPATH, "addRecordAsXml");

            NodeCreateRule rawDwc1_4 = new NodeCreateRule();
            digester.addRule(ExtractionSimpleXPaths.DWC_1_4_RECORD_XPATH, rawDwc1_4);
            digester.addSetNext(ExtractionSimpleXPaths.DWC_1_4_RECORD_XPATH, "addRecordAsXml");

            //      NodeCreateRule rawDwcManis = new NodeCreateRule();
            //      digester.addRule(ExtractionSimpleXPaths.DWC_MANIS_RECORD_XPATH, rawDwcManis);
            //      digester.addSetNext(ExtractionSimpleXPaths.DWC_MANIS_RECORD_XPATH, "addRecordAsXml");

            NodeCreateRule rawDwc2009 = new NodeCreateRule();
            digester.addRule(ExtractionSimpleXPaths.DWC_2009_RECORD_XPATH, rawDwc2009);
            digester.addSetNext(ExtractionSimpleXPaths.DWC_2009_RECORD_XPATH, "addRecordAsXml");

            digester.parse(inputSource);
            return responseBody.getRecords();
        } catch (ParserConfigurationException e) {
            throw new ServiceUnavailableException("Error setting up Commons Digester", e);
        } catch (SAXException e) {
            throw new ParsingException("Parsing failed", e);
        } catch (IOException e) {
            throw new ParsingException("Parsing failed", e);
        } catch (TransformerException e) {
            throw new ServiceUnavailableException("Error setting up Commons Digester", e);
        }
    }

    /**
     * Parses a single response gzipFile and returns a List of the contained RawXmlOccurrences.
     */
    public List<RawXmlOccurrence> parseResponseFileToRawXml(File gzipFile) {
        if (LOG.isDebugEnabled())
            LOG.debug(">> parseResponseFileToRawXml [{}]", gzipFile.getAbsolutePath());
        ParsedSearchResponse responseBody = null;
        InputStreamReader inputStreamReader = null;
        BufferedReader bufferedReader = null;
        try {
            responseBody = new ParsedSearchResponse();

            FileInputStream fis = new FileInputStream(gzipFile);
            GZIPInputStream inputStream = new GZIPInputStream(fis);

            // charsets are a nightmare and users can't be trusted, so strategy
            // is try these encodings in order until one of them (hopefully) works
            // (note the last two could be repeats of the first two):
            // - utf-8
            // - latin1 (iso-8859-1)
            // - the declared encoding from the parsing itself
            // - a guess at detecting the charset from the raw gzipFile bytes

            List<String> charsets = new ArrayList<String>();
            charsets.add("UTF-8");
            charsets.add("ISO-8859-1");

            // read parsing declaration

            inputStreamReader = new InputStreamReader(inputStream);
            bufferedReader = new BufferedReader(inputStreamReader);
            boolean gotEncoding = false;
            String encoding = "";
            int lineCount = 0;
            while (bufferedReader.ready() && !gotEncoding && lineCount < 5) {
                String line = bufferedReader.readLine();
                lineCount++;
                if (line != null && line.contains("encoding=")) {
                    encoding = line.split("encoding=")[1];
                    // drop trailing ?>
                    encoding = encoding.substring(0, encoding.length() - 2);
                    // drop quotes
                    encoding = encoding.replaceAll("\"", "").replaceAll("'", "").trim();
                    LOG.debug("Found encoding [{}] in parsing declaration", encoding);
                    try {
                        Charset.forName(encoding);
                        charsets.add(encoding);
                    } catch (Exception e) {
                        LOG.debug(
                                "Could not find supported charset matching detected encoding of [{}] - trying other guesses instead",
                                encoding);
                    }
                    gotEncoding = true;
                }
            }

            // attempt detection from bytes
            Charset charset = CharsetDetection.detectEncoding(gzipFile);
            charsets.add(charset.name());
            String goodCharset = null;
            boolean encodingError = false;
            for (String charsetName : charsets) {
                LOG.debug("Trying charset [{}]", charsetName);
                try {
                    // reset streams
                    fis = new FileInputStream(gzipFile);
                    inputStream = new GZIPInputStream(fis);

                    BufferedReader inputReader = new BufferedReader(
                            new XmlSanitizingReader(new InputStreamReader(inputStream, charsetName)));
                    InputSource inputSource = new InputSource(inputReader);

                    Digester digester = new Digester();
                    digester.setNamespaceAware(true);
                    digester.setValidating(false);
                    digester.push(responseBody);

                    NodeCreateRule rawAbcd = new NodeCreateRule();
                    digester.addRule(ExtractionSimpleXPaths.ABCD_RECORD_XPATH, rawAbcd);
                    digester.addSetNext(ExtractionSimpleXPaths.ABCD_RECORD_XPATH, "addRecordAsXml");

                    NodeCreateRule rawAbcd1Header = new NodeCreateRule();
                    digester.addRule(ExtractionSimpleXPaths.ABCD_HEADER_XPATH, rawAbcd1Header);
                    digester.addSetNext(ExtractionSimpleXPaths.ABCD_HEADER_XPATH, "setAbcd1Header");

                    NodeCreateRule rawDwc1_0 = new NodeCreateRule();
                    digester.addRule(ExtractionSimpleXPaths.DWC_1_0_RECORD_XPATH, rawDwc1_0);
                    digester.addSetNext(ExtractionSimpleXPaths.DWC_1_0_RECORD_XPATH, "addRecordAsXml");

                    NodeCreateRule rawDwc1_4 = new NodeCreateRule();
                    digester.addRule(ExtractionSimpleXPaths.DWC_1_4_RECORD_XPATH, rawDwc1_4);
                    digester.addSetNext(ExtractionSimpleXPaths.DWC_1_4_RECORD_XPATH, "addRecordAsXml");

                    // TODO: dwc_manis appears to work without a NodeCreateRule here - why?

                    NodeCreateRule rawDwc2009 = new NodeCreateRule();
                    digester.addRule(ExtractionSimpleXPaths.DWC_2009_RECORD_XPATH, rawDwc2009);
                    digester.addSetNext(ExtractionSimpleXPaths.DWC_2009_RECORD_XPATH, "addRecordAsXml");

                    digester.parse(inputSource);

                    LOG.debug("Success with charset [{}] - skipping any others", charsetName);
                    goodCharset = charsetName;
                    break;
                } catch (SAXException e) {
                    String msg = "SAX exception when parsing parsing from response gzipFile ["
                            + gzipFile.getAbsolutePath() + "] using encoding [" + charsetName
                            + "] - trying another charset";
                    LOG.debug(msg, e);
                } catch (IOException e) {
                    if (e instanceof MalformedByteSequenceException) {
                        LOG.debug("Malformed utf-8 byte when parsing with encoding [{}] - trying another charset",
                                charsetName);
                        encodingError = true;
                    }
                }
            }

            if (goodCharset == null) {
                if (encodingError) {
                    LOG.warn(
                            "Could not parse gzipFile - none of the encoding attempts worked (failed with malformed utf8) - skipping gzipFile [{}]",
                            gzipFile.getAbsolutePath());
                } else {
                    LOG.warn("Could not parse gzipFile (malformed parsing) - skipping gzipFile [{}]",
                            gzipFile.getAbsolutePath());
                }
            }

        } catch (FileNotFoundException e) {
            LOG.warn("Could not find response gzipFile [{}] - skipping gzipFile", gzipFile.getAbsolutePath(), e);
        } catch (IOException e) {
            LOG.warn("Could not read response gzipFile [{}] - skipping gzipFile", gzipFile.getAbsolutePath(), e);
        } catch (TransformerException e) {
            LOG.warn("Could not create parsing transformer for [{}] - skipping gzipFile",
                    gzipFile.getAbsolutePath(), e);
        } catch (ParserConfigurationException e) {
            LOG.warn("Failed to pull raw parsing from response gzipFile [{}] - skipping gzipFile",
                    gzipFile.getAbsolutePath(), e);
        } finally {
            try {
                if (bufferedReader != null)
                    bufferedReader.close();
                if (inputStreamReader != null)
                    inputStreamReader.close();
            } catch (IOException e) {
                LOG.debug("Failed to close input files", e);
            }
        }

        if (LOG.isDebugEnabled())
            LOG.debug("<< parseResponseFileToRawXml [{}]", gzipFile.getAbsolutePath());
        return (responseBody == null) ? null : responseBody.getRecords();
    }

    public List<RawOccurrenceRecord> parseRawXmlToRor(List<RawXmlOccurrence> raws) {
        List<RawOccurrenceRecord> rors = new ArrayList<RawOccurrenceRecord>();
        for (RawXmlOccurrence raw : raws) {
            List<RawOccurrenceRecord> innerRors = XmlFragmentParser.parseRecord(raw);
            rors.addAll(innerRors);
        }
        return rors;
    }

}