com.norconex.importer.parser.impl.xfdl.XFDLParser.java Source code

Introduction

Here is the source code for com.norconex.importer.parser.impl.xfdl.XFDLParser.java
Source

/* Copyright 2015 Norconex Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.norconex.importer.parser.impl.xfdl;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Writer;
import java.util.Arrays;
import java.util.List;
import java.util.zip.GZIPInputStream;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.CharEncoding;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.norconex.importer.doc.ImporterDocument;
import com.norconex.importer.doc.ImporterMetadata;
import com.norconex.importer.parser.DocumentParserException;
import com.norconex.importer.parser.IDocumentParser;

/**
 * Parser for PureEdge Extensible Forms Description Language (XFDL).
 * This parser extracts any text found in the XFDL XML, whether that XML
 * is Base64 encoded or just plain XML (two possible format for XFDL).
 * 
 * @author Pascal Essiembre 
 * @since 2.1.0
 */
public class XFDLParser implements IDocumentParser {

    private static final char[] MAGIC_BASE64 = "application/vnd.xfdl;content-encoding=\"base64-gzip\""
            .toCharArray();

    @Override
    public List<ImporterDocument> parseDocument(ImporterDocument doc, Writer output)
            throws DocumentParserException {
        try {
            //TODO have a generic utility method for this?
            BufferedInputStream is = new BufferedInputStream(doc.getContent());
            CharsetDetector detector = new CharsetDetector();
            detector.enableInputFilter(true);
            detector.setText(is);
            CharsetMatch match = detector.detect();
            String charset = CharEncoding.UTF_8;
            if (match != null && CharEncoding.isSupported(match.getName())) {
                charset = match.getName();
            }
            BufferedReader reader = new BufferedReader(new InputStreamReader(is, charset));
            parse(reader, output, doc.getMetadata());
        } catch (IOException | ParserConfigurationException | SAXException e) {
            throw new DocumentParserException("Could not parse " + doc.getReference(), e);
        }
        return null;
    }

    private void parse(BufferedReader reader, Writer out, ImporterMetadata metadata)
            throws IOException, ParserConfigurationException, SAXException {
        reader.mark(MAGIC_BASE64.length);
        char[] signature = new char[MAGIC_BASE64.length];
        int num = reader.read(signature);
        reader.reset();
        if (num == -1) {
            return;
        }

        //--- Create XML DOM ---
        DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        Document dom = null;
        if (Arrays.equals(signature, MAGIC_BASE64)) {
            // skip first line
            reader.readLine();

            // un-encode first
            byte[] compressedContent = Base64.decodeBase64(IOUtils.toString(reader));
            // deal with compression
            InputStream is = new GZIPInputStream(new ByteArrayInputStream(compressedContent));

            dom = docBuilder.parse(is);
            IOUtils.closeQuietly(is);
        } else {
            dom = docBuilder.parse(new InputSource(reader));
        }
        parseXML(dom, out, metadata);
    }

    //TODO use a SAX parser instead for increased efficiency.
    private void parseXML(Document doc, Writer out, ImporterMetadata metadata) throws IOException {

        // Grab the title
        NodeList xmlTitles = doc.getElementsByTagName("title");
        if (xmlTitles != null && xmlTitles.getLength() > 0) {
            Node titleItem = xmlTitles.item(0);
            if (titleItem instanceof Element) {
                metadata.addString("title", ((Element) titleItem).getTextContent());
            }
        }

        boolean isEmpty = true;
        NodeList xmlFields = doc.getElementsByTagName("field");
        for (int i = 0; i < xmlFields.getLength(); i++) {
            if (xmlFields.item(i) instanceof Element) {
                NodeList children = xmlFields.item(i).getChildNodes();
                for (int j = 0; j < children.getLength(); j++) {
                    Node childItem = children.item(j);
                    if (childItem instanceof Element) {
                        Element tag = ((Element) childItem);
                        String tagName = tag.getTagName();
                        if ("value".equalsIgnoreCase(tagName)) {
                            isEmpty = writeValue(out, tag.getTextContent(), isEmpty);
                        }
                    }
                }
            }
        }
    }

    private boolean writeValue(Writer out, String value, boolean isOuputEmpty) throws IOException {
        boolean stillEmpty = true;
        if (StringUtils.isNotBlank(value)) {
            if (isOuputEmpty) {
                // Add space at the beginning to avoid false
                // document type recognition like MILESTONE
                // for docs that start with MILES
                out.write(" ");
                stillEmpty = false;
            }
            out.append(value).append("\n");
        }
        return stillEmpty;
    }
}