org.sejda.conversion.PdfFileSourceListAdapter.java Source code

Introduction

Here is the source code for org.sejda.conversion.PdfFileSourceListAdapter.java
Source

/*
 * Created on Sep 3, 2011
 * Copyright 2010 by Eduard Weissmann (edi.weissmann@gmail.com).
 * 
 * This file is part of the Sejda source code
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.sejda.conversion;

import static org.apache.commons.io.FilenameUtils.getExtension;
import static org.sejda.common.XMLUtils.nullSafeGetStringAttribute;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathException;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.sejda.conversion.exception.ConversionException;
import org.sejda.model.exception.SejdaRuntimeException;
import org.sejda.model.input.PdfFileSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/**
 * Adapter for a list of {@link PdfFileSource}s. Provides a filePath based constructor. Will parse xml, csv config file formats, and list a directory contents
 * 
 * @author Eduard Weissmann
 * 
 */
public class PdfFileSourceListAdapter {

    private static final Logger LOG = LoggerFactory.getLogger(PdfFileSourceListAdapter.class);

    private final PdfInputFilesSourceFactory parserFactory = new PdfInputFilesSourceFactory();
    private final List<PdfFileSource> fileSourceList = new ArrayList<>();
    private final File file;
    private Pattern pattern = Pattern.compile(".+");

    public PdfFileSourceListAdapter(String filePath) {
        file = new File(filePath);

        if (!file.exists()) {
            throw new ConversionException("File '" + file.getPath() + "' does not exist");
        }
    }

    public PdfFileSourceListAdapter filter(String filterRegExp) {
        if (StringUtils.isNotBlank(filterRegExp)) {
            LOG.debug("Applying regular expression: {}", filterRegExp);
            pattern = Pattern.compile(filterRegExp);
        }
        return this;
    }

    public List<PdfFileSource> getFileSourceList() {
        fileSourceList.addAll(parserFactory.createSource(file).getInputFiles(file));

        if (fileSourceList.isEmpty()) {
            throw new ConversionException("No input files specified in '" + file.getPath() + "'");
        }
        return fileSourceList;
    }

    /**
     * Factory for {@link PdfInputFilesSource}s. Depending on input {@link File} (folder, csv file, xml file), a different source will be created.
     * 
     * @author Eduard Weissmann
     * 
     */
    class PdfInputFilesSourceFactory {
        private static final String XML_EXTENSION = "xml";
        private static final String CSV_EXTENSION = "csv";

        PdfInputFilesSource createSource(File file) {
            String extension = getExtension(file.getName());

            if (file.isDirectory()) {
                return new FolderFileSourceListParser(PdfFileSourceListAdapter.this.pattern);
            } else if (CSV_EXTENSION.equalsIgnoreCase(extension)) {
                return new CsvFileSourceListParser();
            } else if (XML_EXTENSION.equalsIgnoreCase(extension)) {
                return new XmlFileSourceListParser();
            }

            throw new SejdaRuntimeException("Cannot read input file names from config file '" + file.getName()
                    + "'. Unsupported file format: " + extension);
        }
    }
}

/**
 * Source for {@link PdfFileSource} input files
 * 
 * @author Eduard Weissmann
 * 
 */
interface PdfInputFilesSource {

    List<PdfFileSource> getInputFiles(File file);
}

/**
 * Abstract base class of {@link PdfInputFilesSource}s
 * 
 * @author Eduard Weissmann
 * 
 */
abstract class AbstractPdfInputFilesSource implements PdfInputFilesSource {
    private static final Logger LOG = LoggerFactory.getLogger(AbstractPdfInputFilesSource.class);

    @Override
    public List<PdfFileSource> getInputFiles(File file) {
        List<String> filenames = parseFileNames(file);
        LOG.trace("Input files: '" + StringUtils.join(filenames, "', '") + "'");
        try {
            return PdfFileSourceAdapter.fromStrings(filenames);
        } catch (SejdaRuntimeException e) {
            throw new ConversionException("Invalid filename found: " + e.getMessage(), e);
        }
    }

    protected abstract List<String> parseFileNames(File file);
}

/**
 * Produces the list of input files by listing a directory for files with pdf extension
 * 
 * @author Eduard Weissmann
 * 
 */
class FolderFileSourceListParser extends AbstractPdfInputFilesSource {

    protected static final String PDF_EXTENSION = "pdf";

    private Pattern pattern;

    FolderFileSourceListParser(Pattern pattern) {
        this.pattern = pattern;
    }

    @Override
    protected List<String> parseFileNames(File file) {
        List<File> files = Arrays.asList(file.listFiles((dir, filename) -> {
            return StringUtils.equalsIgnoreCase(getExtension(filename), PDF_EXTENSION)
                    && pattern.matcher(filename).matches();
        }));

        List<String> filenames = new ArrayList<>();
        for (File current : files) {
            filenames.add(current.getAbsolutePath());
        }

        Collections.sort(filenames);

        return filenames;
    }

}

/**
 * Produces the list of input files by parsing a csv file
 * 
 * @author Eduard Weissmann
 * 
 */
class CsvFileSourceListParser extends AbstractPdfInputFilesSource {
    private static final Logger LOG = LoggerFactory.getLogger(CsvFileSourceListParser.class);

    @Override
    protected List<String> parseFileNames(File file) {
        try {
            return doParseFileNames(file);
        } catch (Exception e) {
            LOG.error("Can't extract filesnames", e);
            throw new ConversionException(
                    "Can't extract filenames from '" + file.getName() + "'. Reason:" + e.getMessage(), e);
        }
    }

    protected List<String> doParseFileNames(File file) throws IOException {
        List<String> resultingFileNames = new ArrayList<>();

        List<String> lines = IOUtils.readLines(new FileInputStream(file));
        for (String eachLine : lines) {
            String[] splitLine = StringUtils.split(eachLine.toString(), ",");
            resultingFileNames.addAll(Arrays.asList(splitLine));
        }

        return resultingFileNames;
    }
}

/**
 * Produces the list of input files by parsing a xml file
 * 
 * @author Eduard Weissmann
 * 
 */
class XmlFileSourceListParser extends AbstractPdfInputFilesSource {
    private static final Logger LOG = LoggerFactory.getLogger(XmlFileSourceListParser.class);

    private XPathFactory xpathFactory = XPathFactory.newInstance();

    @Override
    protected List<String> parseFileNames(File file) {
        try {
            return doParseFileNames(file);
        } catch (Exception e) {
            LOG.error("Can't extract filenames", e);
            throw new ConversionException(
                    "Can't extract filenames from '" + file.getName() + "'. Reason:" + e.getMessage(), e);
        }
    }

    protected List<String> doParseFileNames(File file)
            throws IOException, SAXException, ParserConfigurationException, XPathException {
        DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
        domFactory.setNamespaceAware(true);
        DocumentBuilder builder = domFactory.newDocumentBuilder();
        Document doc = builder.parse(file);

        List<String> result = new ArrayList<>();

        result.addAll(parseSingleFiles(doc));
        result.addAll(parseFileSets(doc, file));

        return result;
    }

    /**
     * Parse fileset definitions <filelist><fileset>[...]</fileset></filelist> ignoring the rest of the document
     * 
     * @param doc
     * @return a list of string matching the contents of the <filelist><fileset> tags in the document
     * @throws XPathExpressionException
     */
    private List<String> parseFileSets(Document doc, File configFile) throws XPathExpressionException {
        List<String> result = new ArrayList<>();

        NodeList nodeList = getNodeListMatchingXpath("//filelist/fileset/file", doc);
        for (int i = 0; i < nodeList.getLength(); i++) {
            Node node = nodeList.item(i);
            Node fileSet = node.getParentNode();

            String parentDirPath = nullSafeGetStringAttribute(fileSet, "dir");
            if (parentDirPath == null) {
                parentDirPath = configFile.getAbsoluteFile().getParent();
            }

            String filePath = extractFilePath(node);

            // warn if file in fileset is using absolute path mode
            if (FilenameUtils.getPrefixLength(filePath) > 0) {
                LOG.warn("File " + filePath + " in fileset "
                        + StringUtils.defaultIfBlank(nullSafeGetStringAttribute(fileSet, "dir"), "")
                        + " seems to be an absolute path. Will _not_ be resolved relative to the <fileset>, but as an absolute path. Normally you would want to use relative paths in a //filelist/fileset/file, and absolute paths in a //filelist/file.");
            }

            result.add(FilenameUtils.concat(parentDirPath, filePath));
        }

        return result;
    }

    private String extractFilePath(Node fileNode) {
        String password = nullSafeGetStringAttribute(fileNode, "password");
        String value = nullSafeGetStringAttribute(fileNode, "value");
        return value + (password == null ? "" : PdfFileSourceAdapter.PASSWORD_SEPARATOR_CHARACTER + password);
    }

    /**
     * Parse single file definitions <filelist><file>[...]</file></filelist> ignoring the rest of the document
     * 
     * @param doc
     * @return a list of string matching the contents of the <filelist><file> tags in the document
     * @throws XPathExpressionException
     */
    private List<String> parseSingleFiles(Document doc) throws XPathExpressionException {
        List<String> result = new ArrayList<>();

        NodeList nodeList = getNodeListMatchingXpath("//filelist/file", doc);
        for (int i = 0; i < nodeList.getLength(); i++) {
            result.add(extractFilePath(nodeList.item(i)));
        }

        return result;
    }

    private NodeList getNodeListMatchingXpath(String xpathString, Document doc) throws XPathExpressionException {
        return (NodeList) xpathFactory.newXPath().evaluate(xpathString, doc, XPathConstants.NODESET);
    }
}