com.tekstosense.segmenter.StructurePdf.PdfSections.java Source code

Introduction

Here is the source code for com.tekstosense.segmenter.StructurePdf.PdfSections.java
Source

/*******************************************************************************
 * Copyright (c) 2016, TekstoSense and/or its affiliates. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package com.tekstosense.segmenter.StructurePdf;

import static com.google.common.base.Preconditions.checkNotNull;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.stream.Collectors;

import org.apache.pdfbox.pdmodel.PDDocument;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Charsets;
import com.google.common.io.Files;
import com.tekstosense.segmenter.TextExtractor;
import com.tekstosense.segmenter.Rule.SectionRules;
import com.tekstosense.segmenter.commandline.Params;
import com.tekstosense.segmenter.model.DoublyLinkedList;
import com.tekstosense.segmenter.model.LineNode;
import com.tekstosense.segmenter.model.Section;
import com.tekstosense.segmenter.model.Structure;
import com.tekstosense.segmenter.util.StructureUtil;

/**
 * @author TekstoSense
 */
public class PdfSections implements Pdf2Structure {

    private List<File> files;
    private Map<File, List<Section>> pdfSections;
    private Params params;

    public PdfSections(Params params) {
        this.files = new ArrayList<>();
        this.pdfSections = new HashMap<>();
        this.params = params;
    }

    public PdfSections() {
        this(new Params());
    }

    @Override
    public void processFiles(List<String> files) throws IOException {
        this.files = checkNotNull(files, "Files can not be Null").stream().map(s -> new File(s))
                .collect(Collectors.toList());
        processPdf();
    }

    @Override
    public void processFile(String file) throws IOException {
        processFiles(Arrays.asList(file));
    }

    private void processPdf() throws IOException {
        for (File file : files) {
            TextExtractor te = parsePdf(file);
            DoublyLinkedList<LineNode> sentenceList = te.toLinkedList();
            SectionRules sectionRules = new SectionRules(sentenceList);
            sectionRules.applyDefaultRule();
            pdfSections.put(file, sectionRules.getSections());
        }
    }

    private TextExtractor parsePdf(File f) throws IOException {
        PDDocument doc = PDDocument.load(f);

        if (doc.isEncrypted()) {
            // Some documents are encrypted with the empty password. Try
            // to decrypt with this password, or the one passed in on the
            // command line (if any), and fail if we can't.
            try {
                doc.setAllSecurityToBeRemoved(false);
                // doc.decrypt(password); // Defaults to the empty string.
            } catch (Exception e) {
                throw new IOException("Can't decrypt document: ", e);
            }
        }
        TextExtractor te = new TextExtractor();
        te.writeText(doc, new OutputStreamWriter(new ByteArrayOutputStream()));

        return te;
    }

    public List<String> generateOutput() throws IOException {
        List<String> finalJson = new ArrayList<>();
        ObjectMapper mapper = new ObjectMapper();
        for (Entry<File, List<Section>> entry : pdfSections.entrySet()) {
            List<Structure> structures = StructureUtil.toStructure(entry.getValue());

            if (this.params.getOutputDir() != null) {
                File toFile = new File(this.params.getOutputDir(), entry.getKey().getName() + ".txt");
                Files.write(structures.toString(), toFile, Charsets.UTF_8);
            } else if (params.getFormat().equalsIgnoreCase("STDOUT")) {
                System.out.println(structures.toString());
            }
            finalJson.add(mapper.writeValueAsString(structures));
        }
        return finalJson;
    }

    @Override
    public Map<File, List<Section>> getStructuredPdf() {
        return pdfSections;
    }
}