com.joowon.returnA.classifier.Classifier.java Source code

Introduction

Here is the source code for com.joowon.returnA.classifier.Classifier.java
Source

package com.joowon.returnA.classifier;

import com.joowon.returnA.classifier.cv.PdfPageDivider;
import com.joowon.returnA.classifier.export.PdfImageExport;
import com.joowon.returnA.classifier.extractor.PdfTextExtractor;
import com.joowon.returnA.classifier.parser.HeadlineParser;
import javafx.beans.DefaultProperty;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;

import java.io.IOException;

/**
 * Copyright (c) 4/10/16 Joowon Ryoo
 * <p>
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 * <p>
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * <p>
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
public class Classifier {
    protected PDDocument document;
    protected String destinationParentPath;

    protected static final String DATA_FROM_EBS_PATH = "/Users/Joowon/Documents/Github/ReturnA/data/DaraFromEBS";

    public Classifier(PDDocument document, String path) {
        // Export images from PDF
        this.document = document;
        this.destinationParentPath = path;
        PdfImageExport.export(document, destinationParentPath, "image");
    }

    @Deprecated
    public String getTestName() throws IOException {
        double[] bodyPosition = new PdfPageDivider(destinationParentPath + "/image_" + 1 + ".png").divide()
                .findHeadLine();
        PDPage page = document.getPage(0);
        int width = (int) page.getMediaBox().getWidth();
        int height = (int) page.getMediaBox().getHeight();
        int endY = (int) (height * bodyPosition[1]);

        String text = new PdfTextExtractor(page).addRegion(0, 0, width, endY).extract().toString();
        String testName = HeadlineParser.parseTestName(text);
        String testType = HeadlineParser.parseTestType(text);
        if (testType.length() != 0)
            testName += " " + testType;
        return testName;
    }

    public String getBodyText() throws IOException {
        int numberOfPages = document.getNumberOfPages();

        String text = "";
        for (int i = 1; i <= numberOfPages; ++i) {
            double[][] bodyPosition = new PdfPageDivider(destinationParentPath + "/image_" + i + ".png").divide()
                    .findBody();
            PDPage page = document.getPage(i - 1);

            for (double[] val : bodyPosition) {
                text += new PdfTextExtractor(page).addRegion((int) (val[0] * page.getMediaBox().getWidth()),
                        (int) (val[1] * page.getMediaBox().getHeight()),
                        (int) (val[2] * page.getMediaBox().getWidth()),
                        (int) (val[3] * page.getMediaBox().getHeight())).extract();
            }
        }
        return text;
    }

    public void close() {
        try {
            document.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}