pdf2xml.TopElementComparator.java Source code

Java tutorial

Introduction

Here is the source code for pdf2xml.TopElementComparator.java

Source

/*
Copyright 2005, 2005 Burcu Yildiz
Contact: burcu.yildiz@gmail.com
    
This file is part of pdf2table.
    
pdf2table is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
    
pdf2table is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with pdf2table.  If not, see <http://www.gnu.org/licenses/>.
*/

package pdf2xml;

import java.awt.Button;
import java.awt.Dialog;
import java.awt.Frame;
import java.awt.GraphicsEnvironment;
import java.awt.Label;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;

import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.input.SAXBuilder;

import pdf2xml.Text_Element.Type;

/**
 * Comparator to sort elements by the Y values of the tops.
 */
class TopElementComparator implements Comparator<Element> {

    public int compare(Element e1, Element e2) {
        int top1 = Integer.parseInt(e1.getAttribute("top").getValue());
        int top2 = Integer.parseInt(e2.getAttribute("top").getValue());
        return (top1 - top2);
    }
}

public class FirstClassification {

    List<Font> fonts;
    List<Line> lines;
    List<Multiline_Block> mlbs;
    boolean interactive_extraction;
    String path;

    int distance_sum = 0;

    public FirstClassification(boolean interactivity, String p) {//, int c) {
        this.fonts = new ArrayList<Font>();
        this.lines = new ArrayList<Line>();
        this.mlbs = new ArrayList<Multiline_Block>();
        this.interactive_extraction = interactivity;
        this.path = p;
    }

    public void run(final String file_name) {

        SAXBuilder builder = new SAXBuilder();
        try {
            int lines_before = 0;
            Document doc = builder.build(file_name);

            for (Element page : doc.getRootElement().getChildren("page")) {
                lines_before = doPage(lines_before, page);
            } // end of while pages

            multiline_block_merge(this.mlbs, this.lines);

            List<Table> tables = SecondClassification.decompose_tables(mlbs, lines);

            if (interactive_extraction == true) {
                SemiOutputFrame so = new SemiOutputFrame(tables, fonts, path);
                so.setVisible(true);
            } else {
                XmlOutput.create(tables, fonts, path);
            }
        } catch (JDOMException e) {
            System.out.println(e.getMessage());
            showErrorFrame(file_name);
        } catch (IOException e) {
            System.out.println(e);
        } catch (Exception e) {
            System.out.println("Exception in class: FirstClassification. " + e);
        }
    }

    // TODO: This giant method needs to be split up more
    private int doPage(int lines_before, Element page) {
        int page_number = Integer.parseInt(page.getAttribute("number").getValue());

        this.fonts.addAll(getFonts(page));

        List<Text_Column> text_columns = generateColumns(page, 1);
        for (Text_Column tc : text_columns) {
            this.lines.addAll(tc.lines);
        }

        boolean multi_modus = false;
        int d = 0;
        int sum_of_distances = 0;

        for (int o = lines_before; o < this.lines.size(); o++) {
            Line l = this.lines.get(o);

            Text_Element.processLineTexts(l.texts);

            if (l.texts.size() > 1) {
                // multi-line
                if (multi_modus == true) {
                    Multiline_Block current_mlb = this.mlbs.get(this.mlbs.size() - 1);
                    sum_of_distances += d;
                    current_mlb.add(l);
                } else {
                    Multiline_Block mlb = new Multiline_Block();
                    sum_of_distances = 0;
                    mlb.init(l, o, page_number);
                    this.mlbs.add(mlb);
                    multi_modus = true;
                }
            } else if (l.texts.size() == 1) {
                // single-line
                if (multi_modus == true) {

                    Line pl = this.lines.get(o - 1);
                    sum_of_distances += d;
                    Text_Element t = l.texts.get(0);
                    int top_distance = l.first_top - pl.bottom;

                    boolean control = false;

                    int belongs = 0;

                    for (int k = 0; k < pl.texts.size(); k++) {

                        Text_Element n = pl.texts.get(k);
                        int left_distance = Math.abs(n.left - t.left);
                        int right_distance = Math.abs((n.left + n.width) - (t.left + t.width));

                        if (top_distance < t.height / 2 && n.typ.equals(t.typ) && n.typ == Type.TEXT
                                && ((left_distance < 3) || (right_distance < 3))) {

                            String s = n.value + "\n" + t.value;
                            n.value = s;

                            n.count_lines++;
                            this.lines.remove(o);
                            o--;
                            n.add(t);
                            pl.add(t);
                            control = true;
                        }
                        if (Text_Element.intersect(t, n)) {
                            belongs++;
                        }
                    } // end of for
                    if (control == false) {

                        /*   if (belongs == 1)  {
                        Multiline_Block current_mlb = (Multiline_Block) this.mlbs.lastElement();
                        actualize_mlb_values(current_mlb, l);
                        }
                        else {*/
                        //if (belongs == 0 || count_single_lines > 5) {
                        Multiline_Block current_mlb = this.mlbs.get(this.mlbs.size() - 1);
                        int mlb_element_count = current_mlb.end - current_mlb.begin;
                        if (mlb_element_count > 0) {
                            current_mlb.avg_distance = sum_of_distances / mlb_element_count;
                        } else {
                            current_mlb.avg_distance = d;
                        }
                        multi_modus = false;
                        // }
                    }
                } else {
                    // do nothing
                }
            }
        }

        multi_modus = false;
        lines_before = this.lines.size();
        return lines_before;
    }

    private List<Text_Column> generateColumns(Element page, int column_count) {
        List<Text_Column> text_columns = new ArrayList<Text_Column>();
        int page_width = Integer.parseInt(page.getAttribute("width").getValue());
        int text_columns_width = page_width / column_count;

        for (int i = 0; i < column_count; i++) {
            Text_Column tc = new Text_Column(text_columns_width);
            text_columns.add(tc);
        }
        int distance = 0;
        Text_Column current_tc;

        List<Element> text_elements = new LinkedList<Element>(page.getChildren("text"));
        Collections.sort(text_elements, new TopElementComparator());

        for (Element e : text_elements) {
            Text_Element current_t = Text_Element.getTextElement(e, fonts);

            int right_column = Math.abs(current_t.left / text_columns_width);

            if (right_column < text_columns.size()) {

                current_tc = text_columns.get(right_column);

                if (current_tc.lines.size() > 0) {
                    Line l = current_tc.lines.get(current_tc.lines.size() - 1);

                    if (l.contains(current_t)) {
                        // exactly in the boundaries of the line
                        l.texts.add(current_t);
                        l.add(current_t);
                    } else {
                        Line new_line = new Line();
                        new_line.texts.add(current_t);
                        new_line.init(current_t);
                        current_tc.lines.add(new_line);
                        distance += new_line.first_top - l.last_top;
                    }
                } else {
                    Line new_line = new Line();
                    new_line.texts.add(current_t);
                    new_line.init(current_t);
                    current_tc.lines.add(new_line);
                } // if current_tc.lines
            } // if right_column ...
        } // for e_array.length
        return text_columns;
    }

    /**
     * Return a list of fonts on a given page.
     */
    private List<Font> getFonts(Element page) {
        List<Font> fonts = new ArrayList<Font>();
        int page_number = Integer.parseInt(page.getAttribute("number").getValue());

        for (Element font : page.getChildren("fontspec")) {
            int id = Integer.parseInt(font.getAttribute("id").getValue());
            int size = Integer.parseInt(font.getAttribute("size").getValue());
            String family = font.getAttribute("family").getValue();
            String color = font.getAttribute("color").getValue();
            Font f = new Font(page_number, id, size, family, color);
            fonts.add(f);
        }
        return fonts;
    }

    /**
     * Merge contents of multi-line blocks.  This will modify the collections
     * in place.
     * 
     * @param blocks
     * @param linez
     */
    private static void multiline_block_merge(List<Multiline_Block> blocks, List<Line> linez) {
        int steps_backward = 0;
        int steps_forward = 0;
        int before = 0;
        int after = 0;

        int removed_elements_before = 0;
        int removed_elements_after = 0;

        for (int i = 0; i < blocks.size(); i++) {

            Multiline_Block mlb2 = blocks.get(i);

            mlb2.begin = mlb2.begin - removed_elements_before - removed_elements_after;
            mlb2.end = mlb2.end - removed_elements_before - removed_elements_after;

            before = removed_elements_before;
            after = removed_elements_after;

            if (i == 0) {
                // first multiline block
                if (mlb2.begin - 10 > 0) {
                    steps_backward = 10;
                } else {
                    steps_backward = mlb2.begin - 1;
                }
                steps_forward = 0;
                int[] counts = line_merge(mlb2, linez, steps_backward, steps_forward);
                removed_elements_before += counts[0];
                removed_elements_after += counts[1];
                mlb2.begin = mlb2.begin - (removed_elements_before - before);
                mlb2.end = mlb2.end - (removed_elements_before - before);
            } else if (i == blocks.size() - 1) {
                // last multiline block
                if (mlb2.end + 10 < linez.size()) {
                    steps_forward = 10;
                } else {
                    steps_forward = linez.size() - mlb2.end - 1;
                }
                steps_backward = 0;
                int[] counts = line_merge(mlb2, linez, steps_backward, steps_forward);
                removed_elements_before += counts[0];
                removed_elements_after += counts[1];
            } else {
                // every other multiline block between the first and the last
                Multiline_Block mlb1 = blocks.get(i - 1);
                Multiline_Block mlb3 = blocks.get(i + 1);

                steps_forward = mlb3.begin - mlb2.end - 1;
                steps_backward = mlb2.begin - mlb1.end - 1;

                if (mlb2.page == mlb3.page && mlb2.page != mlb1.page) {
                    steps_backward = 0;
                    int[] counts = line_merge(mlb2, linez, steps_backward, steps_forward);
                    removed_elements_before += counts[0];
                    removed_elements_after += counts[1];
                } else if (mlb2.page == mlb1.page && mlb2.page != mlb3.page) {
                    steps_forward = 0;
                    int[] counts = line_merge(mlb2, linez, steps_backward, steps_forward);
                    removed_elements_before += counts[0];
                    removed_elements_after += counts[1];
                } else if (mlb2.page == mlb1.page && mlb2.page == mlb3.page) {
                    int[] counts = line_merge(mlb2, linez, steps_backward, steps_forward);
                    removed_elements_before += counts[0];
                    removed_elements_after += counts[1];
                } // if mlbs on the same page

                boolean merge_with_before = false;

                if (mlb2.begin - mlb1.end <= 3 && mlb2.page == mlb1.page
                        && (Math.abs(mlb2.max_elements - mlb1.max_elements) <= 1)) {
                    mlb1.end = mlb2.end - (removed_elements_before - before);
                    blocks.remove(i);
                    merge_with_before = true;
                    mlb1.add(mlb2);
                    i--;
                }
                if (mlb3.begin - mlb2.end <= 3 && mlb3.page == mlb2.page
                        && (Math.abs(mlb2.max_elements - mlb3.max_elements) <= 1)) {
                    if (merge_with_before == false) {
                        mlb2.begin = mlb2.begin - (removed_elements_before - before);
                        mlb2.end = mlb3.end - (removed_elements_before - before) - (removed_elements_after - after);
                        mlb2.add(mlb3);
                        blocks.remove(i + 1);
                    } else {
                        mlb1.end = mlb3.end - (removed_elements_before - before) - (removed_elements_after - after);
                        mlb1.add(mlb3);
                        blocks.remove(i + 1);
                    }
                }
            }
        }
    }

    private static int[] line_merge(Multiline_Block mlb, List<Line> lines, int steps_back, int steps_for) {
        Line first_line = lines.get(mlb.begin);
        Line last_line = lines.get(mlb.end);
        int count = 0;
        int removed_elements_before = 0;
        int removed_elements_after = 0;
        boolean merge_control = true;

        for (int i = 1; i <= steps_back && merge_control == true; i++) {
            Line pl = lines.get(mlb.begin - i);
            List<Text_Element> storage = new ArrayList<Text_Element>(first_line.texts);

            int top_distance = first_line.first_top - pl.bottom;

            for (int j = 0; j < first_line.texts.size(); j++) {
                Text_Element t = storage.get(j);

                for (int k = 0; k < pl.texts.size(); k++) {

                    Text_Element n = pl.texts.get(k);
                    int left_distance = Math.abs(n.left - t.left);
                    int right_distance = Math.abs((n.left + n.width) - (t.left + t.width));
                    if (top_distance < t.height / 2 && t.typ.equals(n.typ) && t.typ.equals("text")
                            && ((left_distance < 3) || (right_distance < 3))) {
                        String s = n.value + " " + t.value;
                        t.value = s;
                        t.count_lines++;
                        t.add(n);
                        count++;
                    }
                }
            }
            if (count == pl.texts.size()) {
                List<Text_Element> clone = new ArrayList<Text_Element>(storage);
                first_line.texts = clone;
                for (int p = 0; p < first_line.texts.size(); p++) {
                    Text_Element t = first_line.texts.get(p);
                    first_line.add(t);
                }

                lines.remove(mlb.begin - i);
                removed_elements_before++;
            } else {
                merge_control = false;
            }
            count = 0;

        }
        merge_control = true;

        for (int i = 1; i <= steps_for && merge_control == true; i++) {
            Line nl = lines.get(mlb.end + i);
            List<Text_Element> storage = new ArrayList<Text_Element>(last_line.texts);

            int top_distance = nl.first_top - last_line.bottom;

            for (int j = 0; j < last_line.texts.size(); j++) {
                Text_Element t = last_line.texts.get(j);
                for (int k = 0; k < nl.texts.size(); k++) {
                    Text_Element n = nl.texts.get(k);
                    int left_distance = Math.abs(n.left - t.left);
                    int right_distance = Math.abs((n.left + n.width) - (t.left + t.width));

                    if (top_distance < t.height / 2 && t.typ.equals(n.typ) && t.typ == Type.TEXT
                            && ((left_distance < 3) || (right_distance < 3))) {

                        String s = t.value + " " + n.value;
                        t.value = s;
                        t.count_lines++;
                        t.add(n);
                        count++;
                    }
                }
            }
            if (count == nl.texts.size()) {
                last_line.texts = new ArrayList<Text_Element>(storage);
                for (int p = 0; p < last_line.texts.size(); p++) {
                    Text_Element t = last_line.texts.get(p);
                    last_line.add(t);
                }
                lines.remove(mlb.end + i);
                removed_elements_after++;
            } else {
                merge_control = false;
            }
            count = 0;
        }
        return new int[] { removed_elements_before, removed_elements_after };
    }

    private void showErrorFrame(final String file_name) {
        GraphicsEnvironment ge = GraphicsEnvironment.getLocalGraphicsEnvironment();
        Frame f = new Frame(ge.getDefaultScreenDevice().getDefaultConfiguration());

        Dialog d = new Dialog(f, "Failure", true);
        Label l = new Label("pdftohtml was unable to return right data.");
        Label l2 = new Label("Would you like to restart with pre-debugging?");
        d.setLayout(null);
        l.setBounds(60, 50, 300, 20);
        l2.setBounds(60, 70, 300, 20);
        d.add(l);
        d.add(l2);
        d.setSize(420, 150);

        Button b = new Button("Yes");
        b.addActionListener(new java.awt.event.ActionListener() {
            public void actionPerformed(java.awt.event.ActionEvent evt) {
                Button b2 = (Button) evt.getSource();
                debug_pdftohtml_output(file_name);
                ((Dialog) b2.getParent()).dispose();
            }
        });
        b.setBounds(180, 100, 60, 20);

        Button b2 = new Button("No");
        b2.addActionListener(new java.awt.event.ActionListener() {
            public void actionPerformed(java.awt.event.ActionEvent evt) {
                Button b3 = (Button) evt.getSource();
                ((Dialog) b3.getParent()).dispose();
            }
        });
        b.setBounds(180, 100, 60, 20);
        b2.setBounds(250, 100, 60, 20);

        d.add(b);
        d.add(b2);
        d.setLocationRelativeTo(null);
        d.setVisible(true);
    }

    private void debug_pdftohtml_output(final String filename) {
        try {
            BufferedReader br = new BufferedReader(new FileReader(filename));

            PrintStream dos = new PrintStream(
                    new FileOutputStream(new File(this.path + File.separator + "debugged_output.xml")));

            String current_line = br.readLine();

            while (current_line != null) {
                current_line = current_line.replaceAll("A href", "a href");
                current_line = current_line.replaceAll("<B>", "<b>");
                current_line = current_line.replaceAll("<I>", "<i>");
                current_line = current_line.replaceAll("</I>", "</i>");
                current_line = current_line.replaceAll("</B>", "</b>");

                dos.println(current_line);
                current_line = br.readLine();
            }

            run(this.path + File.separator + "debugged_output.xml");

            dos.close();
            br.close();
        } catch (Exception e) {
            System.out.println(e);
        }
    }

}