Java tutorial
/* Copyright 2005, 2005 Burcu Yildiz Contact: burcu.yildiz@gmail.com This file is part of pdf2table. pdf2table is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. pdf2table is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with pdf2table. If not, see <http://www.gnu.org/licenses/>. */ package pdf2xml; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import org.jdom2.Element; class TopComparator implements Comparator<Text_Element> { public int compare(Text_Element t1, Text_Element t2) { return (t1.top - t2.top); } } class LeftComparator implements Comparator<Text_Element> { public int compare(Text_Element t1, Text_Element t2) { return (t1.left - t2.left); } } public class Text_Element { public enum Type { TEXT, NUMBER }; public enum Style { NORMAL, BOLD, ITALIC, BOLD_ITALIC }; String value; int top; int left; int width; int height; int right; int font; int font_size; Style style = Style.NORMAL; Type typ; int count_lines; List<Text_Element> elements; int last_top; int first_top; int colspan = 1; boolean artificial; public Text_Element(String v, int t, int l, int w, int h, int f, int size, Style f2, Type t2) { this.value = v; this.top = t; this.left = l; this.width = w; this.right = l + w; this.height = h; this.font = f; this.font_size = size; this.style = f2; this.typ = t2; this.last_top = t; // no line merged to this text element this.first_top = t; this.colspan = 1; this.count_lines = 1; this.elements = new ArrayList<Text_Element>(); this.right = this.left + this.width; this.artificial = false; } public Text_Element() { this.value = "null"; this.colspan = 1; this.count_lines = 1; this.artificial = true; } public Text_Element(String s) { this.value = s; this.colspan = 1; this.count_lines = 1; this.artificial = false; } public Object clone() { Text_Element t = new Text_Element(this.value, this.top, this.left, this.width, this.height, this.font, this.font_size, this.style, this.typ); return t; } /** * Maximize bounds with those of given TextElement * * @param te * TextElement */ public void add(Text_Element te) { last_top = Math.max(last_top, te.last_top); first_top = Math.min(first_top, te.first_top); int t_right = te.left + te.width; width = t_right - left; } public static Text_Element getTextElement(Element text, List<Font> fonts) { String value = text.getValue().trim(); int top = Integer.parseInt(text.getAttribute("top").getValue()); int left = Integer.parseInt(text.getAttribute("left").getValue()); int width = Integer.parseInt(text.getAttribute("width").getValue()); int height = Integer.parseInt(text.getAttribute("height").getValue()); int font = Integer.parseInt(text.getAttribute("font").getValue()); Type typ = Type.NUMBER; try { Integer.parseInt(value); Float.parseFloat(value); } catch (NumberFormatException nfe) { typ = Type.TEXT; } List<Element> bold_elements = text.getChildren("b"); List<Element> italic_elements = text.getChildren("i"); Style style; if (bold_elements.size() > 0) { if (italic_elements.size() > 0) { style = Style.BOLD_ITALIC; } else { style = Style.BOLD; } } else if (italic_elements.size() > 0) { style = Style.ITALIC; } else { style = Style.NORMAL; } // This is a hack, but we need access to the font specs to know the size int font_size = fonts.get(font).size; return new Text_Element(value, top, left, width, height, font, font_size, style, typ); } /** * Test whether another TextElement is contained by, contains, or overlaps * this one. * * @param te * other Text_Element * @return true if they intersect in any way. */ public boolean intersects(Text_Element te) { return Text_Element.intersect(this, te); } /** * Test whether two Text_Element are contained by or overlap each other this * one. * * @param te1 * firstText_Element * @param te2 * other Text_Element * @return true if they intersect in any way. */ public static boolean intersect(Text_Element te1, Text_Element te2) { int l1 = te1.left; int r1 = te1.left + te1.width; int l2 = te2.left; int r2 = te2.left + te2.width; return ((l1 >= l2 && r1 <= r2) || (l1 >= l2 && l1 <= r2 && r1 > r2) || (l1 < l2 && r1 >= l2 && r1 <= r2) || (l2 >= l1 && r2 <= r1)); } public static int belong_together(Text_Element t, Text_Element n) { int n_letter_width = 0; int t_letter_width = 0; if (n.value.length() != 0) { n_letter_width = n.width / n.value.length(); if (t.value.length() != 0) { t_letter_width = t.width / t.value.length(); } int distance = n.left - (t.left + t.width); int t_right = t.left + t.width; int n_right = n.left + n.width; if (t.left > n.left && t_right < n_right) { return 1; } else if (n.left > t.left && n_right < t_right) { return 1; } else if (n_right > t.left && n_right < t_right) { return 1; } else if (n.left > t.left && n.left < t_right) { return 1; } else if (distance <= n_letter_width && distance <= t_letter_width) { return 0; } } else if (n.value.length() == 0) { return 0; } return -1; } /** * Sort sub elements and concatenate into a new value */ void coalesceSubElements() { if (elements.size() > 0) { Collections.sort(elements, new TopComparator()); StringBuilder sb = new StringBuilder(); for (Text_Element element : elements) { sb.append(element.value).append(" "); } value = sb.toString(); elements.clear(); } } /** * Process all the text elements for a single line of text and reduce/merge * into a simpler list of elements. * * @param texts */ static void processLineTexts(List<Text_Element> texts) { leftSort(texts); int p = 0; while (p < texts.size() - 1) { Text_Element t = texts.get(p); Text_Element n = texts.get(p + 1); int result = belong_together(t, n); if (result != -1) { texts.remove(p + 1); if (result == 1) { if (t.elements.size() == 0) { t.elements.add(t); t.add(n); } if (n.value.length() > 0) { t.elements.add(n); t.add(n); } } else if (result == 0) { t.value = t.value + " " + n.value; t.add(n); } p--; } p++; } for (Text_Element t : texts) { t.coalesceSubElements(); } } static void leftSort(List<Text_Element> elements) { Collections.sort(elements, new LeftComparator()); } }