Java tutorial
/* Copyright 2005, 2005 Burcu Yildiz Contact: burcu.yildiz@gmail.com This file is part of pdf2table. pdf2table is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. pdf2table is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with pdf2table. If not, see <http://www.gnu.org/licenses/>. */ package pdf2xml; import java.awt.Button; import java.awt.Dialog; import java.awt.Frame; import java.awt.GraphicsEnvironment; import java.awt.Label; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.LinkedList; import java.util.List; import org.jdom2.Document; import org.jdom2.Element; import org.jdom2.JDOMException; import org.jdom2.input.SAXBuilder; import pdf2xml.Text_Element.Type; /** * Comparator to sort elements by the Y values of the tops. */ class TopElementComparator implements Comparator<Element> { public int compare(Element e1, Element e2) { int top1 = Integer.parseInt(e1.getAttribute("top").getValue()); int top2 = Integer.parseInt(e2.getAttribute("top").getValue()); return (top1 - top2); } } public class FirstClassification { List<Font> fonts; List<Line> lines; List<Multiline_Block> mlbs; boolean interactive_extraction; String path; int distance_sum = 0; public FirstClassification(boolean interactivity, String p) {//, int c) { this.fonts = new ArrayList<Font>(); this.lines = new ArrayList<Line>(); this.mlbs = new ArrayList<Multiline_Block>(); this.interactive_extraction = interactivity; this.path = p; } public void run(final String file_name) { SAXBuilder builder = new SAXBuilder(); try { int lines_before = 0; Document doc = builder.build(file_name); for (Element page : doc.getRootElement().getChildren("page")) { lines_before = doPage(lines_before, page); } // end of while pages multiline_block_merge(this.mlbs, this.lines); List<Table> tables = SecondClassification.decompose_tables(mlbs, lines); if (interactive_extraction == true) { SemiOutputFrame so = new SemiOutputFrame(tables, fonts, path); so.setVisible(true); } else { XmlOutput.create(tables, fonts, path); } } catch (JDOMException e) { System.out.println(e.getMessage()); showErrorFrame(file_name); } catch (IOException e) { System.out.println(e); } catch (Exception e) { System.out.println("Exception in class: FirstClassification. " + e); } } // TODO: This giant method needs to be split up more private int doPage(int lines_before, Element page) { int page_number = Integer.parseInt(page.getAttribute("number").getValue()); this.fonts.addAll(getFonts(page)); List<Text_Column> text_columns = generateColumns(page, 1); for (Text_Column tc : text_columns) { this.lines.addAll(tc.lines); } boolean multi_modus = false; int d = 0; int sum_of_distances = 0; for (int o = lines_before; o < this.lines.size(); o++) { Line l = this.lines.get(o); Text_Element.processLineTexts(l.texts); if (l.texts.size() > 1) { // multi-line if (multi_modus == true) { Multiline_Block current_mlb = this.mlbs.get(this.mlbs.size() - 1); sum_of_distances += d; current_mlb.add(l); } else { Multiline_Block mlb = new Multiline_Block(); sum_of_distances = 0; mlb.init(l, o, page_number); this.mlbs.add(mlb); multi_modus = true; } } else if (l.texts.size() == 1) { // single-line if (multi_modus == true) { Line pl = this.lines.get(o - 1); sum_of_distances += d; Text_Element t = l.texts.get(0); int top_distance = l.first_top - pl.bottom; boolean control = false; int belongs = 0; for (int k = 0; k < pl.texts.size(); k++) { Text_Element n = pl.texts.get(k); int left_distance = Math.abs(n.left - t.left); int right_distance = Math.abs((n.left + n.width) - (t.left + t.width)); if (top_distance < t.height / 2 && n.typ.equals(t.typ) && n.typ == Type.TEXT && ((left_distance < 3) || (right_distance < 3))) { String s = n.value + "\n" + t.value; n.value = s; n.count_lines++; this.lines.remove(o); o--; n.add(t); pl.add(t); control = true; } if (Text_Element.intersect(t, n)) { belongs++; } } // end of for if (control == false) { /* if (belongs == 1) { Multiline_Block current_mlb = (Multiline_Block) this.mlbs.lastElement(); actualize_mlb_values(current_mlb, l); } else {*/ //if (belongs == 0 || count_single_lines > 5) { Multiline_Block current_mlb = this.mlbs.get(this.mlbs.size() - 1); int mlb_element_count = current_mlb.end - current_mlb.begin; if (mlb_element_count > 0) { current_mlb.avg_distance = sum_of_distances / mlb_element_count; } else { current_mlb.avg_distance = d; } multi_modus = false; // } } } else { // do nothing } } } multi_modus = false; lines_before = this.lines.size(); return lines_before; } private List<Text_Column> generateColumns(Element page, int column_count) { List<Text_Column> text_columns = new ArrayList<Text_Column>(); int page_width = Integer.parseInt(page.getAttribute("width").getValue()); int text_columns_width = page_width / column_count; for (int i = 0; i < column_count; i++) { Text_Column tc = new Text_Column(text_columns_width); text_columns.add(tc); } int distance = 0; Text_Column current_tc; List<Element> text_elements = new LinkedList<Element>(page.getChildren("text")); Collections.sort(text_elements, new TopElementComparator()); for (Element e : text_elements) { Text_Element current_t = Text_Element.getTextElement(e, fonts); int right_column = Math.abs(current_t.left / text_columns_width); if (right_column < text_columns.size()) { current_tc = text_columns.get(right_column); if (current_tc.lines.size() > 0) { Line l = current_tc.lines.get(current_tc.lines.size() - 1); if (l.contains(current_t)) { // exactly in the boundaries of the line l.texts.add(current_t); l.add(current_t); } else { Line new_line = new Line(); new_line.texts.add(current_t); new_line.init(current_t); current_tc.lines.add(new_line); distance += new_line.first_top - l.last_top; } } else { Line new_line = new Line(); new_line.texts.add(current_t); new_line.init(current_t); current_tc.lines.add(new_line); } // if current_tc.lines } // if right_column ... } // for e_array.length return text_columns; } /** * Return a list of fonts on a given page. */ private List<Font> getFonts(Element page) { List<Font> fonts = new ArrayList<Font>(); int page_number = Integer.parseInt(page.getAttribute("number").getValue()); for (Element font : page.getChildren("fontspec")) { int id = Integer.parseInt(font.getAttribute("id").getValue()); int size = Integer.parseInt(font.getAttribute("size").getValue()); String family = font.getAttribute("family").getValue(); String color = font.getAttribute("color").getValue(); Font f = new Font(page_number, id, size, family, color); fonts.add(f); } return fonts; } /** * Merge contents of multi-line blocks. This will modify the collections * in place. * * @param blocks * @param linez */ private static void multiline_block_merge(List<Multiline_Block> blocks, List<Line> linez) { int steps_backward = 0; int steps_forward = 0; int before = 0; int after = 0; int removed_elements_before = 0; int removed_elements_after = 0; for (int i = 0; i < blocks.size(); i++) { Multiline_Block mlb2 = blocks.get(i); mlb2.begin = mlb2.begin - removed_elements_before - removed_elements_after; mlb2.end = mlb2.end - removed_elements_before - removed_elements_after; before = removed_elements_before; after = removed_elements_after; if (i == 0) { // first multiline block if (mlb2.begin - 10 > 0) { steps_backward = 10; } else { steps_backward = mlb2.begin - 1; } steps_forward = 0; int[] counts = line_merge(mlb2, linez, steps_backward, steps_forward); removed_elements_before += counts[0]; removed_elements_after += counts[1]; mlb2.begin = mlb2.begin - (removed_elements_before - before); mlb2.end = mlb2.end - (removed_elements_before - before); } else if (i == blocks.size() - 1) { // last multiline block if (mlb2.end + 10 < linez.size()) { steps_forward = 10; } else { steps_forward = linez.size() - mlb2.end - 1; } steps_backward = 0; int[] counts = line_merge(mlb2, linez, steps_backward, steps_forward); removed_elements_before += counts[0]; removed_elements_after += counts[1]; } else { // every other multiline block between the first and the last Multiline_Block mlb1 = blocks.get(i - 1); Multiline_Block mlb3 = blocks.get(i + 1); steps_forward = mlb3.begin - mlb2.end - 1; steps_backward = mlb2.begin - mlb1.end - 1; if (mlb2.page == mlb3.page && mlb2.page != mlb1.page) { steps_backward = 0; int[] counts = line_merge(mlb2, linez, steps_backward, steps_forward); removed_elements_before += counts[0]; removed_elements_after += counts[1]; } else if (mlb2.page == mlb1.page && mlb2.page != mlb3.page) { steps_forward = 0; int[] counts = line_merge(mlb2, linez, steps_backward, steps_forward); removed_elements_before += counts[0]; removed_elements_after += counts[1]; } else if (mlb2.page == mlb1.page && mlb2.page == mlb3.page) { int[] counts = line_merge(mlb2, linez, steps_backward, steps_forward); removed_elements_before += counts[0]; removed_elements_after += counts[1]; } // if mlbs on the same page boolean merge_with_before = false; if (mlb2.begin - mlb1.end <= 3 && mlb2.page == mlb1.page && (Math.abs(mlb2.max_elements - mlb1.max_elements) <= 1)) { mlb1.end = mlb2.end - (removed_elements_before - before); blocks.remove(i); merge_with_before = true; mlb1.add(mlb2); i--; } if (mlb3.begin - mlb2.end <= 3 && mlb3.page == mlb2.page && (Math.abs(mlb2.max_elements - mlb3.max_elements) <= 1)) { if (merge_with_before == false) { mlb2.begin = mlb2.begin - (removed_elements_before - before); mlb2.end = mlb3.end - (removed_elements_before - before) - (removed_elements_after - after); mlb2.add(mlb3); blocks.remove(i + 1); } else { mlb1.end = mlb3.end - (removed_elements_before - before) - (removed_elements_after - after); mlb1.add(mlb3); blocks.remove(i + 1); } } } } } private static int[] line_merge(Multiline_Block mlb, List<Line> lines, int steps_back, int steps_for) { Line first_line = lines.get(mlb.begin); Line last_line = lines.get(mlb.end); int count = 0; int removed_elements_before = 0; int removed_elements_after = 0; boolean merge_control = true; for (int i = 1; i <= steps_back && merge_control == true; i++) { Line pl = lines.get(mlb.begin - i); List<Text_Element> storage = new ArrayList<Text_Element>(first_line.texts); int top_distance = first_line.first_top - pl.bottom; for (int j = 0; j < first_line.texts.size(); j++) { Text_Element t = storage.get(j); for (int k = 0; k < pl.texts.size(); k++) { Text_Element n = pl.texts.get(k); int left_distance = Math.abs(n.left - t.left); int right_distance = Math.abs((n.left + n.width) - (t.left + t.width)); if (top_distance < t.height / 2 && t.typ.equals(n.typ) && t.typ.equals("text") && ((left_distance < 3) || (right_distance < 3))) { String s = n.value + " " + t.value; t.value = s; t.count_lines++; t.add(n); count++; } } } if (count == pl.texts.size()) { List<Text_Element> clone = new ArrayList<Text_Element>(storage); first_line.texts = clone; for (int p = 0; p < first_line.texts.size(); p++) { Text_Element t = first_line.texts.get(p); first_line.add(t); } lines.remove(mlb.begin - i); removed_elements_before++; } else { merge_control = false; } count = 0; } merge_control = true; for (int i = 1; i <= steps_for && merge_control == true; i++) { Line nl = lines.get(mlb.end + i); List<Text_Element> storage = new ArrayList<Text_Element>(last_line.texts); int top_distance = nl.first_top - last_line.bottom; for (int j = 0; j < last_line.texts.size(); j++) { Text_Element t = last_line.texts.get(j); for (int k = 0; k < nl.texts.size(); k++) { Text_Element n = nl.texts.get(k); int left_distance = Math.abs(n.left - t.left); int right_distance = Math.abs((n.left + n.width) - (t.left + t.width)); if (top_distance < t.height / 2 && t.typ.equals(n.typ) && t.typ == Type.TEXT && ((left_distance < 3) || (right_distance < 3))) { String s = t.value + " " + n.value; t.value = s; t.count_lines++; t.add(n); count++; } } } if (count == nl.texts.size()) { last_line.texts = new ArrayList<Text_Element>(storage); for (int p = 0; p < last_line.texts.size(); p++) { Text_Element t = last_line.texts.get(p); last_line.add(t); } lines.remove(mlb.end + i); removed_elements_after++; } else { merge_control = false; } count = 0; } return new int[] { removed_elements_before, removed_elements_after }; } private void showErrorFrame(final String file_name) { GraphicsEnvironment ge = GraphicsEnvironment.getLocalGraphicsEnvironment(); Frame f = new Frame(ge.getDefaultScreenDevice().getDefaultConfiguration()); Dialog d = new Dialog(f, "Failure", true); Label l = new Label("pdftohtml was unable to return right data."); Label l2 = new Label("Would you like to restart with pre-debugging?"); d.setLayout(null); l.setBounds(60, 50, 300, 20); l2.setBounds(60, 70, 300, 20); d.add(l); d.add(l2); d.setSize(420, 150); Button b = new Button("Yes"); b.addActionListener(new java.awt.event.ActionListener() { public void actionPerformed(java.awt.event.ActionEvent evt) { Button b2 = (Button) evt.getSource(); debug_pdftohtml_output(file_name); ((Dialog) b2.getParent()).dispose(); } }); b.setBounds(180, 100, 60, 20); Button b2 = new Button("No"); b2.addActionListener(new java.awt.event.ActionListener() { public void actionPerformed(java.awt.event.ActionEvent evt) { Button b3 = (Button) evt.getSource(); ((Dialog) b3.getParent()).dispose(); } }); b.setBounds(180, 100, 60, 20); b2.setBounds(250, 100, 60, 20); d.add(b); d.add(b2); d.setLocationRelativeTo(null); d.setVisible(true); } private void debug_pdftohtml_output(final String filename) { try { BufferedReader br = new BufferedReader(new FileReader(filename)); PrintStream dos = new PrintStream( new FileOutputStream(new File(this.path + File.separator + "debugged_output.xml"))); String current_line = br.readLine(); while (current_line != null) { current_line = current_line.replaceAll("A href", "a href"); current_line = current_line.replaceAll("<B>", "<b>"); current_line = current_line.replaceAll("<I>", "<i>"); current_line = current_line.replaceAll("</I>", "</i>"); current_line = current_line.replaceAll("</B>", "</b>"); dos.println(current_line); current_line = br.readLine(); } run(this.path + File.separator + "debugged_output.xml"); dos.close(); br.close(); } catch (Exception e) { System.out.println(e); } } }