eu.himeros.hocr.FlatXml.java Source code

Java tutorial

Introduction

Here is the source code for eu.himeros.hocr.FlatXml.java

Source

/*
 * This file is part of eu.himeros_hocraggregator_jar_1.0-SNAPSHOT
 *
 * Copyright (C) 2013 federico[DOT]boschetti[DOT]73[AT]gmail[DOT]com
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package eu.himeros.hocr;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.List;
import org.jdom2.Comment;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Namespace;
import org.jdom2.filter.Filters;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import org.jdom2.xpath.XPathExpression;
import org.jdom2.xpath.XPathFactory;

/**
 *
 * @author federico[DOT]boschetti[DOT]73[AT]gmail[DOT]com
 */
public class FlatXml {
    //static String dirName="/usr/local/hocrinfoaggregator/test/demetrius/Demetrius-De_elocutione.book";

    public FlatXml(File inFile, File outFile) throws Exception {
        init(inFile, outFile);
    }

    private void init(File inFile, File outFile) throws Exception {
        SAXBuilder builder = new SAXBuilder();
        Document doc = builder.build(inFile);
        Element root = doc.getRootElement();
        Namespace oldns = root.getNamespace();
        Element newRoot = new Element("html", "http://www.w3.org/1999/xhtml");
        Namespace xmlns = newRoot.getNamespace();
        Element head = root.getChild("head", oldns);
        head.setNamespace(xmlns);
        for (Element child : head.getChildren())
            child.setNamespace(xmlns);
        Element title = new Element("title", xmlns);
        title.addContent("ocr");
        if (head != null)
            head.addContent(title);
        Element body = root.getChild("body", oldns);
        body.setNamespace(xmlns);
        /*Element oldPage;
        try{
        oldPage=body.getChild("div",xmlns);
        }catch(Exception ex){
        oldPage=new Element("div",xmlns);
        }*/
        Element page = new Element("div", xmlns);
        page.setAttribute("class", "ocr_page");
        page.setAttribute("id", "i" + inFile.getName().substring(1).replace(".html", ".png"));
        XPathExpression<Element> xpath = XPathFactory.instance().compile("//*[@class='ocr_carea']",
                Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
        List<Element> careaElL = xpath.evaluate(body);
        for (Element careaEl : careaElL) {
            page.addContent(new Comment("<div class=\"" + careaEl.getAttributeValue("class") + "\" title=\""
                    + careaEl.getAttributeValue("title") + "\">"));
            for (Element pEl : careaEl.getChildren()) {
                page.addContent(new Comment("<p>"));
                for (Element lineEl : pEl.getChildren()) {
                    lineEl.removeAttribute("id");
                    lineEl.setNamespace(xmlns);
                    for (Element child : lineEl.getChildren()) {
                        child.removeAttribute("id");
                        child.removeAttribute("lang");
                        child.removeAttribute("lang", xmlns);
                        child.setNamespace(xmlns);
                    }
                    page.addContent(lineEl.clone());
                }
                page.addContent(new Comment("</p>"));
            }
            page.addContent(new Comment("</div>"));
        }
        //oldPage.detach();
        if (body != null) {
            body.removeContent();
            body.addContent(page);
        }
        newRoot.addContent(root.removeContent());
        doc.detachRootElement();
        doc.setRootElement(newRoot);
        XMLOutputter xmlOutputter = new XMLOutputter(Format.getPrettyFormat());
        xmlOutputter.output(doc, new BufferedWriter(new FileWriter(outFile)));
    }

    public static void main(String[] args) throws Exception {
        File dir = new File(args[0]);
        for (File file : dir.listFiles()) {
            try {
                if (!file.getName().endsWith(".html"))
                    continue;
                FlatXml fx = new FlatXml(file, new File(file.getAbsolutePath().replaceAll("html$", "1.html")));
            } catch (Exception ex) {
                ex.printStackTrace(System.err);
                System.err.println(file.getName());
                continue;
            }
        }
    }
}