eu.himeros.hocr.NgtMaker.java Source code

Java tutorial

Introduction

Here is the source code for eu.himeros.hocr.NgtMaker.java

Source

/*
 * This file is part of eu.himeros_hocraggregator_jar_1.0-SNAPSHOT
 *
 * Copyright (C) 2012 federico[DOT]boschetti[DOT]73[AT]gmail[DOT]com
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package eu.himeros.hocr;

import eu.himeros.transcoder.Transcoder;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeMap;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Namespace;
import org.jdom2.filter.Filters;
import org.jdom2.input.SAXBuilder;
import org.jdom2.xpath.XPathExpression;
import org.jdom2.xpath.XPathFactory;

/**
 *
 * @author federico[DOT]boschetti[DOT]73[AT]gmail[DOT]com
 */
public class NgtMaker {
    StringBuilder prolog;
    BufferedReader br;
    BufferedWriter bw;
    Document doc;
    Element root;
    Namespace xmlns;
    SAXBuilder builder;
    TreeMap<String, Integer> ngtTm;
    ArrayList<String> ngtAl;
    ArrayList<String> ocrAl;
    Transcoder trans;
    int prevValue = -1;
    int start = -1;
    int end = -1;
    String outFileName;
    XPathExpression<Element> xpath;

    public NgtMaker(File ngtName) throws Exception {
        init(ngtName);
    }

    private void init(File ngtName) throws Exception {
        br = new BufferedReader(new FileReader(ngtName));
        if (RunAll.configpath == "")
            RunAll.configpath = (new File("./src/main/resources")).getCanonicalPath();
        trans = new Transcoder(
                new FileInputStream(RunAll.configpath + "/eu/himeros/resources/transcoders/low2up.txt"));
        ngtTm = new TreeMap<>();
        ngtAl = new ArrayList<>(300000);
        prolog = new StringBuilder("<html xmlns=\"http://www.w3.org/1999/xhtml\">\n");
        prolog.append("<head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n");
        prolog.append("<meta content=\"Rigaudon 0.3\" name=\"ocr-system\" />\n");
        prolog.append("<meta name=\"ocr-nmber-of-pages\" content=\"???\" />\n");
        prolog.append("<meta name=\"ocr-langs\" content=\"grc lat\" />\n");
        prolog.append("<meta content=\"ocr_line ocr_page\" name=\"ocr-capabilities\" />\n");
        prolog.append("<link href=\"hocraggregate.css\" rel=\"stylesheet\" type=\"text/css\"/>\n");
        String line;
        String it1;
        String it2;
        String it3;
        String key;
        while ((line = br.readLine()) != null) {
            ngtAl.add(line);
        }
        ngtAl.add("###");
        ngtAl.add("###");
        for (int i = 0; i < ngtAl.size() - 2; i++) {
            it1 = trans.parse(ngtAl.get(i));
            it2 = trans.parse(ngtAl.get(i + 1));
            it3 = trans.parse(ngtAl.get(i + 2));
            key = (new StringBuilder(it1)).append(it2).append(it3).toString();
            if (ngtTm.containsKey(key)) {
                ngtTm.remove(key);
            } else {
                ngtTm.put(key, i);
            }
        }
        br.close();
    }

    public void parseAll(File dir) throws Exception {
        for (File file : dir.listFiles()) {
            try {
                if (!file.getName().matches("^.+\\.1.html$"))
                    continue;
                parseDoc(file);
            } catch (Exception ex) {
                System.err.println(file.getName());
                continue;
            }
        }
    }

    public void parseDoc(File file) throws Exception {
        adjustFile(file);
        start = -1;
        end = -1;
        prevValue = -1;
        ocrAl = new ArrayList<>(1000);
        outFileName = file.getAbsolutePath().substring(0, file.getAbsolutePath().length() - 4) + "ngt.xml";
        builder = new SAXBuilder();
        doc = builder.build(file);
        root = doc.getRootElement();
        xmlns = root.getNamespace();
        xpath = XPathFactory.instance().compile("//ns:span[@class='ocr_word']", Filters.element(), null,
                Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
        List<Element> elements = xpath.evaluate(root);
        for (Element element : elements) {
            parseOcrWord(element);
        }

        ocrAl.add("%%%");
        ocrAl.add("%%%");
        findAnchors();
        writeFragment(start, end);
    }

    private void parseOcrPage(Element ocrPage) {
        List<Element> ocrLines = ocrPage.getChildren("span", xmlns);
        for (Element ocrLine : ocrLines) {
            parseOcrLine(ocrLine);
        }
    }

    private void parseOcrLine(Element ocrLine) {
        List<Element> ocrWords = ocrLine.getChildren("span", xmlns);
        for (Element ocrWord : ocrWords) {
            parseOcrWord(ocrWord);
        }
    }

    private void parseOcrWord(Element ocrWord) {
        String word = trans.parse(ocrWord.getText()).replaceAll("[^-]*", "").trim();
        if (word.length() == 0)
            return;
        ocrAl.add(word);
    }

    private void findAnchors() {
        for (int i = 0; i < ocrAl.size() - 2; i++) {
            String key = (new StringBuilder(ocrAl.get(i))).append(ocrAl.get(i + 1)).append(ocrAl.get(i + 2))
                    .toString();
            Integer value = ngtTm.get(key);
            if (prevValue == -1 && value != null) {
                start = value;
                prevValue = value;
            }
            if (value != null) {
                if ((value - prevValue) < 100) {
                    prevValue = value;
                    end = value;
                }
            }
        }
    }

    private void writeFragment(int fragStart, int fragEnd) throws Exception {
        bw = new BufferedWriter(new FileWriter(outFileName));
        StringBuilder fragTextSb = new StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        fragTextSb.append("<fragment>\n");
        try {
            for (int i = fragStart; i <= fragEnd; i++) {
                String word = ngtAl.get(i);
                fragTextSb.append(word).append(" ");
            }
        } catch (Exception ex) {
            System.out.println(outFileName);
        }
        fragTextSb.append("\n</fragment>\n");
        bw.write(fragTextSb.toString());
        bw.close();
    }

    private void adjustFile(File file) throws Exception {
        BufferedReader tmpBr = new BufferedReader(new FileReader(file));
        String line;
        StringBuilder fileTextSb = new StringBuilder(10000);
        fileTextSb.append(prolog);
        int lineCounter = 0;
        while ((line = tmpBr.readLine()) != null) {
            lineCounter++;
            if (lineCounter < 8)
                continue;
            fileTextSb.append(line).append("\n");
        }
        tmpBr.close();
        BufferedWriter tmpBw = new BufferedWriter(new FileWriter(file));
        tmpBw.write(fileTextSb.toString());
        tmpBw.close();
    }

    public static void main(String[] args) throws Exception {
        //String ngtName="/usr/local/hocrinfoaggregator/test/demetrius/demetrius-de_elocutione.ngt.csv";
        //String dirName="/usr/local/hocrinfoaggregator/test/demetrius/Demetrius-De_elocutione.book";
        NgtMaker nm = new NgtMaker(new File(args[0]));
        nm.parseAll(new File(args[1]));
    }
}