eu.himeros.ocr.ngt.NearGroundTruthParser.java Source code

Java tutorial

Introduction

Here is the source code for eu.himeros.ocr.ngt.NearGroundTruthParser.java

Source

/*
 * This file is part of eu.himeros_hocraggregator_jar_1.0-SNAPSHOT
 *
 * Copyright (C) 2012 federico[DOT]boschetti[DOT]73[AT]gmail[DOT]com
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package eu.himeros.ocr.ngt;

import eu.himeros.transcoder.Transcoder;
import java.util.HashMap;
import java.util.List;
import java.util.Stack;
import org.jdom2.Content;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Text;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;

/**
 *
 * @author federico[DOT]boschetti[DOT]73[AT]gmail[DOT]com
 */
public class NearGroundTruthParser {

    private SAXBuilder builder = null;
    private Document docIn = null;
    private Element rootIn = null;
    private Element rootOut = null;
    private Stack<String> langStack = null;
    private String langScope = null;
    private Transcoder betaTrans = null;
    private Transcoder upperTrans = null;
    private String defaultLang = "grc";
    private String hyphenatedFirstPart = null;
    private HashMap<String, Integer> occHm = null;
    private int id = 1;

    public NearGroundTruthParser() throws Exception {
        init(defaultLang);
    }

    public NearGroundTruthParser(String defaultLang) throws Exception {
        init(defaultLang);
    }

    private void init(String defaultLang) throws Exception {
        this.defaultLang = defaultLang;
        occHm = new HashMap<>(2048);
        langStack = new Stack<>();
        betaTrans = new Transcoder(
                this.getClass().getResourceAsStream("/eu/himeros/resources/transcoders/beta2u.txt"));
        upperTrans = new Transcoder(
                this.getClass().getResourceAsStream("/eu/himeros/resources/transcoders/low2up.txt"));
        langStack.push(defaultLang);
        rootOut = new Element("text");
    }

    public Element parse(Content content) {
        switch (content.getCType()) {
        case Text:
        case CDATA:
            String text = ((Text) content).getText();
            if (defaultLang.equals(langStack.peek())) {
                parseTextLine(betaTrans.parse(text));
                //parseTextLine(text);
            }
            break;
        case Element:
            List<Content> children = ((Element) content).getContent();
            for (Content child : children) {
                switch (child.getCType()) {
                case Text:
                case CDATA:
                    parse(child);
                    break;
                case Element:
                    langScope = (((Element) child).getAttributeValue("lang") == null ? langScope
                            : ((Element) child).getAttributeValue("lang"));
                    langStack.push(langScope);
                    parse(child);
                    langStack.pop();
                }
            }
        }
        return rootOut;
    }

    public Element parse(String inFile) throws Exception {
        rootIn = getRoot(inFile);
        rootOut = parse(rootIn);
        injectOcc(rootOut);
        appendToAnchor(rootOut);
        return appendToAnchor(rootOut);
    }

    private Element appendToAnchor(Element root) throws Exception {
        Element anchorRoot = new Element("text");
        List<Element> words = root.getChildren();
        Element currAnchor = new Element("w");
        currAnchor.setAttribute("id", "0");
        anchorRoot.addContent(currAnchor);
        Element currNode;
        for (Element word : words) {
            currNode = word.clone();
            if ("1".equals(word.getAttributeValue("occ"))) {
                currAnchor = currNode;
                anchorRoot.addContent(currAnchor);
            } else {
                currAnchor.addContent(currNode);
            }
        }
        return anchorRoot;
    }

    private void parseTextLine(String textLine) {
        String[] tokens = textLine.split(" ");
        for (String token : tokens) {
            token = token.replaceAll("[\n\t,;.]+", "");
            if (token.matches("[\u0380-\u03FF\u1F00-\u1FFF]+")) {
                if (hyphenatedFirstPart != null) {
                    token = hyphenatedFirstPart + token;
                    hyphenatedFirstPart = null;
                }
                Element el = new Element("w");
                el.setAttribute("id", "" + id++);
                el.setAttribute("text", token);
                el.setAttribute("uc", upperTrans.parse(token));
                rootOut.addContent(el);
            } else if (token.endsWith("-")) {
                hyphenatedFirstPart = token.substring(0, token.length() - 1);
            }
        }
    }

    private void injectOcc(Element root) {
        makeOccHm(root);
        List<Element> words = root.getChildren();
        for (Element word : words) {
            String upWord = word.getAttributeValue("uc");
            String occ = occHm.get(upWord).toString();
            word.setAttribute("occ", occ);
        }
    }

    private void makeOccHm(Element root) {
        List<Element> words = root.getChildren();
        for (Element word : words) {
            String upWord = word.getAttributeValue("uc");
            Integer counter;
            counter = ((counter = occHm.get(upWord)) == null ? new Integer(1)
                    : new Integer(counter.intValue() + 1));
            occHm.put(upWord, counter);
        }
    }

    public Element getRoot(String inFileName) throws Exception {
        builder = new SAXBuilder();
        docIn = builder.build(inFileName);
        return docIn.getRootElement();
    }

    public static void align(String inFileName, String outFileName) throws Exception {
        NearGroundTruthParser ngtp = new NearGroundTruthParser();
        Element rootOut = ngtp.parse(inFileName); //e.g. xxx002_001_ft-xi_frag.xml
        XMLOutputter xop = new XMLOutputter(
                Format.getPrettyFormat().setEncoding("UTF-8").setLineSeparator("\n").setIndent("   "));
        String output = xop.outputString(rootOut);
        System.out.println(output);
    }

    public static void main(String[] args) throws Exception {
        align(args[0], args[1]);
    }
}