org.wandora.application.tools.extractors.ocr.OCRExtractor.java Source code

Introduction

Here is the source code for org.wandora.application.tools.extractors.ocr.OCRExtractor.java
Source

/*
 * WANDORA
 * Knowledge Extraction, Management, and Publishing Application
 * http://wandora.org
 *
 * Copyright (C) 2004-2016 Wandora Team
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.wandora.application.tools.extractors.ocr;

/* Misc */
import java.net.URL;
import javax.swing.Icon;
import java.io.*;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import org.apache.commons.io.IOUtils;

/* wandora */
import org.wandora.application.*;
import org.wandora.application.gui.UIBox;
import org.wandora.application.tools.browserextractors.BrowserExtractRequest;
import org.wandora.application.tools.browserextractors.BrowserPluginExtractor;
import org.wandora.topicmap.*;
import org.wandora.utils.language.LanguageBox;
import org.wandora.application.tools.extractors.ExtractHelper;
import org.wandora.application.tools.extractors.AbstractExtractor;

/**
 *
 * @author
 * Eero Lehtonen
 */

public class OCRExtractor extends AbstractExtractor {

    protected String SOURCE_SI = "http://wandora.org/si/source";
    protected String DOCUMENT_SI = "http://wandora.org/si/document";

    protected String TEXT_CONTENT_SI = "http://wandora.org/si/text_content";
    protected String DATE_EXTRACTED_SI = "http://wandora.org/si/time_extracted";
    protected String DATE_MODIFIED_SI = "http://wandora.org/si/date_modified";
    protected String FILE_SIZE_SI = "http://wandora.org/si/file_size";

    protected String TEMP_PATH = "temp" + File.separator + "ocr";
    protected SimpleDateFormat dateFormatter;

    @Override
    public String getName() {
        return "OCR Extractor";
    }

    @Override
    public String getDescription() {
        return "Extracts topics and associations from image text data. ";
    }

    @Override
    public Icon getIcon() {
        return UIBox.getIcon(0xf1c5);
    }

    private final String[] contentTypes = new String[] { "image/jpeg", "image/jpg" };

    @Override
    public String[] getContentTypes() {
        return contentTypes;
    }

    @Override
    public boolean useURLCrawler() {
        return false;
    }

    @Override
    public int getExtractorType() {
        return FILE_EXTRACTOR | URL_EXTRACTOR;
    }

    // -------------------------------------------------------------------------

    @Override
    public boolean isConfigurable() {
        return false;
    }

    @Override
    public String doBrowserExtract(BrowserExtractRequest request, Wandora wandora) throws TopicMapException {

        try {
            setWandora(wandora);
            String urlStr = request.getSource();

            URL u = new URL(urlStr);
            String mime = u.openConnection().getContentType();

            if (mime != null && mime.indexOf("image") > -1) {
                _extractTopicsFrom(u, wandora.getTopicMap());
            } else {
                throw new Exception("incompatible mimetype");
            }

        } catch (Exception e) {
            e.printStackTrace();
            return BrowserPluginExtractor.RETURN_ERROR + e.getMessage();
        }

        return null;
    }

    @Override
    public boolean acceptBrowserExtractRequest(BrowserExtractRequest request, Wandora wandora)
            throws TopicMapException {
        return true;
    }

    @Override
    public String getBrowserExtractorName() {
        return getName();
    }

    @Override
    public boolean _extractTopicsFrom(File f, TopicMap t) throws Exception {

        this.dateFormatter = new SimpleDateFormat();
        String si = f.toURI().toString();
        String lang = System.getenv("TESSERACT_LANG");
        if (lang == null)
            lang = "eng";

        Locator l = new Locator(si);
        Topic langTopic = this.getOrCreateLangTopic(t, lang);
        Topic documentTopic = t.getTopic(si);
        Topic dateModifiedType = this.getDateModifiedType(t);

        if (documentTopic == null)
            documentTopic = t.createTopic();

        documentTopic.addSubjectIdentifier(l);
        documentTopic.setSubjectLocator(l);
        documentTopic.setBaseName(f.getName());
        documentTopic.setDisplayName("en", f.getName());

        String modified = dateFormatter.format(new Date(f.lastModified()));
        documentTopic.setData(dateModifiedType, langTopic, modified);

        return this.processFile(f, t, documentTopic);
    }

    @Override
    public boolean _extractTopicsFrom(URL u, TopicMap t) throws Exception {
        boolean success = false;
        URLConnection uc;
        File f = new File(TEMP_PATH + "_temp.dat");
        if (getWandora() != null) {
            uc = getWandora().wandoraHttpAuthorizer.getAuthorizedAccess(u);
        } else {
            uc = u.openConnection();
            Wandora.initUrlConnection(uc);
        }

        String name = uc.getHeaderField("Content-Disposition");
        InputStream is = uc.getInputStream();
        try {
            FileOutputStream fos = new FileOutputStream(f);
            try {
                byte[] buffer = new byte[4096];
                for (int n; (n = is.read(buffer)) != -1;)
                    fos.write(buffer, 0, n);
            } finally {
                fos.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            is.close();
        }

        try {
            String si = u.toString();
            Locator l = new Locator(si);
            Topic documentTopic = t.getTopic(si);

            if (documentTopic == null)
                documentTopic = t.createTopic();

            documentTopic.addSubjectIdentifier(l);
            documentTopic.setSubjectLocator(l);
            if (name != null) {
                documentTopic.setBaseName(name);
                documentTopic.setDisplayName("en", name);
            }

            documentTopic.addSubjectIdentifier(new Locator(si));
            success = processFile(f, t, documentTopic);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            f.delete();
        }

        return success;
    }

    @Override
    public boolean _extractTopicsFrom(String str, TopicMap t) throws Exception {
        throw new UnsupportedOperationException("TODO");
    }

    private boolean processFile(File f, TopicMap tm, Topic documentTopic) throws TopicMapException {
        boolean success = false;
        this.dateFormatter = new SimpleDateFormat();
        String text = "";
        File tmp = new File(TEMP_PATH + ".txt");

        /*
         * Build the command to be executed in the form 
         *  <path/to/tesseract> <path/to/input> <path/to/output> -l <lang>
         * where the output file is temporary and is disposed of 
         * once it's contents are read.
         */

        ArrayList<String> cmd = new ArrayList<String>();
        String pathToTes = System.getenv("TESSERACT_PATH") + "tesseract";
        String lang = System.getenv("TESSERACT_LANG");
        cmd.add(pathToTes);
        cmd.add(f.getAbsolutePath());
        cmd.add(TEMP_PATH);
        if (lang != null) {
            cmd.add("-l");
            cmd.add(lang);
        }

        ProcessBuilder pb = new ProcessBuilder();
        pb.command(cmd);

        try {
            Process p = pb.start();
            StreamGobbler gobbler = new StreamGobbler(p.getInputStream());
            StreamGobbler errorGobbler = new StreamGobbler((p.getErrorStream()));
            gobbler.start();
            errorGobbler.start();
            int w = p.waitFor();
            if (w == 0 && p.exitValue() == 0) { // Exited alright
                FileInputStream is = new FileInputStream(TEMP_PATH + ".txt");
                try {
                    text = IOUtils.toString(is);
                } finally {
                    is.close();
                }
            } else { // Something got messed up
                String error = errorGobbler.getMessage();
                if (error.length() == 0) {
                    error = gobbler.getMessage();
                }
                System.out.println(error);
                throw new RuntimeException(error);
            }

            String extracted = dateFormatter.format(new Date());
            Long size = f.length();

            if (lang == null)
                lang = "eng";

            Topic langTopic = getOrCreateLangTopic(tm, lang);
            Topic documentType = createDocumentTypeTopic(tm);
            Topic contentType = getContentType(tm);
            Topic timeExtractedType = getTimeExtractedType(tm);
            Topic fileSizeType = getSizeType(tm);

            documentTopic.addType(documentType);

            documentTopic.setData(contentType, langTopic, text);
            documentTopic.setData(timeExtractedType, langTopic, extracted);
            documentTopic.setData(fileSizeType, langTopic, "" + size);

            success = true;

        } catch (RuntimeException rte) {
            log("The OCR runtime failed for " + f.getPath());
            log(rte.getMessage());
        } catch (TopicMapException tme) { // Adding the topic failed
            log("Failed to add the file topic with the path " + f.getPath());
        } catch (IOException ioe) { // A file operation failed
            log(ioe.getMessage());
        } catch (InterruptedException ie) {
            log("The OCR process failed for the file " + f.getPath());
        } finally { // Cleanup
            tmp.delete();
        }

        return success;
    }

    public Topic getContentType(TopicMap tm) throws TopicMapException {
        return getOrCreateTopic(tm, TEXT_CONTENT_SI, "Text Content");

    }

    public Topic getDateModifiedType(TopicMap tm) throws TopicMapException {
        return getOrCreateTopic(tm, DATE_MODIFIED_SI, "Date modified");
    }

    public Topic getTimeExtractedType(TopicMap tm) throws TopicMapException {
        return getOrCreateTopic(tm, DATE_EXTRACTED_SI, "Time extracted");
    }

    public Topic getSizeType(TopicMap tm) throws TopicMapException {
        return getOrCreateTopic(tm, FILE_SIZE_SI, "Filesize");
    }

    public Topic getSourceType(TopicMap tm) throws TopicMapException {
        return getOrCreateTopic(tm, SOURCE_SI, "Source");
    }

    public Topic createDocumentTypeTopic(TopicMap tm) throws TopicMapException {
        Topic t = createTopic(tm, "OCR processed document");
        Topic w = getWandoraClass(tm);
        makeSubclassOf(tm, t, w);
        return t;
    }

    public Topic getWandoraClass(TopicMap tm) throws TopicMapException {
        return createTopic(tm, TMBox.WANDORACLASS_SI, "Wandora class");
    }

    protected Topic getOrCreateTopic(TopicMap tm, String si) throws TopicMapException {
        return getOrCreateTopic(tm, si, null);
    }

    protected Topic getOrCreateTopic(TopicMap tm, String si, String bn) throws TopicMapException {
        return ExtractHelper.getOrCreateTopic(si, bn, tm);
    }

    protected void makeSubclassOf(TopicMap tm, Topic t, Topic superclass) throws TopicMapException {
        ExtractHelper.makeSubclassOf(t, superclass, tm);
    }

    //lng in ISO 639-2 (three letters)
    protected Topic getOrCreateLangTopic(TopicMap tm, String lng6392) throws TopicMapException {
        String name = LanguageBox.getNameFor6392Code(lng6392);
        String lng6391 = LanguageBox.get6391ForName(name);
        Topic t = tm.getTopic(XTMPSI.getLang(lng6391));
        if (t == null) {
            t = LanguageBox.createTopicForLanguageCode(lng6391, tm);
        }
        return t;
    }

}