de.yaio.services.metaextract.server.extractor.TesseractExtractor.java Source code

Introduction

Here is the source code for de.yaio.services.metaextract.server.extractor.TesseractExtractor.java
Source

/** 
 * software for metadata-extraction
 * 
 * @FeatureDomain                Extractor
 * @author                       Michael Schreiner <michael.schreiner@your-it-fellow.de>
 * @category                     metadata-extraction
 * @copyright                    Copyright (c) 2014, Michael Schreiner
 * @license                      http://mozilla.org/MPL/2.0/ Mozilla Public License 2.0
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */
package de.yaio.services.metaextract.server.extractor;

import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;

import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/** 
 * services to extract data via tika
 */
@Service
public class TesseractExtractor extends AbstractExtractor implements Extractor {

    private static Map<String, String> langMap = new HashMap<>();
    static {
        langMap.put("de", "deu");
        langMap.put("en", "eng");
    }

    @Value("${yaio-metaextract-service.tess.datapath}")
    private String tessDatapath;

    @Override
    public String extractText(final InputStream input, final String fileName, final String lang)
            throws IOException, ExtractorException {
        File tmpFile;
        tmpFile = File.createTempFile("metaextractor", "." + FilenameUtils.getExtension(fileName));
        tmpFile.deleteOnExit();
        Files.copy(input, tmpFile.toPath(), java.nio.file.StandardCopyOption.REPLACE_EXISTING);
        return this.extractText(tmpFile, lang);
    }

    @Override
    public String extractText(final File file, final String lang) throws IOException, ExtractorException {
        if (!isFileSupported(file.getName())) {
            return "";
        }

        Tesseract instance = new Tesseract();
        instance.setDatapath(tessDatapath);

        // init lang
        String tessLang = "deu";
        if (!StringUtils.isEmpty(lang) && langMap.containsKey(lang)) {
            tessLang = langMap.get(lang);
        }
        instance.setLanguage(tessLang);

        String result;
        try {
            result = instance.doOCR(file);
        } catch (TesseractException e) {
            if (RuntimeException.class.isInstance(e.getCause())) {
                throw new ExtractorException("error while using tesseract to extract from not supported file",
                        file.getAbsolutePath(), e);
            }
            throw new ExtractorException("error while using tesseract to extract from file", file.getAbsolutePath(),
                    e);
        }
        return result;
    }

    @Override
    public String identifyLanguage(final String text) {
        return "UNKNOWN";
    }

    @Override
    public String getParserName() {
        return "OCR Tesseract";
    }

    protected boolean isFileSupported(final String imageFileName) {
        String imageFormat = imageFileName.substring(imageFileName.lastIndexOf('.') + 1);
        if (imageFormat.matches("(pbm|pgm|ppm)")) {
            imageFormat = "pnm";
        } else if (imageFormat.matches("(jp2|j2k|jpf|jpx|jpm)")) {
            imageFormat = "jpeg2000";
        }
        Iterator<ImageReader> readers = ImageIO.getImageReadersByFormatName(imageFormat);

        if (!readers.hasNext()) {
            return false;
        }

        return true;
    }

}