cognition.pipeline.service.DNCPipelineService.java Source code

Java tutorial

Introduction

Here is the source code for cognition.pipeline.service.DNCPipelineService.java

Source

/*
Designed and developed by Ismail E. Kartoglu
Copyright 2015 King's College London
    
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    
    http://www.apache.org/licenses/LICENSE-2.0
    
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
 */

package cognition.pipeline.service;

import cognition.pipeline.service.anonymisation.AnonymisationService;
import com.google.gson.Gson;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import cognition.common.exception.CanNotProcessCoordinateException;
import cognition.common.helper.JsonHelper;
import cognition.common.model.DNCWorkCoordinate;
import cognition.common.service.DocumentConversionService;
import cognition.common.service.FileTypeService;
import cognition.common.utils.StringTools;
import cognition.pipeline.commandline.CommandLineArgHolder;
import cognition.pipeline.data.CoordinatesDao;
import cognition.pipeline.data.DNCWorkUnitDao;
import cognition.pipeline.data.PatientDao;
import cognition.common.model.Individual;

import java.io.File;
import java.io.FileInputStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Optional;

@Service
public class DNCPipelineService {

    private static Logger logger = Logger.getLogger(DNCPipelineService.class);

    @Autowired
    private AnonymisationService anonymisationService;

    @Autowired
    private PatientDao patientDao;

    @Autowired
    private DNCWorkUnitDao dncWorkUnitDao;

    @Autowired
    private DocumentConversionService documentConversionService;

    @Value("${conversionFormat}")
    private String conversionFormat;

    @Autowired
    private FileTypeService fileTypeService;

    @Autowired
    private CoordinatesDao coordinatesDao;

    @Autowired
    private PostProcessorService postProcessorService;

    @Value("${ocrEnabled}")
    private String ocrEnabled;

    @Value("${pseudonymEnabled}")
    private String pseudonymEnabled;

    private JsonHelper<DNCWorkCoordinate> jsonHelper = new JsonHelper(DNCWorkCoordinate[].class);

    private CommandLineArgHolder commandLineArgHolder = new CommandLineArgHolder();

    private List<DNCWorkCoordinate> failedCoordinates = Collections.synchronizedList(new ArrayList<>());

    /**
     * Anonymise the DNC Work Coordinates (DWC) specified in a view/table in the source DB.
     *
     */
    public void startCreateModeWithDBView() {
        logger.info("Retrieving coordinates from DB.");

        List<DNCWorkCoordinate> dncWorkCoordinates = coordinatesDao.getCoordinates();

        dncWorkCoordinates.parallelStream().forEach(this::processSingleCoordinate);
        logger.info("Finished all.");
        dumpFailedCoordinates();
    }

    /**
     * Anonymise the DNC Work Coordinates (DWC) specified in the jSON file
     * whose path is given as argument.
     * @param filePath File path of the jSON file that contains DNC Work Coordinates.
     */
    public void startCreateModeWithFile(String filePath) {
        logger.info("Loading work units from file.");

        List<DNCWorkCoordinate> workCoordinates = jsonHelper.loadListFromFile(new File(filePath));

        workCoordinates.parallelStream().forEach(this::processSingleCoordinate);
        logger.info("Finished all.");
        dumpFailedCoordinates();
    }

    public void processCoordinates(List<DNCWorkCoordinate> coordinateQueue) {
        coordinateQueue.parallelStream().forEach(this::processSingleCoordinate);
    }

    public Optional<DNCWorkCoordinate> processTextCoordinate(DNCWorkCoordinate coordinate) {
        try {
            logger.info("Anonymising text, coordinates: " + coordinate);
            String text = dncWorkUnitDao.getTextFromCoordinate(coordinate);
            if (pseudonymisationIsEnabled()) {
                Individual individual = patientDao.getPatient(coordinate.getPatientId());
                text = anonymisationService.pseudonymisePersonPlainText(individual, text);
                coordinate.setIndividual(individual);
            }
            coordinate.setConversionResult(text);
            return Optional.of(coordinate);
        } catch (Exception ex) {
            logger.info("Could not process coordinate " + coordinate);
            failedCoordinates.add(coordinate);
            ex.printStackTrace();
        }

        return Optional.empty();
    }

    public Optional<DNCWorkCoordinate> processBinaryCoordinate(DNCWorkCoordinate coordinate) {
        try {
            byte[] bytes = dncWorkUnitDao.getByteFromCoordinate(coordinate);
            String text = convertBinary(bytes);
            if (isPDFAndPossiblyOCR(bytes, text)) {
                text = applyOCRToPDF(coordinate, bytes, text);
            }
            if (pseudonymisationIsEnabled()) {
                logger.info("Pseudonymising binary, coordinates: " + coordinate);
                Individual individual = patientDao.getPatient(coordinate.getPatientId());
                text = pseudonymisePersonText(individual, text);
                coordinate.setIndividual(individual);
            }

            coordinate.setConversionResult(text);
            return Optional.of(coordinate);
        } catch (Exception ex) {
            logger.error("Could not process coordinate " + coordinate);
            failedCoordinates.add(coordinate);
            ex.printStackTrace();
        }

        return Optional.empty();
    }

    private String applyOCRToPDF(DNCWorkCoordinate coordinate, byte[] bytes, String text)
            throws CanNotProcessCoordinateException {
        String metaData = StringTools.getMetaDataFromHTML(text);
        text = documentConversionService.tryOCRByConvertingToTiff(coordinate, bytes);
        text = StringTools.addMetaDataToHtml(text, metaData);
        return text;
    }

    private boolean isPDFAndPossiblyOCR(byte[] bytes, String text) {
        return StringTools.noContentInHtml(text) && fileTypeService.isPDF(bytes);
    }

    protected boolean pseudonymisationIsEnabled() {
        if (!commandLineArgHolder.isNoPseudonym()) {
            return true;
        }
        return "1".equals(pseudonymEnabled) || "true".equalsIgnoreCase(pseudonymEnabled);
    }

    private String pseudonymisePersonText(Individual individual, String text) {
        if (conversionPreferenceIsHTML()) {
            text = anonymisationService.pseudonymisePersonHTML(individual, text);
        } else {
            text = anonymisationService.pseudonymisePersonPlainText(individual, text);
        }
        return text;
    }

    private String convertBinary(byte[] bytes) {
        if (conversionPreferenceIsHTML()) {
            return documentConversionService.convertToXHTML(bytes);
        }
        return documentConversionService.convertToText(bytes);
    }

    private boolean conversionPreferenceIsHTML() {
        return conversionFormat.equalsIgnoreCase("html") || conversionFormat.equalsIgnoreCase("xhtml");
    }

    private void saveResult(DNCWorkCoordinate coordinate) {
        dncWorkUnitDao.saveConvertedText(coordinate);
    }

    public void processFile(String absoluteFilePath) {
        File file = new File(absoluteFilePath);
        FileInputStream fileInputStream = null;
        try {
            fileInputStream = new FileInputStream(file);
            byte[] bytes = IOUtils.toByteArray(fileInputStream);
            String text = convertBinary(bytes);
            DNCWorkCoordinate coordinate = DNCWorkCoordinate.createEmptyCoordinate();
            coordinate.setSourceTable(absoluteFilePath);
            if (isPDFAndPossiblyOCR(bytes, text)) {
                text = applyOCRToPDF(coordinate, bytes, text);
            }
            coordinate.setConversionResult(text);
            saveResult(coordinate);
        } catch (Exception e) {
            logger.error(e.getMessage());
            e.printStackTrace();
        } finally {
            IOUtils.closeQuietly(fileInputStream);
        }
    }

    public void setConversionFormat(String conversionFormat) {
        this.conversionFormat = conversionFormat;
    }

    public void processSingleCoordinate(DNCWorkCoordinate coordinate) {
        logger.info("Processing coordinate " + coordinate);

        Optional<DNCWorkCoordinate> result;
        if (coordinate.isBinary()) {
            result = processBinaryCoordinate(coordinate);
        } else {
            result = processTextCoordinate(coordinate);
        }

        if (!result.isPresent()) {
            logger.info("Coordinate could not be processed: " + coordinate);
            return;
        }

        DNCWorkCoordinate dncWorkCoordinate = result.get();
        if (StringTools.noContentInHtml(dncWorkCoordinate.getConversionResult())) {
            logger.warn("Not saving empty document at coordinate: " + coordinate);
            return;
        }

        try {
            postProcessorService.postProcess(dncWorkCoordinate);
        } catch (Exception ex) {
            logger.warn("Ignoring exception in post-process: " + ex.getMessage());
        }

        saveResult(dncWorkCoordinate);
    }

    public void dumpFailedCoordinates() {
        if (CollectionUtils.isEmpty(failedCoordinates)) {
            return;
        }
        Gson gson = new Gson();
        String failedJson = gson.toJson(failedCoordinates);
        PrintWriter writer;
        try {
            writer = new PrintWriter("failedCoordinates.json", "UTF-8");
            writer.println(failedJson);
            writer.close();
            logger.info(
                    "Dumped all failed coordinates to failedCoordinates.json. You can process them by --createMode --file=failedCoordinates.json");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public CommandLineArgHolder getCommandLineArgHolder() {
        return commandLineArgHolder;
    }

    public void setOcrEnabled(String ocrEnabled) {
        this.ocrEnabled = ocrEnabled;
    }

    public void setPseudonymEnabled(String pseudonymEnabled) {
        this.pseudonymEnabled = pseudonymEnabled;
    }
}