Java tutorial
/* Designed and developed by Ismail E. Kartoglu Copyright 2015 King's College London Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package cognition.pipeline.service; import cognition.pipeline.service.anonymisation.AnonymisationService; import com.google.gson.Gson; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import cognition.common.exception.CanNotProcessCoordinateException; import cognition.common.helper.JsonHelper; import cognition.common.model.DNCWorkCoordinate; import cognition.common.service.DocumentConversionService; import cognition.common.service.FileTypeService; import cognition.common.utils.StringTools; import cognition.pipeline.commandline.CommandLineArgHolder; import cognition.pipeline.data.CoordinatesDao; import cognition.pipeline.data.DNCWorkUnitDao; import cognition.pipeline.data.PatientDao; import cognition.common.model.Individual; import java.io.File; import java.io.FileInputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Optional; @Service public class DNCPipelineService { private static Logger logger = Logger.getLogger(DNCPipelineService.class); @Autowired private AnonymisationService anonymisationService; @Autowired private PatientDao patientDao; @Autowired private DNCWorkUnitDao dncWorkUnitDao; @Autowired private DocumentConversionService documentConversionService; @Value("${conversionFormat}") private String conversionFormat; @Autowired private FileTypeService fileTypeService; @Autowired private CoordinatesDao coordinatesDao; @Autowired private PostProcessorService postProcessorService; @Value("${ocrEnabled}") private String ocrEnabled; @Value("${pseudonymEnabled}") private String pseudonymEnabled; private JsonHelper<DNCWorkCoordinate> jsonHelper = new JsonHelper(DNCWorkCoordinate[].class); private CommandLineArgHolder commandLineArgHolder = new CommandLineArgHolder(); private List<DNCWorkCoordinate> failedCoordinates = Collections.synchronizedList(new ArrayList<>()); /** * Anonymise the DNC Work Coordinates (DWC) specified in a view/table in the source DB. * */ public void startCreateModeWithDBView() { logger.info("Retrieving coordinates from DB."); List<DNCWorkCoordinate> dncWorkCoordinates = coordinatesDao.getCoordinates(); dncWorkCoordinates.parallelStream().forEach(this::processSingleCoordinate); logger.info("Finished all."); dumpFailedCoordinates(); } /** * Anonymise the DNC Work Coordinates (DWC) specified in the jSON file * whose path is given as argument. * @param filePath File path of the jSON file that contains DNC Work Coordinates. */ public void startCreateModeWithFile(String filePath) { logger.info("Loading work units from file."); List<DNCWorkCoordinate> workCoordinates = jsonHelper.loadListFromFile(new File(filePath)); workCoordinates.parallelStream().forEach(this::processSingleCoordinate); logger.info("Finished all."); dumpFailedCoordinates(); } public void processCoordinates(List<DNCWorkCoordinate> coordinateQueue) { coordinateQueue.parallelStream().forEach(this::processSingleCoordinate); } public Optional<DNCWorkCoordinate> processTextCoordinate(DNCWorkCoordinate coordinate) { try { logger.info("Anonymising text, coordinates: " + coordinate); String text = dncWorkUnitDao.getTextFromCoordinate(coordinate); if (pseudonymisationIsEnabled()) { Individual individual = patientDao.getPatient(coordinate.getPatientId()); text = anonymisationService.pseudonymisePersonPlainText(individual, text); coordinate.setIndividual(individual); } coordinate.setConversionResult(text); return Optional.of(coordinate); } catch (Exception ex) { logger.info("Could not process coordinate " + coordinate); failedCoordinates.add(coordinate); ex.printStackTrace(); } return Optional.empty(); } public Optional<DNCWorkCoordinate> processBinaryCoordinate(DNCWorkCoordinate coordinate) { try { byte[] bytes = dncWorkUnitDao.getByteFromCoordinate(coordinate); String text = convertBinary(bytes); if (isPDFAndPossiblyOCR(bytes, text)) { text = applyOCRToPDF(coordinate, bytes, text); } if (pseudonymisationIsEnabled()) { logger.info("Pseudonymising binary, coordinates: " + coordinate); Individual individual = patientDao.getPatient(coordinate.getPatientId()); text = pseudonymisePersonText(individual, text); coordinate.setIndividual(individual); } coordinate.setConversionResult(text); return Optional.of(coordinate); } catch (Exception ex) { logger.error("Could not process coordinate " + coordinate); failedCoordinates.add(coordinate); ex.printStackTrace(); } return Optional.empty(); } private String applyOCRToPDF(DNCWorkCoordinate coordinate, byte[] bytes, String text) throws CanNotProcessCoordinateException { String metaData = StringTools.getMetaDataFromHTML(text); text = documentConversionService.tryOCRByConvertingToTiff(coordinate, bytes); text = StringTools.addMetaDataToHtml(text, metaData); return text; } private boolean isPDFAndPossiblyOCR(byte[] bytes, String text) { return StringTools.noContentInHtml(text) && fileTypeService.isPDF(bytes); } protected boolean pseudonymisationIsEnabled() { if (!commandLineArgHolder.isNoPseudonym()) { return true; } return "1".equals(pseudonymEnabled) || "true".equalsIgnoreCase(pseudonymEnabled); } private String pseudonymisePersonText(Individual individual, String text) { if (conversionPreferenceIsHTML()) { text = anonymisationService.pseudonymisePersonHTML(individual, text); } else { text = anonymisationService.pseudonymisePersonPlainText(individual, text); } return text; } private String convertBinary(byte[] bytes) { if (conversionPreferenceIsHTML()) { return documentConversionService.convertToXHTML(bytes); } return documentConversionService.convertToText(bytes); } private boolean conversionPreferenceIsHTML() { return conversionFormat.equalsIgnoreCase("html") || conversionFormat.equalsIgnoreCase("xhtml"); } private void saveResult(DNCWorkCoordinate coordinate) { dncWorkUnitDao.saveConvertedText(coordinate); } public void processFile(String absoluteFilePath) { File file = new File(absoluteFilePath); FileInputStream fileInputStream = null; try { fileInputStream = new FileInputStream(file); byte[] bytes = IOUtils.toByteArray(fileInputStream); String text = convertBinary(bytes); DNCWorkCoordinate coordinate = DNCWorkCoordinate.createEmptyCoordinate(); coordinate.setSourceTable(absoluteFilePath); if (isPDFAndPossiblyOCR(bytes, text)) { text = applyOCRToPDF(coordinate, bytes, text); } coordinate.setConversionResult(text); saveResult(coordinate); } catch (Exception e) { logger.error(e.getMessage()); e.printStackTrace(); } finally { IOUtils.closeQuietly(fileInputStream); } } public void setConversionFormat(String conversionFormat) { this.conversionFormat = conversionFormat; } public void processSingleCoordinate(DNCWorkCoordinate coordinate) { logger.info("Processing coordinate " + coordinate); Optional<DNCWorkCoordinate> result; if (coordinate.isBinary()) { result = processBinaryCoordinate(coordinate); } else { result = processTextCoordinate(coordinate); } if (!result.isPresent()) { logger.info("Coordinate could not be processed: " + coordinate); return; } DNCWorkCoordinate dncWorkCoordinate = result.get(); if (StringTools.noContentInHtml(dncWorkCoordinate.getConversionResult())) { logger.warn("Not saving empty document at coordinate: " + coordinate); return; } try { postProcessorService.postProcess(dncWorkCoordinate); } catch (Exception ex) { logger.warn("Ignoring exception in post-process: " + ex.getMessage()); } saveResult(dncWorkCoordinate); } public void dumpFailedCoordinates() { if (CollectionUtils.isEmpty(failedCoordinates)) { return; } Gson gson = new Gson(); String failedJson = gson.toJson(failedCoordinates); PrintWriter writer; try { writer = new PrintWriter("failedCoordinates.json", "UTF-8"); writer.println(failedJson); writer.close(); logger.info( "Dumped all failed coordinates to failedCoordinates.json. You can process them by --createMode --file=failedCoordinates.json"); } catch (Exception e) { e.printStackTrace(); } } public CommandLineArgHolder getCommandLineArgHolder() { return commandLineArgHolder; } public void setOcrEnabled(String ocrEnabled) { this.ocrEnabled = ocrEnabled; } public void setPseudonymEnabled(String pseudonymEnabled) { this.pseudonymEnabled = pseudonymEnabled; } }