com.hyperiongray.rcmp.ReportExtractor.java Source code

Introduction

Here is the source code for com.hyperiongray.rcmp.ReportExtractor.java
Source

package com.hyperiongray.rcmp;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.tika.exception.TikaException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.aspose.pdf.TextExtractionOptions;
import com.google.common.io.Files;
import com.hyperiongray.rcmp.extract.DataKey;
import com.hyperiongray.rcmp.extract.OutputColumns;
import com.hyperiongray.rcmp.extract.Type1Extractor;
import com.hyperiongray.rcmp.extract.Type2Extractor;

/**
 *
 * @author mark
 */
public class ReportExtractor {

    private final static Logger logger = LoggerFactory.getLogger(ReportExtractor.class);
    private static final boolean OBFUSCATE_NAMES = true;

    public static final String SEPARATOR = "#separator";
    public static final String PARAGRAPH = "#paragraph";

    private final String[] key_fileColumns = { "Person name", "Report/Ticket number", "Officer name",
            "Offender name", "Unique key                        " // spaces for Excel cell formatting
    };

    private String outputFile;
    private String inputDir;

    // not lazy initialization, to avoid threading problems
    private static final ReportExtractor instance = new ReportExtractor();

    private int docCount;
    private final static String separator = "|";

    public static ReportExtractor getInstance() {
        return instance;
    }

    private ReportExtractor() {
        // singleton
        initAsposeLicense();
    }

    public void doConvert() throws IOException {
        logger.info("Preparing to convert here: {}, output results there: {}", inputDir, outputFile);
        createOutputFiles();
        // for each report, add extracted information to the appropriate target files
        File[] files = new File(getInputDir()).listFiles();
        for (File file : files) {
            try {
                if (file.isFile() && file.exists()) {
                    String fileName = file.getName();
                    if (fileName.length() > 4
                            && ".pdf".equalsIgnoreCase(fileName.substring(fileName.length() - 4))) {
                        ExtractedData data = extractInfo(file);
                        String personName = getPersonNameFromSummary(data.getData().get(DataKey.SUMMARY));
                        if (StringUtils.isEmpty(personName)) { // type2 case
                            if (!StringUtils.isEmpty(data.getData().get(DataKey.FIRST_NAME))
                                    && !StringUtils.isEmpty(data.getData().get(DataKey.LAST_NAME))) {
                                personName = data.getData().get(DataKey.FIRST_NAME) + " "
                                        + data.getData().get(DataKey.LAST_NAME);
                            }
                        }
                        String ticketNumber = data.getData().get(DataKey.REPORT_NO);
                        String officerName = data.getData().get(DataKey.OFFICER_NOTES_ISSUING_OFFICER);
                        String offenderName = data.getData().get(DataKey.OFFICER_NOTES_OFFENDER);
                        KeyEntry keyEntry = null;
                        if (!StringUtils.isEmpty(ticketNumber)) {
                            keyEntry = new KeyEntry(personName, ticketNumber, officerName, offenderName);
                            KeyTable.getInstance().put(keyEntry);
                        } else {
                            logger.warn("No person name or ticket number found for file " + fileName);
                        }
                        if (keyEntry != null && OBFUSCATE_NAMES) {
                            obfuscate(data, DataKey.FIRST_NAME, keyEntry.getHashKey());
                            obfuscate(data, DataKey.LAST_NAME, keyEntry.getHashKey());
                            obfuscate(data, DataKey.OFFICER_NOTES_ISSUING_OFFICER, keyEntry.getHashKey());
                            obfuscate(data, DataKey.OFFICER_NOTES_OFFENDER, keyEntry.getHashKey());
                            if (!StringUtils.isEmpty(data.getData().get(DataKey.SUMMARY))) {
                                String value = data.getData().get(DataKey.SUMMARY);
                                data.getData().put(DataKey.SUMMARY,
                                        value.replaceAll(keyEntry.getPersonName(), keyEntry.getHashKey()));
                            }
                        }
                        saveData(data);
                        ++docCount;
                    }
                }
            } catch (IOException | TikaException e) {
                logger.error("Problem converting file {}", file.getName(), e);
            }
        }
        writeKeyFile();
    }

    private void obfuscate(ExtractedData data, DataKey key, String hash) {
        if (!StringUtils.isEmpty(data.getData().get(key))) {
            data.getData().put(key, hash);
        }
    }

    private void createOutputFiles() throws IOException {
        new File(getOutputFile1()).delete();
        Files.append(flatten(converToColumnNames(OutputColumns.TYPE_1), separator), new File(getOutputFile1()),
                Charset.defaultCharset());
        new File(getOutputFile2()).delete();
        Files.append(flatten(converToColumnNames(OutputColumns.TYPE_2), separator), new File(getOutputFile2()),
                Charset.defaultCharset());
        logger.info("Will output into two files: {} and {}", getOutputFile1(), getOutputFile2());
    }

    private void writeKeyFile() throws IOException {
        logger.info("Writing the key file: {}", getOutputKeyFile());
        new File(getOutputKeyFile()).delete();
        Files.append(flatten(key_fileColumns, separator), new File(getOutputKeyFile()), Charset.defaultCharset());
        Map<String, KeyEntry> keyTable = KeyTable.getInstance().getKeyTable();
        Iterator<String> iter = keyTable.keySet().iterator();
        String[] values = new String[5];
        while (iter.hasNext()) {
            KeyEntry entry = keyTable.get(iter.next());
            values[0] = entry.getPersonName();
            values[1] = entry.getTicketNumber();
            values[2] = entry.getOfficerName();
            values[3] = entry.getOffenderName();
            values[4] = entry.getHashKey();
            Files.append(flatten((String[]) values, separator), new File(getOutputKeyFile()),
                    Charset.defaultCharset());
        }
    }

    private String[] converToColumnNames(DataKey[] dataKeys) {
        List<String> ret = new ArrayList<String>();
        for (int i = 0; i < dataKeys.length; i++) {
            ret.add(dataKeys[i].fieldName());
        }
        return ret.toArray(new String[ret.size()]);
    }

    public ExtractedData extractInfo(File file) throws IOException, TikaException {
        String pdfText = extractWithAspose(file);
        int fileType = determineFileType(pdfText);
        ExtractedData extractedData;
        List<String> tokens = extractTokensWithAspose(file);
        if (fileType == 1) {
            extractedData = new Type1Extractor().extractData(tokens, pdfText);
        } else if (fileType == 2) {
            extractedData = new Type2Extractor().extractData(tokens);
        } else {
            throw new IllegalStateException("Unknown file type " + fileType);
        }
        logger.debug("File: {}", file.getPath());
        logger.trace(pdfText);
        return extractedData;
    }

    private List<String> extractTokensWithAspose(File file) {
        com.aspose.pdf.Document pdfDocument = new com.aspose.pdf.Document(file.getPath());
        com.aspose.pdf.TextFragmentAbsorber tfa = new com.aspose.pdf.TextFragmentAbsorber();
        TextExtractionOptions teo = new TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Raw);
        tfa.setExtractionOptions(teo);
        pdfDocument.getPages().accept(tfa);
        // create TextFragment Collection instance
        com.aspose.pdf.TextFragmentCollection tfc = tfa.getTextFragments();
        List<String> tokens = new ArrayList<String>();
        for (int i = 1; i <= tfc.size(); i++) {
            String text = tfc.get_Item(i).getText();
            String token;
            if (text.trim().isEmpty()) {
                token = " ";
            } else {
                token = text.trim();
            }
            if (i > 1 && Utils.isProbablySameWord(tfc.get_Item(i - 1).getRectangle(),
                    tfc.get_Item(i).getRectangle())) {
                token = tokens.get(tokens.size() - 1) + token;
                tokens.remove(tokens.size() - 1);
            } else if (i > 1 && Utils.isProbablyNewLine(tfc.get_Item(i - 1).getRectangle(),
                    tfc.get_Item(i).getRectangle())) {
                boolean newParagraph = Utils.isProbablyNewParagraph(tfc.get_Item(i - 1).getRectangle(),
                        tfc.get_Item(i).getRectangle());
                tokens.add(ReportExtractor.SEPARATOR);
                if (newParagraph) {
                    tokens.add(ReportExtractor.PARAGRAPH);
                }
            }
            if (!Utils.isIgnoreWord(token)) {
                tokens.add(token);
            }
        }
        for (int i = 0; i < tokens.size(); i++) {
            if (!tokens.get(i).isEmpty() && !(tokens.get(i).charAt(0) == '\n')) {
                tokens.set(i, tokens.get(i).trim());
            }
        }
        return tokens;
    }

    private void saveData(ExtractedData data) throws IOException {
        String typedOutputFile = data.getFileType() == 1 ? getOutputFile1() : getOutputFile2();
        DataKey[] columns = data.getFileType() == 1 ? OutputColumns.TYPE_1 : OutputColumns.TYPE_2;
        List<String> values = new ArrayList<String>();
        for (int i = 0; i < columns.length; i++) {
            String value = null;
            if (data.getData().containsKey(columns[i])) {
                value = data.getData().get(columns[i]);
            }
            values.add(value != null ? value : "");
        }
        Files.append(flatten((String[]) values.toArray(new String[0]), separator), new File(typedOutputFile),
                Charset.defaultCharset());
    }

    private String extractWithAspose(File file) {
        String extractedText = "Text from file " + file.getPath() + " could not be extracted";
        try {
            com.aspose.pdf.Document pdfDocument = new com.aspose.pdf.Document(file.getPath());
            com.aspose.pdf.TextAbsorber textAbsorber = new com.aspose.pdf.TextAbsorber();
            pdfDocument.getPages().accept(textAbsorber);
            extractedText = textAbsorber.getText();
        } catch (Exception e) {
            logger.error("Problem extracting PDF from " + file.getPath(), e);
        }
        return extractedText;
    }

    private void initAsposeLicense() {
        com.aspose.pdf.License license = new com.aspose.pdf.License();
        try {
            license.setLicense(ReportExtractor.class.getResourceAsStream("/Aspose.Pdf.lic"));
        } catch (Exception e) {
            logger.error("Aspose license problem", e);
        }
    }

    private String flatten(String[] values, String separator) {
        logger.debug("Flattening {} keys", values.length);
        StringBuilder builder = new StringBuilder();
        for (String value : values) {
            logger.debug(Utils.notNull(value).trim());
            builder.append(Utils.notNull(value).trim()).append(separator);
        }
        if (values.length > 0) {
            builder.delete(builder.length() - 1, builder.length());
        }
        return builder.toString() + "\n";
    }

    /**
     * @return the outputFile
     */
    public String getOutputFile() {
        return outputFile;
    }

    /**
     * @return the outputFile of type 1
     */
    public String getOutputFile1() {
        int dot = outputFile.lastIndexOf(".");
        return new StringBuffer(outputFile).insert(dot, 1).toString();
    }

    /**
     * @return the outputFile of type 2
     */
    public String getOutputFile2() {
        int dot = outputFile.lastIndexOf(".");
        return new StringBuffer(outputFile).insert(dot, 2).toString();
    }

    /**
     * @return the outputKeyFile
     */
    public String getOutputKeyFile() {
        int dot = outputFile.lastIndexOf(".");
        return new StringBuffer(outputFile).insert(dot, "key").toString();
    }

    /**
     * @param outputFile the outputFile to set
     */
    public void setOutputFile(String outputFile) {
        this.outputFile = outputFile;
    }

    /**
     * @return the inputDir
     */
    public String getInputDir() {
        return inputDir;
    }

    /**
     * @param inputDir the inputDir to set
     */
    public void setInputDir(String inputDir) {
        this.inputDir = inputDir;
    }

    /**
     * @return the docCount
     */
    public int getDocCount() {
        return docCount;
    }

    /**
     * @param docCount the docCount to set
     */
    public void setDocCount(int docCount) {
        this.docCount = docCount;
    }

    private String getPersonNameFromSummary(String value) {
        if (value == null) {
            return "";
        }
        int nameStart = value.indexOf(" -");
        if (nameStart >= 0) {
            nameStart += 2;
            String name = Utils.getUpperCase(value, nameStart);
            return name;
        }
        return "";
    }

    private int determineFileType(String pdfText) {
        return !pdfText.contains("TICKET   NO:") ? 1 : 2;
    }
}