com.martinwunderlich.nlp.io.LexisNexisParser.java Source code

Java tutorial

Introduction

Here is the source code for com.martinwunderlich.nlp.io.LexisNexisParser.java

Source

/*************************************************************************************************************
 * LexisNexisParser - A Java wrapper for LexisNexis data.
 * Copyright 2015 Martin Wunderlich
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of the
 * 
 * GNU Lesser General Public License (LGPL)
 * 
 * as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details. You should have received a copy
 * of the GNU Lesser General Public License along with this library. If not, see http://www.gnu.org/licenses/.
 **************************************************************************************************************/

package com.martinwunderlich.nlp.io;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;

/**
 * A parser to create instances of {@Link LexisNexisDocument}
 * based on documents in given file path.
 * The meta is mapped by on the regexes in config.properties.
 * 
 * @author Martin Wunderlich (martin@wunderlich.com)
 *
 */
public class LexisNexisParser {

    Map<String, Pattern> metaDataMapping = null;
    Properties config = new Properties();
    String propertiesFile = "config.properties";
    Pattern docStartPattern = Pattern.compile("^^\\s+(Dokument|Document) [0-9]+ (von|of) [0-9]+$"); // TODO: Support other languages than EN and DE?

    public LexisNexisParser() {
        loadConfiguration();
        this.metaDataMapping = readMetaMappingFromConfig();
    }

    private void loadConfiguration() {
        try {
            InputStream is = getClass().getClassLoader().getResourceAsStream(propertiesFile);
            if (is != null)
                config.load(is);
            else
                throw new FileNotFoundException("Configuration file is missing. Looked for: " + propertiesFile);
        } catch (IOException e) {
            throw new RuntimeException(e); // wrap and rethrow
        }
    }

    public List<LexisNexisDocument> parse(String filePath) {
        File file = new File(filePath);

        return this.parse(file);
    }

    public List<LexisNexisDocument> parse(File parseFile) {
        List<LexisNexisDocument> resultList = new ArrayList<>();
        List<String> lines = null;
        try {
            lines = FileUtils.readLines(parseFile);
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }

        LexisNexisDocument currentDoc = null;

        boolean inBody = false;
        boolean afterBody = false;
        boolean inHeader = false;
        boolean afterHeadline = false;
        boolean inCopyright = false;

        StringBuilder bodyBuilder = null;
        StringBuilder copyRightBuilder = null;

        int emptyHeaderLines = 0; // keeps track of consecutive empty lines in header: two or more of these after the headline mean that the body texts starts (argh).

        for (String line : lines) {
            if (line.isEmpty() && !inBody) {
                emptyHeaderLines++;
                continue; // skip empty lines outside of body
            }

            if (isDocumentStart(line)) {
                if (currentDoc != null) {
                    currentDoc.setText(bodyBuilder.toString());
                    currentDoc.setCopyright(copyRightBuilder.toString());
                    resultList.add(currentDoc);
                }

                // Init
                bodyBuilder = new StringBuilder();
                copyRightBuilder = new StringBuilder();
                currentDoc = new LexisNexisDocument();
                inHeader = true;
                afterHeadline = false;
                inBody = false;
                inCopyright = false;
                emptyHeaderLines = 0;

                continue;
            }

            if (inCopyright || isCopyrightStart(line)) {
                copyRightBuilder.append(line.trim());
                if (!inCopyright)
                    inCopyright = true;
                continue;
            }

            String type = getMetaLineType(line);
            if (type != null) {
                addMetaLine(line, type, currentDoc);
                emptyHeaderLines = 0;
                if (inBody) {
                    inBody = false;
                    afterBody = true;
                }
            } else {
                if (inBody) {
                    if (line.isEmpty())
                        bodyBuilder.append("\n");
                    else
                        bodyBuilder.append(line.trim().replaceAll("\n", " "));
                } else if (inHeader && !afterHeadline) { // in header and line is not meta? this must be the headline
                    currentDoc.setHeadline(line.trim());
                    afterHeadline = true;
                    emptyHeaderLines = 0;
                } else if (emptyHeaderLines >= 2 && !afterBody) { // first line of body text found
                    inBody = true;
                    inHeader = false;
                    bodyBuilder.append(line.trim().replaceAll("\n", " "));
                }
            }
        }

        // Store final doc in list
        if (currentDoc != null) {
            currentDoc.setText(bodyBuilder.toString());
            resultList.add(currentDoc);
        }

        return resultList;
    }

    private boolean isCopyrightStart(String line) {
        if (line.contains("            Copyright"))
            return true;
        else
            return false;
    }

    private void addMetaLine(String line, String type, LexisNexisDocument currentDoc) {
        Pattern pattern = metaDataMapping.get(type);
        Matcher m = pattern.matcher(line);

        if (!m.matches())
            return;

        String matchingGroup = m.group(1).trim();
        try {
            switch (type) { // TODO MW: not so nice to have this list of types hard-coded; find a better way
            case "publication":
                currentDoc.setPublication(matchingGroup);
                break;
            case "byline":
                currentDoc.setByline(matchingGroup);
                break;
            case "journalCode":
                currentDoc.setjournalCode(matchingGroup);
                break;
            case "language":
                currentDoc.setLanguage(matchingGroup);
                break;
            case "length":
                currentDoc.setLength(matchingGroup);
                break;
            case "loadDate":
                currentDoc.setLoadDate(matchingGroup);
                break;
            case "documentDate":
                currentDoc.setDocumentDate(matchingGroup);
                break;
            case "publicationType":
                currentDoc.setPublicationType(matchingGroup);
                break;

            }
        } catch (Exception ex) {
            ex.printStackTrace();
            // just print the error to std out and skip
        }
    }

    private String getMetaLineType(String line) {
        for (String key : metaDataMapping.keySet()) {
            Pattern pattern = metaDataMapping.get(key);
            if (pattern.matcher(line).matches())
                return key;
        }

        return null;
    }

    private boolean isDocumentStart(String line) {
        return docStartPattern.matcher(line).matches();
    }

    /**
     * Create a map of pre-compiled regexes. Non-regexes in the properties file
     * are simply skipped.
     */
    private Map<String, Pattern> readMetaMappingFromConfig() {
        Map<String, Pattern> mapping = new HashMap<>();

        for (Object key : config.keySet()) {
            String regex = config.getProperty((String) key);
            try {
                Pattern pattern = Pattern.compile(regex);
                mapping.put((String) key, pattern);
            } catch (Exception ex) {
                // intentionally do nothing
            }
        }

        return mapping;
    }
}