com.twentyn.patentTextProcessor.WordCountProcessor.java Source code

Introduction

Here is the source code for com.twentyn.patentTextProcessor.WordCountProcessor.java
Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.twentyn.patentTextProcessor;

import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.twentyn.patentExtractor.PatentCorpusReader;
import com.twentyn.patentExtractor.PatentDocument;
import com.twentyn.patentExtractor.PatentProcessor;
import edu.emory.clir.clearnlp.component.utils.NLPUtils;
import edu.emory.clir.clearnlp.tokenization.AbstractTokenizer;
import edu.emory.clir.clearnlp.util.lang.TLanguage;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

public class WordCountProcessor implements PatentProcessor {

    public static final Logger LOGGER = Logger.getLogger(PatentProcessor.class);

    public static class PatentDocumentWordCount {
        @JsonProperty("patent_file_path")
        String patentFilePath;
        @JsonProperty("word_count")
        Map<String, Integer> wordCount;

        public PatentDocumentWordCount(String patentFilePath, Map<String, Integer> wordCount) {
            this.patentFilePath = patentFilePath;
            this.wordCount = wordCount;
        }

        public String getPatentFilePath() {
            return patentFilePath;
        }

        public Map<String, Integer> getWordCount() {
            return wordCount;
        }
    }

    ObjectMapper objectMapper = new ObjectMapper();
    {
        objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
        objectMapper.enable(SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS);
    }

    private AbstractTokenizer tokenizer;
    private Set<String> stopWords = new HashSet<>();

    public WordCountProcessor() {
        this.tokenizer = NLPUtils.getTokenizer(TLanguage.ENGLISH);
    }

    public void readStopWords(File path) throws IOException {
        try (BufferedReader r = new BufferedReader(new FileReader(path))) {
            String line = null;
            while ((line = r.readLine()) != null) {
                this.stopWords.add(line);
            }
        }
    }

    public static final Pattern EXCLUDE_NUMBERS = Pattern.compile("^[+\\-]?[0-9\\.]+$");

    public void processPatentText(File patentFile, Reader patentTextReader, int patentTextLength)
            throws IOException, ParserConfigurationException, SAXException, TransformerConfigurationException,
            TransformerException, XPathExpressionException {

        PatentDocument doc = PatentDocument.patentDocumentFromXMLStream(new ReaderInputStream(patentTextReader));
        if (doc == null) {
            LOGGER.warn(String.format("Got null patent document object for patent at %s",
                    patentFile.getAbsolutePath()));
            return;
        }
        // Put all the text back together into one big string.  Sigh.
        List<String> textFields = new ArrayList<String>(
                1 + doc.getClaimsText().size() + doc.getTextContent().size());
        textFields.add(doc.getTitle());
        textFields.addAll(doc.getClaimsText());
        textFields.addAll(doc.getTextContent());
        String joinedText = StringUtils.join(textFields, "\n");

        // TODO: parallelize this.  Because come on, it's the nineties and Map Reduce is almost a thousand years old.
        Map<String, Integer> wordCount = new HashMap<>();
        List<String> words = this.tokenizer.tokenize(joinedText);
        for (String word : words) {
            if (EXCLUDE_NUMBERS.matcher(word).matches()) {
                continue; // Ignore words that are just numbers (with decimals)
            }

            word = word.toLowerCase();
            if (stopWords.contains(word)) {
                continue; // Ignore stop words;
            }

            Integer count = wordCount.get(word);
            if (count == null) {
                count = 1;
            }
            wordCount.put(word, count + 1);
        }

        PatentDocumentWordCount pdwc = new PatentDocumentWordCount(patentFile.getAbsolutePath(), wordCount);
        System.out.println(objectMapper.writeValueAsString(pdwc));
    }

    public static void main(String[] args) throws Exception {
        System.out.println("Starting up...");
        System.out.flush();
        Options opts = new Options();
        opts.addOption(Option.builder("i").longOpt("input").hasArg().required()
                .desc("Input file or directory to score").build());
        opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build());
        opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build());

        HelpFormatter helpFormatter = new HelpFormatter();
        CommandLineParser cmdLineParser = new DefaultParser();
        CommandLine cmdLine = null;
        try {
            cmdLine = cmdLineParser.parse(opts, args);
        } catch (ParseException e) {
            System.out.println("Caught exception when parsing command line: " + e.getMessage());
            helpFormatter.printHelp("WordCountProcessor", opts);
            System.exit(1);
        }

        if (cmdLine.hasOption("help")) {
            helpFormatter.printHelp("DocumentIndexer", opts);
            System.exit(0);
        }

        String inputFileOrDir = cmdLine.getOptionValue("input");
        File splitFileOrDir = new File(inputFileOrDir);
        if (!(splitFileOrDir.exists())) {
            LOGGER.error("Unable to find directory at " + inputFileOrDir);
            System.exit(1);
        }

        WordCountProcessor wcp = new WordCountProcessor();
        PatentCorpusReader corpusReader = new PatentCorpusReader(wcp, splitFileOrDir);
        corpusReader.readPatentCorpus();
    }
}