com.ibm.watson.catalyst.corpus.tfidf.ApplyTemplate.java Source code

Java tutorial

Introduction

Here is the source code for com.ibm.watson.catalyst.corpus.tfidf.ApplyTemplate.java

Source

/*******************************************************************************
 * Copyright 2015 IBM Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package com.ibm.watson.catalyst.corpus.tfidf;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.ibm.watson.catalyst.corpus.tfidf.corpus.TermCorpus;
import com.ibm.watson.catalyst.corpus.tfidf.corpus.TermCorpusBuilder;
import com.ibm.watson.catalyst.corpus.tfidf.document.DocumentMatcher;
import com.ibm.watson.catalyst.corpus.tfidf.document.TermDocument;
import com.ibm.watson.catalyst.corpus.tfidf.term.TemplateMatch;

public final class ApplyTemplate {

    private static final ObjectMapper MAPPER = new ObjectMapper();

    public static void main(String[] args) {

        System.out.println("Loading Corpus.");
        JsonNode root;
        TermCorpus c;
        JsonNode documents;
        try (InputStream in = new FileInputStream(new File("tfidf-health-1.json"))) {
            root = MAPPER.readTree(in);
            documents = root.get("documents");
            TermCorpusBuilder cb = new TermCorpusBuilder();
            cb.setDocumentCombiner(0, 0);
            cb.setJson(new File("health-corpus.json"));
            c = cb.build();
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return;
        } catch (JsonProcessingException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return;
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return;
        }
        System.out.println("Corpus loaded.");

        List<TemplateMatch> matches = new ArrayList<TemplateMatch>();
        Iterator<TermDocument> documentIterator = c.getDocuments().iterator();

        int index = 0;
        for (JsonNode document : documents) {
            Pattern p1 = Template.getTemplatePattern(document, "\\b(an? |the )?(\\w+ ){0,4}",
                    "( \\w+)?(?= is (an?|one|the)\\b)");
            if (p1.toString().equals("\\b(an? |the )?(\\w+ ){0,4}()( \\w+)?(?= is (an?|one|the)\\b)"))
                continue;
            Pattern p2 = Template.getTemplatePattern(document, "^(\\w+ ){0,2}",
                    "( \\w+){0,1}?(?=( can| may)? causes?\\b)");
            Pattern p3 = Template.getTemplatePattern(document, "(?<=the use of )(\\w+ ){0,3}",
                    "( \\w+| ){0,2}?(?=( (and|does|in|for|can|is|as|to|of)\\b|\\.))");
            Pattern p4 = Template.getTemplatePattern(document, "^(\\w+ ){0,3}",
                    "( \\w+){0,1}(?=( can| may) leads? to\\b)");
            Pattern p5 = Template.getTemplatePattern(document, "(?<=\\bthe risk of )(\\w+ ){0,3}",
                    "( (disease|stroke|attack|cancer))?\\b");
            Pattern p6 = Template.getTemplatePattern(document, "(\\w{3,} ){0,3}",
                    "( (disease|stroke|attack|cancer))?(?= is caused by\\b)");
            Pattern p7 = Template.getTemplatePattern(document, "(?<= is caused by )(\\w+ ){0,10}", "");
            Pattern p8 = Template.getTemplatePattern(document, "\\b", "( \\w{4,})(?= can be used)");
            Pattern p9 = Template.getTemplatePattern(document, "(?<= can be used )(\\w+ ){0,10}", "\\b");
            TermDocument d = documentIterator.next();

            DocumentMatcher dm = new DocumentMatcher(d);
            matches.addAll(dm.getParagraphMatches(p1, "What is ", "?"));
            matches.addAll(dm.getParagraphMatches(p2, "What does ", " cause?"));
            matches.addAll(dm.getParagraphMatches(p3, "How is ", " used?"));
            matches.addAll(dm.getParagraphMatches(p4, "What can ", " lead to?"));
            matches.addAll(dm.getParagraphMatches(p5, "What impacts the risk of ", "?"));
            matches.addAll(dm.getParagraphMatches(p6, "What causes ", "?"));
            matches.addAll(dm.getParagraphMatches(p7, "What is caused by ", "?"));
            matches.addAll(dm.getParagraphMatches(p8, "How can ", " be used?"));
            matches.addAll(dm.getParagraphMatches(p9, "What can be used ", "?"));
            System.out.print("Progress: " + ((100 * ++index) / documents.size()) + "%\r");
        }
        System.out.println();

        List<TemplateMatch> condensedMatches = new ArrayList<TemplateMatch>();

        for (TemplateMatch match : matches) {
            for (TemplateMatch baseMatch : condensedMatches) {
                if (match.sameQuestion(baseMatch)) {
                    baseMatch.addAnswers(match);
                    break;
                }
            }
            condensedMatches.add(match);
        }

        try (BufferedWriter bw = new BufferedWriter(new FileWriter("health-questions.txt"))) {
            for (TemplateMatch match : condensedMatches) {
                bw.write(match.toString());
            }
            bw.write("\n");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        System.out.println("Done and generated: " + condensedMatches.size());

    }

}