com.ibm.watson.catalyst.corpus.tfidf.SearchTemplate.java Source code

Java tutorial

Introduction

Here is the source code for com.ibm.watson.catalyst.corpus.tfidf.SearchTemplate.java

Source

/*******************************************************************************
 * Copyright 2015 IBM Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package com.ibm.watson.catalyst.corpus.tfidf;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import com.fasterxml.jackson.databind.JsonNode;
import com.ibm.watson.catalyst.corpus.tfidf.corpus.TermCorpus;
import com.ibm.watson.catalyst.corpus.tfidf.corpus.TermCorpusBuilder;
import com.ibm.watson.catalyst.corpus.tfidf.document.DocumentMatcher;
import com.ibm.watson.catalyst.corpus.tfidf.document.TermDocument;
import com.ibm.watson.catalyst.corpus.tfidf.sentences.WordFrequencyHashtable;
import com.ibm.watson.catalyst.corpus.tfidf.term.TemplateMatch;

public final class SearchTemplate {

    public static void main(String[] args) {

        System.out.println("Loading Corpus.");
        TermCorpusBuilder cb = new TermCorpusBuilder();
        cb.setDocumentCombiner(0, 0);
        cb.setJson(new File("health-corpus.json"));
        TermCorpus c = cb.build();

        List<TermDocument> termDocuments = c.getDocuments();
        List<TemplateMatch> matches = new ArrayList<TemplateMatch>();

        Pattern p3 = Template.getTemplatePattern(new File("verbs-list.words"), "\\b(\\w+ )", "( \\w+)\\b");

        int index = 0;
        for (TermDocument termDocument : termDocuments) {
            DocumentMatcher dm = new DocumentMatcher(termDocument);

            matches.addAll(dm.getParagraphMatches(p3, "", ""));

            double progress = ((double) ++index / (double) termDocuments.size());
            System.out.print("Progress " + progress + "\r");
        }
        System.out.println();

        WordFrequencyHashtable f = new WordFrequencyHashtable();

        for (TemplateMatch match : matches) {
            f.put(match.getMatch(), 1);
        }

        JsonNode jn = f.toJsonNode(5);

        try (BufferedWriter bw = new BufferedWriter(new FileWriter("health-trigrams.json"))) {
            bw.write(jn.toString());
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

}