com.ibm.watson.catalyst.corpus.CorpusBuilder.java Source code

Java tutorial

Introduction

Here is the source code for com.ibm.watson.catalyst.corpus.CorpusBuilder.java

Source

/*******************************************************************************
 * Copyright 2015 IBM Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package com.ibm.watson.catalyst.corpus;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.ibm.watson.catalyst.corpus.Corpus;
import com.ibm.watson.catalyst.corpus.CorpusBuilder;
import com.ibm.watson.catalyst.corpus.document.Document;
import com.ibm.watson.catalyst.corpus.document.DocumentReader;

/**
 * An object which can build a corpus. Takes care of the overhead.
 * @author wabeason
 *
 * @param <E> the Document objects for the corpus
 * @param <C> the type of Corpus to be built
 */
public abstract class CorpusBuilder<E extends Document, C extends Corpus<E>> {

    public CorpusBuilder() {
    }

    public CorpusBuilder(File aDirectory) {
        _directory = aDirectory;
    }

    public abstract C build();

    public CorpusBuilder<E, C> setDirectory(String aDirectory) {
        setDirectory(new File(aDirectory));
        return this;
    }

    public CorpusBuilder<E, C> setName(String aName) {
        _name = aName;
        return this;
    }

    public CorpusBuilder<E, C> setJson(File aJson) {
        _documents = getDocumentsFromJson(aJson);
        return this;
    }

    public CorpusBuilder<E, C> setJson(String aJson) {
        return setJson(new File(aJson));
    }

    public CorpusBuilder<E, C> setDirectory(File aDirectory) {
        _directory = aDirectory;
        return this;
    }

    // ------------------------------------------------------------------------------------------ //
    // Private
    // ------------------------------------------------------------------------------------------ //
    protected File _directory = new File("");
    protected String _name = "";
    protected List<E> _documents = new ArrayList<E>();
    protected static final ObjectMapper MAPPER = new ObjectMapper();

    protected static final DocumentReader dr = new DocumentReader();

    /**
     * Gets the documents node from the corpus JSON.
     * @param aJson
     * @return
     */
    private static JsonNode getDocumentsNode(File aJson) {
        JsonNode root;
        try {
            System.out.println("Reading tree.");
            root = MAPPER.readTree(aJson);
            System.out.println("Tree read.");
        } catch (FileNotFoundException e) {
            throw new IllegalStateException("Corpus not found: " + aJson, e);
        } catch (JsonProcessingException e) {
            e.printStackTrace();
            throw new IllegalStateException();
        } catch (IOException e) {
            throw new IllegalStateException();
        }
        JsonNode documentsNode = root.get("documents");
        return documentsNode;
    }

    /**
     * Gets the documents from the JSON file holding the corpus.
     * @param aJson
     * @return the documents.
     */
    protected List<E> getDocumentsFromJson(File aJson) {
        JsonNode documentsNode = getDocumentsNode(aJson);

        List<E> result = new ArrayList<E>();
        System.out.println("Converting documents to objects.");
        int numDocs = 0;
        for (JsonNode document : documentsNode) {
            result.add(getDocumentFromJson(document));
            if (++numDocs % 100 == 0)
                System.out.print(numDocs + " documents read.\r");
        }
        System.out.println(numDocs + " documents read.");
        return result;
    }

    protected abstract E getDocumentFromJson(JsonNode aNode);

    protected abstract E getDocumentFromFile(File aFile);

    /**
     * Reads corpus documents from a directory.
     * @param aDirectory
     * @return
     */
    protected List<E> getDocumentsFromDirectory(File aDirectory) {
        if (!aDirectory.isDirectory()) {
            System.out.println("Full path: " + aDirectory.getAbsolutePath());
            throw new IllegalArgumentException(aDirectory + " is not a directory.");
        }

        List<E> result = new ArrayList<E>();
        File[] files = aDirectory.listFiles();
        assert (files != null);

        if (files.length == 0) {
            throw new IllegalArgumentException("No files in directory: " + aDirectory);
        }

        int numFiles = 0;
        for (File f : aDirectory.listFiles()) {
            if (!f.toString().endsWith(".xml"))
                continue;
            E d = getDocumentFromFile(f);
            //if (!d.isComplete()) continue;
            //      if (!goodDocs.contains(d.getSourceDoc())) {
            //        continue;
            //      }
            result.add(d);
            if (++numFiles % 1000 == 0) {
                System.out.print(numFiles + " documents loaded.\r");
            }
        }
        System.out.println();

        if (result.size() == 0) {
            throw new IllegalStateException("No valid documents in directory: " + aDirectory);
        }

        return result;
    }

    //  private final List<String> goodDocs = new ArrayList<String>();
    //  {
    //    File[] goodFiles = (new File("C:/Users/IBM_ADMIN/Downloads/healthfinder")).listFiles();
    //    for (File file : goodFiles) {
    //      goodDocs.add(file.getName());
    //    }
    //  }

}