ai.susi.mind.SusiReader.java Source code

Java tutorial

Introduction

Here is the source code for ai.susi.mind.SusiReader.java

Source

/**
 *  SusiReader
 *  Copyright 29.06.2016 by Michael Peter Christen, @0rb1t3r
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *  
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *  
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package ai.susi.mind;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import org.json.JSONArray;
import org.json.JSONObject;

public class SusiReader {

    private final Map<String, String> synonyms; // a map from a synonym to a canonical expression
    private final Map<String, String> categories; // a map from an expression to an associated category name
    private final Set<String> filler; // a set of words that can be ignored completely

    public SusiReader() {
        this.synonyms = new ConcurrentHashMap<>();
        this.categories = new ConcurrentHashMap<>();
        this.filler = new HashSet<>();
    }

    public SusiReader learn(JSONObject json) {

        // initialize temporary json objects
        JSONObject syn = json.has("synonyms") ? json.getJSONObject("synonyms") : new JSONObject();
        JSONArray fill = json.has("filler") ? json.getJSONArray("filler") : new JSONArray();
        JSONObject cat = json.has("categories") ? json.getJSONObject("categories") : new JSONObject();

        // add synonyms
        for (String canonical : syn.keySet()) {
            JSONArray a = syn.getJSONArray(canonical);
            a.forEach(synonym -> synonyms.put(((String) synonym).toLowerCase(), canonical));
        }

        // add filler
        fill.forEach(word -> filler.add((String) word));

        // add categories
        for (String canonical : cat.keySet()) {
            JSONArray a = cat.getJSONArray(canonical);
            a.forEach(synonym -> categories.put(((String) synonym).toLowerCase(), canonical));
        }

        return this;
    }

    public static class Token {
        public final String original, canonical, categorized;

        public Token(String original, String canonical, String categorized) {
            this.original = original;
            this.canonical = canonical;
            this.categorized = categorized;
        }

        public String toString() {
            return "{" + "\"original\"=\"" + original + "\"," + "\"canonical\"=\"" + canonical + "\","
                    + "\"categorized\"=\"" + categorized + "\"" + "}";
        }
    }

    public Token tokenizeTerm(String term) {
        String original = term.toLowerCase();
        String s = this.synonyms.get(original);
        String canonical = s == null ? original : s;
        String c = this.categories.get(canonical);
        String categorized = c == null ? canonical : c;
        return new Token(original, canonical, categorized);
    }

    public List<Token> tokenizeSentence(String term) {
        List<Token> t = new ArrayList<>();
        term = term.replaceAll("\\?", " ?").replaceAll("\\!", " !").replaceAll("\\.", " .").replaceAll("\\,", " ,")
                .replaceAll("\\;", " ;").replaceAll("\\:", " :").replaceAll("  ", " ");
        String[] u = term.split(" ");
        for (String v : u) {
            String original = v.toLowerCase();
            if (this.filler.contains(original))
                continue;
            t.add(tokenizeTerm(original));
        }
        return t;
    }
}