com.teamhooman.hoomanbot.markov.MarkovGenerator.java Source code

Java tutorial

Introduction

Here is the source code for com.teamhooman.hoomanbot.markov.MarkovGenerator.java

Source

/*
Copyright Paul James Mutton, 2001-2004, http://www.jibble.org/
    
This file is part of JMegaHal.
    
This software is dual-licensed, allowing you to choose between the GNU
General Public License (GPL) and the www.jibble.org Commercial License.
Since the GPL may be too restrictive for use in a proprietary application,
a commercial license is also provided. Full license information can be
found at http://www.jibble.org/licenses/
    
$Author: pjm2 $
$Id: JMegaHal.java,v 1.4 2004/02/01 13:24:06 pjm2 Exp $
    
*/

package com.teamhooman.hoomanbot.markov;

import com.beust.jcommander.internal.Lists;
import com.google.common.base.CharMatcher;
import com.google.common.base.Joiner;
import com.google.common.base.MoreObjects;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableMap;
import it.unimi.dsi.fastutil.chars.CharOpenHashSet;
import it.unimi.dsi.fastutil.chars.CharSet;
import org.slf4j.Logger;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.net.URL;
import java.security.SecureRandom;
import java.util.*;

public class MarkovGenerator implements Serializable {

    private final static Logger logger = org.slf4j.LoggerFactory.getLogger(MarkovGenerator.class);

    private final static Random rand = new SecureRandom();

    private final Map<String, Set<ListQuad>> wordToQuads = new HashMap<>();

    private final Map<ListQuad, ListQuad> listQuads = new HashMap<>();

    private final Map<ListQuad, Set<String>> quadToPreviousWord = new HashMap<>();
    private final Map<ListQuad, Set<String>> quadToNextWord = new HashMap<>();

    // These are valid chars for words. Anything else is treated as punctuation.
    private static final CharSet END_CHARS = new CharOpenHashSet(".!?".toCharArray());
    private final static CharMatcher whitespaceMatcher = CharMatcher.INVISIBLE.or(CharMatcher.WHITESPACE);
    private final static CharMatcher wordMatcher = CharMatcher.JAVA_LETTER_OR_DIGIT.or(CharMatcher.anyOf("-_/"))
            .precomputed();
    private final static CharMatcher nonWordMatcher = wordMatcher.negate().precomputed();

    /**
     * Construct an instance of JMegaHal with an empty brain.
     */
    public MarkovGenerator() {

    }

    /**
     * Adds an entire documents to the 'brain'.  Useful for feeding in
     * stray theses, but be careful not to put too much in, or you may
     * run out of memory!
     */
    public void addDocument(String uri) throws IOException {
        BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(uri).openStream()));
        StringBuilder buffer = new StringBuilder();

        int ch = 0;
        while ((ch = reader.read()) != -1) {
            buffer.append((char) ch);
            if (END_CHARS.contains((char) ch)) {
                String sentence = whitespaceMatcher.collapseFrom(buffer, ' ');
                add(sentence);
                buffer = new StringBuilder();
            }
        }
        add(whitespaceMatcher.collapseFrom(buffer, ' '));
        reader.close();
    }

    /**
     * Adds a new sentence to the 'brain'
     */
    public void add(CharSequence input) {

        Map<CharMatcher, CharMatcher> swap = ImmutableMap.of(wordMatcher, nonWordMatcher, nonWordMatcher,
                wordMatcher);
        CharMatcher matcher = wordMatcher;
        List<String> parts = Lists.newArrayList();

        //        input.chars().spliterator().

        Splitter sentenceSplitter = Splitter.on(wordMatcher.negate()).trimResults(whitespaceMatcher)
                .omitEmptyStrings();

        List<String> split = sentenceSplitter.splitToList(input);

        logger.info("Processed:\n\t" + "Input: {}\n\t" + "Words: {}", input, split);

        if (split.size() < 4) {
            logger.warn("Sentence was not long enough to learn from.");
            return;
        }

        for (int n = 0; n < split.size() - 3; ++n) {
            List<String> subList = split.subList(n, n + 4);
            ListQuad lq = new ListQuad(subList);

            if (listQuads.containsKey(lq)) {
                lq = listQuads.get(lq);
            } else {
                listQuads.put(lq, lq);
            }

            if (n == 0) {
                lq.setCanStart(true);
            }

            if (n == split.size() - 4) {
                lq.setCanEnd(true);
            }

            for (int q = 0; q < 4; ++q) {
                String token = split.get(n + q);
                addAssociation(wordToQuads, token, lq);
                logger.info("Added word-to-containing-quad association: {} / {}", token, lq);
            }

            if (n > 0) {
                String previousToken = split.get(n - 1);
                addAssociation(quadToPreviousWord, lq, previousToken);
                logger.info("Added quad-to-possible-predecessor association: {} / {}", previousToken, lq);
            }

            if (n < split.size() - 4) {
                String nextToken = split.get(n + 4);
                addAssociation(quadToNextWord, lq, nextToken);
                logger.info("Added quad-to-possible-successor association: {} / {}", nextToken, lq);
            }
        }
    }

    private <T, U> void addAssociation(Map<T, Set<U>> source, T key, U value) {
        Set<U> set = source.get(key);
        if (set == null) {
            set = new HashSet<>(1);
            source.put(key, set);
        }
        set.add(value);
    }

    /**
     * Generate a random sentence from the brain.
     */
    public String getSentence() {
        return getSentence(null);
    }

    /**
     * Generate a sentence that includes (if possible) the specified word.
     */
    public String getSentence(String word) {

        Set<ListQuad> quads = MoreObjects.firstNonNull(wordToQuads.get(word), this.listQuads.keySet());

        if (quads.size() == 0) {
            logger.warn("Not enough information to construct sentence; returning the empty String.");
            return "";
        }

        LinkedList<String> parts = new LinkedList<>();
        ListQuad mid = quads.stream().skip(rand.nextInt(quads.size())).findFirst().get();
        ListQuad quad = mid;
        parts.addAll(quad.getTokens());

        while (!quad.canEnd()) {
            Set<String> nextTokens = quadToNextWord.get(quad);
            String nextToken = nextTokens.stream().skip(rand.nextInt(nextTokens.size())).findFirst().get();
            List<String> newList = Lists.newArrayList(quad.getTokens().subList(1, 4));
            newList.add(nextToken);
            quad = listQuads.get(new ListQuad(newList));
            parts.add(nextToken);
            logger.info("Added '{}' to end.", nextToken);
        }

        quad = mid;

        while (!quad.canStart()) {
            Set<String> previousTokens = quadToPreviousWord.get(quad);
            String previousToken = previousTokens.stream().skip(rand.nextInt(previousTokens.size())).findFirst()
                    .get();
            List<String> newList = new ArrayList<>();
            newList.add(previousToken);
            newList.addAll(quad.getTokens().subList(0, 3));
            parts.addFirst(previousToken);
            logger.info("Added '{}' to beginning.", previousToken);
        }

        Joiner j = Joiner.on(' ');
        return j.join(parts);
    }
}