org.aika.experiments.CorpusIndividualWordTrainingTest.java Source code

Java tutorial

Introduction

Here is the source code for org.aika.experiments.CorpusIndividualWordTrainingTest.java

Source

package org.aika.experiments;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.aika.corpus.Document;
import org.aika.network.Network;
import org.aika.network.neuron.Neuron;
import org.aika.network.neuron.lattice.Node;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import org.aika.predefinedrules.PredefinedRules;
import org.aika.preprocessing.PrepareSimpleCharacterProperties;

import java.io.*;
import java.util.Collections;
import java.util.TreeSet;

/**
 *
 * @author Lukas Molzberger
 */
public class CorpusIndividualWordTrainingTest {

    @Test
    public void testTraining() throws IOException {
        Node.minFrequency = 3;

        String[] files = new String[] { "Aschenputtel"
                /*                "BruederchenUndSchwesterchen",
                                "DasTapfereSchneiderlein",
                                "DerFroschkoenig",
                                "DerGestiefelteKater",
                                "DerGoldeneSchluessel",
                                "DerSuesseBrei",
                                "DerTeufelMitDenDreiGoldenenHaaren",
                                "DerWolfUndDieSiebenJungenGeisslein",
                                "DieBremerStadtmusikanten",
                                "DieDreiFedern",
                                "DieSterntaler",
                                "DieWeisseSchlange",
                                "DieZwoelfBrueder",
                                "Dornroeschen",
                                "FrauHolle",
                                "HaenselUndGretel",
                                "HansImGlueck",
                                "JorindeUndJoringel",
                                "KatzeUndMausInGesellschaft",
                                "MaerchenVonEinemDerAuszogDasFuerchtenZuLernen",
                                "Marienkind",
                                "Rapunzel",
                                "Rotkaeppchen",
                                "Rumpelstilzchen",
                                "SchneeweisschenUndRosenrot",
                                "Schneewitchen",
                                "TischleinDeckDich",
                                "VonDemFischerUndSeinerFrau"*/
        };

        TreeSet<Document> inputs = new TreeSet<>();

        for (String fn : files) {
            File f = new File("./src/test/resources/text/maerchen/" + fn + ".txt");
            InputStream is = new FileInputStream(f);
            StringWriter writer = new StringWriter();
            IOUtils.copy(is, writer, "UTF-8");
            String txt = writer.toString();

            txt = txt.replace("\"", "");
            txt = txt.replace(",", "");
            txt = txt.replace(".", "");
            txt = txt.replace("\n", " ");
            txt = txt.replace("", " ");
            txt = txt.replace("", " ");
            txt = txt.replace("?", " ");
            txt = txt.replace("!", " ");
            txt = txt.replace(":", " ");
            txt = txt.replace(";", " ");
            txt = txt.replace("-", " ");
            txt = txt.replace("  ", " ");

            int i = 0;
            for (String w : txt.split(" ")) {
                Document doc = new Document(f + "-" + (i++));
                doc.setContent(" " + w + " ");
                PrepareSimpleCharacterProperties.run(doc);
                inputs.add(doc);

                //            if(i == 2) break;
            }
        }
        PredefinedRules pr = new PredefinedRules();
        pr.addRules();
        Neuron.createOrNeuron(pr.lateSE, Collections.EMPTY_SET);

        Network.run(inputs);
    }
}