org.apache.lucene.analysis.kr.test.KoreanAnalyzerTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.analysis.kr.test.KoreanAnalyzerTest.java

Source

/*
 * Copyright 2011-2013 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.kr.test;

import junit.framework.TestCase;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.kr.KoreanAnalyzer;
import org.apache.lucene.analysis.kr.KoreanFilter;
import org.apache.lucene.analysis.kr.utils.HanjaUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

public class KoreanAnalyzerTest extends TestCase {

    /**
     * t.getPositionIncrement()  ? ? ?,  ? ? .
     *  1? ? ?  ? ? ?
     * 0 ? ? ? ? ? ? ?.
     * ? ?  ? ?? .   ?  ? ?  ?? ? ?.
     *
     * @throws Exception
     */
    public void testKoreanTokenizer() throws Exception {

        List<String> sources = new ArrayList<>();
        sources.add("??? ??? ???");
        sources.add(" ? ? ?.");
        sources.add("?");
        sources.add(
                "    ? . school is a good place  ");
        sources.add(
                " \"ASP.NET  ?? ? Lifecycle, Page? Lifecycle ? ? event  ? , event handler ?,      .\\n\" +\n"
                        + "            \"Spring MVC ?   ? Controller ? Interceptor  ? ??.\\n\" +\n"
                        + "            \" ?...\\n\" +\n"
                        + "            \"org.springframework.web.servlet.HandlerInterceptor ? org.springframework.web.servlet.handler.HandlerInterceptorAdapter  ?? preHandler, postHandler, afterComletion ? ? ? .\\n\" +\n"
                        + "            \"servlet.xml ? ?  Interceptor  ?.\\n\" +\n"
                        + "            \" ?\\n\" +\n"
                        + "            \"    .  Spring Framework 3.2.1.RELEASE  Hibernate 4.1.9 Final  .\\n\" +\n"
                        + "            \"UnitOfWorkInterceptor  ? ?  Start ,  ? ? Close ?? . ? Hibernate  ? Unit Of Work ? , ?  ?  ? ? Transaction ?  ,  ?? Unit Of Work ?   ?,  Lifecycle ? Spring MVC ? ??  .\"");

        KoreanAnalyzer analyzer = new KoreanAnalyzer(Version.LUCENE_36);
        analyzer.setHasOrigin(false);

        for (String source : sources) {
            System.out.println("--------------------------");
            System.out.println("Analyze source : " + source);
            System.out.println("--------------------------");
            TokenStream stream = analyzer.tokenStream("s", new StringReader(source));

            long start = System.currentTimeMillis();

            while (stream.incrementToken()) {
                CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
                OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class);
                PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class);
                TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class);

                System.out.println(new String(termAttr.buffer(), 0, termAttr.length()));
            }

            System.out.println((System.currentTimeMillis() - start) + "ms");
        }
    }

    public void testStandardTokenizer() throws Exception {

        String source = "??? ??? ???";
        source = "    ? . school is a good place  ";

        long start = System.currentTimeMillis();

        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        TokenStream stream = analyzer.tokenStream("s", new StringReader(source));
        TokenStream tok = new StandardFilter(Version.LUCENE_36, stream);

        while (tok.incrementToken()) {
            CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class);
            TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class);

            System.out.println(new String(termAttr.buffer(), 0, termAttr.length()));
        }

        System.out.println((System.currentTimeMillis() - start) + "ms");
    }

    public void testJavaEscape() throws Exception {

        String str = StringEscapeUtils.unescapeHtml4("&#48085;");
        System.out.println(str);

        //??
        String han = StringEscapeUtils.unescapeJava("0x3400");
        han = StringEscapeUtils.escapeJava("?");

        System.out.println(han);

    }

    public void testConvertHanja() throws Exception {

        String han = "";

        for (int jj = 0; jj < han.length(); jj++) {
            char[] result = HanjaUtils.convertToHangul(han.charAt(jj));
            for (char c : result)
                System.out.print(c);

            System.out.println();
        }
    }

    public void testHanjaConvert() throws Exception {

        String source = "  ??? ";

        long start = System.currentTimeMillis();

        KoreanAnalyzer analyzer = new KoreanAnalyzer();
        TokenStream stream = analyzer.tokenStream("s", new StringReader(source));
        TokenStream tok = new KoreanFilter(stream);

        while (tok.incrementToken()) {
            CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class);
            TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class);

            System.out.println(new String(termAttr.buffer()));
        }

        System.out.println((System.currentTimeMillis() - start) + "ms");
    }

}