me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java Source code

Java tutorial

Introduction

Here is the source code for me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java

Source

/**
 * Copyright (c) 2016, adar.w (adar.w@outlook.com) 
 * 
 * http://www.smoe.me
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package me.smoe.adar.utils.cam.o.common;

import java.io.StringReader;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

public class SentenceAnalyzer {

    public static Set<String> analyzer(String sentence) throws Exception {
        if (StringUtils.isEmpty(sentence)) {
            return Collections.emptySet();
        }

        Analyzer analyzer = new StandardAnalyzer();
        try {
            TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
            tokenStream.addAttribute(CharTermAttribute.class);
            tokenStream.reset();

            Set<String> words = new LinkedHashSet<>();
            while (tokenStream.incrementToken()) {
                String word = ((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString();

                if (word.length() <= 1) {
                    continue;
                }

                words.add(word);
            }

            return words;
        } finally {
            analyzer.close();
        }
    }
}