org.apache.lucene.pinyin.dwarf.PinyinTokenizer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.pinyin.dwarf.PinyinTokenizer.java

Source

package org.apache.lucene.pinyin.dwarf;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import com.hp.hpl.sparta.xpath.ThisNodeTest;
import com.luhuiguo.chinese.ChineseUtils;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

public class PinyinTokenizer extends Tokenizer {

    private static final int DEFAULT_BUFFER_SIZE = 512;

    private boolean main_done = false;
    private int finalOffset;
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
    private HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
    // .
    private String padding_char = " ";

    public boolean isOpenIK() {
        return openIK;
    }

    public void setOpenIK(boolean openIK) {
        this.openIK = openIK;
    }

    public int getPinyinnum() {
        return pinyinnum;
    }

    public void setPinyinnum(int pinyinnum) {
        this.pinyinnum = pinyinnum;
    }

    private Reader pinyinReader = null;

    private Version luceneMatchVersion;

    private IKSegmenter ikSegmenter;

    private boolean usedSmart = false;

    private boolean openIK = true;

    private int pinyinnum = 4;

    private boolean strictMode = false;

    private String cnCharLetters_String = "";

    private boolean done = false;

    private String[] cnarray;

    private int cn_arr_index = 0;

    // ??
    private CharArraySet stopwords;

    private int result_index = 0;

    private String split_delimiter;

    private String result = "";

    public static List<String> keywords = new ArrayList<String>() {
        {
            add("??");
            add("?");
        }
    };

    public static String filterkeyWords(String s) {
        for (String it : keywords) {
            if (s.contains(it)) {
                return s.replaceAll(it, "");
            }
        }
        return s;
    }

    // .
    public PinyinTokenizer(Version version, boolean usedSmart, boolean strictMode, CharArraySet stopwords,
            String split_delimiter) {
        this(DEFAULT_BUFFER_SIZE, version);
        this.usedSmart = usedSmart;
        this.strictMode = strictMode;
        this.stopwords = stopwords;
        this.split_delimiter = split_delimiter;
    }

    public PinyinTokenizer(int bufferSize, Version version) {
        pinyinReader = input;
        luceneMatchVersion = version;
        termAtt.resizeBuffer(bufferSize);
        format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
        format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
        format.setVCharType(HanyuPinyinVCharType.WITH_V);

    }

    @Override
    public final boolean incrementToken() throws IOException {
        if (!main_done) {
            clearAttributes();
            main_done = true;
            int upto = 0;
            char[] buffer = termAtt.buffer();
            // System.out.println(String.valueOf(buffer));
            while (true) {
                final int length = input.read(buffer, upto, buffer.length - upto);
                if (length == -1)
                    break;
                upto += length;
                if (upto == buffer.length)
                    buffer = termAtt.resizeBuffer(1 + buffer.length);
            }
            termAtt.setLength(upto);
            //?
            String str = ChineseUtils.toSimplified(termAtt.toString());

            termAtt.setEmpty();
            // ??
            /*Iterator<Object> stopwordItem = stopwords.iterator();
            while (stopwordItem.hasNext()) {
              String stopword = String.valueOf((char[]) stopwordItem.next());
              boolean isstop = str.contains(stopword);
              if (isstop) {
                termAtt.setEmpty();
                return false;
              }
            }*/

            StringBuilder cnIKLetters = new StringBuilder(); // ?
            StringBuilder cnCharLetters = new StringBuilder(); // ?
            StringBuilder allPinYinLetters = new StringBuilder(); // ?

            Pattern pattern = Pattern.compile("[\\pP\\s]");
            String[] arrStrings = pattern.split(str);
            for (int k = 0; k < arrStrings.length; k++) {
                String words = arrStrings[k];
                String filterword = filterkeyWords(words);
                if (filterword.length() < pinyinnum + 1) {
                    allPinYinLetters.append(getpinyin(filterword)).append(this.padding_char);
                }
                if (openIK) {
                    StringReader wordReader = new StringReader(words);
                    ikSegmenter = new IKSegmenter(wordReader, this.usedSmart);
                    Lexeme nextLexeme = null;
                    String nextLexemeStr = "";
                    while ((nextLexeme = ikSegmenter.next()) != null) {
                        nextLexemeStr = nextLexeme.getLexemeText();
                        cnIKLetters.append(nextLexemeStr).append(this.padding_char);

                        for (int i = 0; i < nextLexemeStr.length(); i++) {
                            char c = nextLexemeStr.charAt(i);
                            try {
                                String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format);
                                if (strs.length == 0) {
                                    continue;
                                }
                                String first_pinyin = strs[0];

                                // ???2??
                                if (nextLexemeStr.length() > 1) {
                                    allPinYinLetters.append(first_pinyin);
                                }
                            } catch (BadHanyuPinyinOutputFormatCombination e) {
                                // throw new RuntimeException();
                            } catch (NullPointerException e) {
                                break;
                            }

                        }
                        allPinYinLetters.append(this.padding_char);
                    }
                }
                for (int i = 0; i < words.length(); i++) {
                    char c = words.charAt(i);
                    // if (c < 128) {
                    // stringBuilder.append(c);
                    // } else {

                    // cnLetters.append(c);
                    cnCharLetters.append(c).append(this.padding_char);
                    // if (strs != null) {
                    // get first result by default
                    // String first_value = strs[0];

                    // TODO more than one pinyin
                    // .

                    // .
                    // ???2??
                    // if (nextLexemeStr.length() > 1) {
                    // allPinYinLetters.append(first_value);
                    // }
                    // ?????
                    // if (k == 0 && nextLexemeStr.length() == 1) {
                    // allPinYinLetters_without_s.append(first_value);
                    // firstLetters_name.append(first_value.charAt(0));
                    // }
                    // }

                }
                cnCharLetters.append("|||").append(this.padding_char);
            }
            // let's join them

            termAtt.append(allPinYinLetters);
            termAtt.append(this.padding_char);
            // // termAtt.append(cnLetters.toString());
            // termAtt.append(this.padding_char);

            termAtt.append(cnIKLetters);
            // termAtt.append(firstLetters.toString());
            termAtt.append(this.padding_char);
            // termAtt.append(stringBuilder.toString());
            if (!this.strictMode) {
                // termAtt.append(cnCharLetters).append(this.padding_char);
                cnCharLetters_String = cnCharLetters.toString().trim();
                cnarray = cnCharLetters_String.split("\\s+");
            } else {
                this.done = true;
            }
            // termAtt.append(this.padding_char);
            // ???
            // termAtt.append(mergeNGramPinYin(allPinYinLetters.toString()));

            // StringUtils.join(arrStrings, this.padding_char);
            posIncrAtt.setPositionIncrement(1);
            finalOffset = correctOffset(termAtt.toString().split("\\s+").length);
            offsetAtt.setOffset(correctOffset(0), finalOffset);
            typeAtt.setType("<IDEOGRAPHIC>");
            return true;
        } else {
            if (done) {
                return false;
            }

            termAtt.setEmpty();
            String term = cnarray[cn_arr_index];
            termAtt.append(term);
            posIncrAtt.setPositionIncrement(1);
            cn_arr_index++;
            typeAtt.setType("<IDEOGRAPHIC>");
            finalOffset = offsetAtt.endOffset() + term.length();
            offsetAtt.setOffset(correctOffset(offsetAtt.endOffset()), finalOffset);
            if (cn_arr_index == cnarray.length) {
                this.done = true;
            }
        }
        return true;
    }

    @Override
    public final void end() {
        // set final offset
        offsetAtt.setOffset(finalOffset, finalOffset);
    }

    @Override
    public void reset() throws IOException {
        super.reset();
        this.main_done = false;
        this.done = false;
        cn_arr_index = 0;
    }

    public String getpinyin(String s) {
        // ???2??
        if (s.length() < 2) {
            return "";
        }
        StringBuffer result = new StringBuffer();
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            String[] strs = null;
            try {
                strs = PinyinHelper.toHanyuPinyinStringArray(c, format);
            } catch (BadHanyuPinyinOutputFormatCombination e) {
            } catch (NullPointerException e) {
                break;
            }
            if (strs == null) {
                return "";
            }
            if (strs.length == 0) {
                return "";
            }
            result.append(strs[0]);

        }
        return result.toString();
    }

    public static void main(String[] args) {
        String string = "?";
        // System.out.println(getpinyin(string));
        Pattern pattern = Pattern.compile("[\\pP\\s]");
        String[] arrStrings = pattern.split(string);
        for (int i = 0; i < arrStrings.length; i++) {
            String string2 = arrStrings[i];
            System.out.println(filterkeyWords(string2));

        }
    }

}