org.alfresco.repo.search.impl.lucene.analysis.VerbatimMLAnalayser.java Source code

Introduction

Here is the source code for org.alfresco.repo.search.impl.lucene.analysis.VerbatimMLAnalayser.java
Source

/*
 * #%L
 * Alfresco Legacy Lucene
 * %%
 * Copyright (C) 2005 - 2016 Alfresco Software Limited
 * %%
 * This file is part of the Alfresco software. 
 * If the software was purchased under a paid Alfresco license, the terms of 
 * the paid license agreement will prevail.  Otherwise, the software is 
 * provided under the following open source license terms:
 * 
 * Alfresco is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Alfresco is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
 * #L%
 */
package org.alfresco.repo.search.impl.lucene.analysis;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Locale;

import org.alfresco.repo.search.MLAnalysisMode;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;

public class VerbatimMLAnalayser extends Analyzer {
    private static Log s_logger = LogFactory.getLog(VerbatimMLAnalayser.class);

    private MLAnalysisMode mlAnalaysisMode;

    public VerbatimMLAnalayser(MLAnalysisMode mlAnalaysisMode) {
        this.mlAnalaysisMode = mlAnalaysisMode;
    }

    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
        // We use read ahead to get the language info - if this does not exist we need to restart
        // an use the default - there foer we need mark and restore.

        if (!(reader instanceof BufferedReader)) {
            BufferedReader breader = new BufferedReader(reader);
            try {
                if (!breader.markSupported()) {
                    throw new AnalysisException(
                            "Multilingual tokenisation requires a reader that supports marks and reset");
                }
                breader.mark(100);
                StringBuilder builder = new StringBuilder();
                if (breader.read() == '\u0000') {
                    String language = "";
                    String country = "";
                    String varient = "";
                    char c;
                    int count = 0;
                    while ((c = (char) breader.read()) != '\u0000') {
                        if (count++ > 99) {
                            breader.reset();
                            return getAnalyser().tokenStream(fieldName, breader);
                        }
                        if (c == '_') {
                            if (language.length() == 0) {
                                language = builder.toString();
                            } else if (country.length() == 0) {
                                country = builder.toString();
                            } else if (varient.length() == 0) {
                                varient = builder.toString();
                            } else {
                                breader.reset();
                                return getAnalyser().tokenStream(fieldName, breader);
                            }
                            builder = new StringBuilder();
                        } else {
                            builder.append(c);
                        }
                    }
                    if (builder.length() > 0) {
                        if (language.length() == 0) {
                            language = builder.toString();
                        } else if (country.length() == 0) {
                            country = builder.toString();
                        } else if (varient.length() == 0) {
                            varient = builder.toString();
                        } else {
                            breader.reset();
                            return getAnalyser().tokenStream(fieldName, breader);
                        }
                    }
                    Locale locale = new Locale(language, country, varient);
                    // leave the reader where it is ....
                    return new MLTokenDuplicator(getAnalyser().tokenStream(fieldName, breader), locale, breader,
                            mlAnalaysisMode);
                } else {
                    breader.reset();
                    return getAnalyser().tokenStream(fieldName, breader);
                }
            } catch (IOException io) {
                try {
                    breader.reset();
                } catch (IOException e) {
                    throw new AnalysisException("Failed to reset buffered reader - token stream will be invalid",
                            e);
                }
                return getAnalyser().tokenStream(fieldName, breader);
            }

        } else {
            throw new AnalysisException("Multilingual tokenisation requires a buffered reader");
        }
    }

    /**
     * @return Analyzer
     */
    private Analyzer getAnalyser() {
        return new VerbatimAnalyser(false);
    }

}