de.csw.lucene.ConceptFilter.java Source code

Java tutorial

Introduction

Here is the source code for de.csw.lucene.ConceptFilter.java

Source

/*******************************************************************************
 * This file is part of the Coporate Semantic Web Project.
 * 
 * This work has been partially supported by the ``InnoProfile-Corporate Semantic Web" project funded by the German Federal
 * Ministry of Education and Research (BMBF) and the BMBF Innovation Initiative for the New German Laender - Entrepreneurial Regions.
 * 
 * http://www.corporate-semantic-web.de/
 * 
 * Freie Universitaet Berlin
 * Copyright (c) 2007-2013
 * 
 * Institut fuer Informatik
 * Working Group Coporate Semantic Web
 * Koenigin-Luise-Strasse 24-26
 * 14195 Berlin
 * 
 * http://www.mi.fu-berlin.de/en/inf/groups/ag-csw/
 * 
 * This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation; either version 3 of the License, or (at your option) any later version.
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
 * You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA or see <http://www.gnu.org/licenses/>
 ******************************************************************************/
package de.csw.lucene;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;

import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;

import de.csw.ontology.OntologyIndex;

/**
 * A filter that detects concepts from an ontology in the token stream. A
 * concept token is assigned the type {@value #CONCEPT_TYPE}.
 * 
 * @author rheese
 * 
 */
public final class ConceptFilter extends TokenFilter {
    static final Logger log = Logger.getLogger(ConceptFilter.class);

    /** token type of a concept */
    public static final String CONCEPT_TYPE = "concept";

    private OntologyIndex index;

    /** Tokens that have been read ahead */
    private final Queue<AttributeSource.State> queue = new LinkedList<AttributeSource.State>();

    /** the attributes of the token which the filter is currently reading */
    private final CharTermAttribute charTermAttribute;
    private final OffsetAttribute offsetAttribute;
    private final TypeAttribute typeAttribute;

    /**
     * Build a ConceptFilter that uses a given ontology index
     * 
     * @param in
     *            a token stream
     * @param oi
     *            a ontology index
     */
    public ConceptFilter(TokenStream in, OntologyIndex oi) {
        super(in);
        index = oi;

        charTermAttribute = input.getAttribute(CharTermAttribute.class);
        offsetAttribute = input.getAttribute(OffsetAttribute.class);
        typeAttribute = input.getAttribute(TypeAttribute.class);
    }

    /**
     * advances to the next token in the stream.
     * Takes into account that terms from the ontology might be constructed
     * out of several consecutive tokens.
     * @return false at EOS
     */
    @Override
    public boolean incrementToken() throws IOException {

        boolean hasMoreToken = innerNextToken();
        if (!hasMoreToken) {
            return false;
        }

        Queue<AttributeSource.State> lookAhead = new LinkedList<AttributeSource.State>();
        List<String> terms = new ArrayList<String>();
        terms.add(String.copyValueOf(charTermAttribute.buffer(), 0, charTermAttribute.length()));

        while (index.isPrefix(terms) && hasMoreToken) {
            lookAhead.add(captureState());
            hasMoreToken = innerNextToken();
            terms.add(String.copyValueOf(charTermAttribute.buffer(), 0, charTermAttribute.length()));
        }

        // if we have a match ...
        if (index.hasExactMatches(StringUtils.join(terms.toArray(), OntologyIndex.PREFIX_SEPARATOR))) {

            // ..then we consume all elements in the look ahead, if present
            if (!lookAhead.isEmpty()) {
                int maxEndOffset = offsetAttribute.endOffset();
                restoreState(lookAhead.poll());
                terms.remove(0); // already present in current token
                for (String term : terms) {
                    charTermAttribute.append(OntologyIndex.PREFIX_SEPARATOR);
                    charTermAttribute.append(term);
                }

                offsetAttribute.setOffset(offsetAttribute.startOffset(), maxEndOffset);
            }
            typeAttribute.setType(CONCEPT_TYPE);
            if (log.isTraceEnabled()) {
                log.trace("Concept token recognized: "
                        + String.copyValueOf(charTermAttribute.buffer(), 0, charTermAttribute.length()));
            }

        } else {

            // .. else we push back in the queue the tokens already read
            if (!lookAhead.isEmpty()) {
                lookAhead.add(captureState());
                restoreState(lookAhead.poll());
                for (AttributeSource.State laterToken : lookAhead) {
                    queue.add(laterToken);
                }
            }
        }

        return hasMoreToken;
    }

    private boolean innerNextToken() throws IOException {
        if (!queue.isEmpty()) {
            restoreState(queue.poll());
            return true;
        }
        return input.incrementToken();
    }

}