uk.ac.ed.inf.ace.processors.RemoveStopwords.java Source code

Java tutorial

Introduction

Here is the source code for uk.ac.ed.inf.ace.processors.RemoveStopwords.java

Source

/*
 * Copyright 2012 Daniel Renshaw <d.renshaw@sms.ed.ac.uk>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package uk.ac.ed.inf.ace.processors;

import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSet.Builder;
import com.google.common.collect.Iterators;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.Iterator;
import java.util.Set;
import uk.ac.ed.inf.ace.Engine;
import uk.ac.ed.inf.ace.ProcessorBase;
import uk.ac.ed.inf.ace.ReadWriteableDocument;

/**
 * @author "Daniel Renshaw" <d.renshaw@sms.ed.ac.uk>
 */
public class RemoveStopwords extends ProcessorBase<Engine<?, ?>, uk.ac.ed.inf.ace.config.v1.RemoveStopwords> {

    private final Set<String> stopwords;

    public RemoveStopwords(Engine<?, ?> engine, uk.ac.ed.inf.ace.config.v1.RemoveStopwords config)
            throws Exception {
        super(engine, config);
        File file = new File(config.getPathname());
        Preconditions.checkState(file.exists());

        FileReader fileReader = new FileReader(file);
        BufferedReader bufferedReader = new BufferedReader(fileReader);
        Builder<String> builder = ImmutableSet.builder();
        String line;

        while ((line = bufferedReader.readLine()) != null) {
            line = line.trim();

            if (line.isEmpty()) {
                continue;
            }

            builder.add(line.toLowerCase());
        }

        stopwords = builder.build();
    }

    private final Predicate<String> IS_NOT_STOPWORD = new Predicate<String>() {
        @Override
        public boolean apply(String input) {
            return !stopwords.contains(input);
        }
    };

    @Override
    public ReadWriteableDocument process(ReadWriteableDocument document) {
        @SuppressWarnings("unchecked")
        Iterator<String> tokens = (Iterator<String>) document.getContent();
        document.setContent(Iterators.filter(tokens, IS_NOT_STOPWORD));
        return document;
    }
}