com.digitalpebble.storm.crawler.spout.FileSpout.java Source code

Java tutorial

Introduction

Here is the source code for com.digitalpebble.storm.crawler.spout.FileSpout.java

Source

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.storm.crawler.spout;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import backtype.storm.spout.Scheme;
import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichSpout;

/**
 * Reads the lines from a UTF-8 file and use them as a spout. Load the entire
 * content into memory
 */
@SuppressWarnings("serial")
public class FileSpout extends BaseRichSpout {

    private SpoutOutputCollector _collector;
    private String[] _inputFiles;
    private Scheme _scheme;

    private LinkedList<byte[]> toPut = new LinkedList<byte[]>();
    private boolean active;

    public static final Logger LOG = LoggerFactory.getLogger(FileSpout.class);

    public FileSpout(String dir, String filter, Scheme scheme) {
        Path pdir = Paths.get(dir);
        List<String> f = new LinkedList<String>();
        try (DirectoryStream<Path> stream = Files.newDirectoryStream(pdir, filter)) {
            for (Path entry : stream) {
                String inputFile = entry.toAbsolutePath().toString();
                f.add(inputFile);
                LOG.info("Input : {}", inputFile);
            }
        } catch (IOException ioe) {
            LOG.error("IOException: %s%n", ioe);
        }
        _inputFiles = f.toArray(new String[f.size()]);
        _scheme = scheme;
    }

    public FileSpout(String file, Scheme scheme) {
        this(scheme, file);
    }

    public FileSpout(Scheme scheme, String... files) {
        if (files.length == 0) {
            throw new IllegalArgumentException("Must configure at least one inputFile");
        }
        _scheme = scheme;
        _inputFiles = files;
    }

    @SuppressWarnings("rawtypes")
    @Override
    public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
        _collector = collector;

        for (String inputFile : _inputFiles) {
            Path inputPath = Paths.get(inputFile);
            try (BufferedReader reader = Files.newBufferedReader(inputPath, StandardCharsets.UTF_8)) {
                String line = null;
                while ((line = reader.readLine()) != null) {
                    if (StringUtils.isBlank(line))
                        continue;
                    toPut.add(line.getBytes(StandardCharsets.UTF_8));
                }
            } catch (IOException x) {
                System.err.format("IOException: %s%n", x);
            }
        }
    }

    @Override
    public void nextTuple() {
        if (!active)
            return;
        if (toPut.isEmpty())
            return;
        byte[] head = toPut.removeFirst();
        List<Object> fields = this._scheme.deserialize(head);
        this._collector.emit(fields, fields.get(0).toString());
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(_scheme.getOutputFields());
    }

    @Override
    public void close() {
    }

    @Override
    public void activate() {
        super.activate();
        active = true;
    }

    @Override
    public void deactivate() {
        super.deactivate();
        active = false;
    }
}