de.jetwick.tw.TweetProducerViaStream.java Source code

Introduction

Here is the source code for de.jetwick.tw.TweetProducerViaStream.java
Source

/**
 * Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.jetwick.tw;

import de.jetwick.data.JTag;
import de.jetwick.data.JTweet;
import de.jetwick.es.ElasticUserSearch;
import de.jetwick.es.ElasticTagSearch;
import de.jetwick.util.Helper;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import twitter4j.TwitterStream;

/**
 * fills the tweets queue via twitter searchAndGetUsers (does not cost API calls)
 * 
 * @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
 */
public class TweetProducerViaStream extends MyThread implements TweetProducer {

    private final Logger logger = LoggerFactory.getLogger(getClass());
    protected BlockingQueue<JTweet> resultTweets = new LinkedBlockingQueue<JTweet>();
    protected TwitterSearch twSearch;
    protected ElasticTagSearch tagSearch;
    private long newStreamInterval = 3 * 60 * 1000;
    private double tweetsPerSecLimit = 0.5;

    public TweetProducerViaStream() {
        super("tweet-producer-stream");
    }

    @Override
    public void setQueue(BlockingQueue<JTweet> packages) {
        this.resultTweets = packages;
    }

    @Override
    public void run() {
        TwitterStream stream = null;
        TwitterStream oldStream = null;
        // we cannot detect frequency of all terms but detect + remove high frequent disturbers
        Map<String, Integer> termFreq = new LinkedHashMap<String, Integer>();

        while (true) {
            try {
                // stream only LESS FREQUENT tags! leave popular tags only for search                
                Collection<String> input = initTags(termFreq);
                termFreq.clear();
                if (input.isEmpty()) {
                    logger.error("No less frequent tags found! Frequency limit:" + tweetsPerSecLimit);
                    if (!myWait(10))
                        break;
                    continue;
                }
                int counter = 0;
                logger.info("Starting over with " + input.size() + " tags. indexed tweets:" + counter
                        + " tweetsPerSecLimit:" + tweetsPerSecLimit + " " + input);
                if (stream != null)
                    oldStream = stream;

                // use a separate collection here to let the listener release when doing garbage collection
                // (the listener which is added in the streamingTwitter method)
                BlockingQueue<JTweet> queue = new LinkedBlockingQueue<JTweet>(1000);
                stream = twSearch.streamingTwitter(input, queue);

                // shutdown old stream
                if (oldStream != null) {
                    oldStream.shutdown();
                    //                    oldStream.cleanUp();
                }

                long start = System.currentTimeMillis();
                while (true) {
                    JTweet tw = queue.take();
                    String matchingTerm = null;
                    String txt = tw.getLowerCaseText();
                    for (String term : input) {
                        if (txt.contains(term)) {
                            matchingTerm = term;
                            break;
                        }
                    }
                    resultTweets.put(tw.setFeedSource("from stream:" + matchingTerm));
                    Integer integ = termFreq.put(matchingTerm, 1);
                    if (integ != null)
                        termFreq.put(matchingTerm, integ + 1);

                    counter++;
                    // UPDATE tags after a while
                    if ((System.currentTimeMillis() - start) > newStreamInterval)
                        break;
                }
            } catch (Exception ex) {
                logger.error("!! Error while getting tweets via streaming API. Waiting and trying again.", ex);
                if (!myWait(60 * 5))
                    break;
            }
        }

        logger.info(getName() + " finished");
    }

    @Override
    public void setTwitterSearch(TwitterSearch tws) {
        this.twSearch = tws;
    }

    public Collection<String> initTags(Map<String, Integer> termFreq) {
        Map<String, JTag> tags = new LinkedHashMap<String, JTag>();
        try {
            for (JTag tag : tagSearch.findLowFrequent(0, 1000, tweetsPerSecLimit)) {
                if (tag.getTerm() != null) {
                    // information in index is based on old search data check if 'realtime' tweetsPerSec is also ok
                    Integer counts = termFreq.get(tag.getTerm());
                    if (counts != null && counts / (newStreamInterval / 1000f) > tweetsPerSecLimit) {
                        logger.info("Detected tag with a too high frequency (based on stream data):" + tag
                                + " stream-counts:" + counts);
                        continue;
                    }
                    int spaces = Helper.countChars(tag.getTerm(), ' ');
                    if (spaces > 7) {
                        logger.info("Skipping term " + tag.getTerm() + " because too many spaces:" + spaces);
                        continue;
                    }

                    if (tag.getTerm().contains(" OR ")) {
                        logger.warn("Hmmh somewhere the OR came into the tag index!?");
                        continue;
                    }

                    tags.put(tag.getTerm(), tag);
                }
            }
        } catch (Exception ex) {
            logger.info("Couldn't query tag index", ex);
        }
        // TODO further remove overlapping tags like 'wicket' and 'apache wicket'
        Set<String> input = new LinkedHashSet<String>();
        int MAX_TAGS = 400;
        MAIN: for (JTag t : tags.values()) {
            String term = t.getTerm();
            if (input.size() >= MAX_TAGS) {
                logger.error("Too many Tags - Cannot further add tags!" + input.size());
                break MAIN;
            }
            // filter by links only does NOT work!!
            input.add(term.trim());// + " " + TwitterSearch.LINK_FILTER);
        }
        return input;
    }

    @Override
    public void setTagSearch(ElasticTagSearch tagSearch) {
        this.tagSearch = tagSearch;
    }

    @Override
    public void setUserSearch(ElasticUserSearch userSearch) {
        throw new UnsupportedOperationException("Not supported yet.");
    }

    public void setNewStreamInterval(long millis) {
        newStreamInterval = millis;
    }

    public void setTweetsPerSecLimit(double tweetsPerSecLimit) {
        this.tweetsPerSecLimit = tweetsPerSecLimit;
    }
}