Java tutorial
/** * Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.jetwick.bot; import com.google.inject.Guice; import com.google.inject.Injector; import com.google.inject.Module; import de.jetwick.config.Configuration; import de.jetwick.config.DefaultModule; import de.jetwick.es.ElasticTweetSearch; import de.jetwick.data.JTweet; import de.jetwick.data.JUser; import static de.jetwick.es.ElasticTweetSearch.*; import de.jetwick.es.JetwickQuery; import de.jetwick.es.TweetQuery; import de.jetwick.tw.Credits; import de.jetwick.tw.TwitterSearch; import de.jetwick.tw.cmd.TermCreateCommand; import de.jetwick.util.Helper; import de.jetwick.util.MaxBoundSet; import de.jetwick.util.MyDate; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Map; import java.util.Random; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import twitter4j.TwitterException; /** * Idea: either twitterbot or own UI to show trends! * * @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net */ public class Jetwot { public static void main(String[] args) { Map<String, String> params = Helper.parseArguments(args); long interval = 10 * 1000L; try { String str = params.get("interval"); char unit = str.charAt(str.length() - 1); str = str.substring(0, str.length() - 1); if (unit == 'h') { // in hours interval = Long.parseLong(str) * 60 * 60 * 1000L; } else if (unit == 'm') { // in minutes interval = Long.parseLong(str) * 60 * 1000L; } } catch (Exception ex) { logger.warn("Cannot parse interval parameter:" + ex.getMessage()); } int minRT = 15; try { minRT = Integer.parseInt(params.get("minRT")); } catch (Exception ex) { logger.warn("Cannot parse interval parameter:" + ex.getMessage()); } new Jetwot().setMinRT(minRT).start(-1, interval); } private static Logger logger = LoggerFactory.getLogger(Jetwot.class); protected ElasticTweetSearch tweetSearch; protected TwitterSearch tw4j; private int minRT = 25; private MaxBoundSet<JTweet> tweetCache = new MaxBoundSet<JTweet>(50, 100).setMaxAge(3 * 24 * 3600 * 1000L); private TermCreateCommand command = new TermCreateCommand(); private Random rand = new Random(); public void init() { Configuration cfg = new Configuration(); Credits credits = cfg.getJetwotCredits(); Module module = new DefaultModule(); Injector injector = Guice.createInjector(module); tweetSearch = injector.getInstance(ElasticTweetSearch.class); tw4j = new TwitterSearch().setConsumer(credits.getConsumerKey(), credits.getConsumerSecret()); tw4j.initTwitter4JInstance(credits.getToken(), credits.getTokenSecret(), true); try { for (JTweet tw : tw4j.getTweets(tw4j.getUser(), new ArrayList<JUser>(), 20)) { command.calcTermsWithoutNoise(tw); addToCaches(tw); } } catch (Exception ex) { logger.error("Couldn't initialize id cache", ex); } } public void start(int cycles, long interval) { init(); for (int i = 0; cycles < 0 || i < cycles; i++) { logger.info("tweet cache:" + tweetCache.size()); Collection<JTweet> newSearchedTweets = search(); JTweet selectedTweet = null; for (JTweet newSearchTw : newSearchedTweets) { command.calcTermsWithoutNoise(newSearchTw); if (newSearchTw.getTextTerms().size() >= 4) { float maxJc = -1; for (JTweet twInCache : tweetCache.values()) { float jcIndex = (float) TermCreateCommand.calcJaccardIndex(twInCache.getTextTerms(), newSearchTw.getTextTerms()); if (maxJc < jcIndex) maxJc = jcIndex; } if (maxJc < 0.2 || maxJc == -1) { selectedTweet = newSearchTw; logger.info("new tweet with max jacc index= " + maxJc + ":" + newSearchTw.getText()); break; } logger.info("skip tweet because max jacc index= " + maxJc + ":" + newSearchTw.getText()); } else { logger.info("skip tweet because too less terms= " + newSearchTw.getTextTerms().size() + " :" + newSearchTw.getText()); } } if (selectedTweet != null) { try { tw4j.doRetweet(selectedTweet.getTwitterId()); addToCaches(selectedTweet); logger.info("=> retweeted:" + selectedTweet.getText() + " " + selectedTweet.getTwitterId()); } catch (Exception ex) { logger.error("Couldn't retweet tweet:" + selectedTweet + " " + ex.getMessage()); if (ex instanceof TwitterException) { TwitterException ex2 = ((TwitterException) ex); if (ex2.exceededRateLimitation()) { logger.error("Remaining hits:" + ex2.getRateLimitStatus().getRemainingHits() + " wait some seconds:" + ex2.getRateLimitStatus().getResetTimeInSeconds()); } } } } // Create tweet for Trending URLS? // every 15 minutes check for new trending url. put title + url into cache // or even better facet over dt (every 20 minutes) and pick up the docs! // f.dest_title_1_s.facet.limit=20 // from this, calculate trend -> up coming urls (new tweets per hour that link to this url) // every 2 hours post a new trending url from cache with the highest up rate + over a certain number of tweets // do no overuse ratelimit ! // twitter.postTweet("'Title ABOUT XY' short.url/test"); try { // add some noise when waiting to avoid being identified or filtered out as bot ;-) long tmp = (long) (interval + interval * rand.nextDouble() * 0.3); logger.info("wait " + (tmp / 60f / 1000f) + " minutes => next tweet on: " + new MyDate().plusMillis(tmp)); Thread.sleep(tmp); } catch (InterruptedException ex) { logger.warn("Interrupted " + ex.getMessage()); break; } } } public Collection<JTweet> search() { JetwickQuery query = new TweetQuery(). // should be not too old addFilterQuery(DATE, "[" + new MyDate().minusDays(1).toLocalString() + " TO *]"). // should be high quality addFilterQuery(QUALITY, "[90 TO *]"). // should be the first tweet with this content addFilterQuery(DUP_COUNT, 0). // only tweets which were already tweeted minRT-times addFilterQuery(RT_COUNT, "[" + minRT + " TO *]"). // only original tweets addFilterQuery(IS_RT, false). // for english our spam + dup detection works ok addFilterQuery(ElasticTweetSearch.LANG, "(en OR de OR sp)").setSort(RT_COUNT, "desc").setSize(50); logger.info(query.toString()); int TRIALS = 2; for (int trial = 0; trial < TRIALS; trial++) { try { return tweetSearch.collectObjects(tweetSearch.query(query)); } catch (Exception ex) { logger.error(trial + "| Couldn't query twindex: " + ex.getMessage()); } } return Collections.EMPTY_LIST; } public Jetwot setMinRT(int minRT) { this.minRT = minRT; return this; } protected void addToCaches(JTweet selectedTweet) { tweetCache.add(selectedTweet); } }