crawler.CETReCrawler.java Source code

Java tutorial

Introduction

Here is the source code for crawler.CETReCrawler.java

Source

 /* Copyright [2014] [Xinyue Wang]
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
 */
 package crawler;

 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.sql.SQLException;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.Date;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.TreeMap;

 import org.apache.mahout.common.distance.CosineDistanceMeasure;
 import org.apache.mahout.math.Vector;

 import twitter4j.FilterQuery;
 import twitter4j.TwitterStream;
 import twitter4j.TwitterStreamFactory;

 import crawler.util.*;

 public class CETReCrawler {
     public static SimpleDateFormat fm = new SimpleDateFormat("MM|dd-HH:mm");
     private String fileFolder = "KeyWord", fileName;
     private int trackThrsd;
     public static TreeMap<String, Integer> TFFreq;

     private TwitterStream twitterStream;
     private static DataStorage dataStore;
     private BufferedWriter out;
     private FilterQuery query = new FilterQuery();

     public static DataStream dataStream;
     public static TimeCal Timer;
     public static boolean expire;
     public static boolean finishStore = false;

public TwitterCrawler() {
      
   String appendix = getFilenameApd();
   fileName = "./"+fileFolder+"/"+appendix;
   Settings.tableName = appendix+"TF";
   trackThrsd = Settings.keywordMax - Settings.baseKeywords.length;
      
   checkFolder();
   Settings.queryKeywords = Settings.baseKeywords;
   writeKeywordFile(Settings.baseKeywords,false);
   writeKeywordFile(Settings.blackList,true); //blacklist writing

   estabTwitterConn();
   try {startCollection();} 
       catch (SQLException e) {e.printStackTrace();}
}

     private void estabTwitterConn() {
         OAuthUser OA = new OAuthUser();
         query.track(Settings.baseKeywords);
         twitterStream = new TwitterStreamFactory(OA.build(Settings.ConsumerKey.get(0),
                 Settings.ConsumerSecret.get(0), Settings.AccessToken.get(0), Settings.AccessSecret.get(0)))
                         .getInstance();

         dataStream = new DataStream();
         twitterStream.addListener(dataStream);
         twitterStream.filter(query);
     }

     public static String getFilenameApd() {
         String date = fm.format(new Date());
         date = "T" + date.replace("-", "");
         date = date.replace(":", "");
         date = date.replace("|", "");
         return date;
     }

     private void startCollection() throws SQLException {
         Timer = new TimeCal(Settings.timer);
         Timer.start();

         dataStore = new DataStorage();
         dataStore.initialTable();

         dataStore.start();

         while (true) {
             System.out.print("");
             if (TwitterCrawler.finishStore) {
                 TwitterCrawler.finishStore = false;

                 TFFreq = new TreeMap<String, Integer>(Settings.TFHashtagFreq);

                 checkBlackList();
                 checkMinFreq();

                 ArrayList<Entry<String, Integer>> list = sortByValue(TFFreq);
                 /*for(Entry<String, Integer>   entry   :   list){
                    String hash = entry.getKey();
                    int freq = entry.getValue();
                    for(int i = 0; i<Settings.baseKeywords.length; i++){
                       if(hash.replace("#", "").equals(Settings.baseKeywords[i].toLowerCase())){
                  list.remove(hash);
                  System.out.print("{REMOVED}");
                       }
                    }
                    System.out.println("Hashtag <"+hash+"> got frequency "+freq);
                 }*/

                 TFFreq = getTopN(list);
                 String topNList = "";
                 for (String keyword : TFFreq.keySet().toArray(new String[TFFreq.size()]))
                     topNList += keyword + ", ";
                 System.out.printf("************ %3d ************ Top-N Keyword List: %s\n", TFFreq.size(),
                         topNList);

                 //if using grow ratio to get top N
                 //ArrayList<Entry<String,Integer>> listRatio = sortByValue_growRatio();
                 //TFFreq = getTopN_growRatio(listRatio);

                 /*//print sample table
                 Iterator<Entry<String, double[]>> iter= TFFreq_sample.entrySet().iterator(); 
                 while(iter.hasNext()){
                    Entry<String, double[]> ent = iter.next();
                    String hashtag = ent.getKey();
                    double[] freq = ent.getValue();
                       
                    String s="";
                    for(double d:freq)   s+=d+", ";
                    System.out.println("hashtag {"+hashtag+"} got freq {"+s+"}");
                 }*/
                 putLastKeywords();

                 Settings.queryKeywords = distFilter();

                 String queryList = "";
                 for (String keyword : Settings.queryKeywords)
                     queryList += keyword + ", ";
                 System.out.printf("************ %3d ************ Queried Keyword List: %s\n",
                         Settings.queryKeywords.length, queryList);

                 writeKeywordFile(Settings.queryKeywords, false);

                 query.track(Settings.queryKeywords);
                 twitterStream.filter(query);

                 for (String hash : Settings.queryKeywords) {
                     if (Settings.TFHashtagFreq.containsKey(hash)) {
                         Settings.TFHashtagFreq_last.put(hash, Settings.TFHashtagFreq.get(hash));
                     }
                 }
                 Timer.restart();
                 Settings.TFHashtagFreq.clear();
                 TFFreq.clear();
             }
         }
     }

     private void putLastKeywords() {
         //??????not sure to keep this!!!
         //check if there are keywords in last time frame but not in this time frame 
         //(given the condition that there is space remaining)
         int count = 0;
         String temp = "";
         if (TFFreq.size() < Settings.keywordMax) {
             for (String lastKey : Settings.TFHashtagFreq_last.keySet()
                     .toArray(new String[Settings.TFHashtagFreq_last.size()])) {
                 if (!TFFreq.keySet().contains(lastKey)) {
                     TFFreq.put(lastKey, Settings.TFHashtagFreq.get(lastKey));
                     count++;
                     temp += lastKey + ", ";
                 }
             }
         }
         System.out.printf("************ %3d ************ Added Last Frame Keywords: %s\n", count, temp);
         Settings.TFHashtagFreq_last.clear();

     }

     private String[] distFilter() {
         DataSearch keywordsFilter = new DataSearch(TFFreq.keySet().toArray(new String[TFFreq.size()])); //pass potential keywords
         keywordsFilter.collectNewTweets();
         keywordsFilter.calculateTFIDF();

         ArrayList<String> keywordsFreqName = new ArrayList<String>();//
         ArrayList<Vector> keywordsFreq = new ArrayList<Vector>(); //list of keywords vector
         ArrayList<String> keywords = new ArrayList<String>(); //list of keywords

         for (String basehash : Settings.baseKeywords) {
             keywordsFreqName.add(basehash);//
             keywordsFreq.add(keywordsFilter.getSeedVect(basehash));
             keywords.add(basehash);
         }

         int count = 0;
         while (!keywordsFreq.isEmpty()) {
             String seq1Name = keywordsFreqName.get(0);//
             Vector seq1 = keywordsFreq.get(0);
             Vector seq2;
             Iterator<Entry<String, Integer>> iter = TFFreq.entrySet().iterator();
             while (iter.hasNext()) {
                 Entry<String, Integer> ent = iter.next();
                 String hashtag = ent.getKey();
                 seq2 = keywordsFilter.getSeedVect(hashtag);

                 //System.out.print("seq 1 ["+seq1Name+"] count ["+TFFreq.get(seq1Name)+"]: ");//
                 //System.out.println(seq1.asFormatString());
                 //System.out.print("seq 2 ["+hashtag+"] count ["+TFFreq.get(hashtag)+"]: ");
                 if (seq1 != null && seq2 != null) {
                     //System.out.println(seq2.asFormatString());
                     CosineDistanceMeasure cos = new CosineDistanceMeasure();
                     double distVal = cos.distance(seq1, seq2);
                     //System.out.println("***********["+hashtag+"] v.s ["+seq1Name+"]: "+distVal+"***********");

                     //check whether one of the two words is baseline criteria
                     boolean inBase = false;
                     for (String basehash : Settings.baseKeywords) {
                         if (hashtag.replace("#", "").equals(basehash.toLowerCase()) || hashtag.equals(basehash)
                                 || keywords.get(count).replace("#", "").equals(basehash.toLowerCase())
                                 || keywords.get(count).equals(basehash)) {
                             inBase = true;
                             break;
                         }
                     }

                     //base 0.8, others 0.5
                     if (inBase) {
                         if (distVal < 0.8 && distVal > 0.00001 && !keywords.contains(hashtag)) {
                             keywordsFreqName.add(hashtag);//
                             keywordsFreq.add(seq2);
                             keywords.add(hashtag);
                             System.out.println("BASEL***********[" + hashtag + "] v.s [" + seq1Name + "]: "
                                     + distVal + "***********");
                         }
                     } else {
                         if (distVal < 0.5 && distVal > 0.00001 && !keywords.contains(hashtag)) {
                             keywordsFreqName.add(hashtag);//
                             keywordsFreq.add(seq2);
                             keywords.add(hashtag);
                             System.out.println("OTHER***********[" + hashtag + "] v.s [" + seq1Name + "]: "
                                     + distVal + "***********");
                         }
                     }
                 }
             }
             keywordsFreqName.remove(0);//
             keywordsFreq.remove(0);
             count++;
         }

         return keywords.toArray(new String[keywords.size()]);
     }

     /*private TreeMap<String, Integer> getTopN_growRatio(ArrayList<Entry<String, Integer>> listRatio) {
        TreeMap<String, Integer> keywords = new TreeMap<String, Integer>();
        
        //baseline criteria
        for(int i = 0; i<Settings.baseKeywords.length; i++){
      keywords.put(Settings.baseKeywords[i], 10000);
        }
           
        //top list
        int count = 1;
        for(Entry<String, Integer>   entry   :   listRatio){
      String hash = entry.getKey();
      int freq = TFFreq.get(hash);
      boolean inBase = false;
      for(String basehash:Settings.baseKeywords){
         if(hash.replace("#", "").equals(basehash.toLowerCase())){
            inBase = true;
            break;
         }
      }
      if(!inBase){
         keywords.put(hash, freq);
         count ++;
      }
      if(count>trackThrsd) break;
        }
           
        //check if there are keywords in last time frame but not in this time frame 
        //(given the condition that there is space remaining)
        ArrayList<Entry<String,Integer>> listOri = sortByValue(Settings.TFHashtagFreq_last);
        int remain = Settings.keywordMax - keywords.size();
        if(remain > 0){
      for(Entry<String, Integer>   entry   :   listOri){
         String hash = entry.getKey();
         int freq = entry.getValue();
         if(!keywords.containsKey(hash)){
            keywords.put(hash, freq);
            remain--;
            if(remain == 0) break;
         }
      }
        }
           
        //kept for using growing ratio approach
        if(Settings.TFHashtagFreq_last.size()!=0){
      Settings.TFHashtagFreq_last.clear();
        }
        Settings.TFHashtagFreq_last = new TreeMap<String,Integer>(keywords);
        return keywords;
     }*/

     private TreeMap<String, Integer> getTopN(ArrayList<Entry<String, Integer>> list) {
         TreeMap<String, Integer> keywords = new TreeMap<String, Integer>();

         //baseline criteria
         for (int i = 0; i < Settings.baseKeywords.length; i++) {
             keywords.put(Settings.baseKeywords[i], 10000);
         }

         //top list
         int count = 1;
         for (Entry<String, Integer> entry : list) {
             String hash = entry.getKey();
             int freq = entry.getValue();
             boolean inBase = false;
             for (String basehash : Settings.baseKeywords) {
                 if (hash.replace("#", "").equals(basehash.toLowerCase())) {
                     inBase = true;
                     break;
                 }
             }
             if (!inBase) {
                 keywords.put(hash, freq);
                 count++;
             }
             if (count > trackThrsd)
                 break;
         }

         /*//check if there are keywords in last time frame but not in this time frame 
         //(given the condition that there is space remaining)
         if(keywords.size()<Settings.keywordMax){
            for(String lastKey : Settings.TFHashtagFreq_last.keySet().toArray(new String[Settings.TFHashtagFreq_last.size()])){
         if(!keywords.containsKey(lastKey))
            keywords.put(lastKey, Settings.TFHashtagFreq_last.get(lastKey));
            }
         }
            
         //kept for using growing ratio approach
         if(Settings.TFHashtagFreq_last.size()!=0){
            Settings.TFHashtagFreq_last.clear();
         }
         Settings.TFHashtagFreq_last = new TreeMap<String,Integer>(keywords);*/
         return keywords;
     }

     private ArrayList<Entry<String, Integer>> sortByValue(TreeMap<String, Integer> keywords) {
         ArrayList<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(keywords.entrySet());
         Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
             @Override
             public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
                 return (o2.getValue() - o1.getValue());
             }
         });
         return list;
     }

     private void checkMinFreq() {
         String minFreqInTFList = "";
         int num = 0;

         TreeMap<String, Integer> TFFreqTemp = new TreeMap<String, Integer>(TFFreq);
         Iterator<Entry<String, Integer>> iter = TFFreqTemp.entrySet().iterator();
         while (iter.hasNext()) {
             Entry<String, Integer> entry = iter.next();
             String hash = entry.getKey();
             int freq = entry.getValue();
             if (freq <= Settings.minNewFreq) {
                 TFFreq.remove(hash);
                 //System.out.println(hash+":"+freq);
                 minFreqInTFList += hash + ", ";
                 num++;
             }
         }
         System.out.printf("************ %3d ********** Removed LowFreq Words: %s\n", num, minFreqInTFList);
     }

     private void checkBlackList() {
         try {
             BufferedReader in = new BufferedReader(new FileReader(fileName + "BlackList.txt"));
             String temp = in.readLine();
             String blackInTFList = "";
             int num = 0;
             if (temp != null) {
                 String[] blackList = temp.split(", ");
                 for (String blackWord : blackList) {
                     if (TFFreq.containsKey(blackWord)) {
                         TFFreq.remove(blackWord);
                         blackInTFList += blackWord + ", ";
                         num++;
                     }
                 }
                 System.out.printf("************ %3d ********** Removed Black Words: %s\n", num, blackInTFList);
                 /*out = new BufferedWriter(new FileWriter(fileName+"BlackWordList.txt",true));
                 out.append(fm.format(new Date())+temp+"\n");
                 out.close();*/
             }
             in.close();
         } catch (IOException e) {
             e.printStackTrace();
         }

     }

     private void writeKeywordFile(String[] keywords, boolean black) {
         String temp = (black ? "" : ": ");
         for (String s : keywords) {
             temp += s + ", ";
         }
         try {
             if (!black) {
                 out = new BufferedWriter(new FileWriter(fileName + "KeywordsList.txt", true));
                 out.append(fm.format(new Date()) + temp + "\n");
             } else {
                 out = new BufferedWriter(new FileWriter(fileName + "BlackList.txt"));
                 out.append(temp);
             }

             out.close();
         } catch (IOException e) {
             e.printStackTrace();
         }
     }

     private void checkFolder() {
         File KeywordFolder;
         boolean stateKeywordFolder;

         KeywordFolder = new File("./KeyWord");
         stateKeywordFolder = KeywordFolder.exists();

         if (stateKeywordFolder == false) {
             System.out.println("The 'KeyWord' folder do not exist,trying to create one...");
             stateKeywordFolder = KeywordFolder.mkdir();
             if (stateKeywordFolder == false) {
                 System.out.println("Unable to create the folder,please check disk ...");
                 System.exit(1);
             }
         }

     }

     /**
      * @param args
      */
     public static void main(String[] args) {
         interactInput();
         new TwitterCrawler();
     }

     private static void interactInput() {
         for (int i = 0; i < Settings.ConsumerKeyArray.length; i++) {
             Settings.ConsumerKey.add(Settings.ConsumerKeyArray[i]);
             Settings.ConsumerSecret.add(Settings.ConsumerSecretArray[i]);
             Settings.AccessToken.add(Settings.AccessTokenArray[i]);
             Settings.AccessSecret.add(Settings.AccessSecretArray[i]);
         }

         try {
             System.out.printf("********************DEFAULT VALUE******************\n");
             for (int i = 0; i < Settings.baseKeywords.length; i++) {
                 System.out.printf("************ Query %1d: %15s *************\n", i + 1, Settings.baseKeywords[i]);
             }
             System.out.printf("****************** Timer: %1d mins ******************\n",
                     Settings.timer / 60 / 1000);
             System.out.printf("***************************************************\n");

             System.out
                     .print("Please enter the query terms(using comma to differentiate, press enter for default): ");
             BufferedReader bufferRead = new BufferedReader(new InputStreamReader(System.in));
             String temp = bufferRead.readLine();
             if (!temp.equals("")) {
                 Settings.baseKeywords = temp.split(",");
             } else {
                 System.out.println("Default value used!");
             }

             System.out.print("Please enter the time interval in minutes(press enter for default): ");
             bufferRead = new BufferedReader(new InputStreamReader(System.in));
             temp = bufferRead.readLine();
             if (!temp.equals("")) {
                 double inNum = Double.parseDouble(temp);
                 Settings.timer = (long) (inNum * 60 * 1000);
             } else {
                 System.out.println("Default value used!");
             }

             System.out.printf("*******************ACTUAL VALUE********************\n");
             for (int i = 0; i < Settings.baseKeywords.length; i++) {
                 System.out.printf("************ Query %2d: %15s ************\n", i + 1, Settings.baseKeywords[i]);
             }
             System.out.printf("**************** Timer: %2.1f mins ******************\n",
                     (double) Settings.timer / 60 / 1000);
             System.out.printf("***************************************************\n");

         } catch (IOException ioe) {
             System.out.println("IO error!");
             System.exit(1);
         }
     }
 }