edu.cmu.cs.lti.discoursedb.io.twitter.converter.TwitterConverter.java Source code

Introduction

Here is the source code for edu.cmu.cs.lti.discoursedb.io.twitter.converter.TwitterConverter.java
Source

/*******************************************************************************
 * Copyright (C)  2015 - 2016  Carnegie Mellon University
 * Author: Oliver Ferschke
 *
 * This file is part of DiscourseDB.
 *
 * DiscourseDB is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * DiscourseDB is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with DiscourseDB.  If not, see <http://www.gnu.org/licenses/> 
 * or write to the Free Software Foundation, Inc., 51 Franklin Street, 
 * Fifth Floor, Boston, MA 02110-1301  USA
 *******************************************************************************/
package edu.cmu.cs.lti.discoursedb.io.twitter.converter;

import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;

import org.bson.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.stereotype.Component;
import org.springframework.util.Assert;

import com.mongodb.BasicDBObject;
import com.mongodb.Block;
import com.mongodb.MongoClient;
import com.mongodb.util.JSON;

import edu.cmu.cs.lti.discoursedb.io.twitter.model.PemsStationMetaData;
import lombok.extern.log4j.Log4j;
import twitter4j.Status;
import twitter4j.TwitterException;
import twitter4j.TwitterObjectFactory;

@Log4j
@Component
public class TwitterConverter implements CommandLineRunner {

    @Autowired
    TwitterConverterService converterService;

    private final static int MAX_DIST = 5000;

    MongoClient mongoClient;

    @Override
    public void run(String... args) throws ParseException {
        Assert.isTrue(args.length == 5 || args.length == 7,
                "Usage: TwitterConverterApplication <DiscourseName> <DataSetName> <MongoDbHost> <MongoTwitterDatabaseName> <MongoTwitterCollectionName> [ <PemsMetaMongoDataDatabaseName> <PemsMetaDataMongoCollectionName> optional]");

        String discourseName = args[0];
        String datasetName = args[1];
        String dbHost = args[2];
        String twitterDbName = args[3];
        String twitterCollectionName = args[4];
        String pemsMetaDbName = null;
        String pemsMetaCollectionName = null;
        if (args.length == 7) {
            pemsMetaDbName = args[5];
            pemsMetaCollectionName = args[6];
        }

        // start data import
        this.convert(discourseName, datasetName, dbHost, twitterDbName, twitterCollectionName, pemsMetaDbName,
                pemsMetaCollectionName);

        // retrieve timelines for all imported users
        this.retrieveTimelines(discourseName, datasetName, dbHost, twitterDbName, twitterCollectionName);

    }

    /**
     * Opens a connection to a MongoDB instance, reads all Documents in the
     * provided collection of tweets and passes them on to the Tweet mapper.
     * 
     * @param discourseName
     *            the name of the DiscourseDB discourse
     * @param datasetName
     *            the database identifier
     * @param dbHost
     *            the host of the database server (e.g. "localhost")
     * @param dbName
     *            the name of the database containing the tweet collection
     * @param collectionName
     *            the name of the collection containing the tweet documents
     */
    private void convert(String discourseName, String datasetName, String dbHost, String twitterDbName,
            String twitterCollectionName, String pemsDB, String pemsCollection) {
        Assert.hasText(discourseName, "The discourse name has to be specified and cannot be empty.");
        Assert.hasText(datasetName, "The dataset name has to be specified and cannot be empty.");
        Assert.hasText(dbHost, "The MongoDB database host has to be specified and cannot be empty.");
        Assert.hasText(twitterDbName, "The database name has to be specified and cannot be empty.");
        Assert.hasText(twitterCollectionName, "The collection name has to be specified and cannot be empty.");

        mongoClient = new MongoClient(dbHost); // assuming standard port

        log.info("Starting to import tweets from MongoDB database \"" + twitterDbName + "\" collection \""
                + twitterCollectionName + "\"");

        mongoClient.getDatabase(twitterDbName).getCollection(twitterCollectionName).find()
                .forEach((Block<Document>) d -> {
                    if (pemsDB != null && pemsCollection != null) {
                        // add PEMS station meta data
                        converterService.mapTweet(discourseName, datasetName, document2Tweet(d),
                                document2Pems(d, pemsDB, pemsCollection));
                    } else {
                        // do not add PEMS station meta data
                        converterService.mapTweet(discourseName, datasetName, document2Tweet(d), null);
                    }
                });

        log.info("Finished importing tweets.");

        // TODO we need a second pass to create relations between tweets.
        // right now, they are stored as annotations (ids).
        // we might be able to collect information about the relations in the
        // first pass that we can use to
        // create relations without doing a full iteration over the whole mongo
        // database again

        mongoClient.close();
    }

    /**
     * Parses a MongoDB Document that represents a tweet into a Twitter4J Status
     * object
     * 
     * @param tweetDocument
     *            a MongoDB document representing a tweet
     * @return a Twitter4J Status object representing the tweet
     */
    private Status document2Tweet(Document tweetDocument) {
        Assert.notNull(tweetDocument, "The mongodb document representing the tweet to be parsed cannot be null.");

        Status stat = null; // if parsing fails, null will be returns and mapper
        // will skip the tweet
        try {
            stat = TwitterObjectFactory.createStatus(tweetDocument.toJson());
        } catch (TwitterException e) {
            log.warn("Could not parse tweet from document", e);
        }
        return stat;
    }

    /**
     * 
     * @param tweetDocument
     *            a MongoDB document representing a tweet
     * @param pemsMetaDb
     *            database with the pems data
     * @param pemsMetaCollection
     *            collection with the pems meta data
     * @param pemsdatacollection
     *            collection with the pems station data
     * @return an object containing the pems station data to be mapped to the
     *         tweet
     */
    private PemsStationMetaData document2Pems(Document tweetDocument, String pemsMetaDb,
            String pemsMetaCollection) {

        Assert.notNull(tweetDocument, "The mongodb document representing the tweet to be parsed cannot be null.");
        PemsStationMetaData pems = null;

        Document location = (Document) tweetDocument.get("location");
        Document place = (Document) tweetDocument.get("place");
        if (location != null && !location.isEmpty()) {
            // if available, map gps tag to station
            log.trace("Mapping gps location to PEMS station data.");

            BasicDBObject query = (BasicDBObject) JSON.parse("{location:{$near:{$geometry:" + location.toJson()
                    + ",$minDistance:0,$maxDistance: " + MAX_DIST + "}}}");
            Document metaData = (Document) mongoClient.getDatabase(pemsMetaDb).getCollection(pemsMetaCollection)
                    .find(query).first();
            pems = new PemsStationMetaData(metaData);
        } else if (place != null && !place.isEmpty()) {
            // if no gps tag is available, map place to station if available
            log.trace("Mapping place tag to PEMS station data.");
            log.warn("Not yet implemented yet."); // TODO implement place
            // mapping

        } else {
            // no gps tag or place available - do nothing for now
            log.trace("No location or place available for PEMS station data mapping.");
        }

        return pems;
    }

    private void retrieveTimelines(String discourseName, String datasetName, String dbHost, String twitterDbName,
            String twitterCollectionName) {

        List<String> usernames = new ArrayList<>();
        mongoClient = new MongoClient(dbHost); // assuming standard port

        log.debug("Import usernames from MongoDB database \"" + twitterDbName + "\" collection \""
                + twitterCollectionName + "\"");
        mongoClient.getDatabase(twitterDbName).getCollection(twitterCollectionName)
                .distinct("user.screen_name", String.class).into(usernames);

        converterService.importUserTimelines(usernames, discourseName, datasetName);
    }

}