edu.cmu.cs.lti.discoursedb.io.twitter.converter.TwitterConverterService.java Source code

Introduction

Here is the source code for edu.cmu.cs.lti.discoursedb.io.twitter.converter.TwitterConverterService.java
Source

/*******************************************************************************
 * Copyright (C)  2015 - 2016  Carnegie Mellon University
 * Author: Oliver Ferschke
 *
 * This file is part of DiscourseDB.
 *
 * DiscourseDB is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * DiscourseDB is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with DiscourseDB.  If not, see <http://www.gnu.org/licenses/> 
 * or write to the Free Software Foundation, Inc., 51 Franklin Street, 
 * Fifth Floor, Boston, MA 02110-1301  USA
 *******************************************************************************/
package edu.cmu.cs.lti.discoursedb.io.twitter.converter;

import java.util.ArrayList;
import java.util.List;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.Assert;

import edu.cmu.cs.lti.discoursedb.core.model.annotation.AnnotationInstance;
import edu.cmu.cs.lti.discoursedb.core.model.macro.Content;
import edu.cmu.cs.lti.discoursedb.core.model.macro.Contribution;
import edu.cmu.cs.lti.discoursedb.core.model.macro.Discourse;
import edu.cmu.cs.lti.discoursedb.core.model.system.DataSourceInstance;
import edu.cmu.cs.lti.discoursedb.core.model.user.User;
import edu.cmu.cs.lti.discoursedb.core.service.annotation.AnnotationService;
import edu.cmu.cs.lti.discoursedb.core.service.macro.ContentService;
import edu.cmu.cs.lti.discoursedb.core.service.macro.ContributionService;
import edu.cmu.cs.lti.discoursedb.core.service.macro.DiscoursePartService;
import edu.cmu.cs.lti.discoursedb.core.service.macro.DiscourseService;
import edu.cmu.cs.lti.discoursedb.core.service.system.DataSourceService;
import edu.cmu.cs.lti.discoursedb.core.service.user.UserService;
import edu.cmu.cs.lti.discoursedb.core.type.ContributionTypes;
import edu.cmu.cs.lti.discoursedb.io.twitter.model.PemsStationMetaData;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import lombok.extern.log4j.Log4j;
import twitter4j.GeoLocation;
import twitter4j.HashtagEntity;
import twitter4j.MediaEntity;
import twitter4j.Paging;
import twitter4j.Place;
import twitter4j.Status;
import twitter4j.Twitter;
import twitter4j.TwitterException;
import twitter4j.TwitterFactory;

/**
 * Service for mapping data retrieved from the Twitter4j API to DiscourseDB
 * 
 * @author Oliver Ferschke
 *
 */
@Log4j
@Service
@Transactional(propagation = Propagation.REQUIRED, readOnly = false)
@RequiredArgsConstructor(onConstructor = @__(@Autowired))
public class TwitterConverterService {

    private final @NonNull DataSourceService dataSourceService;
    private final @NonNull UserService userService;
    private final @NonNull ContentService contentService;
    private final @NonNull ContributionService contributionService;
    private final @NonNull DiscoursePartService discoursepartService;
    private final @NonNull DiscourseService discourseService;
    private final @NonNull AnnotationService annoService;

    /**
     * Maps a Tweet represented as a Twitter4J Status object to DiscourseDB
     * 
     * @param discourseName the name of the discourse
     * @param datasetName the dataset identifier
     * @param tweet the Tweet to store in DiscourseDB
     */
    public void mapTweet(String discourseName, String datasetName, Status tweet, PemsStationMetaData pemsMetaData) {
        if (tweet == null) {
            return;
        }

        Assert.hasText(discourseName, "The discourse name has to be specified and cannot be empty.");
        Assert.hasText(datasetName, "The dataset name has to be specified and cannot be empty.");

        if (dataSourceService.dataSourceExists(String.valueOf(tweet.getId()), TweetSourceMapping.ID_TO_CONTRIBUTION,
                datasetName)) {
            log.trace("Tweet with id " + tweet.getId() + " already exists in database. Skipping");
            return;
        }
        log.trace("Mapping Tweet " + tweet.getId());

        Discourse discourse = discourseService.createOrGetDiscourse(discourseName);

        twitter4j.User tUser = tweet.getUser();
        User user = null;
        if (!userService.findUserByDiscourseAndUsername(discourse, tUser.getScreenName()).isPresent()) {
            user = userService.createOrGetUser(discourse, tUser.getScreenName());
            user.setRealname(tUser.getName());
            user.setEmail(tUser.getEmail());
            user.setLocation(tUser.getLocation());
            user.setLanguage(tUser.getLang());
            user.setStartTime(tweet.getUser().getCreatedAt());

            AnnotationInstance userInfo = annoService.createTypedAnnotation("twitter_user_info");
            annoService.addFeature(userInfo,
                    annoService.createTypedFeature(String.valueOf(tUser.getFavouritesCount()), "favorites_count"));
            annoService.addFeature(userInfo,
                    annoService.createTypedFeature(String.valueOf(tUser.getFollowersCount()), "followers_count"));
            annoService.addFeature(userInfo,
                    annoService.createTypedFeature(String.valueOf(tUser.getFriendsCount()), "friends_count"));
            annoService.addFeature(userInfo,
                    annoService.createTypedFeature(String.valueOf(tUser.getStatusesCount()), "statuses_count"));
            annoService.addFeature(userInfo,
                    annoService.createTypedFeature(String.valueOf(tUser.getListedCount()), "listed_count"));
            if (tUser.getDescription() != null) {
                annoService.addFeature(userInfo,
                        annoService.createTypedFeature(String.valueOf(tUser.getDescription()), "description"));
            }
            annoService.addAnnotation(user, userInfo);
        }

        Contribution curContrib = contributionService.createTypedContribution(ContributionTypes.TWEET);
        DataSourceInstance contribSource = dataSourceService.createIfNotExists(new DataSourceInstance(
                String.valueOf(tweet.getId()), TweetSourceMapping.ID_TO_CONTRIBUTION, datasetName));
        curContrib.setStartTime(tweet.getCreatedAt());
        dataSourceService.addSource(curContrib, contribSource);

        AnnotationInstance tweetInfo = annoService.createTypedAnnotation("twitter_tweet_info");
        if (tweet.getSource() != null) {
            annoService.addFeature(tweetInfo, annoService.createTypedFeature(tweet.getSource(), "tweet_source"));
        }

        annoService.addFeature(tweetInfo,
                annoService.createTypedFeature(String.valueOf(tweet.getFavoriteCount()), "favorites_count"));

        if (tweet.getHashtagEntities() != null) {
            for (HashtagEntity hashtag : tweet.getHashtagEntities()) {
                annoService.addFeature(tweetInfo, annoService.createTypedFeature(hashtag.getText(), "hashtag"));
            }
        }

        if (tweet.getMediaEntities() != null) {
            for (MediaEntity media : tweet.getMediaEntities()) {
                //NOTE: additional info is available for MediaEntities
                annoService.addFeature(tweetInfo, annoService.createTypedFeature(media.getMediaURL(), "media_url"));
            }
        }

        //TODO this should be represented as a relation if the related tweet is part of the dataset
        if (tweet.getInReplyToStatusId() > 0) {
            annoService.addFeature(tweetInfo, annoService
                    .createTypedFeature(String.valueOf(tweet.getInReplyToStatusId()), "in_reply_to_status_id"));
        }

        //TODO this should be represented as a relation if the related tweet is part of the dataset
        if (tweet.getInReplyToScreenName() != null) {
            annoService.addFeature(tweetInfo,
                    annoService.createTypedFeature(tweet.getInReplyToScreenName(), "in_reply_to_screen_name"));
        }
        annoService.addAnnotation(curContrib, tweetInfo);

        GeoLocation geo = tweet.getGeoLocation();
        if (geo != null) {
            AnnotationInstance coord = annoService.createTypedAnnotation("twitter_tweet_geo_location");
            annoService.addFeature(coord,
                    annoService.createTypedFeature(String.valueOf(geo.getLongitude()), "long"));
            annoService.addFeature(coord, annoService.createTypedFeature(String.valueOf(geo.getLatitude()), "lat"));
            annoService.addAnnotation(curContrib, coord);
        }

        Place place = tweet.getPlace();
        if (place != null) {
            AnnotationInstance placeAnno = annoService.createTypedAnnotation("twitter_tweet_place");
            annoService.addFeature(placeAnno,
                    annoService.createTypedFeature(String.valueOf(place.getPlaceType()), "place_type"));
            if (place.getGeometryType() != null) {
                annoService.addFeature(placeAnno,
                        annoService.createTypedFeature(String.valueOf(place.getGeometryType()), "geo_type"));
            }
            annoService.addFeature(placeAnno, annoService
                    .createTypedFeature(String.valueOf(place.getBoundingBoxType()), "bounding_box_type"));
            annoService.addFeature(placeAnno,
                    annoService.createTypedFeature(String.valueOf(place.getFullName()), "place_name"));
            if (place.getStreetAddress() != null) {
                annoService.addFeature(placeAnno,
                        annoService.createTypedFeature(String.valueOf(place.getStreetAddress()), "street_address"));
            }
            annoService.addFeature(placeAnno,
                    annoService.createTypedFeature(String.valueOf(place.getCountry()), "country"));
            if (place.getBoundingBoxCoordinates() != null) {
                annoService.addFeature(placeAnno, annoService.createTypedFeature(
                        convertGeoLocationArray(place.getBoundingBoxCoordinates()), "bounding_box_lat_lon_array"));
            }
            if (place.getGeometryCoordinates() != null) {
                annoService.addFeature(placeAnno, annoService.createTypedFeature(
                        convertGeoLocationArray(place.getGeometryCoordinates()), "geometry_lat_lon_array"));
            }
            annoService.addAnnotation(curContrib, placeAnno);
        }

        Content curContent = contentService.createContent();
        curContent.setText(tweet.getText());
        curContent.setAuthor(user);
        curContent.setStartTime(tweet.getCreatedAt());
        curContrib.setCurrentRevision(curContent);
        curContrib.setFirstRevision(curContent);

        DataSourceInstance contentSource = dataSourceService.createIfNotExists(new DataSourceInstance(
                String.valueOf(tweet.getId()), TweetSourceMapping.ID_TO_CONTENT, datasetName));
        dataSourceService.addSource(curContent, contentSource);

        if (pemsMetaData != null) {
            log.warn("PEMS station meta data mapping not implemented yet");
            //TODO map pems meta data if available         
        }
    }

    /**
     * Converts a 2D-array of GeoLocation objects to a list-style String representation of latitude-longitude pairs. 
     * 
     * @param location a 2d-array of geolocations representing a point or polygon type bounding box
     * @return a String representation of the 2D GeoLocation array
     */
    private String convertGeoLocationArray(GeoLocation[][] location) {
        StringBuilder str = new StringBuilder();
        for (int row = 0; row < location.length; row++) {
            for (int col = 0; col < location[row].length; col++) {
                if (str.length() > 0) {
                    str.append(",");
                }
                str.append("[").append(location[row][col].getLatitude()).append(",")
                        .append(location[row][col].getLongitude()).append("]");
            }
        }
        return str.toString();
    }

    /**
     * For each user in the mongodb dataset, import the whole timeline of that user (API limit: latest 3,200 tweets)
     * 
     * @param users
     * @param discourseName
     * @param datasetName
     */
    public void importUserTimelines(List<String> users, String discourseName, String datasetName) {
        Twitter twitter = TwitterFactory.getSingleton();

        log.info("Importing timelines for " + users.size() + " users into DiscourseDB");

        for (String screenname : users) {
            log.info("Retrieving timeline for user " + screenname);
            List<Status> tweets = new ArrayList<>();

            //There's an API limit of 3,200 tweets you can get from a timeline and 200 per request (page). 
            //This makes 16 requests with 200 tweets per page (pages 1 to 17)
            //This also works if the users has less than 3,200 tweets
            for (int i = 1; i < 17; i++) {
                try {
                    tweets.addAll(twitter.getUserTimeline(screenname, new Paging(i, 200)));
                } catch (TwitterException e) {
                    log.error("Error retrieving timeline for user " + screenname, e);
                }
            }

            log.info("Retrieved timeline (" + tweets.size() + " Tweets) for user " + screenname);
            log.info("Mapping tweets for user " + screenname);
            for (Status tweet : tweets) {
                log.info("Mapping tweet " + tweet.getId());
                mapTweet(discourseName, datasetName, tweet, null);
            }
        }
    }
}