gate.corpora.JSONTweetFormat.java Source code

Java tutorial

Introduction

Here is the source code for gate.corpora.JSONTweetFormat.java

Source

/*
 *  Copyright (c) 1995-2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  
 *  $Id: JSONTweetFormat.java 17968 2014-05-11 16:37:34Z ian_roberts $
 */
package gate.corpora;

import gate.AnnotationSet;
import gate.DocumentContent;
import gate.GateConstants;
import gate.Resource;
import gate.corpora.twitter.PreAnnotation;
import gate.corpora.twitter.Tweet;
import gate.corpora.twitter.TweetUtils;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleResource;
import gate.util.DocumentFormatException;
import gate.util.InvalidOffsetException;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;

/** Document format for handling JSON tweets: either one 
 *  object {...} or a list [{tweet...}, {tweet...}, ...].
 *  
 *  This format produces one GATE document from one JSON file.
 */
@CreoleResource(name = "GATE JSON Tweet Document Format", isPrivate = true, autoinstances = {
        @AutoInstance(hidden = true) }, comment = "Format parser for Twitter JSON files", helpURL = "http://gate.ac.uk/userguide/sec:social:twitter:format")

public class JSONTweetFormat extends TextualDocumentFormat {
    private static final long serialVersionUID = 6878020036304333918L;

    /** Default construction */
    public JSONTweetFormat() {
        super();
    }

    /** Initialise this resource, and return it. */
    public Resource init() throws ResourceInstantiationException {
        // Register ad hoc MIME-type
        // There is an application/json mime type, but I don't think
        // we want everything to be handled this way?
        MimeType mime = new MimeType("text", "x-json-twitter");
        // Register the class handler for this MIME-type
        mimeString2ClassHandlerMap.put(mime.getType() + "/" + mime.getSubtype(), this);
        // Register the mime type with string
        mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
        // Register file suffixes for this mime type
        suffixes2mimeTypeMap.put("json", mime);
        // Register magic numbers for this mime type
        //magic2mimeTypeMap.put("Subject:",mime);
        // Set the mimeType for this language resource
        setMimeType(mime);
        return this;
    }

    @Override
    public void cleanup() {
        super.cleanup();

        MimeType mime = getMimeType();

        mimeString2ClassHandlerMap.remove(mime.getType() + "/" + mime.getSubtype());
        mimeString2mimeTypeMap.remove(mime.getType() + "/" + mime.getSubtype());
        suffixes2mimeTypeMap.remove("json");
    }

    @Override
    public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
        if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
            throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
        }

        setNewLineProperty(doc);
        String jsonString = StringUtils.trimToEmpty(doc.getContent().toString());
        try {
            // Parse the String
            List<Tweet> tweets = TweetUtils.readTweets(jsonString);
            Map<Tweet, Long> tweetStarts = new HashMap<Tweet, Long>();

            // Put them all together to make the unpacked document content
            StringBuilder concatenation = new StringBuilder();
            for (Tweet tweet : tweets) {
                tweetStarts.put(tweet, (long) concatenation.length());
                concatenation.append(tweet.getString()).append("\n\n");
            }

            // Set new document content 
            DocumentContent newContent = new DocumentContentImpl(concatenation.toString());
            doc.edit(0L, doc.getContent().size(), newContent);

            AnnotationSet originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
            // Create Original markups annotations for each tweet
            for (Tweet tweet : tweets) {
                for (PreAnnotation preAnn : tweet.getAnnotations()) {
                    preAnn.toAnnotation(originalMarkups, tweetStarts.get(tweet));
                }
            }
        } catch (InvalidOffsetException e) {
            throw new DocumentFormatException(e);
        } catch (IOException e) {
            throw new DocumentFormatException(e);
        }
    }

}