edu.gslis.ts.DumpThriftData.java Source code

Introduction

Here is the source code for edu.gslis.ts.DumpThriftData.java
Source

package edu.gslis.ts;
/*******************************************************************************
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Options;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.transport.TIOStreamTransport;
import org.apache.thrift.transport.TTransport;
import org.apache.thrift.transport.TTransportException;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.tukaani.xz.XZInputStream;

import streamcorpus_v3.Sentence;
import streamcorpus_v3.StreamItem;
import streamcorpus_v3.Token;
import edu.gslis.docscoring.QueryDocScorer;
import edu.gslis.docscoring.ScorerDirichlet;
import edu.gslis.docscoring.support.CollectionStats;
import edu.gslis.docscoring.support.IndexBackedCollectionStats;
import edu.gslis.queries.GQuery;
import edu.gslis.searchhits.SearchHit;
import edu.gslis.textrepresentation.FeatureVector;

/**
 * Score documents in thrift files w.r.t. temporal summarization queries.
 */
public class DumpThriftData {

    public static void main(String[] args) {
        try {
            // Get the commandline options
            Options options = createOptions();
            CommandLineParser parser = new GnuParser();
            CommandLine cmd = parser.parse(options, args);

            String in = cmd.getOptionValue("i");
            String sentenceParser = cmd.getOptionValue("p");
            String query = cmd.getOptionValue("q");
            String externalCollection = cmd.getOptionValue("e");

            // Get background statistics
            CollectionStats bgstats = new IndexBackedCollectionStats();
            bgstats.setStatSource(externalCollection);

            // Set query
            GQuery gquery = new GQuery();
            gquery.setText(query);
            gquery.setFeatureVector(new FeatureVector(query, null));

            // Setup the filter
            DumpThriftData f = new DumpThriftData();

            if (in != null) {
                File infile = new File(in);
                if (infile.isDirectory()) {
                    for (File file : infile.listFiles()) {
                        if (file.isDirectory()) {
                            for (File filefile : file.listFiles()) {
                                System.err.println(filefile.getAbsolutePath());
                                f.filter(filefile, sentenceParser, gquery, bgstats);
                            }
                        } else {
                            System.err.println(file.getAbsolutePath());
                            f.filter(file, sentenceParser, gquery, bgstats);
                        }
                    }
                } else
                    System.err.println(infile.getAbsolutePath());
                f.filter(infile, sentenceParser, gquery, bgstats);
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static Options createOptions() {
        Options options = new Options();
        options.addOption("i", true, "Input thrift file");
        options.addOption("p", true, "Parser (lingpipe or serif)");
        options.addOption("q", true, "Query");
        options.addOption("e", true, "External Collection");
        return options;
    }

    /**
     * @param thriftFile
     */
    public void filter(File infile, String parser, GQuery gquery, CollectionStats bgstats) {
        try {
            InputStream in = null;

            if (infile.getName().endsWith(".gz"))
                in = new GZIPInputStream(new FileInputStream(infile));
            else if (infile.getName().endsWith("xz"))
                in = new XZInputStream(new FileInputStream(infile));
            else {
                System.err.println("Regular FileInputStream");
                in = new FileInputStream(infile);
            }

            TTransport inTransport = new TIOStreamTransport(new BufferedInputStream(in));
            TBinaryProtocol inProtocol = new TBinaryProtocol(inTransport);
            inTransport.open();

            try {
                // Run through items in the thrift file
                while (true) {
                    final StreamItem item = new StreamItem();
                    item.read(inProtocol);
                    if (item.body == null || item.body.clean_visible == null) {
                        System.err.println("Body is null.");
                        continue;
                    }

                    String streamId = "";
                    if (item.stream_id != null) {
                        streamId = item.stream_id;
                    }

                    String dateTime = "";
                    long epochTime = 0;
                    if (item.stream_time != null && item.stream_time.zulu_timestamp != null) {
                        dateTime = item.stream_time.zulu_timestamp;
                        DateTimeFormatter dtf = ISODateTimeFormat.dateTime();
                        epochTime = dtf.parseMillis(dateTime);
                    }

                    String source = "";
                    if (item.source != null) {
                        source = item.source;
                    }

                    Map<String, List<Sentence>> parsers = item.body.sentences;
                    List<Sentence> sentenceParser = parsers.get(parser);

                    QueryDocScorer scorer = new ScorerDirichlet();
                    scorer.setCollectionStats(bgstats);
                    scorer.setQuery(gquery);

                    List<Double> sentenceScores = new ArrayList<Double>();
                    List<String> sentences = new ArrayList<String>();
                    String sentencesText = "";
                    if (sentenceParser != null && sentenceParser.size() > 0) {

                        for (Sentence s : sentenceParser) {
                            try {
                                List<Token> tokens = s.tokens;
                                String sentence = "";
                                for (Token token : tokens) {
                                    String tok = token.token;
                                    sentence += tok + " ";
                                }
                                FeatureVector sentenceVector = new FeatureVector(sentence, null);
                                SearchHit sentenceHit = new SearchHit();
                                sentenceHit.setFeatureVector(sentenceVector);
                                sentenceHit.setLength(sentenceVector.getLength());
                                double score = scorer.score(sentenceHit);

                                sentenceScores.add(score);
                                sentences.add(sentence);

                                sentencesText += sentence + "\n";
                            } catch (Exception e) {
                                System.err
                                        .println("Issue with sentence " + sentences.size() + " in doc " + streamId);
                                System.err.println("File: " + infile.getAbsolutePath());
                            }
                        }
                        SearchHit docHit = new SearchHit();
                        docHit.setFeatureVector(new FeatureVector(sentencesText, null));
                        double docscore = scorer.score(docHit);
                        for (int i = 0; i < sentenceScores.size(); i++) {
                            System.out.println(infile.getAbsolutePath() + "\t" + source + "\t" + epochTime + "\t"
                                    + streamId + "\t" + docscore + "\t" + i + "\t" + sentenceScores.get(i) + "\t"
                                    + sentences.get(i));
                        }
                    } else if (sentenceParser == null) {
                        System.err.println("Sentence parser null");
                    } else if (sentenceParser.size() == 0) {
                        System.err.println("Sentence length 0");
                    } else {
                        System.err.println("Other sentence error.");
                    }

                }
            } catch (TTransportException te) {
                if (te.getType() == TTransportException.END_OF_FILE) {
                } else {
                    throw te;
                }
            }

            inTransport.close();

        } catch (Exception e) {
            System.err.println("Error processing " + infile.getAbsolutePath() + " " + infile.getName());
            e.printStackTrace();
        }
    }
}