org.trec.liveqa.GetYAnswersPropertiesFromQid.java Source code

Java tutorial

Introduction

Here is the source code for org.trec.liveqa.GetYAnswersPropertiesFromQid.java

Source

package org.trec.liveqa;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.json.simple.JSONValue;
import org.json.simple.JSONObject;

/**
 * Copyright 2015 Yahoo Inc.<br>
 * Licensed under the terms of the MIT license. Please see LICENSE file at the root of this project for terms.
 * <p/>
 * 
 * This class scrapes Yahoo Answers pages by a list of QIDs, obtaining certain properties.
 * 
 * @author yuvalp@yahoo-inc.com
 * 
 */
public class GetYAnswersPropertiesFromQid {

    public interface ElementPredicate {

        /**
         * Given an HTML Element, see if it conforms to a Yahoo Answers tag pattern specific to the desired property.
         * @param e
         * @return
         */
        public boolean check(Element e);

        public String getText(Element e);

    }

    public static class CheckTitle implements ElementPredicate {

        @Override
        public boolean check(Element e) {
            return e.nodeName().equals("meta") && e.attr("name").equals("title");
        }

        @Override
        public String getText(Element e) {
            return e.attr("content");
        }

    }

    public static class CheckBody implements ElementPredicate {

        @Override
        public boolean check(Element e) {
            return e.nodeName().equals("meta") && e.attr("name").equals("description");
        }

        @Override
        public String getText(Element e) {
            return e.attr("content");
        }

    }

    public static class CheckKeywords implements ElementPredicate {

        @Override
        public boolean check(Element e) {
            return e.nodeName().equals("meta") && e.attr("name").equals("keywords");
        }

        @Override
        public String getText(Element e) {
            return e.attr("content");
        }

    }

    public static class CheckBestAnswer implements ElementPredicate {

        @Override
        public boolean check(Element e) {
            return e.nodeName().equals("span") && e.className().contains("ya-q-full-text");
        }

        @Override
        public String getText(Element e) {
            return e.text();
        }

    }

    public static class CheckTopLevelCategory implements ElementPredicate {

        @Override
        public boolean check(Element e) {
            return e.nodeName().equals("a") && e.className().contains("Clr-b");
        }

        @Override
        public String getText(Element e) {
            return e.text();
        }

    }

    // ---------------------------------------------------

    private static final String URL_PREFIX = "https://answers.yahoo.com/question/index?qid=";
    private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US);
    private static CheckTitle ct = new CheckTitle();
    private static CheckBody cb = new CheckBody();
    private static CheckKeywords ck = new CheckKeywords();
    private static CheckTopLevelCategory cc = new CheckTopLevelCategory();
    private static CheckBestAnswer cba = new CheckBestAnswer();

    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            System.out.println("Usage: GetYAnswersPropertiesFromQid <plaintext-list-of-qids> <out-file>");
            return;
        }

        BufferedWriter writer = getWriter(args[1]);
        String[] qids = getQids(args[0]);

        for (String qid : qids) {
            System.out.println("Getting data for QID " + qid);
            Map<String, String> data = extractData(qid);
            JSONObject jo = new JSONObject();
            for (Map.Entry<String, String> e : data.entrySet()) {
                jo.put(e.getKey(), JSONValue.escape(e.getValue()));
            }

            writer.append(jo.toString());
            writer.newLine();
            writer.flush();
        }
        writer.close();
    }

    private static String[] getQids(String inFile) throws IOException {
        BufferedReader reader = new BufferedReader(new FileReader(new File(inFile)));
        List<String> qids = new ArrayList<>();
        String l = reader.readLine();
        while (l != null) {
            qids.add(l);
            l = reader.readLine();
        }
        reader.close();
        return qids.toArray(new String[qids.size()]);
    }

    private static BufferedWriter getWriter(String fileLocation) throws FileNotFoundException {
        return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(fileLocation))));
    }

    /**
     * 
     * @param iQid question ID
     * @return map of features and attributes: question title, body, category, best answer, date
     * @throws Exception
     */
    public static Map<String, String> extractData(String iQid) throws Exception {

        Map<String, String> res = new LinkedHashMap<>();
        res.put("qid", iQid);

        // parse date from qid
        res.put("Date", DATE_FORMAT.parse(iQid.substring(0, 14)).toString());

        // get and mine html page
        String url = URL_PREFIX + iQid;
        HttpClient client = new HttpClient();
        GetMethod method = new GetMethod(url);
        method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
                new DefaultHttpMethodRetryHandler(3, false));
        try {
            int statusCode = client.executeMethod(method);
            if (statusCode != HttpStatus.SC_OK) {
                System.err.println("Method failed: " + method.getStatusLine());
            }
            InputStream responseBody = method.getResponseBodyAsStream();

            // strip top levels
            Document doc = Jsoup.parse(responseBody, "UTF8", url);
            Element html = doc.child(0);

            Element body = html.child(1);
            Element head = html.child(0);

            // get category
            res.put("Top level Category", findElementText(body, cc));

            // get title
            res.put("Title", findElementText(head, ct));

            // get body
            res.put("Body", findElementText(head, cb));

            // get keywords
            res.put("Keywords", findElementText(head, ck));

            // get best answer
            Element best_answer_div = html.select("div#ya-best-answer").first();
            if (best_answer_div != null) {
                res.put("Best Answer", findElementText(best_answer_div, cba));
            }

            responseBody.close();

        } catch (HttpException e) {
            System.err.println("Fatal protocol violation: " + e.getMessage());
            e.printStackTrace();
        } catch (IOException e) {
            System.err.println("Fatal transport error: " + e.getMessage());
            e.printStackTrace();
        } finally {
            method.releaseConnection();
        }

        return res;
    }

    private static String findElementText(Element topElem, ElementPredicate f) {
        Element elem = findElement(topElem, f);
        return elem == null ? "" : f.getText(elem);
    }

    private static Element findElement(Element e, ElementPredicate f) {
        if (f.check(e)) {
            return e;
        }
        for (Element c : e.children()) {
            Element elem = findElement(c, f);
            if (elem != null) {
                return elem;
            }
        }
        return null;
    }

}