reconcile.hbase.ingest.rss.ImportRSS.java Source code

Introduction

Here is the source code for reconcile.hbase.ingest.rss.ImportRSS.java
Source

/*
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National
 * Laboratory. Written by Teresa Cottom, cottom1@llnl.gov CODE-400187 All rights reserved. This file is part of
 * RECONCILE
 *
 * This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public
 * License (as published by the Free Software Foundation) version 2, dated June 1991. This program is distributed in the
 * hope that it will be useful, but WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the terms and conditions of the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with this program; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA For full text see license.txt
 */
package reconcile.hbase.ingest.rss;

import reconcile.hbase.mapreduce.JobConfig;
import reconcile.hbase.table.DocSchema;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.TreeSet;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;

import com.sun.syndication.feed.synd.*;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.SyndFeedOutput;
import com.sun.syndication.io.XmlReader;

public class ImportRSS {

    public static final String URL_ARG = "-url=";
    public static final String SAVE_ARG = "-save=";
    public static final String TAGS_ARG = "-tags=";

    public static final Log LOG = LogFactory.getLog(ImportRSS.class);

    public static void main(String[] args) throws IOException, IllegalArgumentException, FeedException {
        if (args.length == 0) {
            String[] myargs = { "-save=false",
                    //URL_ARG+"https://publicaffairs.llnl.gov/news/rssfeed.xml",
                    //URL_ARG+"http://www.paperoftheweek.com/feed/atom/",
                    URL_ARG + "http://www.googlelabs.com/rss",
                    //URL_ARG+"http://www.cloudera.com/blog/feed/",
                    //URL_ARG+"http://rss.cnn.com/rss/money_latest.rss",
                    JobConfig.SOURCE_ARG + "ignored" };
            args = myargs;
        }

        ImportRSS importer = new ImportRSS(args);
        importer.ingest();
    }

    private String urlName;
    private String tags;
    private String source;
    private boolean save;

    public ImportRSS(String[] args) throws IOException {
        // Parse arguments
        JobConfig config = new JobConfig(args);
        source = config.getSource();
        if (source == null)
            throw new RuntimeException("Must specify source name for RSS/Atom feed via command-line "
                    + JobConfig.SOURCE_ARG + "<source>");

        urlName = config.getFirstArg(URL_ARG);
        if (urlName == null)
            throw new RuntimeException("Must specify url for RSS/Atom feed via command-line " + URL_ARG + "<url>");

        tags = config.getFirstArg(TAGS_ARG);

        save = true;
        String value = config.getFirstArg(SAVE_ARG);
        if (value != null) {
            save = Boolean.parseBoolean(value);
        }
    }

    public long ingest() throws MalformedURLException, IOException, IllegalArgumentException, FeedException {
        DocSchema docTable = null;
        if (save) {
            docTable = new DocSchema(source);
        }

        LOG.info("Get RSS feed (" + urlName + ")");

        // Process feed XML as stream using ROME
        URL feedUrl = new URL(urlName);
        SyndFeedInput input = new SyndFeedInput();
        SyndFeed sf = input.build(new XmlReader(feedUrl));

        SyndFeedOutput output = new SyndFeedOutput();
        String content = output.outputString(sf);

        // Get the original XML for each item
        ArrayList<String> items = processAtomEntries(content);
        if (items.size() == 0) {
            items = processRSSItems(content);
        }

        // Verify that XML Item count equals Feed Entry count
        if (items.size() != sf.getEntries().size())
            throw new IOException("XML entries length does not match SyndFeed entries length");

        long numSaved = 0;
        for (int i = 0; i < items.size(); ++i) {
            try {
                SyndEntry entry = (SyndEntry) sf.getEntries().get(i);
                String entryData = items.get(i);
                saveEntry(sf, entry, entryData, docTable);
                ++numSaved;
            } catch (IOException e) {
                LOG.error("Unable to save entry (" + i + ") as received exception. " + e.getMessage());
            }
        }

        // Flush table writes
        if (docTable != null) {
            docTable.flushCommits();
            docTable.close();
        }
        return numSaved;
    }

    /**
     * Save the given SyndEntry into HBase as a row
     * @param feed
     * @param entry
     * @param entryData
     * @param docTable
     * @throws IOException
     */
    private void saveEntry(SyndFeed feed, SyndEntry entry, String entryData, DocSchema docTable)
            throws IOException {
        String id = DigestUtils.shaHex(entryData.getBytes());

        LOG.debug("Saving entry (" + id + ") title(" + entry.getTitle() + ")");

        Put put = new Put(id.getBytes());

        DocSchema.add(put, DocSchema.srcCF, DocSchema.srcId, id);
        DocSchema.add(put, DocSchema.textCF, DocSchema.textOrig, entryData);

        // Save Description
        if (entry.getDescription() != null) {
            String desc = entry.getDescription().getValue();
            DocSchema.add(put, DocSchema.metaCF, "description", desc);
        }

        meta(put, feed, entry);

        // Save separate entries for contents
        int numContent = 0;
        for (Object obj : entry.getContents()) {
            SyndContent cnt = (SyndContent) obj;
            if (saveEntryContent(cnt, "content", feed, entry, id, docTable))
                ++numContent;
        }

        // Entry had no real content, store description as a content row
        if (numContent == 0 && entry.getDescription() != null) {
            saveEntryContent(entry.getDescription(), "desc", feed, entry, id, docTable);
        }

        if (docTable != null)
            docTable.put(put);
        else
            echo(put);
    }

    private void add(Put put, String qualifier, Collection<String> vals) {
        if (vals.size() == 0)
            return;
        String[] values = vals.toArray(new String[vals.size()]);
        String output = Arrays.toString(values);
        DocSchema.add(put, DocSchema.metaCF, qualifier, output.substring(1, output.length() - 1));
    }

    /**
     * Add feed and entry fields as meta data to put
     * 
     * @param put
     * @param feed
     * @param entry
     */
    private void meta(Put put, SyndFeed feed, SyndEntry entry) {
        DocSchema.add(put, DocSchema.srcCF, DocSchema.srcName, source);
        DocSchema.add(put, DocSchema.srcCF, "tags", tags);

        // Add Feed Meta data
        DocSchema.add(put, DocSchema.metaCF, "feed_url", feed.getUri());
        DocSchema.add(put, DocSchema.metaCF, "feed_link", feed.getLink());
        DocSchema.add(put, DocSchema.metaCF, "feed_author", feed.getAuthor());
        DocSchema.add(put, DocSchema.metaCF, "feed_copyright", feed.getCopyright());
        DocSchema.add(put, DocSchema.metaCF, "feed_description", feed.getDescription());
        DocSchema.add(put, DocSchema.metaCF, "feed_language", feed.getLanguage());
        DocSchema.add(put, DocSchema.metaCF, "feed_type", feed.getFeedType());
        DocSchema.add(put, DocSchema.metaCF, "feed_title", feed.getTitle());

        // Add Entry Meta data
        DocSchema.add(put, DocSchema.metaCF, "title", entry.getTitle());
        DocSchema.add(put, DocSchema.metaCF, "url", entry.getUri());
        DocSchema.add(put, DocSchema.metaCF, "updated_date", DocSchema.formatSolrDate(entry.getUpdatedDate()));
        DocSchema.add(put, DocSchema.metaCF, "published_date", DocSchema.formatSolrDate(entry.getPublishedDate()));

        // Save authors
        TreeSet<String> values = new TreeSet<String>();
        if (entry.getAuthor() != null)
            values.add(entry.getAuthor());
        if (entry.getAuthors() != null)
            for (Object obj : entry.getAuthors()) {
                SyndPerson author = (SyndPerson) obj;
                values.add(author.getName());
            }
        add(put, "authors", values);

        // Save links
        values.clear();
        if (entry.getLink() != null)
            values.add(entry.getLink());
        if (entry.getLinks() != null)
            for (Object obj : entry.getLinks()) {
                SyndLink link = (SyndLink) obj;
                values.add(link.getHref());
            }
        add(put, "links", values);

        // Save categories
        values.clear();
        if (entry.getCategories() != null)
            for (Object obj : entry.getCategories()) {
                SyndCategory cat = (SyndCategory) obj;
                values.add(cat.getName());
            }
        add(put, "categories", values);

        DocSchema.addIngestDate(put);
    }

    /**
     * Save the given SyndContent as a separate row in HBase with a link back to SyndEntry row
     * @param content
     * @param tagType
     * @param feed
     * @param entry
     * @param entryKey
     * @param docTable
     * @return
     * @throws IOException
     */
    private boolean saveEntryContent(SyndContent content, String tagType, SyndFeed feed, SyndEntry entry,
            String entryKey, DocSchema docTable) throws IOException {
        if (content.getValue() == null || content.getValue().length() == 0) {
            LOG.error("entry row(" + entryKey + ") SyndContent value is empty.  Will not store as separate row.");
            return false;
        }

        String id = DigestUtils.shaHex(content.getValue().getBytes());
        Put put = new Put(id.getBytes());

        meta(put, feed, entry);

        DocSchema.add(put, DocSchema.srcCF, DocSchema.srcId, id);

        DocSchema.add(put, DocSchema.textCF, DocSchema.textType, content.getType());
        DocSchema.add(put, DocSchema.textCF, DocSchema.textOrig, content.getValue());

        DocSchema.add(put, DocSchema.metaCF, "entry_tag_type", tagType);
        DocSchema.add(put, DocSchema.metaCF, "entry_key", entryKey);

        if (docTable != null)
            docTable.put(put);
        else
            echo(put);

        return true;
    }

    private ArrayList<String> processAtomEntries(String content) {
        ArrayList<String> entries = process(content, "<entry", "</entry>");
        LOG.info("There were (" + entries.size() + ") atom entries found");
        return entries;
    }

    private ArrayList<String> processRSSItems(String content) {
        ArrayList<String> items = process(content, "<item", "</item>");
        LOG.info("There were (" + items.size() + ") rss items found");
        return items;
    }

    private ArrayList<String> process(String content, String BEGIN, String END) {
        ArrayList<String> items = new ArrayList<String>();
        String caseInsensitive = content.toLowerCase();

        int ndx = -1;
        int endNdx = 0;
        while ((ndx = caseInsensitive.indexOf(BEGIN, endNdx)) >= 0) {
            endNdx = caseInsensitive.indexOf(END, ndx + 1);
            if (endNdx > ndx) {
                // Found Entry
                endNdx += END.length();
                String item = content.substring(ndx, endNdx);
                items.add(item);
            }
        }
        return items;
    }

    private void echo(Put put) {
        System.out.println(
                "Key(" + Bytes.toString(put.getRow()) + ")================================================");
        for (byte[] family : put.getFamilyMap().keySet()) {
            String familyCF = Bytes.toString(family);
            for (KeyValue kv : put.getFamilyMap().get(family)) {
                System.out.println("col(" + familyCF + ":" + Bytes.toString(kv.getQualifier()) + ") value("
                        + Bytes.toString(kv.getValue()) + ")");
            }
        }

    }

}