org.kitesdk.spring.hbase.example.service.WebPageSnapshotService.java Source code

Java tutorial

Introduction

Here is the source code for org.kitesdk.spring.hbase.example.service.WebPageSnapshotService.java

Source

/**
 * Copyright 2014 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.kitesdk.spring.hbase.example.service;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import java.io.IOException;
import java.security.PrivilegedAction;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.UserGroupInformation;

import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.kitesdk.data.DatasetReader;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.Key;
import org.kitesdk.data.RandomAccessDataset;
import org.kitesdk.data.spi.DefaultConfiguration;
import org.kitesdk.spring.hbase.example.model.WebPageRedirectModel;
import org.kitesdk.spring.hbase.example.model.WebPageSnapshotModel;
import org.kitesdk.spring.hbase.example.model.frontend.WebPageSnapshotContent;
import org.kitesdk.spring.hbase.example.model.frontend.WebPageSnapshotMeta;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.convert.ConversionService;
import org.springframework.stereotype.Component;

/**
 * Service for WebPageSnapshot operations
 */
@Component
public class WebPageSnapshotService {

    private static final Logger LOG = LoggerFactory.getLogger(WebPageSnapshotService.class);

    @Autowired
    private String webPageSnapshotUri;

    private final Map<String, RandomAccessDataset<WebPageSnapshotModel>> webPageSnapshotModelMap = Maps
            .newHashMap();

    @Autowired
    private String webPageRedirectUri;

    private final Map<String, RandomAccessDataset<WebPageRedirectModel>> webPageRedirectModelMap = Maps
            .newHashMap();

    @Autowired
    private ConversionService conversionService;

    /**
     * Take a snapshot of an URL. This WebPageSnapshot is stored in HBase. Returns
     * the WebPageSnapshotMeta
     *
     * If the URL is a redirect, the snapshot is stored under the final URL
     * destination. A WebPageRedirectModel is stored in the redirect table so when
     * fetching snapshots, we can follow the proper redirect path.
     *
     * @param url The URL to take a snapshot of
     * @param contentKey The key used to store the content
     * @param user The user taking a snapshot
     * @return The WebPageSnapshotMeta for the page that we snapshotted.
     * @throws IOException
     */
    public WebPageSnapshotMeta takeSnapshot(final String url, final String contentKey, final String user)
            throws IOException {
        WebPageSnapshotMeta meta = null;
        UserGroupInformation ugi = UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser());
        try {
            meta = ugi.doAs(new PrivilegedExceptionAction<WebPageSnapshotMeta>() {

                @Override
                public WebPageSnapshotMeta run() throws Exception {
                    WebPageSnapshotModel webPageSnapshotModel = fetchWebPage(url, contentKey);
                    if (!webPageSnapshotModel.getUrl().equals(url)) {
                        // Url is different, so must have redirected. Store the redirect model
                        WebPageRedirectModel redirectModel = WebPageRedirectModel.newBuilder().setUrl(url)
                                .setDestinationUrl(webPageSnapshotModel.getUrl()).build();
                        webPageRedirectModels(user).put(redirectModel);
                    } else {
                        // If redirect exists, remove it since this URL no longer redirects
                        Key key = new Key.Builder(webPageRedirectModels(user)).add("url", url).build();
                        WebPageRedirectModel redirectModel = webPageRedirectModels(user).get(key);
                        if (redirectModel != null) {
                            webPageRedirectModels(user).delete(key);
                        }
                    }
                    webPageSnapshotModels(user).put(webPageSnapshotModel);
                    return conversionService.convert(webPageSnapshotModel, WebPageSnapshotMeta.class);
                }
            });
        } catch (InterruptedException ex) {
            Thread.currentThread().interrupt();
            if (meta == null) {
                throw new IOException("Interrupted trying to save the snapshot", ex);
            }
        }

        return meta;
    }

    /**
     * Get the most recent WebPageSnapshotMeta from HBase
     *
     * @param url The URL of the WebPageSnapshotMeta to get from HBase.
     * @param user The user retrieving the snapshot
     * @return The WebPageSnapshotMeta, or null if one doesn't exist for this URL.
     */
    public WebPageSnapshotMeta getWebPageSnapshotMeta(final String url, final String user) throws IOException {
        WebPageSnapshotModel model = getMostRecentWebPageSnapshot(url, user);
        if (model != null) {
            return conversionService.convert(model, WebPageSnapshotMeta.class);
        } else {
            return null;
        }
    }

    /**
     * Get the WebPageSnapshotMeta that was fetched at a particular timestamp from
     * HBase
     *
     * @param url The URL of the WebPageSnapshotMeta to get from HBase.
     * @param ts The snapshot timestamp of the WebPageSnapshotMeta to get from
     * HBase.
     * @return The WebPageSnapshotMeta, or null if one doesn't exist for this URL
     * at this timestamp.
     */
    public WebPageSnapshotMeta getWebPageSnapshotMeta(final String url, final long ts, final String user)
            throws IOException {
        WebPageSnapshotModel model = getWebPageSnapshot(url, ts, user);
        if (model != null) {
            return conversionService.convert(model, WebPageSnapshotMeta.class);
        } else {
            return null;
        }
    }

    /**
     * Get all WebPageSnapshotMeta from an URL that have been snapshotted since
     * the "since" param.
     *
     * @param url The URL to get WebPageSnapshotMeta instances from
     * @param since The epoch timestamp
     * @return The list of WebPageSnapshotMeta instances.
     */
    public List<WebPageSnapshotMeta> getWebPageSnapshotMetaSince(String url, long since, final String user)
            throws IOException {
        return convertList(getWebPageSnapshotsSince(url, since, user), WebPageSnapshotMeta.class);
    }

    /**
     * Get the most recent WebPageSnapshotContent from HBase
     *
     * @param url The URL to fetch the most recent WebPageSnapshotContent from
     * @return The WebPageSnapshotContent, or null if one doesn't exists for this
     * URL.
     */
    public WebPageSnapshotContent getWebPageSnapshotContent(String url, final String user) throws IOException {
        WebPageSnapshotModel model = getMostRecentWebPageSnapshot(url, user);
        if (model != null) {
            return conversionService.convert(model, WebPageSnapshotContent.class);
        } else {
            return null;
        }
    }

    /**
     * Get the WebPageSnapshotContent that was fetched at a particular timestamp
     * from HBase
     *
     * @param url The URL of the WebPageSnapshotContent to get from HBase.
     * @param ts The snapshot timestamp of the WebPageSnapshotContent to get from
     * HBase.
     * @return The WebPageSnapshotContent, or null if one doesn't exist for this
     * URL at this timestamp.
     */
    public WebPageSnapshotContent getWebPageSnapshotContent(String url, long ts, final String user)
            throws IOException {
        WebPageSnapshotModel model = getWebPageSnapshot(url, ts, user);
        if (model != null) {
            return conversionService.convert(model, WebPageSnapshotContent.class);
        } else {
            return null;
        }
    }

    /**
     * Get all WebPageSnapshotContent from an URL that have been snapshotted since
     * the "since" param.
     *
     * @param url The URL to get WebPageSnapshotContent instances from
     * @param since The epoch timestamp
     * @return The list of WebPageSnapshotContent instances.
     */
    public List<WebPageSnapshotContent> getWebPageSnapshotContentSince(String url, long since, final String user)
            throws IOException {
        return convertList(getWebPageSnapshotsSince(url, since, user), WebPageSnapshotContent.class);
    }

    /**
     * Get the epoch timestamps for every snapshot time of an URL in HBase.
     *
     * @param url The URL of the page to get snapshot timestamps for
     * @return The list of timestamps
     */
    public List<Long> getSnapshotTimestamps(String url, final String user) throws IOException {
        List<Long> snapshotTimestamps = null;
        final String normalizedUrl = normalizeUrl(url, user);
        LOG.error("Getting snapshot timestamps: url = {}, user = {}, normalized url = {}",
                new Object[] { url, user, normalizedUrl });
        UserGroupInformation ugi = UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser());

        snapshotTimestamps = ugi.doAs(new PrivilegedAction<List<Long>>() {

            @Override
            public List<Long> run() {
                List<Long> snapshotTimestamps = new ArrayList<Long>();
                DatasetReader<WebPageSnapshotModel> reader = null;
                try {
                    reader = webPageSnapshotModels(user).from("url", normalizedUrl).from("fetchedAtRevTs", 0L)
                            .to("url", normalizedUrl).to("fetchedAtRevTs", Long.MAX_VALUE).newReader();
                    while (reader.hasNext()) {
                        snapshotTimestamps.add(reader.next().getFetchedAt());
                    }
                } finally {
                    if (reader != null) {
                        reader.close();
                    }
                }
                return snapshotTimestamps;
            }
        });

        return snapshotTimestamps;
    }

    /**
     * Get the most recent WebPageSnapshotModel from HBase
     *
     * @param url The URL to get the snapshotted page from HBase
     * @return The WebPageSnapshotModel, or null if there are no fetches for this
     * URL
     */
    private WebPageSnapshotModel getMostRecentWebPageSnapshot(String url, final String user) throws IOException {
        WebPageSnapshotModel snapshot = null;
        final String normalizedUrl = normalizeUrl(url, user);

        UserGroupInformation ugi = UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser());

        LOG.error("Created proxy user " + ugi.getShortUserName() + " ugi: " + ugi);

        snapshot = ugi.doAs(new PrivilegedAction<WebPageSnapshotModel>() {

            @Override
            public WebPageSnapshotModel run() {
                DatasetReader<WebPageSnapshotModel> reader = null;
                try {
                    // we don't know the exact timestamp in the key, but we know since keys
                    // are in timestamp descending order that the first row for an URL will be
                    // the most recent.
                    reader = webPageSnapshotModels(user).from("url", normalizedUrl).from("fetchedAtRevTs", 0L)
                            .to("url", normalizedUrl).to("fetchedAtRevTs", Long.MAX_VALUE).newReader();
                    if (reader.hasNext()) {
                        return reader.next();
                    } else {
                        return null;
                    }
                } finally {
                    if (reader != null) {
                        reader.close();
                    }
                }
            }

        });

        return snapshot;
    }

    /**
     * Get the WebPageSnapshotModel from HBase
     *
     * @param url The URL of the WebPageSnapshotModel
     * @param ts The snapshot timestamp of the WebPageSnapshotModel
     * @return The WebPageSnapshotModel, or null if there is no snapshot for the
     * URL at this timestamp.
     */
    private WebPageSnapshotModel getWebPageSnapshot(String url, final long ts, final String user)
            throws IOException {
        WebPageSnapshotModel snapshot = null;
        final String normalizedUrl = normalizeUrl(url, user);

        UserGroupInformation ugi = UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser());
        snapshot = ugi.doAs(new PrivilegedAction<WebPageSnapshotModel>() {

            @Override
            public WebPageSnapshotModel run() {
                Key key = new Key.Builder(webPageSnapshotModels(user)).add("url", normalizedUrl)
                        .add("fetchedAtRevTs", Long.MAX_VALUE - ts).build();
                return webPageSnapshotModels(user).get(key);
            }
        });

        return snapshot;
    }

    /**
     * Get WebPageSnapshotModels for an URL from HBase since the since param.
     *
     * @param url The URL of the page to fetch
     * @param since The models to fetch since
     * @return The list of models that have been fetched for an URL since the
     * since param.
     */
    private List<WebPageSnapshotModel> getWebPageSnapshotsSince(String url, final long since, final String user)
            throws IOException {
        List<WebPageSnapshotModel> snapshots = null;
        final String normalizedUrl = normalizeUrl(url, user);

        UserGroupInformation ugi = UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser());

        ugi.doAs(new PrivilegedAction<List<WebPageSnapshotModel>>() {

            @Override
            public List<WebPageSnapshotModel> run() {
                List<WebPageSnapshotModel> models = new ArrayList<WebPageSnapshotModel>();
                DatasetReader<WebPageSnapshotModel> reader = null;
                try {
                    reader = webPageSnapshotModels(user).from("url", normalizedUrl).from("fetchedAtRevTs", 0L)
                            .to("url", normalizedUrl).to("fetchedAtRevTs", since).newReader();
                    while (reader.hasNext()) {
                        models.add(reader.next());
                    }
                } finally {
                    if (reader != null) {
                        reader.close();
                    }
                }
                return models;
            }
        });

        return snapshots;
    }

    /**
     * Normalize an URL, which currently only consists of returning a redirect
     * destination if an URL is a redirect, or otherwise the passed in url.
     *
     * @param url The url to normalize
     * @return The normalized URL;
     */
    private String normalizeUrl(String url, final String user) throws IOException {
        // If this url is a redirect, get it's destination URL to fetch from our
        // HBase store since we store all snapshots under the final destination the
        // page lives at.
        WebPageRedirectModel redirectModel = getRedirect(url, user);
        if (redirectModel != null) {
            return redirectModel.getDestinationUrl();
        } else {
            return url;
        }
    }

    /**
     * Return a WebPageRedirectModel if an URL is one that redirects to a
     * different source. Otherwise, returns null.
     *
     * @return The WebPageRedirectModel
     */
    private WebPageRedirectModel getRedirect(final String url, final String user) throws IOException {
        UserGroupInformation ugi = UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser());

        return ugi.doAs(new PrivilegedAction<WebPageRedirectModel>() {

            @Override
            public WebPageRedirectModel run() {
                Key key = new Key.Builder(webPageRedirectModels(user)).add("url", url).build();
                return webPageRedirectModels(user).get(key);
            }
        });
    }

    /**
     * Fetch the web page from the URL, parse the HTML to populate the metadata
     * required by WebPageSnapshotModel, and return the constructed
     * WebPageSnapshotModel.
     *
     * @param url The URL to fetch the web page from
     * @return The WebPageSnapshotModel
     * @throws IOException Thrown if there's an issue fetching the web page.
     */
    private WebPageSnapshotModel fetchWebPage(String url, String contentKey) throws IOException {
        long fetchTime = System.currentTimeMillis();
        Connection connection = Jsoup.connect(url);
        Response response = connection.execute();
        long postFetchTime = System.currentTimeMillis();
        int timeToFetch = (int) (postFetchTime - fetchTime);

        Document doc = response.parse();
        String destinationUrl = response.url().toString();
        String title = doc.title();
        String description = getDescriptionFromDocument(doc);
        List<String> keywords = getKeywordsFromDocument(doc);
        List<String> outlinks = getOutlinksFromDocument(doc);

        return WebPageSnapshotModel.newBuilder().setUrl(destinationUrl)
                .setFetchedAtRevTs(Long.MAX_VALUE - fetchTime).setSize(doc.html().length()).setFetchedAt(fetchTime)
                .setFetchTimeMs(timeToFetch).setTitle(title).setDescription(description).setKeywords(keywords)
                .setOutlinks(outlinks).setContentKey(contentKey).setContent(ImmutableMap.of(contentKey, doc.html()))
                .build();
    }

    /**
     * Parse the description out of the meta tag if one exists. Otherwise, return
     * null
     *
     * @param doc The Document to parse
     * @return The description if it exists in the HTML, otherwise null.
     */
    private String getDescriptionFromDocument(Document doc) {
        Elements metaDescriptionElements = doc.select("meta[name=description]");
        return metaDescriptionElements.size() > 0 ? metaDescriptionElements.attr("content") : "";
    }

    /**
     * Parse the keywords out of the meta tag if one exists. Otherwise, return an
     * empty list.
     *
     * @param doc The Document ot parse
     * @return The list of keywords.
     */
    private List<String> getKeywordsFromDocument(Document doc) {
        List<String> keywords = new ArrayList<String>();
        Elements keywordsElements = doc.select("meta[name=keywords]");
        for (Element keywordsElement : keywordsElements) {
            for (String keyword : keywordsElement.attr("content").split(",")) {
                keywords.add(keyword.trim());
            }
        }
        return keywords;
    }

    /**
     * Parse the outlinks from a href tags in the document, and return them as a
     * list
     *
     * @param doc The document to parse
     * @return The list of outlinks as URL strings.
     */
    private List<String> getOutlinksFromDocument(Document doc) {
        List<String> outlinks = new ArrayList<String>();
        Elements linkElements = doc.select("a[href]");
        for (Element linkElement : linkElements) {
            outlinks.add(linkElement.attr("href").trim());
        }
        return outlinks;
    }

    /**
     * Use the conversionService to convert a list of objects to clazz
     *
     * @param list The list of objects to convert
     * @param clazz The class to convert those objects to
     * @return The list of converted objects.
     */
    private <T> List<T> convertList(List<?> list, Class<T> clazz) {
        List<T> returnList = new ArrayList<T>();
        for (Object o : list) {
            returnList.add(conversionService.convert(o, clazz));
        }
        return returnList;
    }

    private synchronized RandomAccessDataset<WebPageSnapshotModel> webPageSnapshotModels(String user) {
        RandomAccessDataset<WebPageSnapshotModel> dataset = webPageSnapshotModelMap.get(user);

        if (dataset == null) {
            Configuration conf = new Configuration(DefaultConfiguration.get());
            conf.set("hbase.client.instance.id", user);
            DefaultConfiguration.set(conf);

            dataset = Datasets.load(webPageSnapshotUri, WebPageSnapshotModel.class);
            webPageSnapshotModelMap.put(user, dataset);
        }

        return dataset;
    }

    private synchronized RandomAccessDataset<WebPageRedirectModel> webPageRedirectModels(String user) {
        RandomAccessDataset<WebPageRedirectModel> dataset = webPageRedirectModelMap.get(user);

        if (dataset == null) {
            Configuration conf = new Configuration(DefaultConfiguration.get());
            conf.set("hbase.client.instance.id", user);
            DefaultConfiguration.set(conf);

            dataset = Datasets.load(webPageRedirectUri, WebPageRedirectModel.class);
            webPageRedirectModelMap.put(user, dataset);
        }

        return dataset;
    }
}