com.attribyte.relay.HTMLUtil.java Source code

Java tutorial

Introduction

Here is the source code for com.attribyte.relay.HTMLUtil.java

Source

/*
 * Copyright 2016 Attribyte, LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 *
 */

package com.attribyte.relay;

import com.attribyte.client.ClientProtos;
import com.google.common.base.Strings;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.Set;

/**
 * HTML-related utility methods.
 */
public class HTMLUtil {

    /**
     * Extracts links for external citations and images.
     * @param entry The entry.
     * @param baseURI The URI used to resolve relative links.
     */
    public static void extractLinks(final ClientProtos.WireMessage.Entry.Builder entry, final String baseURI) {
        Document doc = Jsoup.parse(entry.getContent(), baseURI);
        Elements links = doc.select("a[href]");
        for (Element link : links) {
            String href = Strings.nullToEmpty(link.attr("href")).trim();
            if (!href.isEmpty()) {
                entry.addCitationsBuilder().setDirection(ClientProtos.WireMessage.Citation.Direction.OUT)
                        .setLink(href);
            }
        }

        Set<String> imageSources = Sets.newHashSetWithExpectedSize(8);

        //Don't replace/duplicate any previously added images...

        if (entry.hasPrimaryImage()) {
            imageSources.add(entry.getPrimaryImage().getOriginalSrc());
        }

        if (entry.getImagesCount() > 0) {
            entry.getImagesList().stream().forEach(image -> imageSources.add(image.getOriginalSrc()));
        }

        Elements images = doc.select("img[src]");
        for (Element image : images) {
            String src = Strings.nullToEmpty(image.attr("src")).trim();
            if (!src.isEmpty() && !imageSources.contains(src)) {
                imageSources.add(src);
                ClientProtos.WireMessage.Image.Builder imageBuilder = entry.addImagesBuilder().setOriginalSrc(src);
                String alt = Strings.nullToEmpty(image.attr("alt")).trim();
                if (!alt.isEmpty()) {
                    imageBuilder.setAltText(alt);
                }

                String title = Strings.nullToEmpty(image.attr("title")).trim();
                if (!title.isEmpty()) {
                    imageBuilder.setTitle(title);
                }

                imageBuilder.setSize("original");

                String heightStr = Strings.nullToEmpty(image.attr("height")).trim();
                String widthStr = Strings.nullToEmpty(image.attr("width")).trim();
                if (!heightStr.isEmpty() && !widthStr.isEmpty()) {
                    Integer height = Ints.tryParse(heightStr);
                    Integer width = Ints.tryParse(widthStr);
                    if (height != null && width != null) {
                        imageBuilder.setHeight(height).setWidth(width);
                    }
                }
            }
        }

        if (!entry.hasPrimaryImage() && entry.getImagesCount() > 0) { //Set the primary image as the first image.
            entry.setPrimaryImage(entry.getImages(0));
        }
    }
}