no.kantega.publishing.modules.linkcheck.crawl.LinkExtractor.java Source code

Java tutorial

Introduction

Here is the source code for no.kantega.publishing.modules.linkcheck.crawl.LinkExtractor.java

Source

/*
 * Copyright 2009 Kantega AS
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package no.kantega.publishing.modules.linkcheck.crawl;

import no.kantega.commons.exception.SystemException;
import no.kantega.publishing.api.content.ContentAO;
import no.kantega.publishing.api.content.ContentIdentifier;
import no.kantega.publishing.api.content.attribute.AttributeDataType;
import no.kantega.publishing.common.Aksess;
import no.kantega.publishing.common.data.Content;
import no.kantega.publishing.common.data.attributes.*;
import no.kantega.publishing.eventlog.Event;
import no.kantega.publishing.eventlog.EventLog;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;

import static org.apache.commons.lang3.StringUtils.isNotBlank;

public class LinkExtractor {
    private static final Logger log = LoggerFactory.getLogger(LinkExtractor.class);

    private final EventLog eventLog;
    private final ContentAO contentAO;

    public LinkExtractor(EventLog eventLog, ContentAO contentAO) {
        this.eventLog = eventLog;
        this.contentAO = contentAO;
    }

    public synchronized void extractLinks(Content content, LinkHandler linkHandler) throws SystemException {

        if (content.isExternalLink()) {
            linkHandler.contentLinkFound(content, content.getLocation());
        } else {
            content = contentAO.getContent(ContentIdentifier.fromContentId(content.getId()), true);

            List<Attribute> attributes = content.getAttributes(AttributeDataType.CONTENT_DATA);
            for (Attribute attribute : attributes) {
                handleAttribute(content, linkHandler, attribute);
            }

        }
    }

    private void handleAttribute(Content content, LinkHandler linkHandler, Attribute attribute) {
        String attrName = (isNotBlank(attribute.getTitle())) ? attribute.getTitle() : attribute.getName();
        if (attribute instanceof HtmltextAttribute) {
            String html = attribute.getValue();
            try {
                if (html != null) {
                    Elements links = Jsoup.parse(html).select("a[href]");
                    for (Element link : links) {
                        String href = link.attr("href");
                        linkHandler.attributeLinkFound(content, href, attrName);

                    }
                }
            } catch (Throwable e) {
                eventLog.log("LinkExtractor", "localhost", Event.FAILED_LINK_EXTRACT,
                        String.format("Failed to extract links from %s", content.getUrl()), content);
                log.error("contentId: {}, associationid: {}, attribute: {} {}", content.getId(),
                        content.getAssociation().getId(), attrName, html);
            }
        } else if (attribute instanceof UrlAttribute) {
            String link = attribute.getValue();
            if (link != null && link.length() > 0) {
                if (link.startsWith("/")) {
                    link = Aksess.VAR_WEB + link;
                }
                linkHandler.attributeLinkFound(content, link, attrName);
            }
        } else if (attribute instanceof FileAttribute && isNotBlank(attribute.getValue())) {
            try {
                int attachmentId = Integer.parseInt(attribute.getValue());
                String link = Aksess.VAR_WEB + "/attachment.ap?id=" + attachmentId;
                linkHandler.attributeLinkFound(content, link, attrName);
            } catch (Exception e) {
                log.error("Error getting Content({}) FileAttribute {} with value {}", content.getId(),
                        attribute.getName(), attribute.getValue());
            }
        } else if (attribute instanceof MediaAttribute && isNotBlank(attribute.getValue())) {
            try {
                int mediaId = Integer.parseInt(attribute.getValue());
                String link = Aksess.VAR_WEB + "/multimedia.ap?id=" + mediaId;
                linkHandler.attributeLinkFound(content, link, attrName);
            } catch (Exception e) {
                log.error("Error getting Content({}) FileAttribute {} with value {}", content.getId(),
                        attribute.getName(), attribute.getValue());
            }
        } else if (attribute instanceof RepeaterAttribute) {
            RepeaterAttribute repeaterAttribute = (RepeaterAttribute) attribute;
            for (List<Attribute> attributes : repeaterAttribute) {
                for (Attribute a : attributes) {
                    handleAttribute(content, linkHandler, a);
                }
            }
        }
    }
}