com.crosstreelabs.cognitio.gumshoe.format.HtmlFormatHandler.java Source code

Introduction

Here is the source code for com.crosstreelabs.cognitio.gumshoe.format.HtmlFormatHandler.java
Source

/*
 * Copyright 2015 Crosstree Labs.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.crosstreelabs.cognitio.gumshoe.format;

import com.crosstreelabs.cognitio.api.extension.FormatHandler;
import com.crosstreelabs.cognitio.api.resource.Visit;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import io.mola.galimatias.GalimatiasParseException;
import io.mola.galimatias.URL;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.util.logging.Level;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.utils.URIBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HtmlFormatHandler implements FormatHandler {
    private static final Logger LOGGER = LoggerFactory.getLogger(HtmlFormatHandler.class);

    @Override
    public boolean handles(final Visit visit) {
        if ("text/html".equals(visit.contentType)) {
            return true;
        }
        if (visit.contentStream == null) {
            return false;
        }
        try {
            if (!(visit.contentStream instanceof ByteArrayInputStream)) {
                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                IOUtils.copy(visit.contentStream, baos);
                visit.contentStream = new ByteArrayInputStream(baos.toByteArray());
            }
            byte[] buf = new byte[250];
            IOUtils.read(visit.contentStream, buf, 0, 250);
            visit.contentStream.reset();
            String chunk = new String(buf);
            return chunk.toLowerCase().contains("<html");
        } catch (IOException ex) {
        }
        return false;
    }

    @Override
    public void handle(final Visit visit) {
        if (!handles(visit)) {
            throw new UnsupportedOperationException("Cannot handle resource");
        }

        processLinks(visit);
        processContent(visit);
    }

    @Override
    public void processLinks(final Visit visit) {
        try {
            String charset = StringUtils.defaultIfBlank(visit.contentCharset, "UTF-8");

            Document doc = Jsoup.parse(visit.contentStream, charset, visit.result.location);
            Elements anchors = doc.getElementsByTag("a");

            for (Element e : anchors) {
                String url = stripURLFragmentIdentifier(e.attr("abs:href"));
                String uri = stripURLFragmentIdentifier(e.attr("href").toLowerCase());
                if (uri.isEmpty() || url.isEmpty() || uri.contains("javascript:") || uri.contains("mailto:")
                        || uri.contains("@")) {
                    continue;
                }

                visit.discoveredLinks.add(URL.parse(url).toString()); // TODO Need to add the link text as the title
            }
            visit.contentStream.reset();
        } catch (GalimatiasParseException | IOException ex) {
            throw new RuntimeException(ex);
        }
    }

    @Override
    public void processContent(final Visit visit) {
        try {
            visit.result.description = ArticleExtractor.getInstance()
                    .getText(new InputStreamReader(visit.contentStream));
        } catch (BoilerpipeProcessingException ex) {
            throw new RuntimeException(ex);
        }
    }

    protected static String stripURLFragmentIdentifier(final String url) {
        if (!url.contains("#")) {
            return url;
        }

        int pos = url.indexOf("#");
        int pos2 = url.indexOf("#!");
        if (pos == pos2) {
            return url;
        }

        return url.substring(0, pos);
    }

}