biz.webgate.domino.mywebgate.util.URLFetcher.java Source code

Java tutorial

Introduction

Here is the source code for biz.webgate.domino.mywebgate.util.URLFetcher.java

Source

/*
 *  Copyright WebGate Consulting AG, 2012
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at:
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
 * implied. See the License for the specific language governing 
 * permissions and limitations under the License.
 */
package biz.webgate.domino.mywebgate.util;

import java.util.ArrayList;
import java.util.HashMap;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class URLFetcher {

    private String m_URL;
    private String m_Title = "";
    private String m_Description = "";
    private String m_URLContent = "";
    private ArrayList<String> m_ThumbNails = new ArrayList<String>();

    private Exception m_Exception;
    private String m_Error;

    private HashMap<String, String> m_OpenGraph = new HashMap<String, String>();

    public URLFetcher(String url) {
        super();
        m_URL = url;
    }

    public String getURL() {
        return m_URL;
    }

    public String getTitle() {
        return m_Title;
    }

    public String getDescription() {
        return m_Description == null ? "" : m_Description;
    }

    public ArrayList<String> getThumbNails() {
        return m_ThumbNails;
    }

    public boolean fetchURL() {
        try {
            HttpClient httpClient = new DefaultHttpClient();

            HttpGet httpGet = new HttpGet(m_URL);
            httpGet.addHeader("Content-Type", "text/html; charset=utf-8");
            HttpResponse response = httpClient.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();

            String strBaseURL = m_URL;
            if (strBaseURL.lastIndexOf("/") > 8) {
                strBaseURL = strBaseURL.substring(0, strBaseURL.lastIndexOf("/"));
            }

            Document doc = null;
            if (statusCode == 200) {
                m_ThumbNails.add("no image");
                HttpEntity entity = response.getEntity();
                // String content = EntityUtils.toString(entity);

                try {
                    DOMParser dpHTML = new DOMParser();
                    dpHTML.setProperty("http://cyberneko.org/html/properties/default-encoding", "utf-8");
                    // dpHTML.parse(new
                    // InputSource(EntityUtils.toString(entity)));
                    dpHTML.parse(new InputSource(entity.getContent()));
                    doc = dpHTML.getDocument();
                    NodeList ndlMet = doc.getElementsByTagName("meta");
                    NodeList ndlTitle = doc.getElementsByTagName("title");
                    NodeList ndlImage = doc.getElementsByTagName("img");
                    check4OpenGraphTags(ndlMet, strBaseURL);
                    if (m_Description.equals("")) {
                        for (int nCounter = 0; nCounter < ndlMet.getLength(); nCounter++) {
                            Element elCurrent = (Element) ndlMet.item(nCounter);
                            if ("description".equalsIgnoreCase(elCurrent.getAttribute("name"))) {
                                if (elCurrent.hasAttribute("content")) {
                                    m_Description = elCurrent.getAttribute("content");
                                    nCounter = ndlMet.getLength();
                                }
                            }
                        }
                    }
                    if (ndlTitle.getLength() > 0 && m_Title.equals("")) {
                        m_Title = ((Element) ndlTitle.item(0)).getFirstChild().getNodeValue();
                    }
                    for (int nCounter = 0; nCounter < ndlImage.getLength(); nCounter++) {
                        Element elCurrent = (Element) ndlImage.item(nCounter);
                        if (elCurrent.hasAttribute("src")) {
                            String strImage = elCurrent.getAttribute("src");
                            if (ndlImage.getLength() > 20 && elCurrent.hasAttribute("height")) {
                                String strHeight = elCurrent.getAttribute("height");
                                strHeight.replace("px", "");
                                try {
                                    int nHeight = Integer.parseInt(strHeight);
                                    if (nHeight > 200) {
                                        strImage = null;
                                    }
                                } catch (Exception e) {
                                    // TODO: handle exception
                                }
                            }
                            if (strImage != null) {
                                strImage = checkIMAGEURL(strImage, strBaseURL);

                                if (!m_ThumbNails.contains(strImage)) {
                                    m_ThumbNails.add(strImage);
                                }
                            }
                        }
                    }
                } catch (IllegalStateException e) {
                    m_Error = e.getLocalizedMessage();
                    m_Exception = e;
                    e.printStackTrace();
                } catch (SAXException e) {
                    m_Error = e.getLocalizedMessage();
                    m_Exception = e;
                    e.printStackTrace();
                } finally {
                    httpClient.getConnectionManager().shutdown();
                }

            }

        } catch (Exception e) {
            m_Error = e.getLocalizedMessage();
            m_Exception = e;
            e.printStackTrace();
            return false;
        }
        return true;
    }

    private void check4OpenGraphTags(NodeList ndlMeta, String strBaseURL) {
        for (int nCounter = 0; nCounter < ndlMeta.getLength(); nCounter++) {
            Element elMeta = (Element) ndlMeta.item(nCounter);
            // Test if property is available
            if (elMeta.hasAttribute("property")) {
                String strProperty = elMeta.getAttribute("property");
                // CHECK if we have a OpenGraphProperty
                if (strProperty.toLowerCase().startsWith("og:")) {
                    m_OpenGraph.put(strProperty, elMeta.getAttribute("content"));
                }
                if ("og:image".equalsIgnoreCase(strProperty)) {
                    String strImage = checkIMAGEURL(elMeta.getAttribute("content"), strBaseURL);
                    m_ThumbNails.add(strImage);
                }
                if ("og:title".equalsIgnoreCase(strProperty)) {
                    m_Title = elMeta.getAttribute("content");
                }
                if ("og:url".equalsIgnoreCase(strProperty)) {
                    m_URLContent = elMeta.getAttribute("content");
                }
                if ("og:description".equalsIgnoreCase(strProperty)) {
                    m_Description = elMeta.getAttribute("content");
                }
            }
        }
    }

    public Exception getException() {
        return m_Exception;
    }

    public String getError() {
        return m_Error;
    }

    public String getURLContent() {
        return m_URLContent;
    }

    public HashMap<String, String> getOpenGraph() {
        return m_OpenGraph;
    }

    private String checkIMAGEURL(String strImage, String strBaseURL) {
        String strRC = strImage;
        if (strRC.startsWith("//")) {
            strRC = "http:" + strRC;
        }
        if (!strRC.toLowerCase().startsWith("http://")) {
            strRC = strBaseURL + "/" + strRC;
        }

        return strRC;

    }
}