de.l3s.boilerpipe.sax.HTMLFetcher.java Source code

Introduction

Here is the source code for de.l3s.boilerpipe.sax.HTMLFetcher.java
Source

/**
 * Copyright (C) 2013 Christian Kohlschtter (ckkohl79@gmail.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.l3s.boilerpipe.sax;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;

import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.entity.ContentType;

import com.squareup.okhttp.OkHttpClient;
import com.squareup.okhttp.Request;
import com.squareup.okhttp.Response;

/**
 * A very simple HTTP/HTML fetcher, really just for demo purposes.
 * 
 * @author Christian Kohlschtter
 */
public class HTMLFetcher {
    private HTMLFetcher() {
    }

    private static final Pattern PAT_CHARSET = Pattern.compile("charset=([^; ]+)$");
    private static final Pattern PAT_CHARSET_REX = Pattern.compile("charset=\"?([^; ]+)\"");

    /**
     * Fetches the document at the given URL, using {@link URLConnection}.
     * 
     * @param url
     * @return the document at the given URL
     * @throws IOException
     */
    public static HTMLDocument fetch(final URL url) throws IOException {
        return fetch(url.toString());
        //return fetchOk(url.toString());

        //return fetchHelper(url);

    }

    public static HTMLDocument fetch(final String url) throws IOException {
        //DefaultHttpClient httpclient = new DefaultHttpClient();
        CloseableHttpClient httpclient = HttpClients.createDefault();
        HttpGet request = new HttpGet(url.toString());
        request.setHeader("User-Agent",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36");
        request.setHeader("Referer", "http://www.google.com");

        HttpResponse response = httpclient.execute(request);
        HttpEntity entity = response.getEntity();
        //System.out.println("Response Code: " +
        //response.getStatusLine().getStatusCode());
        ContentType contentType = ContentType.getOrDefault(entity);
        Charset charset = contentType.getCharset();
        if (charset == null) {
            charset = Charset.forName("gb2312");
        }

        BufferedReader rd = new BufferedReader(new InputStreamReader(entity.getContent(), charset));

        StringBuilder builder = new StringBuilder();
        String aux = "";
        Charset cs = Charset.forName("utf8");
        boolean charsetFlag = false;
        while ((aux = rd.readLine()) != null) {
            if (aux != null && !charsetFlag && (aux.contains("http-equiv") || !aux.contains("src"))) {
                Matcher m = PAT_CHARSET_REX.matcher(aux);
                if (m.find()) {
                    final String cName = m.group(1);
                    charsetFlag = true;
                    try {
                        cs = Charset.forName(cName);
                        break;
                    } catch (UnsupportedCharsetException e) {
                        // keep default
                    }
                }
            }
            //builder.append(aux);
            //System.out.println(builder.toString());
        }

        HttpGet request2 = new HttpGet(url.toString());
        request2.setHeader("User-Agent",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36");
        request2.setHeader("Referer", "http://www.google.com");

        HttpResponse response2 = httpclient.execute(request2);
        HttpEntity entity2 = response2.getEntity();
        contentType = ContentType.getOrDefault(entity2);
        charset = contentType.getCharset();
        if (charset == null)
            charset = cs;
        //if(charset.name().toLowerCase().equals("gb2312"))
        //   charset = Charset.forName("gbk");
        BufferedReader rd2 = new BufferedReader(new InputStreamReader(entity2.getContent(), charset));
        while ((aux = rd2.readLine()) != null) {
            builder.append(aux);
            //System.out.println(builder.toString());
        }

        String text = builder.toString();
        //System.out.println(text);
        rd.close();
        rd2.close();
        return new HTMLDocument(text, cs); //sometimes cs not equal to charset
    }

    public static HTMLDocument fetchHelper(final URL url) throws IOException {
        final URLConnection conn = url.openConnection();
        //conn.setRequestProperty("User-Agent",
        //"Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19");
        conn.setRequestProperty("User-Agent",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36");
        //conn.setRequestProperty("Cookie","wapparam=web2wap; vt=4");
        final String ct = conn.getContentType();

        if (ct == null || !(ct.equals("text/html") || ct.startsWith("text/html;"))) {
            //throw new IOException("Unsupported content type: "+ct+ url);
            System.err.println("WARN: unsupported Content-type: " + ct + url);
        }

        Charset cs = Charset.forName("UTF8");
        if (ct != null) {
            Matcher m = PAT_CHARSET_REX.matcher(ct);
            if (m.find()) {
                final String charset = m.group(1);
                try {
                    cs = Charset.forName(charset);
                } catch (UnsupportedCharsetException e) {
                    // keep default
                }
            }
        }

        InputStream in = conn.getInputStream();

        final String encoding = conn.getContentEncoding();
        if (encoding != null) {
            if ("gzip".equalsIgnoreCase(encoding)) {
                in = new GZIPInputStream(in);
            } else {
                System.err.println("WARN: unsupported Content-Encoding: " + encoding);
            }
        }

        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        byte[] buf = new byte[4096];
        int r;
        while ((r = in.read(buf)) != -1) {
            bos.write(buf, 0, r);
        }
        in.close();

        final byte[] data = bos.toByteArray();

        return new HTMLDocument(data, cs);
    }

    public static HTMLDocument fetchOk(final String url) throws IOException {
        OkHttpClient client = new OkHttpClient();
        Request request = new Request.Builder().url(url).build();

        Response response = client.newCall(request).execute();

        String data = response.body().string();
        Charset cs = response.body().contentType().charset();
        return new HTMLDocument(data, cs);
    }
}