Java Network How to - Handle non-UTF8 html page








Question

We would like to know how to handle non-UTF8 html page.

Answer

import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
//from  www .j av a 2  s  . co  m
public class Main {
  public static void main(String[] args) {
    System.out.println(downloadHtml("http://baike.baidu.com"));
  }

  static String downloadHtml(String urlString) {
    StringBuffer buffer = new StringBuffer();

    try {
      URL url = new URL(urlString);
      HttpURLConnection conn = (HttpURLConnection) url.openConnection();
      HttpURLConnection.setFollowRedirects(true);
      conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
      String encoding = conn.getContentEncoding();
      InputStream  inStr = null;

      if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
        inStr = new GZIPInputStream(conn.getInputStream());
      } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
        inStr = new InflaterInputStream(conn.getInputStream(), new Inflater(
            true));
      } else {
        inStr = conn.getInputStream();
      }
      int ptr = 0;
      InputStreamReader inStrReader = new InputStreamReader(inStr,
          Charset.forName("GB2312"));

      while ((ptr = inStrReader.read()) != -1) {
        buffer.append((char) ptr);
      }
      inStrReader.close();
      conn.disconnect();
      inStr.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
    return buffer.toString();
  }

}