We would like to know how to handle non-UTF8 html page.
import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.nio.charset.Charset; import java.util.zip.GZIPInputStream; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; //from www .j av a 2 s . co m public class Main { public static void main(String[] args) { System.out.println(downloadHtml("http://baike.baidu.com")); } static String downloadHtml(String urlString) { StringBuffer buffer = new StringBuffer(); try { URL url = new URL(urlString); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); HttpURLConnection.setFollowRedirects(true); conn.setRequestProperty("Accept-Encoding", "gzip, deflate"); String encoding = conn.getContentEncoding(); InputStream inStr = null; if (encoding != null && encoding.equalsIgnoreCase("gzip")) { inStr = new GZIPInputStream(conn.getInputStream()); } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) { inStr = new InflaterInputStream(conn.getInputStream(), new Inflater( true)); } else { inStr = conn.getInputStream(); } int ptr = 0; InputStreamReader inStrReader = new InputStreamReader(inStr, Charset.forName("GB2312")); while ((ptr = inStrReader.read()) != -1) { buffer.append((char) ptr); } inStrReader.close(); conn.disconnect(); inStr.close(); } catch (Exception e) { e.printStackTrace(); } return buffer.toString(); } }