Java tutorial
//package com.java2s; /** * Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.Proxy; import java.net.URL; import java.util.zip.GZIPInputStream; public class Main { public static final String ISO = "ISO-8859-1"; public static int K4 = 4096; final static String DESCRIPTION = "<meta name=\"description\" content=\""; final static String DESCRIPTION2 = "<meta name=\"Description\" content=\""; /** * @return tries to get the title of the specified url. returns an empty string * if this failed */ public static String getUrlTitle(String urlAsString, int timeout) { return getUrlInfos(urlAsString, timeout)[0]; } public static String[] getUrlInfos(String urlAsString, int timeout) { try { URL url = new URL(urlAsString); //using proxy may increase latency HttpURLConnection hConn = (HttpURLConnection) url.openConnection(Proxy.NO_PROXY); hConn.setRequestProperty("User-Agent", "Mozilla/5.0 Gecko/20100915 Firefox/3.6.10"); // on android we got problems because of this // so disable that for now // hConn.setRequestProperty("Accept-Encoding", "gzip, deflate"); hConn.setConnectTimeout(timeout); hConn.setReadTimeout(timeout); // default length of bufferedinputstream is 8k byte[] arr = new byte[K4]; InputStream is = hConn.getInputStream(); if ("gzip".equals(hConn.getContentEncoding())) is = new GZIPInputStream(is); BufferedInputStream in = new BufferedInputStream(is, arr.length); in.read(arr); return getUrlInfosFromText(arr, hConn.getContentType()); } catch (Exception ex) { } return new String[] { "", "" }; } public static String getInputStream(InputStream is) throws IOException { if (is == null) throw new IllegalArgumentException("stream mustn't be null!"); BufferedReader bufReader = createBuffReader(is); StringBuilder sb = new StringBuilder(); String line; while ((line = bufReader.readLine()) != null) { sb.append(line); sb.append('\n'); } bufReader.close(); return sb.toString(); } /** * Returns title and description of a specified string (as byte array) */ public static String[] getUrlInfosFromText(byte[] arr, String contentType) { String res; try { res = new String(arr, extractEncoding(contentType)); } catch (Exception ex) { res = new String(arr); } int index = getStartTitleEndPos(res); if (index < 0) return new String[] { "", "" }; int encIndex = res.indexOf("charset="); if (encIndex > 0) { int lastEncIndex = res.indexOf("\"", encIndex + 8); // if we have charset="something" if (lastEncIndex == encIndex + 8) lastEncIndex = res.indexOf("\"", ++encIndex + 8); // re-read byte array with different encoding if (lastEncIndex > encIndex + 8) { try { String encoding = res.substring(encIndex + 8, lastEncIndex); res = new String(arr, encoding); } catch (Exception ex) { } index = getStartTitleEndPos(res); if (index < 0) return new String[] { "", "" }; } } int lastIndex = res.indexOf("</title>"); if (lastIndex <= index) return new String[] { "", "" }; String title = res.substring(index, lastIndex); index = res.indexOf(DESCRIPTION); if (index < 0) index = res.indexOf(DESCRIPTION2); lastIndex = res.indexOf("\"", index + DESCRIPTION.length()); if (index < 0 || lastIndex < 0) return new String[] { title, "" }; index += DESCRIPTION.length(); return new String[] { title, res.substring(index, lastIndex) }; } public static BufferedReader createBuffReader(File file) throws FileNotFoundException, UnsupportedEncodingException { return new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); } public static BufferedReader createBuffReader(InputStream is) throws FileNotFoundException, UnsupportedEncodingException { return new BufferedReader(new InputStreamReader(is, "UTF-8")); } public static String extractEncoding(String contentType) { String[] values = contentType.split(";"); String charset = ""; for (String value : values) { value = value.trim().toLowerCase(); if (value.startsWith("charset=")) charset = value.substring("charset=".length()); } // http1.1 says ISO-8859-1 is the default charset if (charset.length() == 0) charset = ISO; return charset; } public static int getStartTitleEndPos(String res) { int index = res.indexOf("<title>"); if (index < 0) { index = res.indexOf("<title "); if (index < 0) return -1; index = res.indexOf(">", index); if (index >= 0) index++; } else index += "<title>".length(); return index; } }