Http Connection
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
/**
* Implementation of {@link Connection}.
*
* @see org.jsoup.Jsoup#connect(String)
*/
public class HttpConnection implements Connection {
public static Connection connect(String url) {
Connection con = new HttpConnection();
con.url(url);
return con;
}
public static Connection connect(URL url) {
Connection con = new HttpConnection();
con.url(url);
return con;
}
private Connection.Request req;
private Connection.Response res;
private HttpConnection() {
req = new Request();
res = new Response();
}
public Connection url(URL url) {
req.url(url);
return this;
}
public Connection url(String url) {
Validate.notEmpty(url, "Must supply a valid URL");
try {
req.url(new URL(url));
} catch (MalformedURLException e) {
throw new IllegalArgumentException("Malformed URL: " + url, e);
}
return this;
}
public Connection userAgent(String userAgent) {
Validate.notNull(userAgent, "User agent must not be null");
req.header("User-Agent", userAgent);
return this;
}
public Connection timeout(int millis) {
req.timeout(millis);
return this;
}
public Connection followRedirects(boolean followRedirects) {
req.followRedirects(followRedirects);
return this;
}
public Connection referrer(String referrer) {
Validate.notNull(referrer, "Referrer must not be null");
req.header("Referer", referrer);
return this;
}
public Connection method(Method method) {
req.method(method);
return this;
}
public Connection data(String key, String value) {
req.data(KeyVal.create(key, value));
return this;
}
public Connection data(Map<String, String> data) {
Validate.notNull(data, "Data map must not be null");
for (Map.Entry<String, String> entry : data.entrySet()) {
req.data(KeyVal.create(entry.getKey(), entry.getValue()));
}
return this;
}
public Connection data(String... keyvals) {
Validate.notNull(keyvals, "Data key value pairs must not be null");
Validate.isTrue(keyvals.length % 2 == 0,
"Must supply an even number of key value pairs");
for (int i = 0; i < keyvals.length; i += 2) {
String key = keyvals[i];
String value = keyvals[i + 1];
Validate.notEmpty(key, "Data key must not be empty");
Validate.notNull(value, "Data value must not be null");
req.data(KeyVal.create(key, value));
}
return this;
}
public Connection header(String name, String value) {
req.header(name, value);
return this;
}
public Connection cookie(String name, String value) {
req.cookie(name, value);
return this;
}
public Connection.Response execute() throws IOException {
res = Response.execute(req);
return res;
}
public Connection.Request request() {
return req;
}
public Connection request(Connection.Request request) {
req = request;
return this;
}
public Connection.Response response() {
return res;
}
public Connection response(Connection.Response response) {
res = response;
return this;
}
@SuppressWarnings({ "unchecked" })
private static abstract class Base<T extends Connection.Base> implements
Connection.Base<T> {
URL url;
Method method;
Map<String, String> headers;
Map<String, String> cookies;
private Base() {
headers = new LinkedHashMap<String, String>();
cookies = new LinkedHashMap<String, String>();
}
public URL url() {
return url;
}
public T url(URL url) {
Validate.notNull(url, "URL must not be null");
this.url = url;
return (T) this;
}
public Method method() {
return method;
}
public T method(Method method) {
Validate.notNull(method, "Method must not be null");
this.method = method;
return (T) this;
}
public String header(String name) {
Validate.notNull(name, "Header name must not be null");
return getHeaderCaseInsensitive(name);
}
public T header(String name, String value) {
Validate.notEmpty(name, "Header name must not be empty");
Validate.notNull(value, "Header value must not be null");
removeHeader(name); // ensures we don't get an "accept-encoding" and
// a "Accept-Encoding"
headers.put(name, value);
return (T) this;
}
public boolean hasHeader(String name) {
Validate.notEmpty(name, "Header name must not be empty");
return getHeaderCaseInsensitive(name) != null;
}
public T removeHeader(String name) {
Validate.notEmpty(name, "Header name must not be empty");
Map.Entry<String, String> entry = scanHeaders(name); // remove is
// case
// insensitive
// too
if (entry != null)
headers.remove(entry.getKey()); // ensures correct case
return (T) this;
}
public Map<String, String> headers() {
return headers;
}
private String getHeaderCaseInsensitive(String name) {
Validate.notNull(name, "Header name must not be null");
// quick evals for common case of title case, lower case, then scan
// for mixed
String value = headers.get(name);
if (value == null)
value = headers.get(name.toLowerCase());
if (value == null) {
Map.Entry<String, String> entry = scanHeaders(name);
if (entry != null)
value = entry.getValue();
}
return value;
}
private Map.Entry<String, String> scanHeaders(String name) {
String lc = name.toLowerCase();
for (Map.Entry<String, String> entry : headers.entrySet()) {
if (entry.getKey().toLowerCase().equals(lc))
return entry;
}
return null;
}
public String cookie(String name) {
Validate.notNull(name, "Cookie name must not be null");
return cookies.get(name);
}
public T cookie(String name, String value) {
Validate.notEmpty(name, "Cookie name must not be empty");
Validate.notNull(value, "Cookie value must not be null");
cookies.put(name, value);
return (T) this;
}
public boolean hasCookie(String name) {
Validate.notEmpty("Cookie name must not be empty");
return cookies.containsKey(name);
}
public T removeCookie(String name) {
Validate.notEmpty("Cookie name must not be empty");
cookies.remove(name);
return (T) this;
}
public Map<String, String> cookies() {
return cookies;
}
}
public static class Request extends Base<Connection.Request> implements
Connection.Request {
private int timeoutMilliseconds;
private boolean followRedirects;
private Collection<Connection.KeyVal> data;
private Request() {
timeoutMilliseconds = 3000;
followRedirects = true;
data = new ArrayList<Connection.KeyVal>();
method = Connection.Method.GET;
headers.put("Accept-Encoding", "gzip");
}
public int timeout() {
return timeoutMilliseconds;
}
public Request timeout(int millis) {
Validate.isTrue(millis >= 0,
"Timeout milliseconds must be 0 (infinite) or greater");
timeoutMilliseconds = millis;
return this;
}
public boolean followRedirects() {
return followRedirects;
}
public Connection.Request followRedirects(boolean followRedirects) {
this.followRedirects = followRedirects;
return this;
}
public Request data(Connection.KeyVal keyval) {
Validate.notNull(keyval, "Key val must not be null");
data.add(keyval);
return this;
}
public Collection<Connection.KeyVal> data() {
return data;
}
}
public static class Response extends Base<Connection.Response> implements
Connection.Response {
private static final int MAX_REDIRECTS = 20;
private int statusCode;
private String statusMessage;
private ByteBuffer byteData;
private String charset;
private String contentType;
private boolean executed = false;
private int numRedirects = 0;
Response() {
super();
}
private Response(Response previousResponse) throws IOException {
super();
if (previousResponse != null) {
numRedirects = previousResponse.numRedirects + 1;
if (numRedirects >= MAX_REDIRECTS)
throw new IOException(
String.format(
"Too many redirects occurred trying to load URL %s",
previousResponse.url()));
}
}
static Response execute(Connection.Request req) throws IOException {
return execute(req, null);
}
static Response execute(Connection.Request req,
Response previousResponse) throws IOException {
Validate.notNull(req, "Request must not be null");
String protocol = req.url().getProtocol();
Validate.isTrue(
protocol.equals("http") || protocol.equals("https"),
"Only http & https protocols supported");
// set up the request for execution
if (req.method() == Connection.Method.GET && req.data().size() > 0)
serialiseRequestUrl(req); // appends query string
HttpURLConnection conn = createConnection(req);
conn.connect();
if (req.method() == Connection.Method.POST)
writePost(req.data(), conn.getOutputStream());
int status = conn.getResponseCode();
boolean needsRedirect = false;
if (status != HttpURLConnection.HTTP_OK) {
if (status == HttpURLConnection.HTTP_MOVED_TEMP
|| status == HttpURLConnection.HTTP_MOVED_PERM
|| status == HttpURLConnection.HTTP_SEE_OTHER)
needsRedirect = true;
else
throw new IOException(status + " error loading URL "
+ req.url().toString());
}
Response res = new Response(previousResponse);
res.setupFromConnection(conn, previousResponse);
if (needsRedirect && req.followRedirects()) {
req.url(new URL(req.url(), res.header("Location")));
for (Map.Entry<String, String> cookie : res.cookies.entrySet()) { // add
// response
// cookies
// to
// request
// (for
// e.g.
// login
// posts)
req.cookie(cookie.getKey(), cookie.getValue());
}
return execute(req, res);
}
InputStream inStream = null;
try {
inStream = res.hasHeader("Content-Encoding")
&& res.header("Content-Encoding").equalsIgnoreCase(
"gzip") ? new BufferedInputStream(
new GZIPInputStream(conn.getInputStream()))
: new BufferedInputStream(conn.getInputStream());
res.byteData = DataUtil.readToByteBuffer(inStream);
res.charset = DataUtil
.getCharsetFromContentType(res.contentType); // may be
// null,
// readInputStream
// deals
// with
// it
} finally {
if (inStream != null)
inStream.close();
}
res.executed = true;
return res;
}
public int statusCode() {
return statusCode;
}
public String statusMessage() {
return statusMessage;
}
public String charset() {
return charset;
}
public String contentType() {
return contentType;
}
public String body() {
Validate.isTrue(
executed,
"Request must be executed (with .execute(), .get(), or .post() before getting response body");
// charset gets set from header on execute, and from meta-equiv on
// parse. parse may not have happened yet
String body;
if (charset == null)
body = Charset.forName(DataUtil.defaultCharset)
.decode(byteData).toString();
else
body = Charset.forName(charset).decode(byteData).toString();
byteData.rewind();
return body;
}
public byte[] bodyAsBytes() {
Validate.isTrue(
executed,
"Request must be executed (with .execute(), .get(), or .post() before getting response body");
return byteData.array();
}
// set up connection defaults, and details from request
private static HttpURLConnection createConnection(Connection.Request req)
throws IOException {
HttpURLConnection conn = (HttpURLConnection) req.url()
.openConnection();
conn.setRequestMethod(req.method().name());
conn.setInstanceFollowRedirects(false); // don't rely on native
// redirection support
conn.setConnectTimeout(req.timeout());
conn.setReadTimeout(req.timeout());
if (req.method() == Method.POST)
conn.setDoOutput(true);
if (req.cookies().size() > 0)
conn.addRequestProperty("Cookie", getRequestCookieString(req));
for (Map.Entry<String, String> header : req.headers().entrySet()) {
conn.addRequestProperty(header.getKey(), header.getValue());
}
return conn;
}
// set up url, method, header, cookies
private void setupFromConnection(HttpURLConnection conn,
Connection.Response previousResponse) throws IOException {
method = Connection.Method.valueOf(conn.getRequestMethod());
url = conn.getURL();
statusCode = conn.getResponseCode();
statusMessage = conn.getResponseMessage();
contentType = conn.getContentType();
// headers into map
Map<String, List<String>> resHeaders = conn.getHeaderFields();
for (Map.Entry<String, List<String>> entry : resHeaders.entrySet()) {
String name = entry.getKey();
if (name == null)
continue; // http/1.1 line
List<String> values = entry.getValue();
if (name.equalsIgnoreCase("Set-Cookie")) {
for (String value : values) {
TokenQueue cd = new TokenQueue(value);
String cookieName = cd.chompTo("=").trim();
String cookieVal = cd.consumeTo(";").trim();
// ignores path, date, domain, secure et al. req'd?
cookie(cookieName, cookieVal);
}
} else { // only take the first instance of each header
if (!values.isEmpty())
header(name, values.get(0));
}
}
// if from a redirect, map previous response cookies into this
// response
if (previousResponse != null) {
for (Map.Entry<String, String> prevCookie : previousResponse
.cookies().entrySet()) {
if (!hasCookie(prevCookie.getKey()))
cookie(prevCookie.getKey(), prevCookie.getValue());
}
}
}
private static void writePost(Collection<Connection.KeyVal> data,
OutputStream outputStream) throws IOException {
OutputStreamWriter w = new OutputStreamWriter(outputStream,
DataUtil.defaultCharset);
boolean first = true;
for (Connection.KeyVal keyVal : data) {
if (!first)
w.append('&');
else
first = false;
w.write(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset));
w.write('=');
w.write(URLEncoder.encode(keyVal.value(),
DataUtil.defaultCharset));
}
w.close();
}
private static String getRequestCookieString(Connection.Request req) {
StringBuilder sb = new StringBuilder();
boolean first = true;
for (Map.Entry<String, String> cookie : req.cookies().entrySet()) {
if (!first)
sb.append("; ");
else
first = false;
sb.append(cookie.getKey()).append('=')
.append(cookie.getValue());
// todo: spec says only ascii, no escaping / encoding defined.
// validate on set? or escape somehow here?
}
return sb.toString();
}
// for get url reqs, serialise the data map into the url
private static void serialiseRequestUrl(Connection.Request req)
throws IOException {
URL in = req.url();
StringBuilder url = new StringBuilder();
boolean first = true;
// reconstitute the query, ready for appends
url.append(in.getProtocol()).append("://")
.append(in.getAuthority()) // includes host, port
.append(in.getPath()).append("?");
if (in.getQuery() != null) {
url.append(in.getQuery());
first = false;
}
for (Connection.KeyVal keyVal : req.data()) {
if (!first)
url.append('&');
else
first = false;
url.append(
URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset))
.append('=')
.append(URLEncoder.encode(keyVal.value(),
DataUtil.defaultCharset));
}
req.url(new URL(url.toString()));
req.data().clear(); // moved into url as get params
}
}
public static class KeyVal implements Connection.KeyVal {
private String key;
private String value;
public static KeyVal create(String key, String value) {
Validate.notEmpty(key, "Data key must not be empty");
Validate.notNull(value, "Data value must not be null");
return new KeyVal(key, value);
}
private KeyVal(String key, String value) {
this.key = key;
this.value = value;
}
public KeyVal key(String key) {
Validate.notEmpty(key, "Data key must not be empty");
this.key = key;
return this;
}
public String key() {
return key;
}
public KeyVal value(String value) {
Validate.notNull(value, "Data value must not be null");
this.value = value;
return this;
}
public String value() {
return value;
}
@Override
public String toString() {
return key + "=" + value;
}
}
}
/**
* A Connection provides a convenient interface to fetch content from the web,
* and parse them into Documents.
* <p>
* To get a new Connection, use {@link org.jsoup.Jsoup#connect(String)}.
* Connections contain {@link Connection.Request} and
* {@link Connection.Response} objects. The request objects are reusable as
* prototype requests.
* <p>
* Request configuration can be made using either the shortcut methods in
* Connection (e.g. {@link #userAgent(String)}), or by methods in the
* Connection.Request object directly. All request configuration must be made
* before the request is executed.
* <p>
* The Connection interface is <b>currently in beta</b> and subject to change.
* Comments, suggestions, and bug reports are welcome.
*/
interface Connection {
/**
* GET and POST http methods.
*/
public enum Method {
GET, POST
}
/**
* Set the request URL to fetch. The protocol must be HTTP or HTTPS.
*
* @param url
* URL to connect to
* @return this Connection, for chaining
*/
public Connection url(URL url);
/**
* Set the request URL to fetch. The protocol must be HTTP or HTTPS.
*
* @param url
* URL to connect to
* @return this Connection, for chaining
*/
public Connection url(String url);
/**
* Set the request user-agent header.
*
* @param userAgent
* user-agent to use
* @return this Connection, for chaining
*/
public Connection userAgent(String userAgent);
/**
* Set the request timeouts (connect and read). If a timeout occurs, an
* IOException will be thrown. The default timeout is 3 seconds (3000
* millis). A timeout of zero is treated as an infinite timeout.
*
* @param millis
* number of milliseconds (thousandths of a second) before timing
* out connects or reads.
* @return this Connection, for chaining
*/
public Connection timeout(int millis);
/**
* Set the request referrer (aka "referer") header.
*
* @param referrer
* referrer to use
* @return this Connection, for chaining
*/
public Connection referrer(String referrer);
/**
* Configures the connection to (not) follow server redirects. By default
* this is <b>true</b>.
*
* @param followRedirects
* true if server redirects should be followed.
* @return this Connection, for chaining
*/
public Connection followRedirects(boolean followRedirects);
/**
* Set the request method to use, GET or POST. Default is GET.
*
* @param method
* HTTP request method
* @return this Connection, for chaining
*/
public Connection method(Method method);
/**
* Add a request data parameter. Request parameters are sent in the request
* query string for GETs, and in the request body for POSTs. A request may
* have multiple values of the same name.
*
* @param key
* data key
* @param value
* data value
* @return this Connection, for chaining
*/
public Connection data(String key, String value);
/**
* Adds all of the supplied data to the request data parameters
*
* @param data
* map of data parameters
* @return this Connection, for chaining
*/
public Connection data(Map<String, String> data);
/**
* Add a number of request data parameters. Multiple parameters may be set
* at once, e.g.:
* <code>.data("name", "jsoup", "language", "Java", "language", "English");</code>
* creates a query string like:
* <code>?name=jsoup&language=Java&language=English</code>
*
* @param keyvals
* a set of key value pairs.
* @return this Connection, for chaining
*/
public Connection data(String... keyvals);
/**
* Set a request header.
*
* @param name
* header name
* @param value
* header value
* @return this Connection, for chaining
* @see org.jsoup.Connection.Request#headers()
*/
public Connection header(String name, String value);
/**
* Set a cookie to be sent in the request
*
* @param name
* name of cookie
* @param value
* value of cookie
* @return this Connection, for chaining
*/
public Connection cookie(String name, String value);
/**
* Execute the request.
*
* @return a response object
* @throws IOException
* on error
*/
public Response execute() throws IOException;
/**
* Get the request object associatated with this connection
*
* @return request
*/
public Request request();
/**
* Set the connection's request
*
* @param request
* new request object
* @return this Connection, for chaining
*/
public Connection request(Request request);
/**
* Get the response, once the request has been executed
*
* @return response
*/
public Response response();
/**
* Set the conenction's response
*
* @param response
* new response
* @return this Connection, for chaining
*/
public Connection response(Response response);
/**
* Common methods for Requests and Responses
*
* @param <T>
* Type of Base, either Request or Response
*/
interface Base<T extends Base> {
/**
* Get the URL
*
* @return URL
*/
public URL url();
/**
* Set the URL
*
* @param url
* new URL
* @return this, for chaining
*/
public T url(URL url);
/**
* Get the request method
*
* @return method
*/
public Method method();
/**
* Set the request method
*
* @param method
* new method
* @return this, for chaining
*/
public T method(Method method);
/**
* Get the value of a header. This is a simplified header model, where a
* header may only have one value.
* <p>
* Header names are case insensitive.
*
* @param name
* name of header (case insensitive)
* @return value of header, or null if not set.
* @see #hasHeader(String)
* @see #cookie(String)
*/
public String header(String name);
/**
* Set a header. This method will overwrite any existing header with the
* same case insensitive name.
*
* @param name
* Name of header
* @param value
* Value of header
* @return this, for chaining
*/
public T header(String name, String value);
/**
* Check if a header is present
*
* @param name
* name of header (case insensitive)
* @return if the header is present in this request/response
*/
public boolean hasHeader(String name);
/**
* Remove a header by name
*
* @param name
* name of header to remove (case insensitive)
* @return this, for chianing
*/
public T removeHeader(String name);
/**
* Retrieve all of the request/response headers as a map
*
* @return headers
*/
public Map<String, String> headers();
/**
* Get a cookie value by name from this request/response.
* <p>
* Response objects have a simplified cookie model. Each cookie set in
* the response is added to the response object's cookie key=value map.
* The cookie's path, domain, and expiry date are ignored.
*
* @param name
* name of cookie to retrieve.
* @return value of cookie, or null if not set
*/
public String cookie(String name);
/**
* Set a cookie in this request/response.
*
* @param name
* name of cookie
* @param value
* value of cookie
* @return this, for chianing
*/
public T cookie(String name, String value);
/**
* Check if a cookie is present
*
* @param name
* name of cookie
* @return if the cookie is present in this request/response
*/
public boolean hasCookie(String name);
/**
* Remove a cookie by name
*
* @param name
* name of cookie to remove
* @return this, for chianing
*/
public T removeCookie(String name);
/**
* Retrieve all of the request/response cookies as a map
*
* @return cookies
*/
public Map<String, String> cookies();
}
/**
* Represents a HTTP request.
*/
public interface Request extends Base<Request> {
/**
* Get the request timeout, in milliseconds.
*
* @return the timeout in milliseconds.
*/
public int timeout();
/**
* Update the request timeout.
*
* @param millis
* timeout, in milliseconds
* @return this Request, for chaining
*/
public Request timeout(int millis);
/**
* Get the current followRedirects configuration.
*
* @return true if followRedirects is enabled.
*/
public boolean followRedirects();
/**
* Configures the request to (not) follow server redirects. By default
* this is <b>true</b>.
*
* @param followRedirects
* true if server redirects should be followed.
* @return this Connection, for chaining
*/
public Request followRedirects(boolean followRedirects);
/**
* Add a data parameter to the request
*
* @param keyval
* data to add.
* @return this Request, for chaining
*/
public Request data(KeyVal keyval);
/**
* Get all of the request's data parameters
*
* @return collection of keyvals
*/
public Collection<KeyVal> data();
}
/**
* Represents a HTTP response.
*/
public interface Response extends Base<Response> {
/**
* Get the status code of the response.
*
* @return status code
*/
public int statusCode();
/**
* Get the status message of the response.
*
* @return status message
*/
public String statusMessage();
/**
* Get the character set name of the response.
*
* @return character set name
*/
public String charset();
/**
* Get the response content type (e.g. "text/html");
*
* @return the response content type
*/
public String contentType();
/**
* Get the body of the response as a plain string.
*
* @return body
*/
public String body();
/**
* Get the body of the response as an array of bytes.
*
* @return body bytes
*/
public byte[] bodyAsBytes();
}
/**
* A Key Value tuple.
*/
public interface KeyVal {
/**
* Update the key of a keyval
*
* @param key
* new key
* @return this KeyVal, for chaining
*/
public KeyVal key(String key);
/**
* Get the key of a keyval
*
* @return the key
*/
public String key();
/**
* Update the value of a keyval
*
* @param value
* the new value
* @return this KeyVal, for chaining
*/
public KeyVal value(String value);
/**
* Get the value of a keyval
*
* @return the value
*/
public String value();
}
}
final class Validate {
private Validate() {
}
/**
* Validates that the obect is not null
*
* @param obj
* object to test
*/
public static void notNull(Object obj) {
if (obj == null)
throw new IllegalArgumentException("Object must not be null");
}
/**
* Validates that the object is not null
*
* @param obj
* object to test
* @param msg
* message to output if validation fails
*/
public static void notNull(Object obj, String msg) {
if (obj == null)
throw new IllegalArgumentException(msg);
}
/**
* Validates that the value is true
*
* @param val
* object to test
*/
public static void isTrue(boolean val) {
if (!val)
throw new IllegalArgumentException("Must be true");
}
/**
* Validates that the value is true
*
* @param val
* object to test
* @param msg
* message to output if validation fails
*/
public static void isTrue(boolean val, String msg) {
if (!val)
throw new IllegalArgumentException(msg);
}
/**
* Validates that the array contains no null elements
*
* @param objects
* the array to test
*/
public static void noNullElements(Object[] objects) {
noNullElements(objects, "Array must not contain any null objects");
}
/**
* Validates that the array contains no null elements
*
* @param objects
* the array to test
* @param msg
* message to output if validation fails
*/
public static void noNullElements(Object[] objects, String msg) {
for (Object obj : objects)
if (obj == null)
throw new IllegalArgumentException(msg);
}
/**
* Validates that the string is not empty
*
* @param string
* the string to test
*/
public static void notEmpty(String string) {
if (string == null || string.length() == 0)
throw new IllegalArgumentException("String must not be empty");
}
/**
* Validates that the string is not empty
*
* @param string
* the string to test
* @param msg
* message to output if validation fails
*/
public static void notEmpty(String string, String msg) {
if (string == null || string.length() == 0)
throw new IllegalArgumentException(msg);
}
}
/**
* Internal static utilities for handling data.
*
*/
class DataUtil {
private static final Pattern charsetPattern = Pattern
.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
static final String defaultCharset = "UTF-8"; // used if not found in header
// or meta charset
private static final int bufferSize = 0x20000; // ~130K.
private DataUtil() {
}
/**
* Loads a file to a Document.
*
* @param in
* file to load
* @param charsetName
* character set of input
* @param baseUri
* base URI of document, to resolve relative links against
* @return Document
* @throws IOException
* on IO error
*/
static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException {
byte[] buffer = new byte[bufferSize];
ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
int read;
while (true) {
read = inStream.read(buffer);
if (read == -1)
break;
outStream.write(buffer, 0, read);
}
ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
return byteData;
}
/**
* Parse out a charset from a content type header.
*
* @param contentType
* e.g. "text/html; charset=EUC-JP"
* @return "EUC-JP", or null if not found. Charset is trimmed and
* uppercased.
*/
static String getCharsetFromContentType(String contentType) {
if (contentType == null)
return null;
Matcher m = charsetPattern.matcher(contentType);
if (m.find()) {
return m.group(1).trim().toUpperCase();
}
return null;
}
}
/**
* A character queue with parsing helpers.
*
* @author Jonathan Hedley
*/
class TokenQueue {
private String queue;
private int pos = 0;
private static final char ESC = '\\'; // escape char for chomp balanced.
/**
Create a new TokenQueue.
@param data string of data to back queue.
*/
public TokenQueue(String data) {
Validate.notNull(data);
queue = data;
}
/**
* Is the queue empty?
* @return true if no data left in queue.
*/
public boolean isEmpty() {
return remainingLength() == 0;
}
private int remainingLength() {
return queue.length() - pos;
}
/**
* Retrieves but does not remove the first character from the queue.
* @return First character, or 0 if empty.
*/
public char peek() {
return isEmpty() ? 0 : queue.charAt(pos);
}
/**
Add a character to the start of the queue (will be the next character retrieved).
@param c character to add
*/
public void addFirst(Character c) {
addFirst(c.toString());
}
/**
Add a string to the start of the queue.
@param seq string to add.
*/
public void addFirst(String seq) {
// not very performant, but an edge case
queue = seq + queue.substring(pos);
pos = 0;
}
/**
* Tests if the next characters on the queue match the sequence. Case insensitive.
* @param seq String to check queue for.
* @return true if the next characters match.
*/
public boolean matches(String seq) {
return queue.regionMatches(true, pos, seq, 0, seq.length());
}
/**
* Case sensitive match test.
* @param seq
* @return
*/
public boolean matchesCS(String seq) {
return queue.startsWith(seq, pos);
}
/**
Tests if the next characters match any of the sequences. Case insensitive.
@param seq
@return
*/
public boolean matchesAny(String... seq) {
for (String s : seq) {
if (matches(s))
return true;
}
return false;
}
public boolean matchesAny(char... seq) {
if (isEmpty())
return false;
for (char c: seq) {
if (queue.charAt(pos) == c)
return true;
}
return false;
}
public boolean matchesStartTag() {
// micro opt for matching "<x"
return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1)));
}
/**
* Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the
* queue.
* @param seq String to search for, and if found, remove from queue.
* @return true if found and removed, false if not found.
*/
public boolean matchChomp(String seq) {
if (matches(seq)) {
pos += seq.length();
return true;
} else {
return false;
}
}
/**
Tests if queue starts with a whitespace character.
@return if starts with whitespace
*/
public boolean matchesWhitespace() {
return !isEmpty() && Character.isWhitespace(queue.charAt(pos));
}
/**
Test if the queue matches a word character (letter or digit).
@return if matches a word character
*/
public boolean matchesWord() {
return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos));
}
/**
* Drops the next character off the queue.
*/
public void advance() {
if (!isEmpty()) pos++;
}
/**
* Consume one character off queue.
* @return first character on queue.
*/
public char consume() {
return queue.charAt(pos++);
}
/**
* Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will
* throw an illegal state exception -- but you should be running match() against that condition.
<p>
Case insensitive.
* @param seq sequence to remove from head of queue.
*/
public void consume(String seq) {
if (!matches(seq))
throw new IllegalStateException("Queue did not match expected sequence");
int len = seq.length();
if (len > remainingLength())
throw new IllegalStateException("Queue not long enough to consume sequence");
pos += len;
}
/**
* Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
* @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b>
* @return The matched data consumed from queue.
*/
public String consumeTo(String seq) {
int offset = queue.indexOf(seq, pos);
if (offset != -1) {
String consumed = queue.substring(pos, offset);
pos += consumed.length();
return consumed;
} else {
return remainder();
}
}
public String consumeToIgnoreCase(String seq) {
int start = pos;
String first = seq.substring(0, 1);
boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of
while (!isEmpty()) {
if (matches(seq))
break;
if (canScan) {
int skip = queue.indexOf(first, pos) - pos;
if (skip == 0) // this char is the skip char, but not match, so force advance of pos
pos++;
else if (skip < 0) // no chance of finding, grab to end
pos = queue.length();
else
pos += skip;
}
else
pos++;
}
String data = queue.substring(start, pos);
return data;
}
/**
Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
@param seq any number of terminators to consume to. <b>Case insensitive.</b>
@return consumed string
*/
// todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this
// is is a case sensitive time...
public String consumeToAny(String... seq) {
int start = pos;
while (!isEmpty() && !matchesAny(seq)) {
pos++;
}
String data = queue.substring(start, pos);
return data;
}
/**
* Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it).
* <p>
* If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go
* isEmpty() == true).
* @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b>
* @return Data matched from queue.
*/
public String chompTo(String seq) {
String data = consumeTo(seq);
matchChomp(seq);
return data;
}
public String chompToIgnoreCase(String seq) {
String data = consumeToIgnoreCase(seq); // case insensitive scan
matchChomp(seq);
return data;
}
/**
* Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
* and leave " four" on the queue. Unbalanced openers and closers can be escaped (with \). Those escapes will be left
* in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for
* contains text strings; use unescape for that.
* @param open opener
* @param close closer
* @return data matched from the queue
*/
public String chompBalanced(char open, char close) {
StringBuilder accum = new StringBuilder();
int depth = 0;
char last = 0;
do {
if (isEmpty()) break;
Character c = consume();
if (last == 0 || last != ESC) {
if (c.equals(open))
depth++;
else if (c.equals(close))
depth--;
}
if (depth > 0 && last != 0)
accum.append(c); // don't include the outer match pair in the return
last = c;
} while (depth > 0);
return accum.toString();
}
/**
* Unescaped a \ escaped string.
* @param in backslash escaped string
* @return unescaped string
*/
public static String unescape(String in) {
StringBuilder out = new StringBuilder();
char last = 0;
for (char c : in.toCharArray()) {
if (c == ESC) {
if (last != 0 && last == ESC)
out.append(c);
}
else
out.append(c);
last = c;
}
return out.toString();
}
/**
* Pulls the next run of whitespace characters of the queue.
*/
public boolean consumeWhitespace() {
boolean seen = false;
while (matchesWhitespace()) {
pos++;
seen = true;
}
return seen;
}
/**
* Retrieves the next run of word type (letter or digit) off the queue.
* @return String of word characters from queue, or empty string if none.
*/
public String consumeWord() {
int start = pos;
while (matchesWord())
pos++;
return queue.substring(start, pos);
}
/**
* Consume an tag name off the queue (word or :, _, -)
*
* @return tag name
*/
public String consumeTagName() {
int start = pos;
while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-')))
pos++;
return queue.substring(start, pos);
}
/**
* Consume a CSS element selector (tag name, but | instead of : for namespaces, to not conflict with :pseudo selects).
*
* @return tag name
*/
public String consumeElementSelector() {
int start = pos;
while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-')))
pos++;
return queue.substring(start, pos);
}
/**
Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _)
http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier
@return identifier
*/
public String consumeCssIdentifier() {
int start = pos;
while (!isEmpty() && (matchesWord() || matchesAny('-', '_')))
pos++;
return queue.substring(start, pos);
}
/**
Consume an attribute key off the queue (letter, digit, -, _, :")
@return attribute key
*/
public String consumeAttributeKey() {
int start = pos;
while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':')))
pos++;
return queue.substring(start, pos);
}
/**
Consume and return whatever is left on the queue.
@return remained of queue.
*/
public String remainder() {
StringBuilder accum = new StringBuilder();
while (!isEmpty()) {
accum.append(consume());
}
return accum.toString();
}
public String toString() {
return queue.substring(pos);
}
}
Related examples in the same category