Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.util; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.text.NumberFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashSet; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TimeZone; import java.util.logging.Level; import java.util.logging.Logger; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.apache.commons.io.IOUtils; import org.archive.format.gzip.GZIPDecoder; import org.archive.format.gzip.GZIPFormatException; /** * Miscellaneous useful methods. * * @author gojomo & others */ public class ArchiveUtils { private static final Logger LOGGER = Logger.getLogger(ArchiveUtils.class.getName()); final public static String VERSION = loadVersion(); /** * Arc-style date stamp in the format yyyyMMddHHmm and UTC time zone. */ private static final ThreadLocal<SimpleDateFormat> TIMESTAMP12 = threadLocalDateFormat("yyyyMMddHHmm");; /** * Arc-style date stamp in the format yyyyMMddHHmmss and UTC time zone. */ private static final ThreadLocal<SimpleDateFormat> TIMESTAMP14 = threadLocalDateFormat("yyyyMMddHHmmss"); /** * Arc-style date stamp in the format yyyyMMddHHmmssSSS and UTC time zone. */ private static final ThreadLocal<SimpleDateFormat> TIMESTAMP17 = threadLocalDateFormat("yyyyMMddHHmmssSSS"); /** * Log-style date stamp in the format yyyy-MM-dd'T'HH:mm:ss.SSS'Z' * UTC time zone is assumed. */ private static final ThreadLocal<SimpleDateFormat> TIMESTAMP17ISO8601Z = threadLocalDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); /** * Log-style date stamp in the format yyyy-MM-dd'T'HH:mm:ss'Z' * UTC time zone is assumed. */ private static final ThreadLocal<SimpleDateFormat> TIMESTAMP14ISO8601Z = threadLocalDateFormat( "yyyy-MM-dd'T'HH:mm:ss'Z'"); /** * Default character to use padding strings. */ private static final char DEFAULT_PAD_CHAR = ' '; /** milliseconds in an hour */ private static final int HOUR_IN_MS = 60 * 60 * 1000; /** milliseconds in a day */ private static final int DAY_IN_MS = 24 * HOUR_IN_MS; private static ThreadLocal<SimpleDateFormat> threadLocalDateFormat(final String pattern) { ThreadLocal<SimpleDateFormat> tl = new ThreadLocal<SimpleDateFormat>() { protected SimpleDateFormat initialValue() { SimpleDateFormat df = new SimpleDateFormat(pattern, Locale.ENGLISH); df.setTimeZone(TimeZone.getTimeZone("GMT")); return df; } }; return tl; } public static int MAX_INT_CHAR_WIDTH = Integer.toString(Integer.MAX_VALUE).length(); /** * Utility function for creating arc-style date stamps * in the format yyyMMddHHmmssSSS. * Date stamps are in the UTC time zone * @return the date stamp */ public static String get17DigitDate() { return TIMESTAMP17.get().format(new Date()); } protected static long LAST_UNIQUE_NOW17 = 0; protected static String LAST_TIMESTAMP17 = ""; /** * Utility function for creating UNIQUE-from-this-class * arc-style date stamps in the format yyyMMddHHmmssSSS. * Rather than giving a duplicate datestamp on a * subsequent call, will increment the milliseconds until a * unique value is returned. * * Date stamps are in the UTC time zone * @return the date stamp */ public synchronized static String getUnique17DigitDate() { long effectiveNow = System.currentTimeMillis(); effectiveNow = Math.max(effectiveNow, LAST_UNIQUE_NOW17 + 1); String candidate = get17DigitDate(effectiveNow); while (candidate.equals(LAST_TIMESTAMP17)) { effectiveNow++; candidate = get17DigitDate(effectiveNow); } LAST_UNIQUE_NOW17 = effectiveNow; LAST_TIMESTAMP17 = candidate; return candidate; } /** * Utility function for creating arc-style date stamps * in the format yyyyMMddHHmmss. * Date stamps are in the UTC time zone * @return the date stamp */ public static String get14DigitDate() { return TIMESTAMP14.get().format(new Date()); } protected static long LAST_UNIQUE_NOW14 = 0; protected static String LAST_TIMESTAMP14 = ""; /** * Utility function for creating UNIQUE-from-this-class * arc-style date stamps in the format yyyMMddHHmmss. * Rather than giving a duplicate datestamp on a * subsequent call, will increment the seconds until a * unique value is returned. * * Date stamps are in the UTC time zone * @return the date stamp */ public synchronized static String getUnique14DigitDate() { long effectiveNow = System.currentTimeMillis(); effectiveNow = Math.max(effectiveNow, LAST_UNIQUE_NOW14 + 1); String candidate = get14DigitDate(effectiveNow); while (candidate.equals(LAST_TIMESTAMP14)) { effectiveNow += 1000; candidate = get14DigitDate(effectiveNow); } LAST_UNIQUE_NOW14 = effectiveNow; LAST_TIMESTAMP14 = candidate; return candidate; } /** * Utility function for creating arc-style date stamps * in the format yyyyMMddHHmm. * Date stamps are in the UTC time zone * @return the date stamp */ public static String get12DigitDate() { return TIMESTAMP12.get().format(new Date()); } /** * Utility function for creating log timestamps, in * W3C/ISO8601 format, assuming UTC. Use current time. * * Format is yyyy-MM-dd'T'HH:mm:ss.SSS'Z' * * @return the date stamp */ public static String getLog17Date() { return TIMESTAMP17ISO8601Z.get().format(new Date()); } /** * Utility function for creating log timestamps, in * W3C/ISO8601 format, assuming UTC. * * Format is yyyy-MM-dd'T'HH:mm:ss.SSS'Z' * @param date Date to format. * * @return the date stamp */ public static String getLog17Date(long date) { return TIMESTAMP17ISO8601Z.get().format(new Date(date)); } /** * Utility function for creating log timestamps, in * W3C/ISO8601 format, assuming UTC. Use current time. * * Format is yyyy-MM-dd'T'HH:mm:ss'Z' * * @return the date stamp */ public static String getLog14Date() { return TIMESTAMP14ISO8601Z.get().format(new Date()); } /** * Utility function for creating log timestamps, in * W3C/ISO8601 format, assuming UTC. * * Format is yyyy-MM-dd'T'HH:mm:ss'Z' * @param date long timestamp to format. * * @return the date stamp */ public static String getLog14Date(long date) { return TIMESTAMP14ISO8601Z.get().format(new Date(date)); } /** * Utility function for creating log timestamps, in * W3C/ISO8601 format, assuming UTC. * * Format is yyyy-MM-dd'T'HH:mm:ss'Z' * @param date Date to format. * * @return the date stamp */ public static String getLog14Date(Date date) { return TIMESTAMP14ISO8601Z.get().format(date); } public static Date parse14DigitISODate(String datetime, Date defaultVal) { try { return TIMESTAMP14ISO8601Z.get().parse(datetime); } catch (ParseException e) { return defaultVal; } } /** * Utility function for creating arc-style date stamps * in the format yyyyMMddHHmmssSSS. * Date stamps are in the UTC time zone * * @param date milliseconds since epoc * @return the date stamp */ public static String get17DigitDate(long date) { return TIMESTAMP17.get().format(new Date(date)); } public static String get17DigitDate(Date date) { return TIMESTAMP17.get().format(date); } /** * Utility function for creating arc-style date stamps * in the format yyyyMMddHHmmss. * Date stamps are in the UTC time zone * * @param date milliseconds since epoc * @return the date stamp */ public static String get14DigitDate(long date) { return TIMESTAMP14.get().format(new Date(date)); } public static String get14DigitDate(Date d) { return TIMESTAMP14.get().format(d); } /** * Utility function for creating arc-style date stamps * in the format yyyyMMddHHmm. * Date stamps are in the UTC time zone * * @param date milliseconds since epoc * @return the date stamp */ public static String get12DigitDate(long date) { return TIMESTAMP12.get().format(new Date(date)); } public static String get12DigitDate(Date d) { return TIMESTAMP12.get().format(d); } /** * A version of getDate which returns the default instead of throwing an exception if parsing fails * * @param d * @param defaultDate * @return * @throws ParseException */ public static Date getDate(String d, Date defaultDate) { if (d == null) { return defaultDate; } try { return getDate(d); } catch (ParseException pe) { return defaultDate; } } /** * Parses an ARC-style date. If passed String is < 12 characters in length, * we pad. At a minimum, String should contain a year (>=4 characters). * Parse will also fail if day or month are incompletely specified. Depends * on the above getXXDigitDate methods. * @param A 4-17 digit date in ARC style (<code>yyyy</code> to * <code>yyyyMMddHHmmssSSS</code>) formatting. * @return A Date object representing the passed String. * @throws ParseException */ public static Date getDate(String d) throws ParseException { Date date = null; if (d == null) { throw new IllegalArgumentException("Passed date is null"); } switch (d.length()) { case 14: date = ArchiveUtils.parse14DigitDate(d); break; case 17: date = ArchiveUtils.parse17DigitDate(d); break; case 12: date = ArchiveUtils.parse12DigitDate(d); break; case 0: case 1: case 2: case 3: throw new ParseException("Date string must at least contain a" + "year: " + d, d.length()); default: if (!(d.startsWith("19") || d.startsWith("20"))) { throw new ParseException("Unrecognized century: " + d, 0); } if (d.length() < 8 && (d.length() % 2) != 0) { throw new ParseException("Incomplete month/date: " + d, d.length()); } StringBuilder sb = new StringBuilder(d); while (sb.length() < 8) { sb.append("01"); } while (sb.length() < 12) { sb.append("0"); } date = ArchiveUtils.parse12DigitDate(sb.toString()); } return date; } final static SimpleDateFormat dateToTimestampFormats[] = { new SimpleDateFormat("MM/dd/yyyy", Locale.ENGLISH), new SimpleDateFormat("MM/yyyy", Locale.ENGLISH), new SimpleDateFormat("yyyy", Locale.ENGLISH) }; /** * Convert a user-entered date into a timestamp * @param input * @return */ public static String dateToTimestamp(String input) { Date date = null; if (input.isEmpty()) { return null; } for (SimpleDateFormat format : dateToTimestampFormats) { try { date = format.parse(input); break; } catch (ParseException e) { continue; } } if (date == null) { return null; } return get14DigitDate(date); } /** * Utility function for parsing arc-style date stamps * in the format yyyMMddHHmmssSSS. * Date stamps are in the UTC time zone. The whole string will not be * parsed, only the first 17 digits. * * @param date an arc-style formatted date stamp * @return the Date corresponding to the date stamp string * @throws ParseException if the inputstring was malformed */ public static Date parse17DigitDate(String date) throws ParseException { return TIMESTAMP17.get().parse(date); } /** * Utility function for parsing arc-style date stamps * in the format yyyMMddHHmmss. * Date stamps are in the UTC time zone. The whole string will not be * parsed, only the first 14 digits. * * @param date an arc-style formatted date stamp * @return the Date corresponding to the date stamp string * @throws ParseException if the inputstring was malformed */ public static Date parse14DigitDate(String date) throws ParseException { return TIMESTAMP14.get().parse(date); } /** * Utility function for parsing arc-style date stamps * in the format yyyMMddHHmm. * Date stamps are in the UTC time zone. The whole string will not be * parsed, only the first 12 digits. * * @param date an arc-style formatted date stamp * @return the Date corresponding to the date stamp string * @throws ParseException if the inputstring was malformed */ public static Date parse12DigitDate(String date) throws ParseException { return TIMESTAMP12.get().parse(date); } /** * @param timestamp A 14-digit timestamp or the suffix for a 14-digit * timestamp: E.g. '20010909014640' or '20010101' or '1970'. * @return Seconds since the epoch as a string zero-pre-padded so always * Integer.MAX_VALUE wide (Makes it so sorting of resultant string works * properly). * @throws ParseException */ public static String secondsSinceEpoch(String timestamp) throws ParseException { return zeroPadInteger((int) (getSecondsSinceEpoch(timestamp).getTime() / 1000)); } /** * @param timestamp A 14-digit timestamp or the suffix for a 14-digit * timestamp: E.g. '20010909014640' or '20010101' or '1970'. * @return A date. * @see #secondsSinceEpoch(String) * @throws ParseException */ public static Date getSecondsSinceEpoch(String timestamp) throws ParseException { if (timestamp.length() < 14) { if (timestamp.length() < 10 && (timestamp.length() % 2) == 1) { throw new IllegalArgumentException( "Must have year, " + "month, date, hour or second granularity: " + timestamp); } if (timestamp.length() == 4) { // Add first month and first date. timestamp = timestamp + "01010000"; } if (timestamp.length() == 6) { // Add a date of the first. timestamp = timestamp + "010000"; } if (timestamp.length() < 14) { timestamp = timestamp + ArchiveUtils.padTo("", 14 - timestamp.length(), '0'); } } return ArchiveUtils.parse14DigitDate(timestamp); } /** * @param i Integer to add prefix of zeros too. If passed * 2005, will return the String <code>0000002005</code>. String * width is the width of Integer.MAX_VALUE as a string (10 * digits). * @return Padded String version of <code>i</code>. */ public static String zeroPadInteger(int i) { return ArchiveUtils.padTo(Integer.toString(i), MAX_INT_CHAR_WIDTH, '0'); } /** * Convert an <code>int</code> to a <code>String</code>, and pad it to * <code>pad</code> spaces. * @param i the int * @param pad the width to pad to. * @return String w/ padding. */ public static String padTo(final int i, final int pad) { String n = Integer.toString(i); return padTo(n, pad); } /** * Pad the given <code>String</code> to <code>pad</code> characters wide * by pre-pending spaces. <code>s</code> should not be <code>null</code>. * If <code>s</code> is already wider than <code>pad</code> no change is * done. * * @param s the String to pad * @param pad the width to pad to. * @return String w/ padding. */ public static String padTo(final String s, final int pad) { return padTo(s, pad, DEFAULT_PAD_CHAR); } /** * Pad the given <code>String</code> to <code>pad</code> characters wide * by pre-pending <code>padChar</code>. * * <code>s</code> should not be <code>null</code>. If <code>s</code> is * already wider than <code>pad</code> no change is done. * * @param s the String to pad * @param pad the width to pad to. * @param padChar The pad character to use. * @return String w/ padding. */ public static String padTo(final String s, final int pad, final char padChar) { String result = s; int l = s.length(); if (l < pad) { StringBuffer sb = new StringBuffer(pad); while (l < pad) { sb.append(padChar); l++; } sb.append(s); result = sb.toString(); } return result; } /** check that two byte arrays are equal. They may be <code>null</code>. * * @param lhs a byte array * @param rhs another byte array. * @return <code>true</code> if they are both equal (or both * <code>null</code>) */ public static boolean byteArrayEquals(final byte[] lhs, final byte[] rhs) { if (lhs == null && rhs != null || lhs != null && rhs == null) { return false; } if (lhs == rhs) { return true; } if (lhs.length != rhs.length) { return false; } for (int i = 0; i < lhs.length; i++) { if (lhs[i] != rhs[i]) { return false; } } return true; } /** * Converts a double to a string. * @param val The double to convert * @param precision How many characters to include after '.' * @return the double as a string. */ public static String doubleToString(double val, int maxFractionDigits) { return doubleToString(val, maxFractionDigits, 0); } public static String doubleToString(double val, int maxFractionDigits, int minFractionDigits) { // NumberFormat returns U+FFFD REPLACEMENT CHARACTER for NaN which looks // like a bug in the UI if (Double.isNaN(val)) { return "NaN"; } NumberFormat f = NumberFormat.getNumberInstance(Locale.US); f.setMaximumFractionDigits(maxFractionDigits); f.setMinimumFractionDigits(minFractionDigits); return f.format(val); } /** * Takes a byte size and formats it for display with 'friendly' units. * <p> * This involves converting it to the largest unit * (of B, KiB, MiB, GiB, TiB) for which the amount will be > 1. * <p> * Additionally, at least 2 significant digits are always displayed. * <p> * Negative numbers will be returned as '0 B'. * * @param amount the amount of bytes * @return A string containing the amount, properly formated. */ public static String formatBytesForDisplay(long amount) { double displayAmount = (double) amount; int unitPowerOf1024 = 0; if (amount <= 0) { return "0 B"; } final String[] units = { " B", " KiB", " MiB", " GiB", " TiB" }; while (displayAmount >= 1024 && unitPowerOf1024 < units.length - 1) { displayAmount = displayAmount / 1024; unitPowerOf1024++; } int fractionDigits; if (unitPowerOf1024 == 0 || displayAmount >= 10) { fractionDigits = 0; } else { // ensure at least 2 significant digits (#.#) for small displayValues fractionDigits = 1; } return doubleToString(displayAmount, fractionDigits, fractionDigits) + units[unitPowerOf1024]; } /** * Convert milliseconds value to a human-readable duration * @param time * @return Human readable string version of passed <code>time</code> */ public static String formatMillisecondsToConventional(long time) { return formatMillisecondsToConventional(time, 5); } /** * Convert milliseconds value to a human-readable duration of * mixed units, using units no larger than days. For example, * "5d12h13m12s113ms" or "19h51m". * * @param duration * @param unitCount how many significant units to show, at most * for example, a value of 2 would show days+hours or hours+seconds * but not hours+second+milliseconds * @return Human readable string version of passed <code>time</code> */ public static String formatMillisecondsToConventional(long duration, int unitCount) { if (unitCount <= 0) { unitCount = 5; } if (duration == 0) { return "0ms"; } StringBuffer sb = new StringBuffer(); if (duration < 0) { sb.append("-"); } long absTime = Math.abs(duration); long[] thresholds = { DAY_IN_MS, HOUR_IN_MS, 60000, 1000, 1 }; String[] units = { "d", "h", "m", "s", "ms" }; for (int i = 0; i < thresholds.length; i++) { if (absTime >= thresholds[i]) { sb.append(absTime / thresholds[i] + units[i]); absTime = absTime % thresholds[i]; unitCount--; } if (unitCount == 0) { break; } } return sb.toString(); } /** * Copy the raw bytes of a long into a byte array, starting at * the specified offset. * * @param l * @param array * @param offset */ public static void longIntoByteArray(long l, byte[] array, int offset) { int i, shift; for (i = 0, shift = 56; i < 8; i++, shift -= 8) array[offset + i] = (byte) (0xFF & (l >> shift)); } public static long byteArrayIntoLong(byte[] bytearray) { return byteArrayIntoLong(bytearray, 0); } /** * Byte array into long. * @param bytearray Array to convert to a long. * @param offset Offset into array at which we start decoding the long. * @return Long made of the bytes of <code>array</code> beginning at * offset <code>offset</code>. * @see #longIntoByteArray(long, byte[], int) */ public static long byteArrayIntoLong(byte[] bytearray, int offset) { long result = 0; for (int i = offset; i < 8 /*Bytes in long*/; i++) { result = (result << 8 /*Bits in byte*/) | (0xff & (byte) (bytearray[i] & 0xff)); } return result; } /** * Given a string that may be a plain host or host/path (without * URI scheme), add an implied http:// if necessary. * * @param u string to evaluate * @return string with http:// added if no scheme already present */ public static String addImpliedHttpIfNecessary(String u) { int colon = u.indexOf(':'); int period = u.indexOf('.'); if (colon == -1 || (period >= 0) && (period < colon)) { // No scheme present; prepend "http://" u = "http://" + u; } return u; } /** * Verify that the array begins with the prefix. * * @param array * @param prefix * @return true if array is identical to prefix for the first prefix.length * positions */ public static boolean startsWith(byte[] array, byte[] prefix) { if (prefix.length > array.length) { return false; } for (int i = 0; i < prefix.length; i++) { if (array[i] != prefix[i]) { return false; } } return true; } /** * Enhance given object's default String display for appearing * nested in a pretty Map String. * * @param obj Object to prettify * @return prettified String */ public static String prettyString(Object obj) { // these things have to checked and casted unfortunately if (obj instanceof Object[]) { return prettyString((Object[]) obj); } else if (obj instanceof Map) { return prettyString((Map<?, ?>) obj); } else { return "<" + obj + ">"; } } /** * Provide a improved String of a Map's entries * * @param Map * @return prettified (in curly brackets) string of Map contents */ public static String prettyString(Map<?, ?> map) { StringBuilder builder = new StringBuilder(); builder.append("{ "); boolean needsComma = false; for (Object key : map.keySet()) { if (needsComma) { builder.append(", "); } builder.append(key); builder.append(": "); builder.append(prettyString(map.get(key))); needsComma = true; } builder.append(" }"); return builder.toString(); } /** * Provide a slightly-improved String of Object[] * * @param Object[] * @return prettified (in square brackets) of Object[] */ public static String prettyString(Object[] array) { StringBuilder builder = new StringBuilder(); builder.append("[ "); boolean needsComma = false; for (Object o : array) { if (o == null) continue; if (needsComma) { builder.append(", "); } builder.append(prettyString(o)); needsComma = true; } builder.append(" ]"); return builder.toString(); } private static String loadVersion() { InputStream input = ArchiveUtils.class.getResourceAsStream("/org/archive/util/version.txt"); if (input == null) { return "UNKNOWN"; } BufferedReader br = null; String version; try { br = new BufferedReader(new InputStreamReader(input)); version = br.readLine(); br.readLine(); } catch (IOException e) { return e.getMessage(); } finally { closeQuietly(br); } version = version.trim(); if (!version.endsWith("SNAPSHOT")) { return version; } input = ArchiveUtils.class.getResourceAsStream("/org/archive/util/timestamp.txt"); if (input == null) { return version; } br = null; String timestamp; try { br = new BufferedReader(new InputStreamReader(input)); timestamp = br.readLine(); } catch (IOException e) { return version; } finally { closeQuietly(br); } if (timestamp.startsWith("timestamp=")) { timestamp = timestamp.substring(10); } return version.trim() + "-" + timestamp.trim(); } public static Set<String> TLDS; static { TLDS = new HashSet<String>(); InputStream is = ArchiveUtils.class.getResourceAsStream("tlds-alpha-by-domain.txt"); try { BufferedReader reader = new BufferedReader(new InputStreamReader(is)); String line; while ((line = reader.readLine()) != null) { if (line.startsWith("#")) { continue; } TLDS.add(line.trim().toLowerCase()); } } catch (Exception e) { LOGGER.log(Level.SEVERE, "TLD list unavailable", e); } finally { IOUtils.closeQuietly(is); } } /** * Return whether the given string represents a known * top-level-domain (like "com", "org", etc.) per IANA * as of 20100419 * * @param dom candidate string * @return boolean true if recognized as TLD */ public static boolean isTld(String dom) { return TLDS.contains(dom.toLowerCase()); } public static void closeQuietly(Object input) { if (input == null || !(input instanceof Closeable)) { return; } try { ((Closeable) input).close(); } catch (IOException ioe) { // ignore } } /** * Perform checks as to whether normal execution should proceed. * * If an external interrupt is detected, throw an interrupted exception. * Used before anything that should not be attempted by a 'zombie' thread * that the Frontier/Crawl has given up on. * * @throws InterruptedException */ public static void continueCheck() throws InterruptedException { if (Thread.interrupted()) { throw new InterruptedException("interrupt detected"); } } /** * Read stream into buf until EOF or buf full. * * @param input * @param buf * @throws IOException */ public static int readFully(InputStream input, byte[] buf) throws IOException { int max = buf.length; int ofs = 0; while (ofs < max) { int l = input.read(buf, ofs, max - ofs); if (l == 0) { throw new EOFException(); } ofs += l; } return ofs; } /** suffix to recognize gzipped files */ public static final String GZIP_SUFFIX = ".gz"; /** * Get a BufferedReader on the crawler journal given * * TODO: move to a general utils class * * @param source File journal * @return journal buffered reader. * @throws IOException */ public static BufferedReader getBufferedReader(File source) throws IOException { InputStream is = new BufferedInputStream(new FileInputStream(source)); boolean isGzipped = source.getName().toLowerCase().endsWith(GZIP_SUFFIX); if (isGzipped) { is = new GZIPInputStream(is); } return new BufferedReader(new InputStreamReader(is)); } /** * Get a BufferedReader on the crawler journal given. * * @param source URL journal * @return journal buffered reader. * @throws IOException */ public static BufferedReader getBufferedReader(URL source) throws IOException { URLConnection conn = source.openConnection(); boolean isGzipped = conn.getContentType() != null && conn.getContentType().equalsIgnoreCase("application/x-gzip") || conn.getContentEncoding() != null && conn.getContentEncoding().equalsIgnoreCase("gzip"); InputStream uis = conn.getInputStream(); return new BufferedReader( isGzipped ? new InputStreamReader(new GZIPInputStream(uis)) : new InputStreamReader(uis)); } /** * Gzip passed bytes. * Use only when bytes is small. * @param bytes What to gzip. * @return A gzip member of bytes. * @throws IOException */ public static byte[] gzip(byte[] bytes) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); GZIPOutputStream gzipOS = new GZIPOutputStream(baos); gzipOS.write(bytes, 0, bytes.length); gzipOS.close(); return baos.toByteArray(); } /** * Tests passed stream is gzip stream by reading in the HEAD. * Does not mark/reset stream -- so this test actually makes * stream unopenable within GZIP streams, unless reset. * @param is An InputStream. * @return True if compressed stream. * @throws IOException */ public static boolean isGzipped(final InputStream is) throws IOException { try { new GZIPDecoder().parseHeader(is); return true; } catch (GZIPFormatException e) { return false; } } }