Java tutorial
/* $Id$ * $Revision$ * $Author$ * $Date$ * * The Netarchive Suite - Software to harvest and preserve websites * Copyright 2004-2012 The Royal Danish Library, the Danish State and * University Library, the National Library of France and the Austrian * National Library. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package dk.netarkivet.common.utils; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.PrintWriter; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import dk.netarkivet.common.CommonSettings; import dk.netarkivet.common.Constants; import dk.netarkivet.common.exceptions.ArgumentNotValid; import dk.netarkivet.common.exceptions.IOFailure; import dk.netarkivet.common.exceptions.PermissionDenied; import dk.netarkivet.common.exceptions.UnknownID; /** * Misc. handy file utilities. */ public class FileUtils { /** Extension used for CDX files, including separator . */ public static final String CDX_EXTENSION = ".cdx"; /** Extension used for ARC files, including separator . */ public static final String ARC_EXTENSION = ".arc"; /** Extension used for gzipped ARC files, including separator . */ public static final String ARC_GZIPPED_EXTENSION = ".arc.gz"; /** Extension used for WARC files, including separator . */ public static final String WARC_EXTENSION = ".warc"; /** Extension used for gzipped WARC files, including separator . */ public static final String WARC_GZIPPED_EXTENSION = ".warc.gz"; /** Pattern matching ARC files, including separator. * Note: (?i) means case insensitive, (\\.gz)? means .gz is optionally * matched, and $ means matches end-of-line. Thus this pattern will match * file.arc.gz, file.ARC, file.aRc.GZ, but not * file.ARC.open */ public static final String ARC_PATTERN = "(?i)\\.arc(\\.gz)?$"; /** Pattern matching open ARC files, including separator . * Note: (?i) means case insensitive, (\\.gz)? means .gz is optionally * matched, and $ means matches end-of-line. Thus this pattern will match * file.arc.gz.open, file.ARC.open, file.arc.GZ.OpEn, but not * file.ARC.open.txt */ public static final String OPEN_ARC_PATTERN = "(?i)\\.arc(\\.gz)?\\.open$"; /** Pattern matching WARC files, including separator. * Note: (?i) means case insensitive, (\\.gz)? means .gz is optionally * matched, and $ means matches end-of-line. Thus this pattern will match * file.warc.gz, file.WARC, file.WaRc.GZ, but not * file.WARC.open */ public static final String WARC_PATTERN = "(?i)\\.warc(\\.gz)?$"; /** Pattern matching open WARC files, including separator . * Note: (?i) means case insensitive, (\\.gz)? means .gz is optionally * matched, and $ means matches end-of-line. Thus this pattern will match * file.warc.gz.open, file.WARC.open, file.warc.GZ.OpEn, but not * file.wARC.open.txt */ public static final String OPEN_WARC_PATTERN = "(?i)\\.warc(\\.gz)?\\.open$"; /** Pattern matching WARC and ARC files, including separator. * Note: (?i) means case insensitive, (\\.gz)? means .gz is optionally * matched, and $ means matches end-of-line. Thus this pattern will match * file.warc.gz, file.WARC, file.WaRc.GZ, file.arc.gz, file.ARC, * file.aRc.GZ but not file.WARC.open or file.ARC.open */ public static final String WARC_ARC_PATTERN = "(?i)\\.(w)?arc(\\.gz)?$"; /** The logger for this class. */ public static final Log log = LogFactory.getLog(FileUtils.class.getName()); /** * A FilenameFilter accepting a file if and only if * its name (transformed to lower case) ends on ".cdx". */ public static final FilenameFilter CDX_FILE_FILTER = new FilenameFilter() { public boolean accept(File directory, String filename) { return filename.toLowerCase().endsWith(CDX_EXTENSION); } }; /** A filter that matches files left open by a crashed Heritrix process. * Don't work on these files while Heritrix is still working on them. */ public static final FilenameFilter OPEN_ARCS_FILTER = new FilenameFilter() { public boolean accept(File dir, String name) { return name.matches(".*" + OPEN_ARC_PATTERN); } }; /** A filter that matches warcfiles left open by a crashed Heritrix process. * Don't work on these files while Heritrix is still working on them. */ public static final FilenameFilter OPEN_WARCS_FILTER = new FilenameFilter() { public boolean accept(File dir, String name) { return name.matches(".*" + OPEN_WARC_PATTERN); } }; /** * A filter that matches arc files, that is any file that ends on .arc or * .arc.gz in any case. */ public static final FilenameFilter ARCS_FILTER = new FilenameFilter() { public boolean accept(File directory, String filename) { return filename.toLowerCase().matches(".*" + ARC_PATTERN); } }; /** * A filter that matches warc files, that is any file that ends on .warc or * .warc.gz in any case. */ public static final FilenameFilter WARCS_FILTER = new FilenameFilter() { public boolean accept(File directory, String filename) { return filename.toLowerCase().matches(".*" + WARC_PATTERN); } }; /** * A filter that matches warc and arc files, that is any file that ends * on .warc, .warc.gz, .arc or .arc.gz in any case. */ public static final FilenameFilter WARCS_ARCS_FILTER = new FilenameFilter() { public boolean accept(File directory, String filename) { return filename.toLowerCase().matches(".*" + WARC_ARC_PATTERN); } }; /** How many times we will retry making a unique directory name. */ private static final int MAX_RETRIES = 10; /** How many times we will retry making a directory. */ private static final int CREATE_DIR_RETRIES = 3; /** Maximum number of IDs we will put in a filename. Above this * number, a checksum of the ids is generated instead. This is done * to protect us from getting filenames too long for the filesystem. */ public static final int MAX_IDS_IN_FILENAME = 4; /** * Remove a file and any subfiles in case of directories. * * @param f * A file to completely and utterly remove. * @return true if the file did exist, false otherwise. * @throws SecurityException * If a security manager exists and its <code>{@link * java.lang.SecurityManager#checkDelete}</code> * method denies delete access to the file */ public static boolean removeRecursively(File f) { ArgumentNotValid.checkNotNull(f, "File f"); if (!f.exists()) { return false; } // If the file is a directory, delete all files in this directory, // and its subdirectories if (f.isDirectory()) { File[] subfiles = f.listFiles(); if (subfiles != null) { // Can be null in case of error for (File subfile : subfiles) { removeRecursively(subfile); } } } if (!f.delete()) { boolean isDir = f.isDirectory(); if (!isDir) { log.debug("Try once more deleting file '" + f.getAbsolutePath()); final boolean success = remove(f); if (!success) { log.warn("Unable to remove file: '" + f.getAbsolutePath() + "'"); return false; } } else { String errMsg = "Problem with deletion of directory: '" + f.getAbsolutePath() + "'."; log.warn(errMsg); return false; } } return true; } /** * Remove a file. * @param f * A file to completely and utterly remove. * @return true if the file did exist, false otherwise. * @throws ArgumentNotValid if f is null. * @throws SecurityException * If a security manager exists and its <code>{@link * java.lang.SecurityManager#checkDelete}</code> * method denies delete access to the file */ public static boolean remove(File f) { ArgumentNotValid.checkNotNull(f, "f"); if (!f.exists()) { return false; } if (f.isDirectory()) { return false; //Do not attempt to delete a directory } if (!f.delete()) { // Hack to remove file on windows! Works only sometimes! File delFile = new File(f.getAbsolutePath()); delFile.delete(); if (delFile.exists()) { final String errMsg = "Unable to remove file '" + f.getAbsolutePath() + "'."; log.warn(errMsg); return false; } } return true; } /** * Returns a valid filename for most filesystems. Exchanges the following * characters: <p/> " " -> "_" ":" -> "_" "+" -> "_" * * @param filename * the filename to format correctly * @return a new formatted filename */ public static String formatFilename(String filename) { ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); String formattedFilename = filename; // remove spaces formattedFilename = formattedFilename.replace(' ', '_'); // remove colons formattedFilename = formattedFilename.replace(':', '_'); // remove add sign formattedFilename = formattedFilename.replace('+', '_'); return formattedFilename; } /** * Retrieves all files whose names ends with 'type' from directory 'dir' and * all its subdirectories. * * @param dir * Path of base directory * @param files * Initially, an empty list (e.g. an ArrayList) * @param type * The extension/ending of the files to retrieve (e.g. ".xml", * ".ARC") * @return A list of files from directory 'dir' and all its subdirectories */ public static List<File> getFilesRecursively(String dir, List<File> files, String type) { ArgumentNotValid.checkNotNullOrEmpty(dir, "String dir"); File theDirectory = new File(dir); ArgumentNotValid.checkTrue(theDirectory.isDirectory(), "File '" + theDirectory.getAbsolutePath() + "' does not represent a directory"); ArgumentNotValid.checkNotNull(files, "files"); ArgumentNotValid.checkNotNull(type, "type"); File[] top = new File(dir).listFiles(); for (File aTop : top) { if (aTop.isDirectory()) { getFilesRecursively(aTop.getAbsolutePath(), files, type); } else if (aTop.isFile() && aTop.getName().endsWith(type)) { files.add(aTop); } } return files; } /** * Load file content into text string. * * @param file The file to load * @return file content loaded into text string * @throws java.io.IOException If any IO trouble occurs while reading * the file, or the file cannot be found. */ public static String readFile(File file) throws IOException { ArgumentNotValid.checkNotNull(file, "File file"); StringBuffer sb = new StringBuffer(); BufferedReader br = new BufferedReader(new FileReader(file)); try { int i; while ((i = br.read()) != -1) { sb.append((char) i); } } finally { br.close(); } return sb.toString(); } /** * Copy file from one location to another. Will silently overwrite an * already existing file. * * @param from * original to copy * @param to * destination of copy * @throws IOFailure if an io error occurs while copying file, * or the original file does not exist. */ public static void copyFile(File from, File to) { ArgumentNotValid.checkNotNull(from, "File from"); ArgumentNotValid.checkNotNull(to, "File to"); if (!from.exists()) { String errMsg = "Original file '" + from.getAbsolutePath() + "' does not exist"; log.warn(errMsg); throw new IOFailure(errMsg); } try { FileInputStream inStream = null; FileOutputStream outStream = null; FileChannel in = null; FileChannel out = null; try { inStream = new FileInputStream(from); outStream = new FileOutputStream(to); in = inStream.getChannel(); out = outStream.getChannel(); long bytesTransferred = 0; do { //Note: in.size() is called every loop, because if it should //change size, we might end up in an infinite loop trying to //copy more bytes than are actually available. bytesTransferred += in.transferTo(bytesTransferred, Math.min(Constants.IO_CHUNK_SIZE, in.size() - bytesTransferred), out); } while (bytesTransferred < in.size()); } finally { if (inStream != null) { inStream.close(); } if (outStream != null) { outStream.close(); } if (in != null) { in.close(); } if (out != null) { out.close(); } } } catch (IOException e) { final String errMsg = "Error copying file '" + from.getAbsolutePath() + "' to '" + to.getAbsolutePath() + "'"; log.warn(errMsg, e); throw new IOFailure(errMsg, e); } } /** * Copy an entire directory from one location to another. Note that this * will silently overwrite old files, just like copyFile(). * * @param from * Original directory (or file, for that matter) to copy. * @param to * Destination directory, i.e. the 'new name' of the copy of the * from directory. * @throws IOFailure On IO trouble copying files. */ public static void copyDirectory(File from, File to) throws IOFailure { ArgumentNotValid.checkNotNull(from, "File from"); ArgumentNotValid.checkNotNull(to, "File to"); String errMsg; if (from.isFile()) { try { copyFile(from, to); } catch (Exception e) { errMsg = "Error copying from file '" + from.getAbsolutePath() + "' to file '" + to.getAbsolutePath() + "'."; log.warn(errMsg, e); throw new IOFailure(errMsg, e); } } else { if (!from.exists()) { errMsg = "Can't find directory '" + from.getAbsolutePath() + "'."; log.warn(errMsg); throw new IOFailure(errMsg); } if (!from.isDirectory()) { errMsg = "File '" + from.getAbsolutePath() + "' is not a directory"; log.warn(errMsg); throw new IOFailure(errMsg); } to.mkdir(); if (!to.exists()) { errMsg = "Failed to create destination directory '" + to.getAbsolutePath() + "'."; log.warn(errMsg); throw new IOFailure(errMsg); } File[] subfiles = from.listFiles(); for (File subfile : subfiles) { copyDirectory(subfile, new File(to, subfile.getName())); } } } /** * Read an entire file, byte by byte, into a byte array, ignoring any locale * issues. * * @param file A file to be read. * @return A byte array with the contents of the file. * @throws IOFailure on IO trouble reading the file, * or the file does not exist * @throws IndexOutOfBoundsException If the file is too large to be * in an array. */ public static byte[] readBinaryFile(File file) throws IOFailure, IndexOutOfBoundsException { ArgumentNotValid.checkNotNull(file, "File file"); if (!file.exists()) { String errMsg = "File '" + file.getAbsolutePath() + "' does not exist"; log.warn(errMsg); throw new IOFailure(errMsg); } String errMsg; if (file.length() > Integer.MAX_VALUE) { errMsg = "File '" + file.getAbsolutePath() + "' of size " + file.length() + " (bytes) is too long to fit in an array"; log.warn(errMsg); throw new IndexOutOfBoundsException(errMsg); } byte[] result = new byte[(int) file.length()]; FileInputStream in = null; try { try { in = new FileInputStream(file); int bytesRead; for (int i = 0; i < result.length && (bytesRead = in.read(result, i, result.length - i)) != -1; i += bytesRead) { } } finally { if (in != null) { in.close(); } } } catch (IOException e) { errMsg = "Error reading file '" + file.getAbsolutePath() + "'"; log.warn(errMsg); throw new IOFailure(errMsg, e); } return result; } /** * Write an entire byte array to a file, ignoring any locale issues. * * @param file * The file to write the data to * @param b * The byte array to write to the file * @throws IOFailure If an exception occurs during the writing. */ public static void writeBinaryFile(File file, byte[] b) { ArgumentNotValid.checkNotNull(file, "File file"); ArgumentNotValid.checkNotNull(b, "byte[] b"); FileOutputStream out = null; try { try { out = new FileOutputStream(file); out.write(b); } finally { if (out != null) { out.close(); } } } catch (Exception e) { final String errMsg = "writeBinaryFile exception"; log.warn(errMsg, e); throw new IOFailure(errMsg, e); } } /** * Return a filter that only accepts XML files (ending with .xml), * irrespective of their location. * * @return A new filter for XML files. */ public static FilenameFilter getXmlFilesFilter() { return new FilenameFilter() { /** * Tests if a specified file should be included in a file list. * * @param dir * the directory in which the file was found. * Unused in this implementation of accept. * @param name * the name of the file. * @return <code>true</code> if and only if the name should be * included in the file list; <code>false</code> * otherwise. * @see FilenameFilter#accept(java.io.File, java.lang.String) */ public boolean accept(File dir, String name) { return name.endsWith(Constants.XML_EXTENSION); } }; } /** * Read all lines from a file into a list of strings. * @param file The file to read from. * @return The list of lines. * @throws IOFailure on trouble reading the file, * or if the file does not exist */ public static List<String> readListFromFile(File file) { ArgumentNotValid.checkNotNull(file, "File file"); if (!file.exists()) { String errMsg = "File '" + file.getAbsolutePath() + "' does not exist"; log.warn(errMsg); throw new IOFailure(errMsg); } List<String> lines = new ArrayList<String>(); BufferedReader in = null; try { try { in = new BufferedReader(new FileReader(file)); String line; while ((line = in.readLine()) != null) { lines.add(line); } } finally { if (in != null) { in.close(); } } } catch (IOException e) { String msg = "Could not read data from " + file.getAbsolutePath(); log.warn(msg, e); throw new IOFailure(msg, e); } return lines; } /** Writes a collection of strings to a file, each string on one line. * * @param file A file to write to. The contents of this file will be * overwritten. * @param collection The collection to write. The order it will be * written in is unspecified. * @throws IOFailure if any error occurs writing to the file. * @throws ArgumentNotValid if file or collection is null. */ public static void writeCollectionToFile(File file, Collection<String> collection) { ArgumentNotValid.checkNotNull(file, "file"); ArgumentNotValid.checkNotNull(collection, "collection"); try { PrintWriter writer = null; try { writer = new PrintWriter(new FileWriter(file)); for (String fileName : collection) { writer.println(fileName); } writer.flush(); } finally { if (writer != null) { writer.close(); } } } catch (IOException e) { String msg = "Error writing collection to file '" + file.getAbsolutePath() + "'"; log.warn(msg, e); throw new IOFailure(msg, e); } } /** Sort a file into another. The current implementation slurps all lines * into memory. This will not scale forever. * * @param unsortedFile A file to sort * @param sortedOutput The file to sort into */ public static void makeSortedFile(File unsortedFile, File sortedOutput) { ArgumentNotValid.checkNotNull(unsortedFile, "File unsortedFile"); ArgumentNotValid.checkNotNull(sortedOutput, "File sortedOutput"); List<String> lines; lines = readListFromFile(unsortedFile); Collections.sort(lines); writeCollectionToFile(sortedOutput, lines); } /** Remove a line from a given file. * * @param line The full line to remove * @param file The file to remove the line from. This file will be * rewritten in full, and the entire contents will be kept in memory * @throws UnknownID If the file does not exist */ public static void removeLineFromFile(String line, File file) { ArgumentNotValid.checkNotNull(line, "String line"); ArgumentNotValid.checkNotNull(file, "File file"); if (!file.exists()) { String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist."; log.warn(errMsg); throw new UnknownID(errMsg); } List<String> lines = readListFromFile(file); lines.remove(line); writeCollectionToFile(file, lines); } /** * Check if the directory exists and is writable and create it if needed. * The complete path down to the directory is created. If the directory * creation fails a PermissionDenied exception is thrown. * * @param dir The directory to create * @throws ArgumentNotValid If dir is null or its name is the empty string * @throws PermissionDenied If directory cannot be created for any reason, * or is not writable. * @return true if dir created. */ public static boolean createDir(File dir) throws PermissionDenied { ArgumentNotValid.checkNotNull(dir, "File dir"); ArgumentNotValid.checkNotNullOrEmpty(dir.getName(), "File dir"); boolean didCreate = false; if (!dir.exists()) { didCreate = true; int i = 0; //retrying creation due to sun bug (race condition) //See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4742723 while ((i++ < CREATE_DIR_RETRIES) && !(dir.isDirectory() && dir.canWrite())) { dir.mkdirs(); } if (!(dir.isDirectory() && dir.canWrite())) { String msg = "Could not create directory '" + dir.getAbsolutePath() + "'"; log.warn(msg); throw new PermissionDenied(msg); } } else { if (!dir.isDirectory()) { String msg = "Cannot make directory '" + dir.getAbsolutePath() + "' - a file is in the way"; log.warn(msg); throw new PermissionDenied(msg); } } if (!dir.canWrite()) { String msg = "Cannot write to required directory '" + dir.getAbsolutePath() + "'"; log.warn(msg); throw new PermissionDenied(msg); } return didCreate; } /** * Returns the number of bytes free on the file system calling * the FreeSpaceProvider class defined by the setting * CommonSettings.FREESPACE_PROVIDER_CLASS (a.k.a. * settings.common.freespaceprovider.class) * * @param f a given file * @return the number of bytes free defined in the settings.xml */ public static long getBytesFree(File f) { return FreeSpaceProviderFactory.getInstance().getBytesFree(f); } /** * @param theFile * A file to make relative * @param theDir * A directory * @return the filepath of the theFile relative to theDir. null, if * theFile is not relative to theDir. null, if theDir is not a * directory. */ public static String relativeTo(File theFile, File theDir) { ArgumentNotValid.checkNotNull(theFile, "File theFile"); ArgumentNotValid.checkNotNull(theDir, "File theDir"); if (!theDir.isDirectory()) { log.trace("The File '" + theDir.getAbsolutePath() + "' does not represent a directory. Null returned"); return null; } List<String> filePathList = new ArrayList<String>(); List<String> theDirPath = new ArrayList<String>(); File tempFile = theFile.getAbsoluteFile(); filePathList.add(tempFile.getName()); while ((tempFile = tempFile.getParentFile()) != null) { filePathList.add(tempFile.getName()); } tempFile = theDir.getAbsoluteFile(); theDirPath.add(tempFile.getName()); while ((tempFile = tempFile.getParentFile()) != null) { theDirPath.add(tempFile.getName()); } // check, at the path prefix is the same List<String> sublist = filePathList.subList(theDirPath.size() - 2, filePathList.size()); if (!theDirPath.equals(sublist)) { log.trace("The file '" + theFile.getAbsolutePath() + "' is not relative to the directory '" + theDir.getAbsolutePath() + "'. Null returned"); return null; } List<String> relativeList = filePathList.subList(0, theDirPath.size() - 2); StringBuffer sb = new StringBuffer(); Collections.reverse(relativeList); for (String aRelativeList : relativeList) { sb.append(aRelativeList); sb.append(File.separatorChar); } sb.deleteCharAt(sb.length() - 1); // remove last separatorChar return sb.toString(); } /** * Count the number of lines in a file. * @param file the file to read * @throws IOFailure If an error occurred while reading the file * @return the number of lines in the file */ public static long countLines(File file) { ArgumentNotValid.checkNotNull(file, "file"); BufferedReader in = null; long count = 0; try { try { in = new BufferedReader(new FileReader(file)); while (in.readLine() != null) { count++; } } finally { if (in != null) { in.close(); } } } catch (IOException e) { String msg = "Could not check number of lines in '" + file.getAbsolutePath() + "'"; log.warn(msg, e); throw new IOFailure(msg, e); } return count; } /** Create an InputStream that reads from a file but removes the file * when all data has been read. * * @param file A file to read. This file will be deleted when the * inputstream is closed, finalized, reaches end-of-file, or when the * VM closes. * @throws IOFailure If an error occurs in creating the ephemeral * input stream * @return An InputStream containing the file's contents. */ public static InputStream getEphemeralInputStream(final File file) { ArgumentNotValid.checkNotNull(file, "file"); // First make sure we remove the file if the VM dies file.deleteOnExit(); try { // Then create an input stream that deletes the file upon exit. // Note that FileInputStream.finalize calls close(). return new FileInputStream(file) { public void close() throws IOException { super.close(); file.delete(); } }; } catch (IOException e) { String msg = "Error creating ephemeral input stream for " + file; log.warn(msg, e); throw new IOFailure(msg, e); } } /** * Makes a valid file from filename passed in String. Ensures that the File * object returned is not null, and that isFile() returns true. * * @param filename The file to create the File object from * @return A valid, non-null File object. * @throws IOFailure if file cannot be created. */ public static File makeValidFileFromExisting(String filename) throws IOFailure { ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); File res = new File(filename); if (!res.isFile()) { String errMsg = "Error: File object created from filename '" + filename + "' is not a proper file, isFile() failed."; log.warn(errMsg); throw new IOFailure(errMsg); } return res; } /** Write the entire contents of a file to a stream. * * @param f A file to write to the stream. * @param out The stream to write to. * @throws IOFailure If any error occurs while writing the file to a stream */ public static void writeFileToStream(File f, OutputStream out) { ArgumentNotValid.checkNotNull(f, "File f"); ArgumentNotValid.checkNotNull(out, "OutputStream out"); byte[] buffer = new byte[Constants.IO_BUFFER_SIZE]; try { FileInputStream in = new FileInputStream(f); try { int bytesRead; while ((bytesRead = in.read(buffer)) > 0) { out.write(buffer, 0, bytesRead); } } finally { in.close(); } } catch (IOException e) { final String errMsg = "Error writing file '" + f.getAbsolutePath() + "' to stream"; log.warn(errMsg, e); throw new IOFailure(errMsg, e); } } /** Write the contents of a stream into a file. * * @param in A stream to read from. This stream is not closed by this * method. * @param f The file to write the stream contents into. * @throws IOFailure If any error occurs while writing the stream to a file */ public static void writeStreamToFile(InputStream in, File f) { ArgumentNotValid.checkNotNull(f, "File f"); ArgumentNotValid.checkNotNull(in, "InputStream in"); byte[] buffer = new byte[Constants.IO_BUFFER_SIZE]; try { FileOutputStream out = new FileOutputStream(f); try { int bytesRead; while ((bytesRead = in.read(buffer)) > 0) { out.write(buffer, 0, bytesRead); } } finally { out.close(); } } catch (IOException e) { final String errMsg = "Error writing stream to file '" + f.getAbsolutePath() + "'."; log.warn(errMsg, e); throw new IOFailure(errMsg, e); } } /** Get the location of the standard temporary directory. * The * existence of this directory should be ensure at the start of every * application. * * @return The directory that should be used for temporary files. */ public static File getTempDir() { return new File(Settings.get(CommonSettings.DIR_COMMONTEMPDIR)); } /** * Attempt to move a file using rename, and if that fails, move the file * by copy-and-delete. * @param fromFile The source * @param toFile The target */ public static void moveFile(File fromFile, File toFile) { ArgumentNotValid.checkNotNull(fromFile, "File fromFile"); ArgumentNotValid.checkNotNull(toFile, "File toFile"); if (!fromFile.renameTo(toFile)) { copyFile(fromFile, toFile); remove(fromFile); } } /** Given a set, generate a reasonable file name from the set. * @param <T> The type of objects, that the Set IDs argument contains. * @param IDs A set of IDs. * @param suffix A suffix. May be empty string. * @return A reasonable file name. */ public static <T extends Comparable<T>> String generateFileNameFromSet(Set<T> IDs, String suffix) { ArgumentNotValid.checkNotNull(IDs, "Set<T> IDs"); ArgumentNotValid.checkNotNull(suffix, "String suffix"); if (IDs.isEmpty()) { return "empty" + suffix; } List<T> sorted = new ArrayList<T>(IDs); Collections.sort(sorted); String allIDsString = StringUtils.conjoin("-", sorted); String fileName; if (sorted.size() > MAX_IDS_IN_FILENAME) { String firstNIDs = StringUtils.conjoin("-", sorted.subList(0, MAX_IDS_IN_FILENAME)); fileName = firstNIDs + "-" + ChecksumCalculator.calculateMd5(allIDsString.getBytes()) + suffix; } else { fileName = allIDsString + suffix; } return fileName; } /** Sort a crawl.log file according to the url. * * @param file The file containing the unsorted data. * @param toFile The file that the sorted data can be put into. * @throws IOFailure if there were errors running the sort process, or * if the file does not exist. */ public static void sortCrawlLog(File file, File toFile) { ArgumentNotValid.checkNotNull(file, "File file"); ArgumentNotValid.checkNotNull(toFile, "File toFile"); if (!file.exists()) { String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist."; log.warn(errMsg); throw new IOFailure(errMsg); } File sortTempDir = null; if (Settings.getBoolean(CommonSettings.UNIX_SORT_USE_COMMON_TEMP_DIR)) { sortTempDir = FileUtils.getTempDir(); } boolean sortLikeCrawllog = true; int error = ProcessUtils.runUnixSort(file, toFile, sortTempDir, sortLikeCrawllog); if (error != 0) { final String errMsg = "Error code " + error + " sorting crawl log '" + file + "'"; log.warn(errMsg); throw new IOFailure(errMsg); } } /** Sort a crawl.log file according to the timestamp. * * @param file The file containing the unsorted data. * @param toFile The file that the sorted data can be put into. * @throws IOFailure if there were errors running the sort process, or * if the file does not exist. */ public static void sortCrawlLogOnTimestamp(File file, File toFile) { ArgumentNotValid.checkNotNull(file, "File file"); ArgumentNotValid.checkNotNull(toFile, "File toFile"); if (!file.exists()) { String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist."; log.warn(errMsg); throw new IOFailure(errMsg); } File sortTempDir = null; if (Settings.getBoolean(CommonSettings.UNIX_SORT_USE_COMMON_TEMP_DIR)) { sortTempDir = FileUtils.getTempDir(); } boolean sortLikeCrawllog = false; int error = ProcessUtils.runUnixSort(file, toFile, sortTempDir, sortLikeCrawllog); if (error != 0) { final String errMsg = "Error code " + error + " sorting crawl log '" + file + "'"; log.warn(errMsg); throw new IOFailure(errMsg); } } /** Sort a CDX file according to our standard for CDX file sorting. This * method depends on the Unix sort() command. * * @param file The raw unsorted CDX file. * @param toFile The file that the result will be put into. * @throws IOFailure If the file does not exist, or could not be sorted */ public static void sortCDX(File file, File toFile) { ArgumentNotValid.checkNotNull(file, "File file"); ArgumentNotValid.checkNotNull(toFile, "File toFile"); if (!file.exists()) { String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist."; log.warn(errMsg); throw new IOFailure(errMsg); } boolean sortLikeCrawllog = false; File sortTempDir = null; if (Settings.getBoolean(CommonSettings.UNIX_SORT_USE_COMMON_TEMP_DIR)) { sortTempDir = FileUtils.getTempDir(); } int error = ProcessUtils.runUnixSort(file, toFile, sortTempDir, sortLikeCrawllog); if (error != 0) { final String errMsg = "Error code " + error + " sorting cdx file '" + file.getAbsolutePath() + "'"; log.warn(errMsg); throw new IOFailure(errMsg); } } /** * Sort a file using UNIX sort. * @param file the file that you want to sort. * @param toFile The destination file. */ public static void sortFile(File file, File toFile) { sortCDX(file, toFile); } /** Creates a new temporary directory with a unique name. * This directory will be deleted automatically at the end of the * VM (though behaviour if there are files in it is undefined). * This method will try a limited number of times to create a directory, * using a randomly generated suffix, before giving up. * * @param inDir The directory where the temporary directory * should be created. * @param prefix The prefix of the directory name, for identification * purposes. * @return A newly created directory that no other calls to createUniqueDir * returns. * @throws ArgumentNotValid if inDir is not an existing * directory that can be written to. * @throws IOFailure if a free name couldn't be found within a reasonable * number of tries. */ public static File createUniqueTempDir(File inDir, String prefix) { ArgumentNotValid.checkNotNull(inDir, "File inDir"); ArgumentNotValid.checkNotNullOrEmpty(prefix, "String prefix"); ArgumentNotValid.checkTrue(inDir.isDirectory(), inDir + " must be a directory"); ArgumentNotValid.checkTrue(inDir.canWrite(), inDir + " must be writeable"); for (int tries = 0; tries < MAX_RETRIES; tries++) { File newDir; try { newDir = File.createTempFile(prefix, null, inDir); } catch (IOException e) { final String errMsg = "Couldn't create temporary file in '" + inDir.getAbsolutePath() + "' with prefix '" + prefix + "'"; log.warn(errMsg, e); throw new IOFailure(errMsg, e); } newDir.delete(); if (newDir.mkdir()) { newDir.deleteOnExit(); return newDir; } } final String errMsg = "Too many similar files around, cannot create " + "unique dir with prefix " + prefix + " in '" + inDir.getAbsolutePath() + "'."; log.warn(errMsg); throw new IOFailure(errMsg); } /** * Read the last line in a file. Note this method is not UTF-8 safe. * * @param file input file to read last line from. * @return The last line in the file (ending newline is irrelevant), * returns an empty string if file is empty. * @throws ArgumentNotValid on null argument, or file is not a readable * file. * @throws IOFailure on IO trouble reading file. */ public static String readLastLine(File file) { ArgumentNotValid.checkNotNull(file, "File file"); if (!file.isFile() || !file.canRead()) { final String errMsg = "File '" + file.getAbsolutePath() + "' is not a readable file."; log.warn(errMsg); throw new ArgumentNotValid(errMsg); } if (file.length() == 0) { return ""; } RandomAccessFile rafile = null; try { rafile = new RandomAccessFile(file, "r"); //seek to byte one before end of file (remember we know the file is // not empty) - this ensures that an ending newline is not read rafile.seek(rafile.length() - 2); //now search to the last linebreak, or beginning of file while (rafile.getFilePointer() != 0 && rafile.read() != '\n') { //search back two, because we just searched forward one to find //newline rafile.seek(rafile.getFilePointer() - 2); } return rafile.readLine(); } catch (IOException e) { final String errMsg = "Unable to access file '" + file.getAbsolutePath() + "'"; log.warn(errMsg, e); throw new IOFailure(errMsg, e); } finally { try { if (rafile != null) { rafile.close(); } } catch (IOException e) { log.debug("Unable to close file '" + file.getAbsolutePath() + "' after reading", e); } } } /** Append the given lines to a file. Each lines is terminated by a * newline. * * @param file A file to append to. * @param lines The lines to write. */ public static void appendToFile(File file, String... lines) { ArgumentNotValid.checkNotNull(file, "File file"); ArgumentNotValid.checkNotNull(lines, "String... lines"); PrintWriter writer = null; int linesAppended = 0; try { boolean appendMode = true; writer = new PrintWriter(new FileWriter(file, appendMode)); for (String line : lines) { writer.println(line); linesAppended++; } } catch (IOException e) { log.warn("Error appending " + lines.length + " lines to file '" + file.getAbsolutePath() + "'. Only appended " + linesAppended + " lines. ", e); } finally { if (writer != null) { writer.close(); } } } /** * Loads an file from the class path (for retrieving a file from '.jar'). * * @param filePath The path of the file. * @return The file from the class path. * @throws IOFailure If resource cannot be retrieved from the * class path. */ public static File getResourceFileFromClassPath(String filePath) throws IOFailure { ArgumentNotValid.checkNotNullOrEmpty(filePath, "String filePath"); try { // retrieve the file as a stream from the classpath. InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(filePath); if (stream != null) { // Make stream into file, and return it. File tmpFile = File.createTempFile("tmp", "tmp"); StreamUtils.copyInputStreamToOutputStream(stream, new FileOutputStream(tmpFile)); return tmpFile; } else { String msg = "The resource was not retrieved correctly from" + " the class path: '" + filePath + "'"; log.trace(msg); throw new IOFailure(msg); } } catch (IOException e) { String msg = "Problems making stream of resource in class path " + "into a file. Filepath: '" + filePath + "'"; log.warn(msg, e); throw new IOFailure(msg, e); } } /** * Get a humanly readable representation of the file size. * If the file is a directory, the size is the aggregate of the files * in the directory except that subdirectories are ignored. * The number is given with 2 decimals. * @param aFile a File object * @return a humanly readable representation of the file size (rounded) */ public static String getHumanReadableFileSize(File aFile) { ArgumentNotValid.checkNotNull(aFile, "File aFile"); final long bytesPerOneKilobyte = 1000L; final long bytesPerOneMegabyte = 1000000L; final long bytesPerOneGigabyte = 1000000000L; double filesize = 0L; if (aFile.isDirectory()) { for (File f : aFile.listFiles()) { if (f.isFile()) { filesize = filesize + f.length(); } } } else { filesize = aFile.length(); // normal file. } NumberFormat decFormat = new DecimalFormat("##.##"); if (filesize < bytesPerOneKilobyte) { // represent size in bytes without the ".0" return (long) filesize + " bytes"; } else if (filesize >= bytesPerOneKilobyte && filesize < bytesPerOneMegabyte) { // represent size in Kbytes return decFormat.format(filesize / bytesPerOneKilobyte) + " Kbytes"; } else if (filesize >= bytesPerOneMegabyte && filesize < bytesPerOneGigabyte) { // represent size in Mbytes return decFormat.format(filesize / bytesPerOneMegabyte) + " Mbytes"; } else { // represent in Gbytes return decFormat.format(filesize / bytesPerOneGigabyte) + " Gbytes"; } } /** * @param aDir A directory * @return true, if the given directory contains files; else returns false */ public static boolean hasFiles(File aDir) { ArgumentNotValid.checkExistsDirectory(aDir, "aDir"); return (aDir.listFiles().length > 0); } }