List of usage examples for java.io RandomAccessFile read
public int read() throws IOException
From source file:big.BigZip.java
/** * Version 2 that permits to extract the text from a compressed file without * creating any file on the disk.//from w w w.ja v a2 s. co m * @param startPosition Offset where the file begins * @param endPosition Offset where the file ends * @return The source code of the compressed file */ public String extractBytesToRAM(final long startPosition, final Long endPosition) { String result = null; try { // enable random access to the BIG file (fast as heck) RandomAccessFile dataBIG = new RandomAccessFile(fileMainBIG, "r"); // jump directly to the position where the file is positioned dataBIG.seek(startPosition); // create a byte array ByteArrayOutputStream byteOutput = new ByteArrayOutputStream(); // now we start reading bytes during the mentioned interval while (dataBIG.getFilePointer() < endPosition) { // read a byte from our BIG archive int data = dataBIG.read(); byteOutput.write(data); } // flush data at this point byteOutput.flush(); // now convert the stream from input into an output (to feed the zip stream) ByteArrayInputStream byteInput = new ByteArrayInputStream(byteOutput.toByteArray()); // where we place the decompressed bytes ByteArrayOutputStream textOutput = new ByteArrayOutputStream(); // create the zip streamer final ArchiveInputStream archiveStream; archiveStream = new ArchiveStreamFactory().createArchiveInputStream("zip", byteInput); final ZipArchiveEntry entry = (ZipArchiveEntry) archiveStream.getNextEntry(); // copy all bytes from one location to the other (and decompress the data) IOUtils.copy(archiveStream, textOutput); // flush the results textOutput.flush(); // we've got the result right here! result = textOutput.toString(); // now close all the streams that we have open dataBIG.close(); byteOutput.close(); byteInput.close(); textOutput.close(); archiveStream.close(); } catch (FileNotFoundException ex) { Logger.getLogger(BigZip.class.getName()).log(Level.SEVERE, null, ex); return null; } catch (IOException ex) { Logger.getLogger(BigZip.class.getName()).log(Level.SEVERE, null, ex); return null; } catch (ArchiveException ex) { Logger.getLogger(BigZip.class.getName()).log(Level.SEVERE, null, ex); } return result; }
From source file:big.BigZip.java
/** * Version 2 that permits to extract the text from a compressed file without * creating any file on the disk./*from w w w . j a va 2 s . c o m*/ * @param filePosition * @return The source code of the compressed file */ public String extractBytesToRAM(final long filePosition) { String result = null; try { // add the signature bytes to our start position long startPosition = filePosition + magicSignature.length(); // enable random access to the BIG file (fast as heck) RandomAccessFile dataBIG = new RandomAccessFile(fileMainBIG, "r"); // jump directly to the position where the file is positioned dataBIG.seek(startPosition); // create a byte array ByteArrayOutputStream byteOutput = new ByteArrayOutputStream(); // get the end of this file entry (by brute-force) char test = 0; long endPosition = -1; while (test != -1) { test = dataBIG.readChar(); // if the magic devil number was found.. if (test == 66) { // read the next value for confirmation byte value = dataBIG.readByte(); if (value != 73) { continue; } // we found the next entry endPosition = dataBIG.getFilePointer() - 1; break; } } // rewind back to the start position dataBIG.seek(startPosition); // now we start reading bytes during the mentioned interval while (dataBIG.getFilePointer() < endPosition) { // read a byte from our BIG archive int data = dataBIG.read(); byteOutput.write(data); } // flush data at this point byteOutput.flush(); // now convert the stream from input into an output (to feed the zip stream) ByteArrayInputStream byteInput = new ByteArrayInputStream(byteOutput.toByteArray()); // where we place the decompressed bytes ByteArrayOutputStream textOutput = new ByteArrayOutputStream(); // create the zip streamer final ArchiveInputStream archiveStream; archiveStream = new ArchiveStreamFactory().createArchiveInputStream("zip", byteInput); final ZipArchiveEntry entry = (ZipArchiveEntry) archiveStream.getNextEntry(); // copy all bytes from one location to the other (and decompress the data) IOUtils.copy(archiveStream, textOutput); // flush the results textOutput.flush(); // we've got the result right here! result = textOutput.toString(); // now close all the streams that we have open dataBIG.close(); byteOutput.close(); byteInput.close(); textOutput.close(); archiveStream.close(); } catch (FileNotFoundException ex) { Logger.getLogger(BigZip.class.getName()).log(Level.SEVERE, null, ex); return null; } catch (IOException ex) { Logger.getLogger(BigZip.class.getName()).log(Level.SEVERE, null, ex); return null; } catch (ArchiveException ex) { Logger.getLogger(BigZip.class.getName()).log(Level.SEVERE, null, ex); } return result; }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
void resetSubDomainCounts() throws IOException { LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts."); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS ."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); try {//from w ww . j ava 2s . co m // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); inputBuffer.reset(outputBuffer.getData(), CrawlListMetadata.Constants.FixedDataSize); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } // ok reset everything except hashes and first/last url pointers int urlCount = newMetadata.getUrlCount(); long firstRecordOffset = newMetadata.getFirstRecordOffset(); long lastRecordOffset = newMetadata.getLastRecordOffset(); String domainName = newMetadata.getDomainName(); long domainHash = newMetadata.getDomainHash(); // reset newMetadata.clear(); // restore newMetadata.setUrlCount(urlCount); newMetadata.setFirstRecordOffset(firstRecordOffset); newMetadata.setLastRecordOffset(lastRecordOffset); newMetadata.setDomainName(domainName); newMetadata.setDomainHash(domainHash); // serialize it ... outputBuffer.reset(); newMetadata.serialize(outputBuffer, new BinaryProtocol()); // write it back to disk file.seek(orignalPos); // and rewrite it ... file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS"); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
public ArrayList<CrawlListDomainItem> getSubDomainList(int offset, int count) { synchronized (_metadata) { ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>(); try {//from w ww . jav a 2s.co m synchronized (_subDomainMetadataFile) { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize]; try { // skip version file.read(); // read item count int itemCount = file.readInt(); int i = offset; int end = Math.min(i + count, itemCount); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); if (i < itemCount) { file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset)); CrawlListMetadata newMetadata = new CrawlListMetadata(); for (; i < end; ++i) { long orignalPos = file.getFilePointer(); file.readFully(fixedDataBlock, 0, fixedDataBlock.length); inputBuffer.reset(fixedDataBlock, fixedDataBlock.length); newMetadata.deserialize(inputBuffer, new BinaryProtocol()); itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(), newMetadata)); } } } finally { file.close(); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK"); return itemsOut; } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
void loadSubDomainMetadataFromDisk() throws IOException { LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ... "); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize]; try {/* www .ja v a2 s . com*/ // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(fixedDataBlock, 0, fixedDataBlock.length); inputBuffer.reset(fixedDataBlock, fixedDataBlock.length); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos); } // write lookup table _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE); for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) { _offsetLookupTable.writeLong(entry.getKey()); _offsetLookupTable.writeInt(entry.getValue()); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK"); } else { LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH"); RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { //ok rebuild top level metadata as well _metadata.clear(); OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); int processedCount = 0; while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { long position = fixedDataReader.getFilePointer(); // store offset in item item._fileOffset = position; // load from disk item.deserialize(fixedDataReader); try { // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // get metadata object for subdomain CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url); // increment url count subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1); // increment top level metadata count _metadata.setUrlCount(_metadata.getUrlCount() + 1); // update top level metadata .. updateMetadata(item, _metadata, 0); // update sub-domain metadata object from item data updateMetadata(item, subDomainMetadata, 0); ++processedCount; } catch (IOException e) { LOG.error("Exception Reading String Data For Item:" + (processedCount + 1)); LOG.error("Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); } if (processedCount % 10000 == 0) { LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items"); } } // ok commit top level metadata to disk as well writeMetadataToDisk(); } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); _queueState = QueueState.QUEUED; } finally { fixedDataReader.close(); stringDataReader.close(); } LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK"); // write metadat to disk writeInitialSubDomainMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE"); } }
From source file:big.BigZip.java
/** * Given a position inside our knowledge base, retrieve the data up to * the next file indicator.//from w w w . j a v a 2 s . c om * @param targetFile The new file that will be created * @param startPosition The position from where we start to read the data * @param endPosition * @return */ public boolean extractBytes(final File targetFile, final long startPosition, final Long endPosition) { /** * This is a tricky method. We will be extracting data from a the BIG * archive onto a new file somewhere on disk. The biggest challenge here * is to find exactly when the data for the file ends and still do the * file copy with a wonderful performance. */ try { // enable random access to the BIG file (fast as heck) RandomAccessFile dataBIG = new RandomAccessFile(fileMainBIG, "r"); // if the target file exists, try to delete it if (targetFile.exists()) { targetFile.delete(); if (targetFile.exists()) { // we failed completely System.out.println("BIG405 - Failed to delete: " + targetFile.getAbsolutePath()); return false; } } // we need to create a temporary zip file holder File fileZip = new File("temp.zip"); // delete the zip file if it already exists if (fileZip.exists()) { fileZip.delete(); if (fileZip.exists()) { // we failed completely System.out.println("BIG416 - Failed to delete: " + fileZip.getAbsolutePath()); return false; } } // create a new file RandomAccessFile dataNew = new RandomAccessFile(fileZip, "rw"); // jump directly to the position where the file is positioned dataBIG.seek(startPosition); // now we start reading bytes during the mentioned interval while (dataBIG.getFilePointer() < endPosition) { // read a byte from our BIG archive int data = dataBIG.read(); // write the same byte on the target file dataNew.write(data); } // close the file streams dataBIG.close(); dataNew.close(); // extract the file zip.extract(fileZip, new File(".")); // delete the temp zip file fileZip.delete(); } catch (FileNotFoundException ex) { Logger.getLogger(BigZip.class.getName()).log(Level.SEVERE, null, ex); return false; } catch (IOException ex) { Logger.getLogger(BigZip.class.getName()).log(Level.SEVERE, null, ex); return false; } return true; }
From source file:org.apache.james.mailrepository.file.MBoxMailRepository.java
/** * Parse the mbox file.//from w ww. j a v a 2 s.c o m * * @param ins * The random access file to load. Note that the file may or may * not start at offset 0 in the file * @param messAct * The action to take when a message is found */ private MimeMessage parseMboxFile(RandomAccessFile ins, MessageAction messAct) { if ((getLogger().isDebugEnabled())) { String logBuffer = this.getClass().getName() + " Start parsing " + mboxFile; getLogger().debug(logBuffer); } try { Pattern sepMatchPattern = Pattern.compile("^From (.*) (.*):(.*):(.*)$"); int c; boolean inMessage = false; StringBuffer messageBuffer = new StringBuffer(); String previousMessageSeparator = null; boolean foundSep; long prevMessageStart = ins.getFilePointer(); if (BUFFERING) { String line; while ((line = ins.readLine()) != null) { foundSep = sepMatchPattern.matcher(line).matches(); if (foundSep && inMessage) { // if ((DEEP_DEBUG) && (getLogger().isDebugEnabled())) { // getLogger().debug(this.getClass().getName() + // " Invoking " + messAct.getClass() + " at " + // prevMessageStart); // } MimeMessage endResult = messAct.messageAction(previousMessageSeparator, messageBuffer.toString(), prevMessageStart); if (messAct.isComplete()) { // I've got what I want so just exit return endResult; } previousMessageSeparator = line; prevMessageStart = ins.getFilePointer() - line.length(); messageBuffer = new StringBuffer(); inMessage = true; } // Only done at the start (first header) if (foundSep && !inMessage) { previousMessageSeparator = line; inMessage = true; } if (!foundSep && inMessage) { messageBuffer.append(line).append("\n"); } } } else { StringBuffer line = new StringBuffer(); while ((c = ins.read()) != -1) { if (c == 10) { foundSep = sepMatchPattern.matcher(line).matches(); if (foundSep && inMessage) { // if ((DEEP_DEBUG) && // (getLogger().isDebugEnabled())) { // getLogger().debug(this.getClass().getName() + // " Invoking " + messAct.getClass() + " at " + // prevMessageStart); // } MimeMessage endResult = messAct.messageAction(previousMessageSeparator, messageBuffer.toString(), prevMessageStart); if (messAct.isComplete()) { // I've got what I want so just exit return endResult; } previousMessageSeparator = line.toString(); prevMessageStart = ins.getFilePointer() - line.length(); messageBuffer = new StringBuffer(); inMessage = true; } // Only done at the start (first header) if (foundSep && !inMessage) { previousMessageSeparator = line.toString(); inMessage = true; } if (!foundSep) { messageBuffer.append(line).append((char) c); } line = new StringBuffer(); // Reset buffer } else { line.append((char) c); } } } if (messageBuffer.length() != 0) { // process last message return messAct.messageAction(previousMessageSeparator, messageBuffer.toString(), prevMessageStart); } } catch (IOException ioEx) { getLogger().error("Unable to write file (General I/O problem) " + mboxFile, ioEx); } catch (PatternSyntaxException e) { getLogger().error("Bad regex passed " + mboxFile, e); } finally { if ((getLogger().isDebugEnabled())) { String logBuffer = this.getClass().getName() + " Finished parsing " + mboxFile; getLogger().debug(logBuffer); } } return null; }