List of usage examples for java.util.zip CRC32 getValue
@Override public long getValue()
From source file:com.redskyit.scriptDriver.RunTests.java
private void info(WebElement element, String selector, boolean verify) throws Exception { do {/*from w w w. ja va 2 s .co m*/ try { Point loc = element.getLocation(); Dimension size = element.getSize(); String tag = element.getTagName(); System.out .print(null == selector ? "test-id \"" + element.getAttribute("test-id") + "\"" : selector); System.out.print(" info"); System.out.print(" tag " + tag); System.out.print((element.isDisplayed() ? "" : " not") + " displayed"); System.out.print(" at " + loc.x + "," + loc.y); System.out.print(" size " + size.width + "," + size.height); System.out.print((element.isEnabled() ? "" : " not") + " enabled"); System.out.print((element.isSelected() ? "" : " not") + " selected"); if (tag.equals("input") || tag.equals("select")) { System.out.print(" check \"" + element.getAttribute("value") + "\""); } else { String text = tag.equals("textarea") ? element.getAttribute("value") : element.getText(); if (text.indexOf('\n') != -1) { CRC32 crc = new CRC32(); crc.update(text.getBytes()); System.out.print(" checksum \"crc32:" + crc.getValue() + "\""); } else { System.out.print(" check \"" + element.getText() + "\""); } } System.out.println(); return; } catch (StaleElementReferenceException e) { // If element has gone stale during a dump, ignore it if (!verify) return; // element has gone stale, re-select it System.out.println("// EXCEPTION : StaleElementReference"); } catch (Exception e) { if (verify) throw e; return; } sleepAndReselect(100); } while (_waitFor > 0 && (new Date()).getTime() < _waitFor); }
From source file:org.exist.xquery.modules.compression.AbstractCompressFunction.java
/** * Adds a document to a archive//from ww w .j a va2 s . com * * @param os * The Output Stream to add the document to * @param doc * The document to add to the archive * @param useHierarchy * Whether to use a folder hierarchy in the archive file that * reflects the collection hierarchy */ private void compressResource(OutputStream os, DocumentImpl doc, boolean useHierarchy, String stripOffset, String method, String name) throws IOException, SAXException { // create an entry in the Tar for the document Object entry = null; byte[] value = new byte[0]; CRC32 chksum = new CRC32(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); if (name != null) { entry = newEntry(name); } else if (useHierarchy) { String docCollection = doc.getCollection().getURI().toString(); XmldbURI collection = XmldbURI.create(removeLeadingOffset(docCollection, stripOffset)); entry = newEntry(collection.append(doc.getFileURI()).toString()); } else { entry = newEntry(doc.getFileURI().toString()); } if (doc.getResourceType() == DocumentImpl.XML_FILE) { // xml file Serializer serializer = context.getBroker().getSerializer(); serializer.setUser(context.getUser()); serializer.setProperty("omit-xml-declaration", "no"); getDynamicSerializerOptions(serializer); String strDoc = serializer.serialize(doc); value = strDoc.getBytes(); } else if (doc.getResourceType() == DocumentImpl.BINARY_FILE) { // binary file InputStream is = context.getBroker().getBinaryResource((BinaryDocument) doc); byte[] data = new byte[16384]; int len = 0; while ((len = is.read(data, 0, data.length)) > 0) { baos.write(data, 0, len); } is.close(); value = baos.toByteArray(); } // close the entry if (entry instanceof ZipEntry && "store".equals(method)) { ((ZipEntry) entry).setMethod(ZipOutputStream.STORED); chksum.update(value); ((ZipEntry) entry).setCrc(chksum.getValue()); ((ZipEntry) entry).setSize(value.length); } putEntry(os, entry); os.write(value); closeEntry(os); }
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException { SequenceFile.Reader reader = null; Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); Path indexFilePath = new Path(mapFilePath, "index"); Path dataFilePath = new Path(mapFilePath, "data"); File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath, CrawlEnvironment.getHadoopConfig()); ValueBytes valueBytes = indexReader.createValueBytes(); DataOutputBuffer keyBytes = new DataOutputBuffer(); DataInputBuffer keyBuffer = new DataInputBuffer(); DataOutputBuffer finalOutputStream = new DataOutputBuffer(); DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer(); URLFP fp = new URLFP(); try {/* w w w . j ava 2 s . c o m*/ while (indexReader.nextRaw(keyBytes, valueBytes) != -1) { keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength()); // read fingerprint ... fp.readFields(keyBuffer); // write hash only finalOutputStream.writeLong(fp.getUrlHash()); uncompressedValueBytes.reset(); // write value bytes to intermediate buffer ... valueBytes.writeUncompressedBytes(uncompressedValueBytes); // write out uncompressed length WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength()); // write out bytes finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength()); } // delete existing ... cacheFilePath.delete(); // compute crc ... CRC32 crc = new CRC32(); crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength()); // open final output stream DataOutputStream fileOutputStream = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(cacheFilePath))); try { fileOutputStream.writeLong(crc.getValue()); fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength()); fileOutputStream.flush(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); fileOutputStream.close(); fileOutputStream = null; cacheFilePath.delete(); throw e; } finally { if (fileOutputStream != null) { fileOutputStream.close(); } } } finally { if (indexReader != null) { indexReader.close(); } } }
From source file:org.commoncrawl.service.crawler.CrawlList.java
private static void appendTargetsToLogFile(File logFileName, IntrusiveList<CrawlTarget> list) throws IOException { LogFileHeader header = new LogFileHeader(); boolean preExistingHeader = logFileName.exists(); RandomAccessFile file = new RandomAccessFile(logFileName, "rw"); try {// ww w .ja v a2s . c o m long headerOffset = 0; if (preExistingHeader) { headerOffset = readLogFileHeader(file, header); if (header._writePos == 0) { file.seek(headerOffset); } else { // seelk to appropriate write position file.seek(header._writePos); } } else { headerOffset = writeLogFileHeader(file, header); } CustomByteArrayOutputStream bufferOutputStream = new CustomByteArrayOutputStream(1 << 17); DataOutputStream dataOutputStream = new DataOutputStream(bufferOutputStream); CRC32 crc = new CRC32(); for (CrawlTarget target : list) { PersistentCrawlTarget persistentTarget = target.createPersistentTarget(); bufferOutputStream.reset(); // write to intermediate stream ... persistentTarget.write(dataOutputStream); // and crc the data ... crc.reset(); crc.update(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size()); // write out length first file.writeInt(bufferOutputStream.size()); //crc next long computedValue = crc.getValue(); //TODO: waste of space - write 32 bit values as long because having problems with java sign promotion rules during read... file.writeLong(computedValue); // and then the data file.write(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size()); } // now update header ... header._itemCount += list.size(); header._writePos = file.getFilePointer(); // now write out header anew ... writeLogFileHeader(file, header); } finally { if (file != null) { file.close(); } } }
From source file:org.commoncrawl.service.listcrawler.CacheManager.java
/** * loadCacheItemFromDisk - load a single cache item from disk * //w ww . jav a2 s . co m * @param file * @param optTargetURL * @param location * @return * @throws IOException */ private CacheItem loadCacheItemFromDisk(FileInputStream file, String optTargetURL, long location) throws IOException { long timeStart = System.currentTimeMillis(); // and read out the Item Header ... CacheItemHeader itemHeader = new CacheItemHeader(); itemHeader.readHeader(new DataInputStream(file)); // see if it is valid ... if (!Arrays.equals(itemHeader._sync, _header._sync)) { LOG.error("### Item Lookup for URL:" + optTargetURL + " Record at:" + location + " failed - corrupt sync bytes detected!!!"); } else { CRC32 crc32 = new CRC32(); // ok deserialize the bytes ... CacheItem item = new CacheItem(); CheckedInputStream checkedStream = new CheckedInputStream(file, crc32); DataInputStream itemStream = new DataInputStream(checkedStream); item.readFields(itemStream); // read the content buffer length int contentBufferLen = itemStream.readInt(); if (contentBufferLen != 0) { byte data[] = new byte[contentBufferLen]; itemStream.read(data); item.setContent(new Buffer(data)); } // cache crc long crcValueComputed = crc32.getValue(); // read disk crc long crcValueOnDisk = itemStream.readLong(); // validate if (crcValueComputed == crcValueOnDisk) { String canonicalURL = URLUtils.canonicalizeURL(item.getUrl(), true); if (optTargetURL.length() == 0 || optTargetURL.equals(canonicalURL)) { if (isValidCacheItem(item)) { LOG.info("### Item Lookup for URL:" + optTargetURL + " Record at:" + location + " completed in:" + (System.currentTimeMillis() - timeStart)); return item; } else { LOG.info("### Item Lookup for URL:" + optTargetURL + " Record at:" + location + " failed with invalid result code"); } } else { LOG.info("### Item Lookup for URL:" + optTargetURL + " Record at:" + location + " failed with url mismatch. record url:" + item.getUrl()); } } else { LOG.error("### Item Lookup for URL:" + optTargetURL + " Record at:" + location + " failed - crc mismatch!!!"); } } return null; }
From source file:org.commoncrawl.service.crawler.CrawlList.java
private static int readTargetsFromLogFile(CrawlList domain, File logFileName, int desiredReadAmount, IntrusiveList<CrawlTarget> targetsOut) throws IOException { int itemsRead = 0; if (logFileName.exists()) { RandomAccessFile file = new RandomAccessFile(logFileName, "rw"); LogFileHeader header = new LogFileHeader(); try {// ww w. ja v a 2 s .c om long headerOffset = readLogFileHeader(file, header); // seelk to appropriate write position if (header._readPos != 0) file.seek(header._readPos); int itemsToRead = Math.min(desiredReadAmount, header._itemCount); PersistentCrawlTarget persistentTarget = new PersistentCrawlTarget(); CRC32 crc = new CRC32(); CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 16); for (int i = 0; i < itemsToRead; ++i) { // read length ... int urlDataLen = file.readInt(); long urlDataCRC = file.readLong(); buffer.reset(); if (urlDataLen > buffer.getBuffer().length) { buffer = new CustomByteArrayOutputStream(((urlDataLen / 65536) + 1) * 65536); } file.read(buffer.getBuffer(), 0, urlDataLen); crc.reset(); crc.update(buffer.getBuffer(), 0, urlDataLen); long computedValue = crc.getValue(); // validate crc values ... if (computedValue != urlDataCRC) { throw new IOException("Crawl Target Log File Corrupt"); } else { //populate a persistentTarget from the (in memory) data stream DataInputStream bufferReader = new DataInputStream( new ByteArrayInputStream(buffer.getBuffer(), 0, urlDataLen)); persistentTarget.clear(); persistentTarget.readFields(bufferReader); //populate a new crawl target structure ... CrawlTarget newTarget = new CrawlTarget(domain, persistentTarget); targetsOut.addTail(newTarget); } } itemsRead = itemsToRead; // now update header ... header._itemCount -= itemsRead; // now if item count is non zero ... if (header._itemCount != 0) { // set read cursor to next record location header._readPos = file.getFilePointer(); } // otherwise ... else { // reset both cursors ... header._readPos = 0; header._writePos = 0; } // now write out header anew ... writeLogFileHeader(file, header); } finally { if (file != null) { file.close(); } } } return itemsRead; }
From source file:org.commoncrawl.service.crawler.CrawlLog.java
private static void transferLocalCheckpointLog(File crawlLogPath, HDFSCrawlURLWriter writer, long checkpointId) throws IOException { // and open the crawl log file ... RandomAccessFile inputStream = null; IOException exception = null; CRC32 crc = new CRC32(); CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 17); byte[] syncBytesBuffer = new byte[SYNC_BYTES_SIZE]; // save position for potential debug output. long lastReadPosition = 0; try {//from w ww.j av a2 s . c o m inputStream = new RandomAccessFile(crawlLogPath, "rw"); // and a data input stream ... RandomAccessFile reader = inputStream; // seek to zero reader.seek(0L); // read the header ... LogFileHeader header = readLogFileHeader(reader); // read a crawl url from the stream... while (inputStream.getFilePointer() < header._fileSize) { if (seekToNextSyncBytesPos(syncBytesBuffer, reader, header._fileSize)) { try { lastReadPosition = inputStream.getFilePointer(); // skip sync inputStream.skipBytes(SYNC_BYTES_SIZE); // read length ... int urlDataLen = reader.readInt(); long urlDataCRC = reader.readLong(); if (urlDataLen > buffer.getBuffer().length) { buffer = new CustomByteArrayOutputStream(((urlDataLen / 65536) + 1) * 65536); } reader.read(buffer.getBuffer(), 0, urlDataLen); crc.reset(); crc.update(buffer.getBuffer(), 0, urlDataLen); long computedValue = crc.getValue(); // validate crc values ... if (computedValue != urlDataCRC) { LOG.error("CRC Mismatch Detected during HDFS transfer in CrawlLog:" + crawlLogPath.getAbsolutePath() + " Checkpoint Id:" + checkpointId + " FilePosition:" + lastReadPosition); inputStream.seek(lastReadPosition + 1); } else { // allocate a crawl url data structure CrawlURL url = new CrawlURL(); DataInputStream bufferReader = new DataInputStream( new ByteArrayInputStream(buffer.getBuffer(), 0, urlDataLen)); // populate it from the (in memory) data stream url.readFields(bufferReader); try { // and write out appropriate sequence file entries ... writer.writeCrawlURLItem(new Text(url.getUrl()), url); } catch (IOException e) { LOG.error("Failed to write CrawlURL to SequenceFileWriter with Exception:" + CCStringUtils.stringifyException(e)); throw new URLWriterException(); } } } catch (URLWriterException e) { LOG.error("Caught URLRewriter Exception! - Throwing to outer layer!"); throw e; } catch (Exception e) { LOG.error("Ignoring Error Processing CrawlLog Entry at Position:" + lastReadPosition + " Exception:" + CCStringUtils.stringifyException(e)); } } else { break; } } } catch (EOFException e) { LOG.error("Caught EOF Exception during read of local CrawlLog:" + crawlLogPath.getAbsolutePath() + " Checkpoint Id:" + checkpointId + " FilePosition:" + lastReadPosition); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); exception = e; throw e; } finally { if (inputStream != null) inputStream.close(); } }
From source file:ee.sk.digidoc.SignedDoc.java
/** * Writes the SignedDoc to an output file * and automatically calculates DataFile sizes * and digests/*from w ww . j a v a 2 s . c o m*/ * @param outputFile output file name * @param fTempSdoc temporrary file, copy of original for copying items * @throws DigiDocException for all errors */ public void writeToStream(OutputStream os/*, File fTempSdoc*/) throws DigiDocException { DigiDocException ex1 = validateFormatAndVersion(); if (ex1 != null) throw ex1; try { DigiDocXmlGenFactory genFac = new DigiDocXmlGenFactory(this); if (m_format.equals(SignedDoc.FORMAT_BDOC)) { ZipArchiveOutputStream zos = new ZipArchiveOutputStream(os); zos.setEncoding("UTF-8"); if (m_logger.isDebugEnabled()) m_logger.debug("OS: " + ((os != null) ? "OK" : "NULL")); // write mimetype if (m_logger.isDebugEnabled()) m_logger.debug("Writing: " + MIMET_FILE_NAME); ZipArchiveEntry ze = new ZipArchiveEntry(MIMET_FILE_NAME); if (m_comment == null) m_comment = DigiDocGenFactory.getUserInfo(m_format, m_version); ze.setComment(m_comment); ze.setMethod(ZipArchiveEntry.STORED); java.util.zip.CRC32 crc = new java.util.zip.CRC32(); if (m_version.equals(BDOC_VERSION_1_0)) { ze.setSize(SignedDoc.MIMET_FILE_CONTENT_10.getBytes().length); crc.update(SignedDoc.MIMET_FILE_CONTENT_10.getBytes()); } if (m_version.equals(BDOC_VERSION_1_1)) { ze.setSize(SignedDoc.MIMET_FILE_CONTENT_11.getBytes().length); crc.update(SignedDoc.MIMET_FILE_CONTENT_11.getBytes()); } if (m_version.equals(BDOC_VERSION_2_1)) { ze.setSize(SignedDoc.MIMET_FILE_CONTENT_20.getBytes().length); crc.update(SignedDoc.MIMET_FILE_CONTENT_20.getBytes()); } ze.setCrc(crc.getValue()); zos.putArchiveEntry(ze); if (m_version.equals(BDOC_VERSION_1_0)) { zos.write(SignedDoc.MIMET_FILE_CONTENT_10.getBytes()); } if (m_version.equals(BDOC_VERSION_1_1)) { zos.write(SignedDoc.MIMET_FILE_CONTENT_11.getBytes()); } if (m_version.equals(BDOC_VERSION_2_1)) { zos.write(SignedDoc.MIMET_FILE_CONTENT_20.getBytes()); } zos.closeArchiveEntry(); // write manifest.xml if (m_logger.isDebugEnabled()) m_logger.debug("Writing: " + MANIF_FILE_NAME); ze = new ZipArchiveEntry(MANIF_DIR_META_INF); ze = new ZipArchiveEntry(MANIF_FILE_NAME); ze.setComment(DigiDocGenFactory.getUserInfo(m_format, m_version)); zos.putArchiveEntry(ze); //if(m_logger.isDebugEnabled()) // m_logger.debug("Writing manif:\n" + m_manifest.toString()); zos.write(m_manifest.toXML()); zos.closeArchiveEntry(); // write data files for (int i = 0; i < countDataFiles(); i++) { DataFile df = getDataFile(i); if (m_logger.isDebugEnabled()) m_logger.debug("Writing DF: " + df.getFileName() + " content: " + df.getContentType() + " df-cache: " + ((df.getDfCacheFile() != null) ? df.getDfCacheFile().getAbsolutePath() : "NONE")); InputStream is = null; if (df.hasAccessToDataFile()) is = df.getBodyAsStream(); else is = findDataFileAsStream(df.getFileName()); if (is != null) { File dfFile = new File(df.getFileName()); String fileName = dfFile.getName(); ze = new ZipArchiveEntry(fileName); if (df.getComment() == null) df.setComment(DigiDocGenFactory.getUserInfo(m_format, m_version)); ze.setComment(df.getComment()); ze.setSize(dfFile.length()); ze.setTime( (df.getLastModDt() != null) ? df.getLastModDt().getTime() : dfFile.lastModified()); zos.putArchiveEntry(ze); byte[] data = new byte[2048]; int nRead = 0, nTotal = 0; crc = new java.util.zip.CRC32(); while ((nRead = is.read(data)) > 0) { zos.write(data, 0, nRead); nTotal += nRead; crc.update(data, 0, nRead); } ze.setSize(nTotal); ze.setCrc(crc.getValue()); zos.closeArchiveEntry(); is.close(); } } for (int i = 0; i < countSignatures(); i++) { Signature sig = getSignature(i); String sFileName = sig.getPath(); if (sFileName == null) { if (m_version.equals(BDOC_VERSION_2_1)) sFileName = SIG_FILE_NAME_20 + (i + 1) + ".xml"; else sFileName = SIG_FILE_NAME + (i + 1) + ".xml"; } if (!sFileName.startsWith("META-INF")) sFileName = "META-INF/" + sFileName; if (m_logger.isDebugEnabled()) m_logger.debug("Writing SIG: " + sFileName + " orig: " + ((sig.getOrigContent() != null) ? "OK" : "NULL")); ze = new ZipArchiveEntry(sFileName); if (sig.getComment() == null) sig.setComment(DigiDocGenFactory.getUserInfo(m_format, m_version)); ze.setComment(sig.getComment()); String sSig = null; if (sig.getOrigContent() != null) sSig = new String(sig.getOrigContent(), "UTF-8"); else sSig = sig.toString(); if (sSig != null && !sSig.startsWith("<?xml")) sSig = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + sSig; byte[] sdata = sSig.getBytes("UTF-8"); if (m_logger.isDebugEnabled()) m_logger.debug("Writing SIG: " + sFileName + " xml:\n---\n " + ((sSig != null) ? sSig : "NULL") + "\n---\n "); ze.setSize(sdata.length); crc = new java.util.zip.CRC32(); crc.update(sdata); ze.setCrc(crc.getValue()); zos.putArchiveEntry(ze); zos.write(sdata); zos.closeArchiveEntry(); } zos.close(); } else if (m_format.equals(SignedDoc.FORMAT_DIGIDOC_XML)) { // ddoc format os.write(xmlHeader().getBytes()); for (int i = 0; i < countDataFiles(); i++) { DataFile df = getDataFile(i); df.writeToFile(os); os.write("\n".getBytes()); } for (int i = 0; i < countSignatures(); i++) { Signature sig = getSignature(i); if (sig.getOrigContent() != null) os.write(sig.getOrigContent()); else os.write(genFac.signatureToXML(sig)); os.write("\n".getBytes()); } os.write(xmlTrailer().getBytes()); } } catch (DigiDocException ex) { throw ex; // allready handled } catch (Exception ex) { DigiDocException.handleException(ex, DigiDocException.ERR_WRITE_FILE); } }
From source file:org.anarres.lzo.LzopInputStream.java
/** * Read and verify an lzo header, setting relevant block checksum options * and ignoring most everything else./* ww w . j av a 2s . c o m*/ */ protected int readHeader() throws IOException { byte[] buf = new byte[9]; readBytes(buf, 0, 9); if (!Arrays.equals(buf, LzopConstants.LZOP_MAGIC)) throw new IOException("Invalid LZO header"); Arrays.fill(buf, (byte) 0); Adler32 adler = new Adler32(); CRC32 crc32 = new CRC32(); int hitem = readHeaderItem(buf, 2, adler, crc32); // lzop version if (hitem > LzopConstants.LZOP_VERSION) { LOG.debug("Compressed with later version of lzop: " + Integer.toHexString(hitem) + " (expected 0x" + Integer.toHexString(LzopConstants.LZOP_VERSION) + ")"); } hitem = readHeaderItem(buf, 2, adler, crc32); // lzo library version if (hitem > LzoVersion.LZO_LIBRARY_VERSION) { throw new IOException("Compressed with incompatible lzo version: 0x" + Integer.toHexString(hitem) + " (expected 0x" + Integer.toHexString(LzoVersion.LZO_LIBRARY_VERSION) + ")"); } hitem = readHeaderItem(buf, 2, adler, crc32); // lzop extract version if (hitem > LzopConstants.LZOP_VERSION) { throw new IOException("Compressed with incompatible lzop version: 0x" + Integer.toHexString(hitem) + " (expected 0x" + Integer.toHexString(LzopConstants.LZOP_VERSION) + ")"); } hitem = readHeaderItem(buf, 1, adler, crc32); // method switch (hitem) { case LzopConstants.M_LZO1X_1: case LzopConstants.M_LZO1X_1_15: case LzopConstants.M_LZO1X_999: break; default: throw new IOException("Invalid strategy " + Integer.toHexString(hitem)); } readHeaderItem(buf, 1, adler, crc32); // ignore level // flags int flags = readHeaderItem(buf, 4, adler, crc32); boolean useCRC32 = (flags & LzopConstants.F_H_CRC32) != 0; boolean extraField = (flags & LzopConstants.F_H_EXTRA_FIELD) != 0; if ((flags & LzopConstants.F_MULTIPART) != 0) throw new IOException("Multipart lzop not supported"); if ((flags & LzopConstants.F_H_FILTER) != 0) throw new IOException("lzop filter not supported"); if ((flags & LzopConstants.F_RESERVED) != 0) throw new IOException("Unknown flags in header"); // known !F_H_FILTER, so no optional block readHeaderItem(buf, 4, adler, crc32); // ignore mode readHeaderItem(buf, 4, adler, crc32); // ignore mtime readHeaderItem(buf, 4, adler, crc32); // ignore gmtdiff hitem = readHeaderItem(buf, 1, adler, crc32); // fn len if (hitem > 0) { byte[] tmp = (hitem > buf.length) ? new byte[hitem] : buf; readHeaderItem(tmp, hitem, adler, crc32); // skip filename } int checksum = (int) (useCRC32 ? crc32.getValue() : adler.getValue()); hitem = readHeaderItem(buf, 4, adler, crc32); // read checksum if (hitem != checksum) { throw new IOException("Invalid header checksum: " + Long.toHexString(checksum) + " (expected 0x" + Integer.toHexString(hitem) + ")"); } if (extraField) { // lzop 1.08 ultimately ignores this LOG.debug("Extra header field not processed"); adler.reset(); crc32.reset(); hitem = readHeaderItem(buf, 4, adler, crc32); readHeaderItem(new byte[hitem], hitem, adler, crc32); checksum = (int) (useCRC32 ? crc32.getValue() : adler.getValue()); if (checksum != readHeaderItem(buf, 4, adler, crc32)) { throw new IOException("Invalid checksum for extra header field"); } } return flags; }
From source file:org.commoncrawl.service.crawler.CrawlLog.java
public static void walkCrawlLogFile(File crawlLogPath, long startOffset) throws IOException { // and open the crawl log file ... RandomAccessFile inputStream = null; IOException exception = null; CRC32 crc = new CRC32(); CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 17); byte[] syncBytesBuffer = new byte[SYNC_BYTES_SIZE]; // save position for potential debug output. long lastReadPosition = 0; try {/*from w ww .ja v a 2 s.c o m*/ inputStream = new RandomAccessFile(crawlLogPath, "rw"); // and a data input stream ... RandomAccessFile reader = inputStream; // seek to zero reader.seek(0L); // read the header ... LogFileHeader header = readLogFileHeader(reader); System.out.println("Header ItemCount:" + header._itemCount + " FileSize:" + header._fileSize); if (startOffset != 0L) { System.out.println("Preseeking to:" + startOffset); reader.seek(startOffset); } Configuration conf = new Configuration(); // read a crawl url from the stream... long recordCount = 0; while (inputStream.getFilePointer() < header._fileSize) { // System.out.println("PRE-SYNC SeekPos:"+ // inputStream.getFilePointer()); if (seekToNextSyncBytesPos(syncBytesBuffer, reader, header._fileSize)) { // System.out.println("POST-SYNC SeekPos:"+ // inputStream.getFilePointer()); lastReadPosition = inputStream.getFilePointer(); // skip sync inputStream.skipBytes(SYNC_BYTES_SIZE); // read length ... int urlDataLen = reader.readInt(); long urlDataCRC = reader.readLong(); if (urlDataLen > buffer.getBuffer().length) { buffer = new CustomByteArrayOutputStream(((urlDataLen / 65536) + 1) * 65536); } reader.read(buffer.getBuffer(), 0, urlDataLen); crc.reset(); crc.update(buffer.getBuffer(), 0, urlDataLen); long computedValue = crc.getValue(); // validate crc values ... if (computedValue != urlDataCRC) { LOG.error("CRC Mismatch Detected during HDFS transfer in CrawlLog:" + crawlLogPath.getAbsolutePath() + " FilePosition:" + lastReadPosition); inputStream.seek(lastReadPosition + 1); } else { if (recordCount++ % 10000 == 0) { // allocate a crawl url data structure CrawlURL url = new CrawlURL(); DataInputStream bufferReader = new DataInputStream( new ByteArrayInputStream(buffer.getBuffer(), 0, urlDataLen)); // populate it from the (in memory) data stream url.readFields(bufferReader); System.out.println("Record:" + recordCount + " At:" + lastReadPosition + " URL:" + url.getUrl() + " BuffSize:" + urlDataLen + " ContentLen:" + url.getContentRaw().getCount() + " LastModified:" + new Date(url.getLastAttemptTime()).toString()); } } } else { break; } } } catch (EOFException e) { LOG.error("Caught EOF Exception during read of local CrawlLog:" + crawlLogPath.getAbsolutePath() + " FilePosition:" + lastReadPosition); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); exception = e; throw e; } finally { if (inputStream != null) inputStream.close(); } }