Java tutorial
/* * Copyright 2016 sadikovi * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.sadikovi.netflowlib; import java.io.DataInputStream; import java.io.IOException; import java.nio.ByteOrder; import java.util.HashMap; import io.netty.buffer.ByteBuf; import io.netty.buffer.Unpooled; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.github.sadikovi.netflowlib.ScanPlanner; import com.github.sadikovi.netflowlib.Strategies.ScanStrategy; import com.github.sadikovi.netflowlib.Buffers.RecordBuffer; import com.github.sadikovi.netflowlib.Buffers.EmptyRecordBuffer; import com.github.sadikovi.netflowlib.Buffers.FilterRecordBuffer; import com.github.sadikovi.netflowlib.Buffers.ScanRecordBuffer; import com.github.sadikovi.netflowlib.predicate.Columns.Column; import com.github.sadikovi.netflowlib.predicate.Operators.FilterPredicate; import com.github.sadikovi.netflowlib.statistics.Statistics; import com.github.sadikovi.netflowlib.statistics.StatisticsTypes.LongStatistics; import com.github.sadikovi.netflowlib.version.NetFlow; import com.github.sadikovi.netflowlib.version.NetFlowV5; import com.github.sadikovi.netflowlib.version.NetFlowV7; /** * [[NetFlowReader]] is a main entry to process input stream of NetFlow file either from local * file system or HDFS. Provides API to retrieve header and other metadata before scanning records. * Uses statistics and planning based on [[ScanPlanner]] to decide whether or not the file needs * to be scanned. * [[FilterPredicate]] support is introduced to filter data on row basis. */ public final class NetFlowReader { private static Logger log = LoggerFactory.getLogger(NetFlowReader.class); // Internal byte offsets private static final short METADATA_LENGTH = 4; private static final short HEADER_OFFSET_LENGTH = 4; // Header check flags private static final short HEADER_MAGIC1 = 0xCF; private static final short HEADER_MAGIC2 = 0x10; // cisco flow // Byte order encoding private static final short HEADER_LITTLE_ENDIAN = 1; private static final short HEADER_BIG_ENDIAN = 2; /** * Initialize reader with input stream and provided buffer size, see * com.github.sadikovi.netflowlib.Buffers for more information on buffer size constants. * @param inputStream input stream, can be Hadoop FSDataInputStream * @param buffer buffer size in bytes * @param ignoreCorruptFile if true, ignores corrupt file, either when reading header or data * @return reader */ public static NetFlowReader prepareReader(DataInputStream inputStream, int buffer, boolean ignoreCorruptFile) throws IOException { return new NetFlowReader(inputStream, buffer, ignoreCorruptFile); } /** * Initialize reader with input stream and buffer size. By default, fails if * file is corrupt, e.g. file is not NetFlow file, or has corrupt data block. * @param inputStream input stream, can be Hadoop FSDataInputStream * @param buffer buffer size in bytes * @return reader */ public static NetFlowReader prepareReader(DataInputStream inputStream, int buffer) throws IOException { return prepareReader(inputStream, buffer, false); } /** * Initialize reader with input stream and default buffer size ~1Mb. By default, fails if * file is corrupt, e.g. file is not NetFlow file, or has corrupt data block. * @param inputStream input stream, can be Hadoop FSDataInputStream * @return reader */ public static NetFlowReader prepareReader(DataInputStream inputStream) throws IOException { return prepareReader(inputStream, RecordBuffer.BUFFER_LENGTH_2, false); } /** * [[NetFlowReader]] provides interface to get parsed header and record buffer with chosen * strategy based on columns, predicate and statistics. Metadata, header are parsed as part of * initialization. */ private NetFlowReader(DataInputStream inputStream, int buffer, boolean ignoreCorruptFile) throws IOException { in = inputStream; bufferLength = buffer; ignoreCorrupt = ignoreCorruptFile; byte[] metadata = null; ByteBuf buf = null; try { metadata = new byte[METADATA_LENGTH]; in.read(metadata, 0, METADATA_LENGTH); // Parse metadata, byte order does not really matter, so we go for big endian. Metadata contains // magic numbers to verify consistency of the NetFlow file, byte order encoded as either 1 or 2, // and stream version which affects header parsing (currently only 1 and 3 are supported). buf = Unpooled.wrappedBuffer(metadata).order(ByteOrder.BIG_ENDIAN); short magic1 = buf.getUnsignedByte(0); short magic2 = buf.getUnsignedByte(1); short order = buf.getUnsignedByte(2); short stream = buf.getUnsignedByte(3); // Verify consistency of NetFlow file, also this ensures that we are at the beginning of the // input stream if (magic1 != HEADER_MAGIC1 || magic2 != HEADER_MAGIC2) { throw new IOException("Corrupt NetFlow file. Wrong magic number"); } // Resolve byte order, last case corresponds to incorrect reading from buffer if (order == HEADER_BIG_ENDIAN) { byteOrder = ByteOrder.BIG_ENDIAN; } else if (order == HEADER_LITTLE_ENDIAN) { byteOrder = ByteOrder.LITTLE_ENDIAN; } else { throw new IOException("Could not recognize byte order " + order); } streamVersion = stream; // Check stream version ensureStreamVersion(); // Read header header = getHeader(); } catch (IOException err) { if (ignoreCorrupt) { // we subsume exception and log warning. Set header to null log.warn("Failed to initialize reader, ignoreCorruptFile=" + ignoreCorrupt + ", error=" + err); header = new CorruptNetFlowHeader(); } else { throw err; } } finally { metadata = null; if (buf != null) { buf.release(); buf = null; } } } /** Ensure that stream version is either version 1 or version 3 */ private void ensureStreamVersion() throws UnsupportedOperationException { if (streamVersion != 1 && streamVersion != 3) { throw new UnsupportedOperationException("Unsupported stream version " + streamVersion); } } /** Prepare header using provided input stream */ private NetFlowHeader prepareHeader() throws IOException { NetFlowHeader internalHeader; int numBytesRead = 0; int lenRead = 0; ByteBuf buf; byte[] headerArray; // Read header depending on stream version (different from flow version) if (streamVersion == 1) { // Version 1 has static header // TODO: verify header size for stream version 1 lenRead = NetFlowHeader.S1_HEADER_SIZE - METADATA_LENGTH; internalHeader = new NetFlowHeader(streamVersion, byteOrder); } else { // Version 3 with dynamic header size headerArray = new byte[HEADER_OFFSET_LENGTH]; numBytesRead = in.read(headerArray, 0, HEADER_OFFSET_LENGTH); if (numBytesRead != HEADER_OFFSET_LENGTH) { throw new UnsupportedOperationException("Short read while loading header offset"); } buf = Unpooled.wrappedBuffer(headerArray).order(byteOrder); int headerSize = (int) buf.getUnsignedInt(0); if (headerSize <= 0) { throw new UnsupportedOperationException("Failed to load header of size " + headerSize); } // Actual header length, determine how many bytes to read lenRead = headerSize - METADATA_LENGTH - HEADER_OFFSET_LENGTH; internalHeader = new NetFlowHeader(streamVersion, byteOrder, headerSize); } // allocate buffer for length to read headerArray = new byte[lenRead]; numBytesRead = in.read(headerArray, 0, lenRead); if (numBytesRead != lenRead) { throw new UnsupportedOperationException("Short read while loading header data"); } // build buffer buf = Unpooled.wrappedBuffer(headerArray).order(byteOrder); // resolve stream version (either 1 or 3) if (streamVersion == 1) { internalHeader.setFlowVersion((short) buf.getUnsignedShort(0)); internalHeader.setStartCapture(buf.getUnsignedInt(2)); internalHeader.setEndCapture(buf.getUnsignedInt(6)); internalHeader.setHeaderFlags(buf.getUnsignedInt(10)); internalHeader.setRotation(buf.getUnsignedInt(14)); internalHeader.setNumFlows(buf.getUnsignedInt(18)); internalHeader.setNumDropped(buf.getUnsignedInt(22)); internalHeader.setNumMisordered(buf.getUnsignedInt(26)); // Read hostname fixed bytes byte[] hostnameBytes = new byte[NetFlowHeader.S1_HEADER_HN_LEN]; buf.getBytes(30, hostnameBytes, 0, hostnameBytes.length); internalHeader.setHostname(new String(hostnameBytes)); // Read comments fixed bytes byte[] commentsBytes = new byte[NetFlowHeader.S1_HEADER_CMNT_LEN]; buf.getBytes(30 + hostnameBytes.length, commentsBytes, 0, commentsBytes.length); internalHeader.setComments(new String(commentsBytes)); // Dereference arrays hostnameBytes = null; commentsBytes = null; } else { // Resolve TLV (type-length value) // Set decode pointer to first tlv int dp = 0; int left = lenRead; // Smallest TLV is 2+2+0 (null TLV) // tlv_t - TLV type, tlv_l - TLV length, tlv_v - TLV value int tlv_t = 0; int tlv_l = 0; int tlv_v = 0; // Byte array for holding Strings byte[] pr; while (left >= 4) { // Parse type, store in host byte order tlv_t = buf.getUnsignedShort(dp); dp += 2; left -= 2; // Parse len, store in host byte order tlv_l = buf.getUnsignedShort(dp); dp += 2; left -= 2; // Parse val tlv_v = dp; // Point decode buffer at next tlv dp += tlv_l; left -= tlv_l; // TLV length check if (left < 0) { break; } switch (tlv_t) { // FT_TLV_VENDOR case 0x1: internalHeader.setVendor(buf.getUnsignedShort(tlv_v)); break; // FT_TLV_EX_VER case 0x2: internalHeader.setFlowVersion((short) buf.getUnsignedShort(tlv_v)); break; // FT_TLV_AGG_VER case 0x3: internalHeader.setAggVersion(buf.getUnsignedByte(tlv_v)); break; // FT_TLV_AGG_METHOD case 0x4: internalHeader.setAggMethod(buf.getUnsignedByte(tlv_v)); break; // FT_TLV_EXPORTER_IP case 0x5: internalHeader.setExporterIP(buf.getUnsignedInt(tlv_v)); break; // FT_TLV_CAP_START case 0x6: internalHeader.setStartCapture(buf.getUnsignedInt(tlv_v)); break; // FT_TLV_CAP_END case 0x7: internalHeader.setEndCapture(buf.getUnsignedInt(tlv_v)); break; // FT_TLV_HEADER_FLAGS case 0x8: internalHeader.setHeaderFlags(buf.getUnsignedInt(tlv_v)); break; // FT_TLV_ROT_SCHEDULE case 0x9: internalHeader.setRotation(buf.getUnsignedInt(tlv_v)); break; // FT_TLV_FLOW_COUNT case 0xA: internalHeader.setNumFlows(buf.getUnsignedInt(tlv_v)); break; // FT_TLV_FLOW_LOST case 0xB: internalHeader.setNumDropped(buf.getUnsignedInt(tlv_v)); break; // FT_TLV_FLOW_MISORDERED case 0xC: internalHeader.setNumMisordered(buf.getUnsignedInt(tlv_v)); break; // FT_TLV_PKT_CORRUPT case 0xD: internalHeader.setNumCorrupt(buf.getUnsignedInt(tlv_v)); break; // FT_TLV_SEQ_RESET case 0xE: internalHeader.setSeqReset(buf.getUnsignedInt(tlv_v)); break; // FT_TLV_CAP_HOSTNAME case 0xF: pr = new byte[tlv_l]; buf.getBytes(tlv_v, pr, 0, pr.length); // Expected null-terminated string if (pr[pr.length - 1] != 0) { throw new UnsupportedOperationException("Char sequence is not null-terminated"); } internalHeader.setHostname(new String(pr, 0, pr.length - 1)); break; // FT_TLV_COMMENTS case 0x10: pr = new byte[tlv_l]; buf.getBytes(tlv_v, pr, 0, pr.length); // Expected null-terminated string if (pr[pr.length - 1] != 0) { throw new UnsupportedOperationException("Char sequence is not null-terminated"); } internalHeader.setComments(new String(pr, 0, pr.length - 1)); break; // FT_TLV_IF_NAME case 0x11: // uint32_t, uint16_t, string: // - IP address of device // - ifIndex of interface // - interface name long ip = buf.getUnsignedInt(tlv_v); int ifIndex = buf.getUnsignedShort(tlv_v + 4); pr = new byte[tlv_l - 4 - 2]; buf.getBytes(tlv_v + 4 + 2, pr, 0, pr.length); if (pr[pr.length - 1] != 0) { throw new UnsupportedOperationException("Char sequence is not null-terminated"); } internalHeader.setInterfaceName(ip, ifIndex, new String(pr, 0, pr.length - 1)); break; // FT_TLV_IF_ALIAS case 0x12: // uint32_t, uint16_t, uint16_t, string: // - IP address of device // - ifIndex count // - ifIndex of interface (count times) // - alias name long aliasIP = buf.getUnsignedInt(tlv_v); int aliasIfIndexCnt = buf.getUnsignedShort(tlv_v + 4); int aliasIfIndex = buf.getUnsignedShort(tlv_v + 4 + 2); pr = new byte[tlv_l - 4 - 2 - 2]; buf.getBytes(tlv_v + 4 + 2 + 2, pr, 0, pr.length); if (pr[pr.length - 1] != 0) { throw new UnsupportedOperationException("Char sequence is not null-terminated"); } internalHeader.setInterfaceAlias(aliasIP, aliasIfIndexCnt, aliasIfIndex, new String(pr, 0, pr.length - 1)); break; // Case 0x0 default: break; } } if (buf != null && buf.refCnt() > 0) { buf.release(buf.refCnt()); } buf = null; pr = null; } return internalHeader; } /** Return NetFlow header for current input stream */ public NetFlowHeader getHeader() throws IOException { if (header == null) { header = prepareHeader(); } return header; } /** Prepare record buffer for full scan */ public RecordBuffer prepareRecordBuffer(Column[] columns) { return prepareRecordBuffer(columns, null); } /** Prepare record buffer with default statistics on time */ public RecordBuffer prepareRecordBuffer(Column[] columns, FilterPredicate predicate) { return prepareRecordBuffer(columns, predicate, null); } /** Prepare record buffer based on input stream */ public RecordBuffer prepareRecordBuffer(Column[] columns, FilterPredicate predicate, HashMap<Column, Statistics> stats) { // Since we are using statistics on a field, we have to make sure that it is initialized // properly if (stats == null) { stats = new HashMap<Column, Statistics>(); } // Find out appropriate strategy for set of columns and predicate. We also update statistics // with start and end capture time of the file. NetFlow flowInterface; if (header.getFlowVersion() == 5) { flowInterface = new NetFlowV5(); stats.put(NetFlowV5.FIELD_UNIX_SECS, new LongStatistics(header.getStartCapture(), header.getEndCapture())); } else if (header.getFlowVersion() == 7) { flowInterface = new NetFlowV7(); stats.put(NetFlowV7.FIELD_UNIX_SECS, new LongStatistics(header.getStartCapture(), header.getEndCapture())); } else { throw new UnsupportedOperationException("Version " + header.getFlowVersion() + " is not supported"); } ScanStrategy strategy = ScanPlanner.buildStrategy(columns, predicate, stats); return prepareRecordBuffer(strategy, flowInterface); } // Prepare record buffer based on strategy and flow interface. Method is currently private, so // there is no option to pass custom scan strategy. private RecordBuffer prepareRecordBuffer(ScanStrategy strategy, NetFlow flowInterface) { if (strategy == null) { throw new IllegalArgumentException("Expected ScanStrategy instance, got null"); } if (flowInterface == null) { throw new IllegalArgumentException("Expected NetFlow instance, got null"); } // Depending on different strategy we either skip file directly, return full buffer or records, // or return filtering buffer, if there is a FilterScan. boolean isCompressed = header.isCompressed(); int recordSize = flowInterface.recordSize(); if (strategy.skipScan()) { log.info("Skip scan based on strategy " + strategy); return new EmptyRecordBuffer(); } else if (strategy.fullScan()) { log.info("Full scan based on strategy " + strategy + ", ignoreCorrupt=" + ignoreCorrupt); // wrap into closeable iterator return new ScanRecordBuffer(in, strategy.getRecordMaterializer(), recordSize, byteOrder, isCompressed, bufferLength, ignoreCorrupt); } else { log.info("Filter scan based on strategy " + strategy + ", ignoreCorrupt=" + ignoreCorrupt); return new FilterRecordBuffer(in, strategy.getRecordMaterializer(), recordSize, byteOrder, isCompressed, bufferLength, ignoreCorrupt); } } @Override public String toString() { return getClass().getSimpleName() + "[byte order: " + byteOrder + ", stream version: " + streamVersion + ", buffer length: " + bufferLength + "]"; } /** Return size of buffer in bytes that is currently used by reader */ public int getBufferLength() { return this.bufferLength; } /** * Whether or not reader is valid, currently is based on validity of header, assuming that file * is of correct format, but might still have corrupt data blocks. See buffers implementation for * usage of `ignoreCorrupt`. */ public boolean isValid() { return header.isValid(); } // Stream of the NetFlow file private final DataInputStream in; // Byte order of the file private ByteOrder byteOrder; // Stream version of the file private short streamVersion; // Buffer size for record buffer private final int bufferLength; // NetFlow header private NetFlowHeader header = null; // Whether or not to ignore corrupt file stream private final boolean ignoreCorrupt; }