nl.sidn.pcap.parquet.AbstractParquetPacketWriter.java Source code

Java tutorial

Introduction

Here is the source code for nl.sidn.pcap.parquet.AbstractParquetPacketWriter.java

Source

/*
 * ENTRADA, a big data platform for network data analytics
 *
 * Copyright (C) 2016 SIDN [https://www.sidn.nl]
 * 
 * This file is part of ENTRADA.
 * 
 * ENTRADA is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * ENTRADA is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with ENTRADA.  If not, see [<http://www.gnu.org/licenses/].
 *
 */
package nl.sidn.pcap.parquet;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import nl.sidn.pcap.support.PacketCombination;
import nl.sidn.pcap.util.GeoLookupUtil;
import nl.sidn.pcap.util.Settings;

import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetWriter;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.Formats;
import org.kitesdk.data.PartitionStrategy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class AbstractParquetPacketWriter {

    protected static final Logger LOGGER = LoggerFactory.getLogger(AbstractParquetPacketWriter.class);

    //show status every 100k rows;
    protected static final int STATUS_COUNT = 100000;

    protected int packetCounter;
    //writer vars
    protected DatasetDescriptor descriptor;
    protected DatasetWriter<GenericRecord> writer;
    protected String repoLocation;
    protected String schema;
    protected String repoName;
    //meta info
    protected GeoLookupUtil geoLookup;
    protected Map<String, String> geo_ip_cache = new HashMap<String, String>();
    protected Map<String, String> asn_cache = new HashMap<String, String>();
    //metrics
    protected Set<String> countries = new HashSet<>();

    public AbstractParquetPacketWriter(String repoName, String schema) {
        geoLookup = new GeoLookupUtil();
        this.repoLocation = Settings.getInstance().getSetting(Settings.OUTPUT_LOCATION);
        this.schema = schema;
        this.repoName = repoName;
    }

    /**
     * use caching for maxmind otherwise cpu usage will be high
     * and app will stall
     * @param lookup
     * @return
     */
    protected String getCountry(String lookup) {
        String country = geo_ip_cache.get(lookup);
        if (country == null) {
            country = geoLookup.lookupCountry(lookup);
            if (country != null) {
                geo_ip_cache.put(lookup, country);
                countries.add(country);
            }
        }

        return country;
    }

    /**
     * use caching for maxmind otherwise cpu usage will be high
     * and app will stall
     * @param lookup
     * @return
     */
    protected String getAsn(String lookup, boolean v4) {
        String asn = asn_cache.get(lookup);
        if (asn == null) {
            asn = geoLookup.lookupASN(lookup, v4);
            if (asn != null) {
                asn_cache.put(lookup, asn);
            }
        }

        return asn;
    }

    /**
     * create a parquet record which combines values from the query and the response
     * @param packet
     */
    public abstract void write(PacketCombination packet);

    /**
     * Create the partion strategy for the data, e.g. year, month,day
     * @return
     */
    protected abstract PartitionStrategy createPartitionStrategy();

    public void open() {
        String server = Settings.getInstance().getServer().getFullname();
        String path = repoLocation + System.getProperty("file.separator") + server
                + System.getProperty("file.separator") + repoName;
        //remove any dots from the path, kitesdk does not support this
        //https://issues.cloudera.org/browse/KITE-673
        path = StringUtils.replace(path, ".", "_");

        LOGGER.info("Create new Parquet write with path: " + path);

        /* before opening, make sure there is no (old) .metadata folder in the output dir */
        String metadataLocation = path + System.getProperty("file.separator") + ".metadata";
        try {
            FileUtils.deleteDirectory(new File(metadataLocation));
        } catch (IOException e1) {
            throw new RuntimeException("Could not remove old .metadata directory -> " + metadataLocation);
        }

        /* create a partition for year, month and day.
         * The parquetwriter will create a directory structure with the distinct partition
         * values.
         */
        PartitionStrategy partitionStrategy = createPartitionStrategy();
        //creat a descriptor with the parquet output format and the correct partition strategy
        try {
            descriptor = new DatasetDescriptor.Builder().schemaUri("resource:" + schema).format(Formats.PARQUET)
                    .partitionStrategy(partitionStrategy).build();
        } catch (Exception e) {
            throw new RuntimeException("Error while creating data descriptor", e);
        }
        //create a file dataset for the above descriptor
        Dataset<GenericRecord> dataset = Datasets.create("dataset:file:" + path, descriptor, GenericRecord.class);

        writer = dataset.newWriter();
    }

    /**
     * Create a new builder for every row.
     * @return
     */
    protected GenericRecordBuilder newBuilder() {
        return new GenericRecordBuilder(descriptor.getSchema());
    }

    public void close() {
        showStatus();

        if (writer != null && writer.isOpen()) {
            writer.close();
        }
    }

    protected void showStatus() {
        LOGGER.info("---------- " + this.getClass().getSuperclass().getSimpleName()
                + " Parquet writer status --------------------");
        LOGGER.info(packetCounter + " packets written to parquet file.");
        LOGGER.info("-----------------------------------------------------");
    }

    //   @Deprecated
    //   public String filterExtreme(String str) {
    //       StringBuilder filtered = new StringBuilder(str.length());
    //       for (int i = 0; i < str.length(); i++) {
    //           char current = str.charAt(i);
    //           if ((current >= 0x30 && current <= 0x39) ||
    //                 (current >= 0x41 && current <= 0x5a) ||
    //                 (current >= 0x61 && current <= 0x7a)){
    //               filtered.append(current);
    //           }else{
    //               filtered.append("?");
    //           }
    //       }
    //
    //       return filtered.toString();
    //   }

    /**
     * replace all non printable ascii chars with the hex value of the char.
     * @param str
     * @return
     */
    public String filter(String str) {
        StringBuilder filtered = new StringBuilder(str.length());
        for (int i = 0; i < str.length(); i++) {
            char current = str.charAt(i);
            if (current >= 0x20 && current <= 0x7e) {
                filtered.append(current);
            } else {
                filtered.append("0x" + Integer.toHexString(current));
            }
        }

        return filtered.toString();
    }

    public abstract void writeMetrics();

    protected void updateMetricMap(Map<Integer, Integer> map, Integer key) {
        Integer currentVal = map.get(key);
        if (currentVal != null) {
            map.put(key, currentVal.intValue() + 1);
        } else {
            map.put(key, 1);
        }
    }

}