at.ac.ait.ubicity.fileloader.FileLoader.java Source code

Introduction

Here is the source code for at.ac.ait.ubicity.fileloader.FileLoader.java
Source

package at.ac.ait.ubicity.fileloader;

/**
Copyright (C) 2013  AIT / Austrian Institute of Technology
http://www.ait.ac.at
    
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.
    
You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see http://www.gnu.org/licenses/agpl-3.0.html
 */

import at.ac.ait.ubicity.fileloader.aggregation.AggregationJob;
import at.ac.ait.ubicity.fileloader.aggregation.Aggregator;
import at.ac.ait.ubicity.fileloader.cassandra.AstyanaxInitializer;
import at.ac.ait.ubicity.fileloader.util.Delay;
import at.ac.ait.ubicity.fileloader.util.FileCache;
import at.ac.ait.ubicity.fileloader.util.FileCache.FileInformation;
import at.ac.ait.ubicity.fileloader.util.LogFileCache;
import at.ac.ait.ubicity.fileloader.util.LogFileNameFilter;
import at.ac.ait.ubicity.fileloader.util.StatsTableActualizer;
import com.lmax.disruptor.EventHandler;
import com.lmax.disruptor.RingBuffer;
import com.lmax.disruptor.dsl.Disruptor;
import com.netflix.astyanax.Keyspace;
import com.netflix.astyanax.MutationBatch;
import com.netflix.astyanax.model.ColumnFamily;

import java.io.File;
import java.io.FileNotFoundException;
import java.net.URI;
import java.util.Iterator;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;

/**
 *
 * @author Jan van Oort
 */

public final class FileLoader {

    public final static double TWO = 2.0;

    final static Logger logger = Logger.getLogger("FileLoader");

    static Keyspace keySpace;

    public static boolean cassandraInitialized = false;

    static boolean useCache = true;

    //default delay, in milliseconds, for which to check our invigilance directory or file for new updates
    static final long INVIGILANCE_WAITING_DELAY = 5000;

    static {
        logger.setLevel(Level.ALL);
    }

    /**
     * 
     * @param _fileInfo A FileInformation object representing usage information on the file we are supposed to load: line count already ingested, last usage time...
     * @param _keySpace Cassandra key space into which to ingest
     * @param _host Cassandra host / server
     * @param _batchSize MutationBatch size
     * @throws Exception Shouldn't happen, although the Disruptor may throw an Exception under duress
     */
    @SuppressWarnings("unchecked")
    public final static void load(final FileInformation _fileInfo, final String _keySpace, final String _host,
            final int _batchSize) throws Exception {

        if (!cassandraInitialized) {
            keySpace = AstyanaxInitializer.doInit("Test Cluster", _host, _keySpace);
            cassandraInitialized = true;
        }

        LongTimeStampSorter tsSorter = new LongTimeStampSorter();
        Thread tTSSorter = new Thread(tsSorter);
        tTSSorter.setPriority(Thread.MAX_PRIORITY - 1);
        tTSSorter.setName("long timestamp sorter ");
        tTSSorter.start();
        //get the log id from the file's URI
        final String log_id = _fileInfo.getURI().toString();

        final MutationBatch batch = keySpace.prepareMutationBatch();

        logger.info("got keyspace " + keySpace.getKeyspaceName() + " from Astyanax initializer");

        final LineIterator onLines = FileUtils.lineIterator(new File(_fileInfo.getURI()));

        final ExecutorService exec = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 2);

        ColumnFamily crawl_stats = null;

        AggregationJob aggregationJob = new AggregationJob(keySpace, crawl_stats);
        Thread tAggJob = new Thread(aggregationJob);
        tAggJob.setName("Monitrix loader / aggregation job ");
        tAggJob.setPriority(Thread.MIN_PRIORITY + 1);
        tAggJob.start();
        logger.info("[FILELOADER] started aggregation job, ring buffer running");

        final Disruptor<SingleLogLineAsString> disruptor = new Disruptor(SingleLogLineAsString.EVENT_FACTORY,
                (int) Math.pow(TWO, 17), exec);
        SingleLogLineAsStringEventHandler.batch = batch;
        SingleLogLineAsStringEventHandler.keySpace = keySpace;
        SingleLogLineAsStringEventHandler.batchSize = _batchSize;
        SingleLogLineAsStringEventHandler.LOG_ID = log_id;
        SingleLogLineAsStringEventHandler.tsSorter = tsSorter;
        SingleLogLineAsStringEventHandler.aggregationJob = aggregationJob;

        //The EventHandler contains the actual logic for ingesting
        final EventHandler<SingleLogLineAsString> handler = new SingleLogLineAsStringEventHandler();

        disruptor.handleEventsWith(handler);

        //get our Aggregate job in place

        //we are almost ready to start
        final RingBuffer<SingleLogLineAsString> rb = disruptor.start();

        int _lineCount = 0;
        long _start, _lapse;
        _start = System.nanoTime();

        int _linesAlreadyProcessed = _fileInfo.getLineCount();

        //cycle through the lines already processed
        while (_lineCount < _linesAlreadyProcessed) {
            onLines.nextLine();
            _lineCount++;
        }

        //now get down to the work we actually must do, and fill the ring buffer
        logger.info("begin proccessing of file " + _fileInfo.getURI() + " @line #" + _lineCount);
        while (onLines.hasNext()) {

            final long _seq = rb.next();
            final SingleLogLineAsString event = rb.get(_seq);
            event.setValue(onLines.nextLine());
            rb.publish(_seq);
            _lineCount++;
        }
        _lapse = System.nanoTime() - _start;
        logger.info("ended proccessing of file " + _fileInfo.getURI() + " @line #" + _lineCount);

        //stop, waiting for last threads still busy to finish their work
        disruptor.shutdown();

        //update the file info, this will  land in the cache
        _fileInfo.setLineCount(_lineCount);
        _fileInfo.setLastAccess(System.currentTimeMillis());
        int _usageCount = _fileInfo.getUsageCount();
        _fileInfo.setUsageCount(_usageCount++);

        //make sure we release resources
        onLines.close();

        logger.info(
                "handled " + (_lineCount - _linesAlreadyProcessed) + " log lines in " + _lapse + " nanoseconds");

        //now go to aggregation step
        SortedSet<Long> timeStamps = new TreeSet(tsSorter.timeStamps);

        long _minTs = timeStamps.first();
        long _maxTs = timeStamps.last();
        logger.info("**** min TimeStamp = " + _minTs);
        logger.info("**** max TimeStamp = " + _maxTs);

        StatsTableActualizer.update(_fileInfo.getURI().toString(), _minTs, _maxTs, _lineCount);

        //        AggregationJob aggJob = new AggregationJob( keySpace, _host, _batchSize );
        //        Thread tAgg = new Thread( aggJob );
        //        tAgg.setName( "aggregation job " );
        //        tAgg.setPriority( Thread.MAX_PRIORITY - 1 );
        //        tAgg.start();

    }

    /**
     * 
     * @param _uri the uri we must "patrol"
     * @param keySpace the Cassandra keyspace to use
     * @param host the Cassandra host / node
     * @param batchSize MutationBatch size for ingests
     * @param millisToWait the number of milliseconds we are supposed to wait before visiting the uri again
     * @throws FileNotFoundException if there is a problem with the given uri
     * @throws Exception if actually loading ( ingesting ) from some file under the uri leads to a problem
     * 
     */
    public final static void invigilate(URI _uri, String keySpace, String host, int batchSize, long millisToWait)
            throws FileNotFoundException, Exception {
        logger.info("[FILELOADER] invigilating URI: " + _uri);
        if (_uri.getScheme().equals("file")) {
            //we don't know yet if the URI is a directory or a file
            File _startingPoint = new File(_uri);
            File[] _files = getLogFilesFor(_startingPoint);
            FileCache cache = useCache ? LogFileCache.get().loadCache() : null;

            for (File file : _files) {
                logger.info("[FILELOADER] found file under " + _uri.toString() + " : " + file.getName());
                doLoad(file, cache, keySpace, host, batchSize);
            }
            return;
        }
        logger.info("[FILELOADER] URI " + _uri.toString() + " is not something FileLoader can currently handle");
    }

    /**
     * Perform a load, and either write to cache or not, according to settings.
     * 
     * @param _f The file we must ingest
     * @param _cache The cache we are to use for keeping file usage information up to date
     * @param _keySpace Cassandra key space into which to ingest
     * @param _host Cassandra host / server
     * @param _batchSize MutationBatch size
     * @throws Exception  if actual loading of the file causes a problem
     */
    private final static void doLoad(File _f, FileCache _cache, String _keySpace, String _host, int _batchSize)
            throws Exception {
        if (!(_cache == null)) {
            FileInformation _fileInfo = _cache.getFileInformationFor(_f.toURI());
            if (_fileInfo == null) {
                _fileInfo = new FileInformation(_f.toURI(), System.currentTimeMillis(), 1, 0);
                _cache.updateCacheFor(_f.toURI(), _fileInfo);
            }
            logger.info("[FILELOADER] " + _fileInfo.toString());
            load(_fileInfo, _keySpace, _host, _batchSize);

            _cache.saveCache();
        } else {
            load(new FileInformation(_f.toURI(), System.currentTimeMillis(), 0, 0), _keySpace, _host, _batchSize);
        }
    }

    /**
     * 
     * @param _file
     * @return  the log files present under the argument, if it is a directory, or otherwise the argument itself ( if it is a log file )
     */
    public static File[] getLogFilesFor(File _file) throws FileNotFoundException {
        File[] _returned = { _file };
        if (_file.isDirectory())
            return _file.listFiles(new LogFileNameFilter());
        else if (_file.getName().endsWith(LogFileNameFilter.FILE_NAME_SUFFIX))
            return _returned;
        throw new FileNotFoundException("[AGGREGATOR] no log file(s) at " + _file.getName());
    }

    /**
     * 
     * This method is here for demo purposes only. It is not part of the required functionality for this class. 
     * 
     * 
     * @param args arg 0 = file, arg #1 = keyspace, arg #2 = server host name, arg #3 = batch size, arg #4 = number of time units to wait, arg #5 = time unit ( minute, second, hour,.. ) 
     * ( For now, tacitly assume we are on the default Cassandra 9160 port ). Clustering is not yet supported.
     */
    public final static void main(String[] args) throws Exception {
        if (!(args.length == 6)) {
            usage();
            System.exit(1);
        }

        try {
            final File _f = new File(args[0]);

            URI uri = _f.toURI();
            String keySpaceName = args[1];
            final String host = args[2];
            final int batchSize = Integer.parseInt(args[3]);
            final int timeUnitCount = Integer.parseInt(args[4]);
            Delay timeUnit = timeUnitsFromCmdLine(args[5].toUpperCase());
            if (timeUnit == null)
                timeUnit = Delay.SECOND;
            long millisToWait = timeUnitCount * timeUnit.getMilliSeconds();
            useCache = true;
            while (true) {
                try {
                    invigilate(uri, keySpaceName, host, batchSize, millisToWait);
                    Thread.sleep(millisToWait);
                } catch (InterruptedException | Error any) {
                    Thread.interrupted();
                } finally {

                }
            }
        } catch (Exception e) {
            logger.log(Level.SEVERE, e.toString());
        }
    }

    /**
     * Helper method for converting cmd line, human-readable invigilance delays
     * 
     * @param _arg a time unit readable to a human ( minute, second, hour... )
     * @return a Delay known to the system ( Minute, Hour, ... ) 
     */
    private static Delay timeUnitsFromCmdLine(String _arg) {
        Iterator<Delay> onKnownDelayOptions = Delay.knownOptions.iterator();
        while (onKnownDelayOptions.hasNext()) {
            Delay _d = onKnownDelayOptions.next();
            if (_d.name().equals(_arg)) {
                return _d;
            }
        }
        return null;
    }

    private static void usage() {
        System.out.println("usage: FileLoader file URL | keyspace | server | batch_size {  number | seconds }");
        System.out.println("example: FileLoader /data/bl/ mykeyspace  localhost 10000 10 minutes");
    }
}