Java tutorial
package at.ac.ait.ubicity.fileloader; /** Copyright (C) 2013 AIT / Austrian Institute of Technology http://www.ait.ac.at This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see http://www.gnu.org/licenses/agpl-3.0.html */ import at.ac.ait.ubicity.fileloader.aggregation.AggregationJob; import at.ac.ait.ubicity.fileloader.aggregation.Aggregator; import at.ac.ait.ubicity.fileloader.cassandra.AstyanaxInitializer; import at.ac.ait.ubicity.fileloader.util.Delay; import at.ac.ait.ubicity.fileloader.util.FileCache; import at.ac.ait.ubicity.fileloader.util.FileCache.FileInformation; import at.ac.ait.ubicity.fileloader.util.LogFileCache; import at.ac.ait.ubicity.fileloader.util.LogFileNameFilter; import at.ac.ait.ubicity.fileloader.util.StatsTableActualizer; import com.lmax.disruptor.EventHandler; import com.lmax.disruptor.RingBuffer; import com.lmax.disruptor.dsl.Disruptor; import com.netflix.astyanax.Keyspace; import com.netflix.astyanax.MutationBatch; import com.netflix.astyanax.model.ColumnFamily; import java.io.File; import java.io.FileNotFoundException; import java.net.URI; import java.util.Iterator; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; /** * * @author Jan van Oort */ public final class FileLoader { public final static double TWO = 2.0; final static Logger logger = Logger.getLogger("FileLoader"); static Keyspace keySpace; public static boolean cassandraInitialized = false; static boolean useCache = true; //default delay, in milliseconds, for which to check our invigilance directory or file for new updates static final long INVIGILANCE_WAITING_DELAY = 5000; static { logger.setLevel(Level.ALL); } /** * * @param _fileInfo A FileInformation object representing usage information on the file we are supposed to load: line count already ingested, last usage time... * @param _keySpace Cassandra key space into which to ingest * @param _host Cassandra host / server * @param _batchSize MutationBatch size * @throws Exception Shouldn't happen, although the Disruptor may throw an Exception under duress */ @SuppressWarnings("unchecked") public final static void load(final FileInformation _fileInfo, final String _keySpace, final String _host, final int _batchSize) throws Exception { if (!cassandraInitialized) { keySpace = AstyanaxInitializer.doInit("Test Cluster", _host, _keySpace); cassandraInitialized = true; } LongTimeStampSorter tsSorter = new LongTimeStampSorter(); Thread tTSSorter = new Thread(tsSorter); tTSSorter.setPriority(Thread.MAX_PRIORITY - 1); tTSSorter.setName("long timestamp sorter "); tTSSorter.start(); //get the log id from the file's URI final String log_id = _fileInfo.getURI().toString(); final MutationBatch batch = keySpace.prepareMutationBatch(); logger.info("got keyspace " + keySpace.getKeyspaceName() + " from Astyanax initializer"); final LineIterator onLines = FileUtils.lineIterator(new File(_fileInfo.getURI())); final ExecutorService exec = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() * 2); ColumnFamily crawl_stats = null; AggregationJob aggregationJob = new AggregationJob(keySpace, crawl_stats); Thread tAggJob = new Thread(aggregationJob); tAggJob.setName("Monitrix loader / aggregation job "); tAggJob.setPriority(Thread.MIN_PRIORITY + 1); tAggJob.start(); logger.info("[FILELOADER] started aggregation job, ring buffer running"); final Disruptor<SingleLogLineAsString> disruptor = new Disruptor(SingleLogLineAsString.EVENT_FACTORY, (int) Math.pow(TWO, 17), exec); SingleLogLineAsStringEventHandler.batch = batch; SingleLogLineAsStringEventHandler.keySpace = keySpace; SingleLogLineAsStringEventHandler.batchSize = _batchSize; SingleLogLineAsStringEventHandler.LOG_ID = log_id; SingleLogLineAsStringEventHandler.tsSorter = tsSorter; SingleLogLineAsStringEventHandler.aggregationJob = aggregationJob; //The EventHandler contains the actual logic for ingesting final EventHandler<SingleLogLineAsString> handler = new SingleLogLineAsStringEventHandler(); disruptor.handleEventsWith(handler); //get our Aggregate job in place //we are almost ready to start final RingBuffer<SingleLogLineAsString> rb = disruptor.start(); int _lineCount = 0; long _start, _lapse; _start = System.nanoTime(); int _linesAlreadyProcessed = _fileInfo.getLineCount(); //cycle through the lines already processed while (_lineCount < _linesAlreadyProcessed) { onLines.nextLine(); _lineCount++; } //now get down to the work we actually must do, and fill the ring buffer logger.info("begin proccessing of file " + _fileInfo.getURI() + " @line #" + _lineCount); while (onLines.hasNext()) { final long _seq = rb.next(); final SingleLogLineAsString event = rb.get(_seq); event.setValue(onLines.nextLine()); rb.publish(_seq); _lineCount++; } _lapse = System.nanoTime() - _start; logger.info("ended proccessing of file " + _fileInfo.getURI() + " @line #" + _lineCount); //stop, waiting for last threads still busy to finish their work disruptor.shutdown(); //update the file info, this will land in the cache _fileInfo.setLineCount(_lineCount); _fileInfo.setLastAccess(System.currentTimeMillis()); int _usageCount = _fileInfo.getUsageCount(); _fileInfo.setUsageCount(_usageCount++); //make sure we release resources onLines.close(); logger.info( "handled " + (_lineCount - _linesAlreadyProcessed) + " log lines in " + _lapse + " nanoseconds"); //now go to aggregation step SortedSet<Long> timeStamps = new TreeSet(tsSorter.timeStamps); long _minTs = timeStamps.first(); long _maxTs = timeStamps.last(); logger.info("**** min TimeStamp = " + _minTs); logger.info("**** max TimeStamp = " + _maxTs); StatsTableActualizer.update(_fileInfo.getURI().toString(), _minTs, _maxTs, _lineCount); // AggregationJob aggJob = new AggregationJob( keySpace, _host, _batchSize ); // Thread tAgg = new Thread( aggJob ); // tAgg.setName( "aggregation job " ); // tAgg.setPriority( Thread.MAX_PRIORITY - 1 ); // tAgg.start(); } /** * * @param _uri the uri we must "patrol" * @param keySpace the Cassandra keyspace to use * @param host the Cassandra host / node * @param batchSize MutationBatch size for ingests * @param millisToWait the number of milliseconds we are supposed to wait before visiting the uri again * @throws FileNotFoundException if there is a problem with the given uri * @throws Exception if actually loading ( ingesting ) from some file under the uri leads to a problem * */ public final static void invigilate(URI _uri, String keySpace, String host, int batchSize, long millisToWait) throws FileNotFoundException, Exception { logger.info("[FILELOADER] invigilating URI: " + _uri); if (_uri.getScheme().equals("file")) { //we don't know yet if the URI is a directory or a file File _startingPoint = new File(_uri); File[] _files = getLogFilesFor(_startingPoint); FileCache cache = useCache ? LogFileCache.get().loadCache() : null; for (File file : _files) { logger.info("[FILELOADER] found file under " + _uri.toString() + " : " + file.getName()); doLoad(file, cache, keySpace, host, batchSize); } return; } logger.info("[FILELOADER] URI " + _uri.toString() + " is not something FileLoader can currently handle"); } /** * Perform a load, and either write to cache or not, according to settings. * * @param _f The file we must ingest * @param _cache The cache we are to use for keeping file usage information up to date * @param _keySpace Cassandra key space into which to ingest * @param _host Cassandra host / server * @param _batchSize MutationBatch size * @throws Exception if actual loading of the file causes a problem */ private final static void doLoad(File _f, FileCache _cache, String _keySpace, String _host, int _batchSize) throws Exception { if (!(_cache == null)) { FileInformation _fileInfo = _cache.getFileInformationFor(_f.toURI()); if (_fileInfo == null) { _fileInfo = new FileInformation(_f.toURI(), System.currentTimeMillis(), 1, 0); _cache.updateCacheFor(_f.toURI(), _fileInfo); } logger.info("[FILELOADER] " + _fileInfo.toString()); load(_fileInfo, _keySpace, _host, _batchSize); _cache.saveCache(); } else { load(new FileInformation(_f.toURI(), System.currentTimeMillis(), 0, 0), _keySpace, _host, _batchSize); } } /** * * @param _file * @return the log files present under the argument, if it is a directory, or otherwise the argument itself ( if it is a log file ) */ public static File[] getLogFilesFor(File _file) throws FileNotFoundException { File[] _returned = { _file }; if (_file.isDirectory()) return _file.listFiles(new LogFileNameFilter()); else if (_file.getName().endsWith(LogFileNameFilter.FILE_NAME_SUFFIX)) return _returned; throw new FileNotFoundException("[AGGREGATOR] no log file(s) at " + _file.getName()); } /** * * This method is here for demo purposes only. It is not part of the required functionality for this class. * * * @param args arg 0 = file, arg #1 = keyspace, arg #2 = server host name, arg #3 = batch size, arg #4 = number of time units to wait, arg #5 = time unit ( minute, second, hour,.. ) * ( For now, tacitly assume we are on the default Cassandra 9160 port ). Clustering is not yet supported. */ public final static void main(String[] args) throws Exception { if (!(args.length == 6)) { usage(); System.exit(1); } try { final File _f = new File(args[0]); URI uri = _f.toURI(); String keySpaceName = args[1]; final String host = args[2]; final int batchSize = Integer.parseInt(args[3]); final int timeUnitCount = Integer.parseInt(args[4]); Delay timeUnit = timeUnitsFromCmdLine(args[5].toUpperCase()); if (timeUnit == null) timeUnit = Delay.SECOND; long millisToWait = timeUnitCount * timeUnit.getMilliSeconds(); useCache = true; while (true) { try { invigilate(uri, keySpaceName, host, batchSize, millisToWait); Thread.sleep(millisToWait); } catch (InterruptedException | Error any) { Thread.interrupted(); } finally { } } } catch (Exception e) { logger.log(Level.SEVERE, e.toString()); } } /** * Helper method for converting cmd line, human-readable invigilance delays * * @param _arg a time unit readable to a human ( minute, second, hour... ) * @return a Delay known to the system ( Minute, Hour, ... ) */ private static Delay timeUnitsFromCmdLine(String _arg) { Iterator<Delay> onKnownDelayOptions = Delay.knownOptions.iterator(); while (onKnownDelayOptions.hasNext()) { Delay _d = onKnownDelayOptions.next(); if (_d.name().equals(_arg)) { return _d; } } return null; } private static void usage() { System.out.println("usage: FileLoader file URL | keyspace | server | batch_size { number | seconds }"); System.out.println("example: FileLoader /data/bl/ mykeyspace localhost 10000 10 minutes"); } }