Java tutorial
/* * Copyright 2012 Mozilla Foundation * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.mozilla.bagheera.sink; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.UUID; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; import com.yammer.metrics.Metrics; import com.yammer.metrics.core.Meter; import com.yammer.metrics.core.MetricName; import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; public class SequenceFileSink implements KeyValueSink { private static final Logger LOG = Logger.getLogger(SequenceFileSink.class); private ObjectMapper jsonMapper = new ObjectMapper(); protected static final long DAY_IN_MILLIS = 86400000L; // HDFS related member vars protected final Semaphore lock = new Semaphore(1, true); protected final Configuration conf; protected final FileSystem hdfs; protected SequenceFile.Writer writer; protected Path baseDir; protected Path outputPath; protected boolean useBytesValue; protected boolean addTimestamp; protected long nextRolloverMillis = 0L; protected AtomicLong bytesWritten = new AtomicLong(); protected long maxFileSize = 0L; protected final SimpleDateFormat sdf; protected Meter stored; public static final String SINK_TIMESTAMP_FIELD = "BAGHEERA_TS"; public SequenceFileSink(SinkConfiguration config) throws IOException { this(config.getString("namespace"), config.getString("hdfssink.hdfs.basedir.path", "/bagheera"), config.getString("hdfssink.hdfs.date.format", "yyyy-MM-dd"), config.getLong("hdfssink.hdfs.max.filesize", 536870912), config.getBoolean("hdfssink.hdfs.usebytes", false), config.getBoolean("hdfssink.hdfs.addtimestamp", false)); } public SequenceFileSink(String namespace, String baseDirPath, String dateFormat, long maxFileSize, boolean useBytesValue, boolean addTimestamp) throws IOException { LOG.info("Initializing writer for namespace: " + namespace); conf = new Configuration(); conf.setBoolean("fs.automatic.close", false); hdfs = FileSystem.newInstance(conf); this.useBytesValue = useBytesValue; this.maxFileSize = maxFileSize; this.addTimestamp = addTimestamp; sdf = new SimpleDateFormat(dateFormat); if (!baseDirPath.endsWith(Path.SEPARATOR)) { baseDir = new Path(baseDirPath + Path.SEPARATOR + namespace + Path.SEPARATOR + sdf.format(new Date(System.currentTimeMillis()))); } else { baseDir = new Path( baseDirPath + namespace + Path.SEPARATOR + sdf.format(new Date(System.currentTimeMillis()))); } initWriter(); stored = Metrics.newMeter(new MetricName("bagheera", "sink.hdfs.", namespace + ".stored"), "messages", TimeUnit.SECONDS); } private void initWriter() throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("Thread " + Thread.currentThread().getId() + " - initWriter() called"); } if (!hdfs.exists(baseDir)) { hdfs.mkdirs(baseDir); } outputPath = new Path(baseDir, new Path(UUID.randomUUID().toString())); LOG.info("Opening file handle to: " + outputPath.toString()); if (useBytesValue) { writer = SequenceFile.createWriter(hdfs, conf, outputPath, Text.class, BytesWritable.class, CompressionType.BLOCK); } else { writer = SequenceFile.createWriter(hdfs, conf, outputPath, Text.class, Text.class, CompressionType.BLOCK); } // Get time in millis at a day resolution Calendar prev = Calendar.getInstance(); prev.set(Calendar.HOUR_OF_DAY, 0); prev.set(Calendar.MINUTE, 0); prev.set(Calendar.SECOND, 0); prev.set(Calendar.MILLISECOND, 0); nextRolloverMillis = prev.getTimeInMillis() + DAY_IN_MILLIS; } private void checkRollover() throws IOException { boolean getNewFile = false; long now = System.currentTimeMillis(); if (maxFileSize != 0 && bytesWritten.get() >= maxFileSize) { getNewFile = true; } else if (now > nextRolloverMillis) { getNewFile = true; baseDir = new Path(baseDir.getParent(), new Path(sdf.format(new Date(now)))); } if (writer == null || getNewFile) { closeWriter(); initWriter(); } } private void closeWriter() throws IOException { if (writer != null) { writer.close(); writer = null; } bytesWritten.set(0); } @Override public void close() { try { lock.acquire(); LOG.info("Closing file handle to: " + outputPath.toString()); try { closeWriter(); } catch (IOException e) { LOG.error("Error closing writer", e); } if (hdfs != null) { try { LOG.info("fs.automatic.close = " + hdfs.getConf().get("fs.automatic.close")); hdfs.close(); } catch (IOException e) { LOG.error("Error closing HDFS handle", e); } } } catch (InterruptedException ex) { LOG.error("Interrupted while closing HDFS handle", ex); } finally { lock.release(); } } @Override public void store(String key, byte[] data) { try { lock.acquire(); checkRollover(); if (useBytesValue) { writer.append(new Text(key), new BytesWritable(data)); } else { writer.append(new Text(key), new Text(data)); } stored.mark(); bytesWritten.getAndAdd(key.length() + data.length); } catch (IOException e) { LOG.error("IOException while writing key/value pair", e); throw new RuntimeException(e); } catch (InterruptedException e) { LOG.error("Interrupted while writing key/value pair", e); } finally { lock.release(); } } @Override public void store(String key, byte[] data, long timestamp) throws IOException { try { lock.acquire(); checkRollover(); if (addTimestamp) { data = addTimestampToJson(data, timestamp); } if (useBytesValue) { writer.append(new Text(key), new BytesWritable(data)); } else { writer.append(new Text(key), new Text(data)); } stored.mark(); bytesWritten.getAndAdd(key.length() + data.length); } catch (IOException e) { LOG.error("IOException while writing key/value pair", e); throw new RuntimeException(e); } catch (InterruptedException e) { LOG.error("Interrupted while writing key/value pair", e); } finally { lock.release(); } } public byte[] addTimestampToJson(byte[] data, long timestamp) throws IOException { // TODO: add metrics/counters for failures try { ObjectNode document = jsonMapper.readValue(data, ObjectNode.class); document.put(SINK_TIMESTAMP_FIELD, timestamp); return (jsonMapper.writeValueAsBytes(document)); } catch (JsonParseException e) { LOG.error("Invalid JSON", e); LOG.debug(data); } catch (JsonMappingException e) { LOG.error("Invalid JSON", e); LOG.debug(data); } throw new IOException("Invalid JSON"); } @Override public void delete(String key) { // TODO: Throw error or just ignore? // NOOP } }