Java tutorial
/** Copyright (c) 2014 BlackBerry Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * search Logs in a given file set. * <p> * Usage: [genericOptions] [-Dlogdriver.search.start.time=X] [-Dlogdriver.search.end.time=X] searchString input [input ...] output * <p> * */ package com.blackberry.logdriver.util; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.zip.DataFormatException; import java.util.zip.Inflater; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.io.Decoder; import org.apache.avro.io.DecoderFactory; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.blackberry.logdriver.Schemas; import com.blackberry.logdriver.avro.AvroFileHeader; import com.blackberry.logdriver.avro.AvroUtils; import com.blackberry.logdriver.mapreduce.avro.AvroBlockInputFormat; public class MultiSearch extends Configured implements Tool { private static final Logger LOG = LoggerFactory.getLogger(MultiSearch.class); private static final Charset UTF_8 = Charset.forName("UTF-8"); // The expected size of the data blocks, after decompressing. Use what we're // using for avro sync interval, plus a bit for overrun. private static final int BUFFER_SIZE = Math.round(1.2f * 2 * 1024 * 1024); private static final String DEFAULT_OUTPUT_SEPARATOR = "\t"; private static final boolean DEFAULT_WAIT_JOB = true; static boolean ANDsearch = false; static boolean ORsearch = true; static boolean caseSensitive = false; static boolean caseInsensitive = true; static boolean unicode = false; protected static final class ByteTree { protected boolean endNode = false; protected Map<Byte, ByteTree> children = null; protected ByteTree parent = null; protected int totalBranches = 0; protected int activeBranches = 0; protected boolean dead = false; protected ByteTree() { } protected void add(byte[] bytes) { add(bytes, 0); } protected void add(byte[] bytes, int i) { if (endNode) { return; } if (i >= bytes.length) { if (ORsearch) { endNode = true; children = null; } return; } if (children == null) { children = new HashMap<Byte, ByteTree>(); } ByteTree child = null; // For case sensitive searching, add a single branch for the current byte. // For case insensitive searching, create a new branch, and point both upper // and lower case bytes to the same branch. try { if (caseSensitive) { child = children.get(bytes[i]); } else { child = children.get(new String(bytes, "UTF-8").toLowerCase().getBytes("UTF-8")[i]); if (child == null) { child = children.get(new String(bytes, "UTF-8").toUpperCase().getBytes("UTF-8")[i]); } } if (child == null) { child = new ByteTree(); if (caseSensitive) { children.put(bytes[i], child); } else { children.put(new String(bytes, "UTF-8").toLowerCase().getBytes("UTF-8")[i], child); children.put(new String(bytes, "UTF-8").toUpperCase().getBytes("UTF-8")[i], child); } totalBranches++; } } catch (IOException e) { throw new RuntimeException(e); } child.parent = this; child.add(bytes, i + 1); } /** * * @param a * The character array. * @param b * The starting index. * @return */ // If this is the end of a branch, we've matched the branch. For OR searching // return true. For AND searching return true, and remove the matched branch. protected boolean matches(byte[] a, int b) { if (children == null || endNode) { if (ANDsearch) { dead = true; parent.removeBranch(); } return true; } if (b >= a.length) { return false; } ByteTree child = children.get(a[b]); if (child == null || child.dead) { return false; } return child.matches(a, b + 1); } // To remove a branch, reduce the activeBranches counter by one, and mark // the branch dead once all active branches have been removed. If the parent // exists, remove it as well. protected void removeBranch() { activeBranches--; if (activeBranches == 0) { dead = true; if (parent != null) { parent.removeBranch(); } } } // To reset the tree, set dead to false, set the number of active branches // equal to the total number of branches, and then reset each branch. protected void reset() { dead = false; activeBranches = totalBranches; if (children != null) { Set<Byte> branchKeys = children.keySet(); for (Iterator<Byte> branchCount = branchKeys.iterator(); branchCount.hasNext();) { children.get(branchCount.next()).reset(); } } } public String toString() { return toString(0); } private String toString(int i) { if (children == null) { return ""; } String s = ""; for (byte c : children.keySet()) { for (int j = 0; j < i; j++) { s += " "; } try { s += new String(new byte[] { c }, "UTF-8"); } catch (UnsupportedEncodingException e) { LOG.error("", e); } s += "\n"; s += children.get(c).toString(i + 1); } return s; } } private static final class SearchMapper extends Mapper<AvroFileHeader, BytesWritable, Text, NullWritable> { private long start; private long end; private Inflater inflater; private ByteTree byteTree = new ByteTree(); private List<String> searchTerms = new ArrayList<String>(); private String outputSeparator; @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); start = conf.getLong("logdriver.search.start.time", Long.MIN_VALUE); end = conf.getLong("logdriver.search.end.time", Long.MAX_VALUE); ANDsearch = conf.getBoolean("logdriver.search.and", false); ORsearch = !ANDsearch; caseInsensitive = conf.getBoolean("logdriver.search.case.insensitive", false); caseSensitive = !caseInsensitive; String searchStringDir = conf.get("logdriver.search.string.dir"); FileSystem fs; try { fs = FileSystem.get(conf); } catch (IOException e) { throw new RuntimeException(e); } BufferedReader searchStringReader = null; try { for (FileStatus f : fs.listStatus(new Path(searchStringDir))) { searchStringReader = new BufferedReader(new InputStreamReader(fs.open(f.getPath()), "UTF-8")); String line; while ((line = searchStringReader.readLine()) != null) { line = line.trim(); // If the current line contains more bytes than characters in either upper // or lower case, then set unicode to true. if (line.toLowerCase().getBytes("UTF-8").length > line.length() || line.toUpperCase().getBytes("UTF-8").length > line.length()) { unicode = true; } if (!"".equals(line)) { // If unicode characters have been detected, and we're doing a case insensitive // search, then there's no point in continuing to build the byte tree. if (!unicode || caseSensitive) { byteTree.add(line.getBytes("UTF-8")); } searchTerms.add(line); } } } } catch (IOException e) { throw new RuntimeException(e); } if (searchTerms.size() == 0) { throw new RuntimeException("No search strings read."); } inflater = new Inflater(true); outputSeparator = new String( new byte[] { Byte.parseByte(conf.get("logdriver.output.field.separator")) }, UTF_8); LOG.info("Configuring SearchMapper"); LOG.info(" start={}", start); LOG.info(" end={}", end); if (!unicode || caseSensitive) { LOG.info(" tree=\n{}", byteTree); } else { LOG.info("Unicode case insensitive search detected, not using byte tree."); } } @SuppressWarnings("unchecked") @Override protected void map(AvroFileHeader key, BytesWritable value, Context context) throws IOException, InterruptedException { LOG.trace("Got chunk with {} bytes", value.getLength()); if (value.getLength() == 0) { return; } // First, grab the headers off the block, then decompress the block ByteArrayInputStream bytesIn = new ByteArrayInputStream(value.getBytes()); int entries = AvroUtils.readInt(bytesIn); int dataLength = AvroUtils.readInt(bytesIn); LOG.trace("Entries = {}, Data Length={}", entries, dataLength); byte[] inBytes = new byte[dataLength]; int bytesRead = 0; int pos = 0; while (bytesRead >= 0 && pos < dataLength) { bytesRead = bytesIn.read(inBytes, pos, inBytes.length - pos); if (bytesRead > 0) { pos += bytesRead; } } if (pos != dataLength) { throw new IOException( "Read a different number of bytes than expected (" + pos + "!=" + dataLength + ")"); } byte[] buf = new byte[BUFFER_SIZE]; bytesRead = -1; pos = 0; inflater.reset(); inflater.setInput(inBytes); while (!inflater.finished() && bytesRead != 0) { if (pos == buf.length) { LOG.info("Expanding output buffer from {} to {}.", buf.length, buf.length * 2); byte[] newBuf = new byte[buf.length * 2]; System.arraycopy(buf, 0, newBuf, 0, buf.length); buf = newBuf; } try { bytesRead = inflater.inflate(buf, pos, buf.length - pos); } catch (DataFormatException e) { throw new IOException("Error inflating data block.", e); } pos += bytesRead; LOG.trace("BytesRead = {}, Position = {}", bytesRead, pos); } LOG.debug("Read block. Compressed size {}, Expanded size {}, Record count {}", new Object[] { dataLength, pos, entries }); // Find out if the string we're looking for is in the data block // somewhere. Reset the tree before searching. If the input lines // contained multibyte characters, skip bytescanning. boolean match = false; byteTree.reset(); if (!unicode || caseSensitive) { if (ORsearch) { for (int i = 0; i < buf.length; i++) { if (byteTree.matches(buf, i)) { LOG.info("OR match in byte block"); match = true; break; } } } else if (ANDsearch) { for (int i = 0; i < buf.length; i++) { if (byteTree.matches(buf, i) && byteTree.dead) { LOG.info("AND match in byte block"); match = true; break; } } } } else { // It's not worth bytescanning for multi-byte UTF-8, so skip it and deserialize the block. match = true; } // If we know there is a match, then we can decode and go line by line. if (match) { LOG.info("There is a match in this block."); GenericDatumReader<Record> datumReader = new GenericDatumReader<Record>( new Schema.Parser().parse(key.getSchema()), Schemas.getSchema("logBlock")); Record record = null; Decoder decoder = DecoderFactory.get().binaryDecoder(buf, null); long blockNo; long createTime; long second = 0; long lineNumber = 0; long ms = 0; boolean matchline = false; byte[] bytes; for (int i = 0; i < entries; i++) { record = datumReader.read(record, decoder); LOG.trace("Read record {}", record); second = (Long) record.get("second") * 1000; if (second < start || second >= end) { LOG.debug("Time out of range: {} < {} || {} >= {}", new Object[] { second, start, second, end }); continue; } blockNo = (Long) record.get("blockNumber"); createTime = (Long) record.get("createTime"); lineNumber = 0l; for (Record line : (List<Record>) record.get("logLines")) { String message = line.get("message").toString(); ++lineNumber; matchline = false; // If we're not searching for multi-byte characters, use byte scanning to determine // if there is a match in a given message line. FOR loops are contained within checks // for OR and AND searching for increased performance. // Check if each line matches. For an AND search the current branch must match // and the entire tree must be dead for the line to match. if (!unicode || caseSensitive) { bytes = message.getBytes("UTF-8"); byteTree.reset(); if (ORsearch) { for (int j = 0; j < bytes.length; j++) { if (byteTree.matches(bytes, j)) { matchline = true; break; } } } else if (ANDsearch) { for (int j = 0; j < bytes.length; j++) { if (byteTree.matches(bytes, j) && byteTree.dead) { matchline = true; break; } } } } // If we are searching for multi-byte characters, it is faster to do the comparison // using strings. else { String currentTerm = null; if (ORsearch) { for (Iterator<String> termCounter = searchTerms.iterator(); termCounter .hasNext();) { currentTerm = termCounter.next(); if (message.toLowerCase().contains(currentTerm.toLowerCase()) || message.toUpperCase().contains(currentTerm.toUpperCase())) { matchline = true; break; } } } else if (ANDsearch) { matchline = true; for (Iterator<String> termCounter = searchTerms.iterator(); termCounter .hasNext();) { currentTerm = termCounter.next(); if (!message.toLowerCase().contains(currentTerm.toLowerCase()) && !message.toUpperCase().contains(currentTerm.toUpperCase())) { matchline = false; break; } } } } if (matchline) { LOG.info("Got match!"); ms = (Long) line.get("ms"); StringBuilder sb = new StringBuilder().append((second + ms)).append(outputSeparator) .append(StringUtils.chomp(message)).append(outputSeparator) .append(line.get("eventId")).append(outputSeparator).append(createTime) .append(outputSeparator).append(blockNo).append(outputSeparator) .append(lineNumber); context.write(new Text(sb.toString()), null); } } } } } } @Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // Configuration processed by ToolRunner // If run by Oozie, then load the Oozie conf too if (System.getProperty("oozie.action.conf.xml") != null) { conf.addResource(new URL("file://" + System.getProperty("oozie.action.conf.xml"))); } FileSystem fs = FileSystem.get(conf); // The command line options String searchStringDir = null; List<Path> paths = new ArrayList<Path>(); Path outputDir = null; // Load input files from the command line if (args.length < 3) { System.out.println("usage: [genericOptions] searchStringDirectory input [input ...] output"); System.exit(1); } // Get the files we need from the command line. searchStringDir = args[0]; // We are going to be reading all the files in this directory a lot. So // let's up the replication factor by a lot so that they're easy to read. for (FileStatus f : fs.listStatus(new Path(searchStringDir))) { fs.setReplication(f.getPath(), (short) 16); } for (int i = 1; i < args.length - 1; i++) { for (FileStatus f : fs.globStatus(new Path(args[i]))) { paths.add(f.getPath()); } } outputDir = new Path(args[args.length - 1]); @SuppressWarnings("deprecation") Job job = new Job(conf); Configuration jobConf = job.getConfiguration(); job.setJarByClass(MultiSearch.class); jobConf.setIfUnset("mapred.job.name", "MultiSearch"); // To propagate credentials within Oozie if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { jobConf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } // Good output separators include things that are unsupported by XML. So we // just send the byte value of the character through. The restriction here // is that it can't be more than 1 byte when UTF-8 encoded, since it will be // read by Pig which only deals with single byte separators. { String outputSeparator = jobConf.get("logdriver.output.field.separator", DEFAULT_OUTPUT_SEPARATOR); byte[] bytes = outputSeparator.getBytes(UTF_8); if (bytes.length != 1) { LOG.error("The output separator must be a single byte in UTF-8."); return 1; } jobConf.set("logdriver.output.field.separator", Byte.toString(bytes[0])); } jobConf.set("logdriver.search.string.dir", searchStringDir); // This search is generally too fast to make good use of 128MB blocks, so // let's set the value to 256MB (if it's not set already) if (jobConf.get("mapred.max.split.size") == null) { jobConf.setLong("mapred.max.split.size", 256 * 1024 * 1024); } job.setInputFormatClass(AvroBlockInputFormat.class); job.setMapperClass(SearchMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, outputDir); for (Path path : paths) { AvroBlockInputFormat.addInputPath(job, path); } // Run the job. if (conf.getBoolean("job.wait", DEFAULT_WAIT_JOB)) { return job.waitForCompletion(true) ? 0 : 1; } else { job.submit(); return 0; } } public static void main(String[] args) throws Exception { // Let ToolRunner handle generic command-line options int res = ToolRunner.run(new Configuration(), new MultiSearch(), args); System.exit(res); } }