Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.beam.examples.oozie; import org.apache.beam.sdk.transforms.*; import org.apache.beam.sdk.values.PCollection; import org.apache.commons.lang.StringUtils; import java.io.Serializable; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.TimeZone; import java.util.regex.Matcher; import java.util.regex.Pattern; public class OozieAuditLogParser implements Serializable { public static final String MESSAGE_SPLIT_FLAG = "( - )"; public static final String ALLOW_ALL_REGEX = "(.*)"; private static final String COMMON_REGEX = "\\s([^\\]]*\\])"; private static final String TIMESTAMP_REGEX = "(\\d\\d\\d\\d-\\d\\d-\\d\\d \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d)"; private static final String WHITE_SPACE_REGEX = "\\s+"; private static final String LOG_LEVEL_REGEX = "(\\w+)"; private static final String OOZIEAUDIT_FLAG = "(\\w+:\\d+)"; private static final String PREFIX_REGEX = TIMESTAMP_REGEX + WHITE_SPACE_REGEX + LOG_LEVEL_REGEX + WHITE_SPACE_REGEX; private static final String IP = "IP"; private static final String USER = "USER"; private static final String GROUP = "GROUP"; private static final String APP = "APP"; private static final String JOBID = "JOBID"; private static final String OPERATION = "OPERATION"; private static final String PARAMETER = "PARAMETER"; private static final String STATUS = "STATUS"; private static final String HTTPCODE = "HTTPCODE"; private static final String ERRORCODE = "ERRORCODE"; private static final String ERRORMESSAGE = "ERRORMESSAGE"; private static final Pattern LOG_PATTERN = constructPattern(); public static long humanDateToMilliseconds(String date) throws ParseException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS"); sdf.setTimeZone(TimeZone.getDefault()); Date d = sdf.parse(date); return d.getTime(); } private static Pattern constructPattern() { List<String> patterns = new ArrayList<String>(11); patterns.add(IP); patterns.add(USER); patterns.add(GROUP); patterns.add(APP); patterns.add(JOBID); patterns.add(OPERATION); patterns.add(PARAMETER); patterns.add(STATUS); patterns.add(HTTPCODE); patterns.add(ERRORCODE); patterns.add(ERRORMESSAGE); StringBuilder sb = new StringBuilder(); sb.append(PREFIX_REGEX + OOZIEAUDIT_FLAG); sb.append(MESSAGE_SPLIT_FLAG); for (int i = 0; i < patterns.size(); i++) { sb.append("("); sb.append(patterns.get(i) + COMMON_REGEX); sb.append(")"); sb.append(ALLOW_ALL_REGEX); } String rs = StringUtils.removeEnd(sb.toString(), ALLOW_ALL_REGEX); return Pattern.compile(rs); } public OozieAuditLogObject parse(String logLine) throws Exception { OozieAuditLogObject oozieAuditLogObject = new OozieAuditLogObject(); Matcher matcher = LOG_PATTERN.matcher(logLine); if (!matcher.matches()) { return null; } applyValueTo(oozieAuditLogObject, matcher); return oozieAuditLogObject; } private void applyValueTo(OozieAuditLogObject oozieAuditLogObject, Matcher matcher) throws ParseException { oozieAuditLogObject.timestamp = humanDateToMilliseconds(matcher.group(1)); oozieAuditLogObject.level = matcher.group(2); oozieAuditLogObject.ip = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(6), "["), "]"); oozieAuditLogObject.user = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(9), "["), "]"); oozieAuditLogObject.group = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(12), "["), "]"); oozieAuditLogObject.app = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(15), "["), "]"); oozieAuditLogObject.jobId = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(18), "["), "]"); oozieAuditLogObject.operation = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(21), "["), "]"); oozieAuditLogObject.parameter = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(24), "["), "]"); oozieAuditLogObject.status = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(27), "["), "]"); oozieAuditLogObject.httpcode = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(30), "["), "]"); oozieAuditLogObject.errorcode = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(33), "["), "]"); oozieAuditLogObject.errormessage = StringUtils.removeEnd(StringUtils.removeStart(matcher.group(36), "["), "]"); } static class ParserFn extends DoFn<String, OozieAuditLogObject> { private final Aggregator<Long, Long> emptyLines = createAggregator("emptyLines", new Sum.SumLongFn()); private OozieAuditLogParser parser = new OozieAuditLogParser(); @ProcessElement public void processElement(ProcessContext c) throws Exception { if (c.element().trim().isEmpty()) { emptyLines.addValue(1L); } OozieAuditLogObject object = parser.parse(c.element()); if (object != null) c.output(object); } } public static class LogParser extends PTransform<PCollection<String>, PCollection<OozieAuditLogObject>> { @Override public PCollection<OozieAuditLogObject> apply(PCollection<String> lines) { return lines.apply(ParDo.of(new ParserFn())); } } /** A SimpleFunction that converts a Word and Count into a printable string. */ public static class FormatAsTextFn extends SimpleFunction<OozieAuditLogObject, String> { @Override public String apply(OozieAuditLogObject input) { return input.toString(); } } }