co.cask.cdap.spark.app.SparkAppUsingGetDataset.java Source code

Java tutorial

Introduction

Here is the source code for co.cask.cdap.spark.app.SparkAppUsingGetDataset.java

Source

/*
 * Copyright  2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.spark.app;

import co.cask.cdap.api.app.AbstractApplication;
import co.cask.cdap.api.dataset.lib.FileSet;
import co.cask.cdap.api.dataset.lib.FileSetProperties;
import co.cask.cdap.api.dataset.lib.KeyValueTable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import scala.Tuple2;

import java.io.Serializable;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;

/**
 * App to test dataset access/update using getDataset() from a spark program
 */
public class SparkAppUsingGetDataset extends AbstractApplication {

    private static final Pattern CLF_LOG_PATTERN = Pattern.compile(
            //   IP                    id    user      date          request     code     size    referrer
            "^([\\d.]+|[:][:][\\d]) (\\S+) (\\S+) \\[([^\\]]+)\\] \"([^\"]+)\" (\\d{3}) ([-\"\\d]+) \"([^\"]+)\" " +
            // user agent
                    "\"([^\"]+)\"");

    @Override
    public void configure() {
        createDataset("logs", FileSet.class, FileSetProperties.builder().setInputFormat(TextInputFormat.class)
                .setOutputFormat(TextOutputFormat.class).build());
        createDataset("logStats", KeyValueTable.class.getName());
        addSpark(new SparkLogParser());
        addSpark(new ScalaSparkLogParser());
    }

    public static final class LogKey implements Serializable {
        private final String ip;
        private final String user;
        private final String request;
        private final int code;

        public LogKey(String ip, String user, String request, int code) {
            this.ip = ip;
            this.user = user;
            this.request = request;
            this.code = code;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) {
                return true;
            }
            if (o == null || getClass() != o.getClass()) {
                return false;
            }

            LogKey that = (LogKey) o;

            return Objects.equals(ip, that.ip) && Objects.equals(user, that.user)
                    && Objects.equals(request, that.request) && code == that.code;
        }

        @Override
        public int hashCode() {
            return Objects.hash(ip, user, request, code);
        }

        @Override
        public String toString() {
            return "LogKey{" + "ip='" + ip + '\'' + ", user='" + user + '\'' + ", request='" + request + '\''
                    + ", code=" + code + '}';
        }
    }

    public static final class LogStats implements Serializable {
        private final int count;
        private final int size;

        public LogStats(int count, int size) {
            this.count = count;
            this.size = size;
        }

        public LogStats aggregate(LogStats that) {
            return new LogStats(count + that.count, size + that.size);
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) {
                return true;
            }
            if (o == null || getClass() != o.getClass()) {
                return false;
            }

            SparkAppUsingGetDataset.LogStats that = (SparkAppUsingGetDataset.LogStats) o;

            return count == that.count && size == that.size;
        }

        @Override
        public int hashCode() {
            return Objects.hash(count, size);
        }

        @Override
        public String toString() {
            return "LogStats{" + "count=" + count + ", size=" + size + '}';
        }
    }

    @Nullable
    static Tuple2<LogKey, LogStats> parse(Text log) {
        Matcher matcher = CLF_LOG_PATTERN.matcher(log.toString());
        if (matcher.find()) {
            String ip = matcher.group(1);
            String user = matcher.group(3);
            String request = matcher.group(5);
            int code = Integer.parseInt(matcher.group(6));
            int size = Integer.parseInt(matcher.group(7));
            return new Tuple2<>(new LogKey(ip, user, request, code), new LogStats(1, size));
        }
        return null;
    }
}