gov.nasa.jpl.mudrod.weblog.pre.SessionStatistic.java Source code

Introduction

Here is the source code for gov.nasa.jpl.mudrod.weblog.pre.SessionStatistic.java
Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License"); you 
 * may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package gov.nasa.jpl.mudrod.weblog.pre;

import gov.nasa.jpl.mudrod.driver.ESDriver;
import gov.nasa.jpl.mudrod.driver.SparkDriver;
import gov.nasa.jpl.mudrod.main.MudrodConstants;
import gov.nasa.jpl.mudrod.weblog.structure.RequestUrl;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
import org.elasticsearch.search.aggregations.metrics.stats.Stats;
import org.elasticsearch.search.aggregations.metrics.stats.StatsAggregationBuilder;
import org.joda.time.DateTime;
import org.joda.time.Seconds;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.*;
import java.util.concurrent.ExecutionException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;

/**
 * Supports ability to post-process session, including summarizing statistics
 * and filtering
 */
public class SessionStatistic extends LogAbstract {

    /**
     *
     */
    private static final long serialVersionUID = 1L;
    private static final Logger LOG = LoggerFactory.getLogger(SessionStatistic.class);

    public SessionStatistic(Properties props, ESDriver es, SparkDriver spark) {
        super(props, es, spark);
    }

    @Override
    public Object execute() {
        LOG.info("Starting Session Summarization.");
        startTime = System.currentTimeMillis();
        try {
            processSession();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ExecutionException e) {
            e.printStackTrace();
        }
        endTime = System.currentTimeMillis();
        es.refreshIndex();
        LOG.info("Session Summarization complete. Time elapsed {} seconds.", (endTime - startTime) / 1000);
        return null;
    }

    public void processSession() throws InterruptedException, IOException, ExecutionException {
        String processingType = props.getProperty(MudrodConstants.PROCESS_TYPE);
        if (processingType.equals("sequential")) {
            processSessionInSequential();
        } else if (processingType.equals("parallel")) {
            processSessionInParallel();
        }
    }

    public void processSessionInSequential() throws IOException, InterruptedException, ExecutionException {
        es.createBulkProcessor();
        Terms Sessions = this.getSessionTerms();
        int session_count = 0;
        for (Terms.Bucket entry : Sessions.getBuckets()) {
            if (entry.getDocCount() >= 3 && !entry.getKey().equals("invalid")) {
                String sessionid = entry.getKey().toString();
                int sessionNum = processSession(es, sessionid);
                session_count += sessionNum;
            }
        }
        LOG.info("Final Session count: {}", Integer.toString(session_count));
        es.destroyBulkProcessor();
    }

    /**
     * Extract the dataset ID from a long request
     *
     * @param request raw log request
     * @return dataset ID
     */
    public String findDataset(String request) {
        String pattern1 = "/dataset/";
        String pattern2;
        if (request.contains("?")) {
            pattern2 = "?";
        } else {
            pattern2 = " ";
        }

        Pattern p = Pattern.compile(Pattern.quote(pattern1) + "(.*?)" + Pattern.quote(pattern2));
        Matcher m = p.matcher(request);
        if (m.find()) {
            return m.group(1);
        }
        return null;
    }

    public void processSessionInParallel() throws InterruptedException, IOException {

        List<String> sessions = this.getSessions();
        JavaRDD<String> sessionRDD = spark.sc.parallelize(sessions, partition);

        int sessionCount = 0;
        sessionCount = sessionRDD.mapPartitions(new FlatMapFunction<Iterator<String>, Integer>() {
            @Override
            public Iterator<Integer> call(Iterator<String> arg0) throws Exception {
                ESDriver tmpES = new ESDriver(props);
                tmpES.createBulkProcessor();
                List<Integer> sessionNums = new ArrayList<Integer>();
                sessionNums.add(0);
                while (arg0.hasNext()) {
                    String s = arg0.next();
                    Integer sessionNum = processSession(tmpES, s);
                    sessionNums.add(sessionNum);
                }
                tmpES.destroyBulkProcessor();
                tmpES.close();
                return sessionNums.iterator();
            }
        }).reduce(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer a, Integer b) {
                return a + b;
            }
        });

        LOG.info("Final Session count: {}", Integer.toString(sessionCount));
    }

    public int processSession(ESDriver es, String sessionId)
            throws IOException, InterruptedException, ExecutionException {

        String inputType = cleanupType;
        String outputType = sessionStats;

        DateTimeFormatter fmt = ISODateTimeFormat.dateTime();
        String min = null;
        String max = null;
        DateTime start = null;
        DateTime end = null;
        int duration = 0;
        float request_rate = 0;

        int session_count = 0;
        Pattern pattern = Pattern.compile("get (.*?) http/*");

        StatsAggregationBuilder statsAgg = AggregationBuilders.stats("Stats").field("Time");

        BoolQueryBuilder filter_search = new BoolQueryBuilder();
        filter_search.must(QueryBuilders.termQuery("SessionID", sessionId));

        SearchResponse sr = es.getClient().prepareSearch(logIndex).setTypes(inputType).setQuery(filter_search)
                .addAggregation(statsAgg).execute().actionGet();

        Stats agg = sr.getAggregations().get("Stats");
        min = agg.getMinAsString();
        max = agg.getMaxAsString();
        start = fmt.parseDateTime(min);
        end = fmt.parseDateTime(max);

        duration = Seconds.secondsBetween(start, end).getSeconds();

        int searchDataListRequest_count = 0;
        int searchDataRequest_count = 0;
        int searchDataListRequest_byKeywords_count = 0;
        int ftpRequest_count = 0;
        int keywords_num = 0;

        String IP = null;
        String keywords = "";
        String views = "";
        String downloads = "";

        SearchResponse scrollResp = es.getClient().prepareSearch(logIndex).setTypes(inputType)
                .setScroll(new TimeValue(60000)).setQuery(filter_search).setSize(100).execute().actionGet();

        while (true) {
            for (SearchHit hit : scrollResp.getHits().getHits()) {
                Map<String, Object> result = hit.getSource();

                String request = (String) result.get("Request");
                String logType = (String) result.get("LogType");
                IP = (String) result.get("IP");
                Matcher matcher = pattern.matcher(request.trim().toLowerCase());
                while (matcher.find()) {
                    request = matcher.group(1);
                }

                String datasetlist = "/datasetlist?";
                String dataset = "/dataset/";
                if (request.contains(datasetlist)) {
                    searchDataListRequest_count++;

                    RequestUrl requestURL = new RequestUrl();
                    String infoStr = requestURL.getSearchInfo(request) + ",";
                    String info = es.customAnalyzing(props.getProperty("indexName"), infoStr);

                    if (!info.equals(",")) {
                        if (keywords.equals("")) {
                            keywords = keywords + info;
                        } else {
                            String[] items = info.split(",");
                            String[] keywordList = keywords.split(",");
                            for (int m = 0; m < items.length; m++) {
                                if (!Arrays.asList(keywordList).contains(items[m])) {
                                    keywords = keywords + items[m] + ",";
                                }
                            }
                        }
                    }

                }
                if (request.startsWith(dataset)) {
                    searchDataRequest_count++;
                    if (findDataset(request) != null) {
                        String view = findDataset(request);

                        if ("".equals(views)) {
                            views = view;
                        } else {
                            if (views.contains(view)) {

                            } else {
                                views = views + "," + view;
                            }
                        }
                    }
                }
                if ("ftp".equals(logType)) {
                    ftpRequest_count++;
                    String download = "";
                    String requestLowercase = request.toLowerCase();
                    if (requestLowercase.endsWith(".jpg") == false && requestLowercase.endsWith(".pdf") == false
                            && requestLowercase.endsWith(".txt") == false
                            && requestLowercase.endsWith(".gif") == false) {
                        download = request;
                    }

                    if ("".equals(downloads)) {
                        downloads = download;
                    } else {
                        if (downloads.contains(download)) {

                        } else {
                            downloads = downloads + "," + download;
                        }
                    }
                }

            }

            scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId())
                    .setScroll(new TimeValue(600000)).execute().actionGet();
            // Break condition: No hits are returned
            if (scrollResp.getHits().getHits().length == 0) {
                break;
            }
        }

        if (!keywords.equals("")) {
            keywords_num = keywords.split(",").length;
        }

        if (searchDataListRequest_count != 0
                && searchDataListRequest_count <= Integer.parseInt(props.getProperty("searchf"))
                && searchDataRequest_count != 0
                && searchDataRequest_count <= Integer.parseInt(props.getProperty("viewf"))
                && ftpRequest_count <= Integer.parseInt(props.getProperty("downloadf"))) {
            String sessionURL = props.getProperty("SessionPort") + props.getProperty("SessionUrl") + "?sessionid="
                    + sessionId + "&sessionType=" + outputType + "&requestType=" + inputType;
            session_count = 1;

            IndexRequest ir = new IndexRequest(logIndex, outputType).source(jsonBuilder().startObject()
                    .field("SessionID", sessionId).field("SessionURL", sessionURL).field("Duration", duration)
                    .field("Number of Keywords", keywords_num).field("Time", min).field("End_time", max)
                    .field("searchDataListRequest_count", searchDataListRequest_count)
                    .field("searchDataListRequest_byKeywords_count", searchDataListRequest_byKeywords_count)
                    .field("searchDataRequest_count", searchDataRequest_count)
                    .field("keywords", es.customAnalyzing(logIndex, keywords)).field("views", views)
                    .field("downloads", downloads).field("request_rate", request_rate).field("Comments", "")
                    .field("Validation", 0).field("Produceby", 0).field("Correlation", 0).field("IP", IP)
                    .endObject());

            es.getBulkProcessor().add(ir);
        }

        return session_count;
    }

    @Override
    public Object execute(Object o) {
        return null;
    }

}