esiptestbed.mudrod.weblog.pre.SessionGenerator.java Source code

Java tutorial

Introduction

Here is the source code for esiptestbed.mudrod.weblog.pre.SessionGenerator.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License"); you 
 * may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package esiptestbed.mudrod.weblog.pre;

import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.update.UpdateRequest;
import org.joda.time.DateTime;
import org.joda.time.Seconds;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
import org.elasticsearch.search.aggregations.metrics.MetricsAggregationBuilder;
import org.elasticsearch.search.aggregations.metrics.stats.Stats;
import org.elasticsearch.search.sort.SortOrder;

import esiptestbed.mudrod.discoveryengine.DiscoveryStepAbstract;
import esiptestbed.mudrod.driver.ESDriver;
import esiptestbed.mudrod.driver.SparkDriver;
import esiptestbed.mudrod.main.MudrodConstants;
import esiptestbed.mudrod.weblog.structure.Session;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Supports ability to generate user session by time threshold and referrer
 */
public class SessionGenerator extends DiscoveryStepAbstract {

    /**
     * 
     */
    private static final long serialVersionUID = 1L;
    private static final Logger LOG = LoggerFactory.getLogger(SessionGenerator.class);

    public SessionGenerator(Properties props, ESDriver es, SparkDriver spark) {
        super(props, es, spark);
    }

    @Override
    public Object execute() {
        LOG.info("Starting Session Generation.");
        startTime = System.currentTimeMillis();
        generateSession();
        endTime = System.currentTimeMillis();
        es.refreshIndex();
        LOG.info("Session generating complete. Time elapsed {} seconds.", (endTime - startTime) / 1000);
        return null;
    }

    public void generateSession() {
        try {
            es.createBulkProcessor();
            genSessionByReferer(Integer.parseInt(props.getProperty("timegap")));
            es.destroyBulkProcessor();

            es.createBulkProcessor();
            combineShortSessions(Integer.parseInt(props.getProperty("timegap")));
            es.destroyBulkProcessor();
        } catch (ElasticsearchException e) {
            LOG.error("Error whilst executing bulk processor.", e);
        } catch (IOException e) {
            LOG.error("Error whilst reading configuration.", e);
        }

    }

    /**
     * Method to generate session by time threshold and referrer
     * @param timeThres value of time threshold (s)
     * @throws ElasticsearchException ElasticsearchException
     * @throws IOException IOException
     */
    public void genSessionByReferer(int timeThres) throws ElasticsearchException, IOException {
        SearchResponse sr = es.getClient().prepareSearch(props.getProperty("indexName")).setTypes(this.cleanupType)
                .setQuery(QueryBuilders.matchAllQuery()).setSize(0)
                .addAggregation(AggregationBuilders.terms("Users").field("IP").size(0)).execute().actionGet();
        Terms users = sr.getAggregations().get("Users");

        int sessionCount = 0;
        for (Terms.Bucket entry : users.getBuckets()) {

            String startTime = null;
            int sessionCountIn = 0;

            QueryBuilder filterSearch = QueryBuilders.boolQuery()
                    .must(QueryBuilders.termQuery("IP", entry.getKey()));
            QueryBuilder querySearch = QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), filterSearch);

            SearchResponse scrollResp = es.getClient().prepareSearch(props.getProperty("indexName"))
                    .setTypes(this.cleanupType).setScroll(new TimeValue(60000)).setQuery(querySearch)
                    .addSort("Time", SortOrder.ASC).setSize(100).execute().actionGet();

            Map<String, Map<String, DateTime>> sessionReqs = new HashMap<>();
            String request = "";
            String referer = "";
            String logType = "";
            String id = "";
            String ip = entry.getKey().toString();
            String indexUrl = "http://podaac.jpl.nasa.gov/";
            DateTime time = null;
            DateTimeFormatter fmt = ISODateTimeFormat.dateTime();

            while (scrollResp.getHits().getHits().length != 0) {
                for (SearchHit hit : scrollResp.getHits().getHits()) {
                    Map<String, Object> result = hit.getSource();
                    request = (String) result.get("RequestUrl");
                    referer = (String) result.get("Referer");
                    logType = (String) result.get("LogType");
                    time = fmt.parseDateTime((String) result.get("Time"));
                    id = hit.getId();

                    if (logType.equals("PO.DAAC")) {
                        if (referer.equals("-") || referer.equals(indexUrl) || !referer.contains(indexUrl)) {
                            sessionCount++;
                            sessionCountIn++;
                            sessionReqs.put(ip + "@" + sessionCountIn, new HashMap<String, DateTime>());
                            sessionReqs.get(ip + "@" + sessionCountIn).put(request, time);

                            update(props.getProperty("indexName"), this.cleanupType, id, "SessionID",
                                    ip + "@" + sessionCountIn);

                        } else {
                            int count = sessionCountIn;
                            int rollbackNum = 0;
                            while (true) {
                                Map<String, DateTime> requests = sessionReqs.get(ip + "@" + count);
                                if (requests == null) {
                                    sessionReqs.put(ip + "@" + count, new HashMap<String, DateTime>());
                                    sessionReqs.get(ip + "@" + count).put(request, time);
                                    update(props.getProperty("indexName"), this.cleanupType, id, "SessionID",
                                            ip + "@" + count);

                                    break;
                                }
                                ArrayList<String> keys = new ArrayList<>(requests.keySet());
                                boolean bFindRefer = false;

                                for (int i = keys.size() - 1; i >= 0; i--) {
                                    rollbackNum++;
                                    if (keys.get(i).equalsIgnoreCase(referer)) {
                                        bFindRefer = true;
                                        // threshold,if time interval > 10*
                                        // click num, start a new session
                                        if (Math.abs(Seconds.secondsBetween(requests.get(keys.get(i)), time)
                                                .getSeconds()) < timeThres * rollbackNum) {
                                            sessionReqs.get(ip + "@" + count).put(request, time);
                                            update(props.getProperty("indexName"), this.cleanupType, id,
                                                    "SessionID", ip + "@" + count);
                                        } else {
                                            sessionCount++;
                                            sessionCountIn++;
                                            sessionReqs.put(ip + "@" + sessionCountIn,
                                                    new HashMap<String, DateTime>());
                                            sessionReqs.get(ip + "@" + sessionCountIn).put(request, time);
                                            update(props.getProperty("indexName"), this.cleanupType, id,
                                                    "SessionID", ip + "@" + sessionCountIn);
                                        }

                                        break;
                                    }
                                }

                                if (bFindRefer) {
                                    break;
                                }

                                count--;
                                if (count < 0) {

                                    sessionCount++;
                                    sessionCountIn++;

                                    sessionReqs.put(ip + "@" + sessionCountIn, new HashMap<String, DateTime>());
                                    sessionReqs.get(ip + "@" + sessionCountIn).put(request, time);
                                    update(props.getProperty(MudrodConstants.ES_INDEX_NAME), this.cleanupType, id,
                                            "SessionID", ip + "@" + sessionCountIn);

                                    break;
                                }
                            }
                        }
                    } else if ("ftp".equals(logType)) {

                        // may affect computation efficiency
                        Map<String, DateTime> requests = sessionReqs.get(ip + "@" + sessionCountIn);
                        if (requests == null) {
                            sessionReqs.put(ip + "@" + sessionCountIn, new HashMap<String, DateTime>());
                        } else {
                            ArrayList<String> keys = new ArrayList<>(requests.keySet());
                            int size = keys.size();
                            if (Math.abs(Seconds.secondsBetween(requests.get(keys.get(size - 1)), time)
                                    .getSeconds()) > timeThres) {
                                sessionCount += 1;
                                sessionCountIn += 1;
                                sessionReqs.put(ip + "@" + sessionCountIn, new HashMap<String, DateTime>());
                            }
                        }
                        sessionReqs.get(ip + "@" + sessionCountIn).put(request, time);
                        update(props.getProperty("indexName"), this.cleanupType, id, "SessionID",
                                ip + "@" + sessionCountIn);
                    }
                }

                scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId())
                        .setScroll(new TimeValue(600000)).execute().actionGet();
            }
        }
    }

    public void combineShortSessions(int Timethres) throws ElasticsearchException, IOException {
        SearchResponse sr = es.getClient().prepareSearch(props.getProperty("indexName")).setTypes(this.cleanupType)
                .setQuery(QueryBuilders.matchAllQuery())
                .addAggregation(AggregationBuilders.terms("Users").field("IP").size(0)).execute().actionGet();
        Terms users = sr.getAggregations().get("Users");

        for (Terms.Bucket entry : users.getBuckets()) {
            QueryBuilder filterAll = QueryBuilders.boolQuery().must(QueryBuilders.termQuery("IP", entry.getKey()));
            QueryBuilder queryAll = QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), filterAll);
            SearchResponse checkAll = es.getClient().prepareSearch(props.getProperty("indexName"))
                    .setTypes(this.cleanupType).setScroll(new TimeValue(60000)).setQuery(queryAll).setSize(0)
                    .execute().actionGet();

            long all = checkAll.getHits().getTotalHits();

            QueryBuilder filterCheck = QueryBuilders.boolQuery().must(QueryBuilders.termQuery("IP", entry.getKey()))
                    .must(QueryBuilders.termQuery("Referer", "-"));
            QueryBuilder queryCheck = QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), filterCheck);
            SearchResponse checkReferer = es.getClient().prepareSearch(props.getProperty("indexName"))
                    .setTypes(this.cleanupType).setScroll(new TimeValue(60000)).setQuery(queryCheck).setSize(0)
                    .execute().actionGet();

            long numInvalid = checkReferer.getHits().getTotalHits();

            double invalidRate = (float) (numInvalid / all);

            if (invalidRate >= 0.8 || all < 3) {
                deleteInvalid(entry.getKey().toString());
                continue;
            }

            QueryBuilder filterSearch = QueryBuilders.boolQuery()
                    .must(QueryBuilders.termQuery("IP", entry.getKey()));
            QueryBuilder querySearch = QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), filterSearch);

            MetricsAggregationBuilder statsAgg = AggregationBuilders.stats("Stats").field("Time");
            SearchResponse sr_session = es
                    .getClient().prepareSearch(props.getProperty("indexName")).setTypes(this.cleanupType)
                    .setScroll(new TimeValue(60000)).setQuery(querySearch).addAggregation(AggregationBuilders
                            .terms("Sessions").field("SessionID").size(0).subAggregation(statsAgg))
                    .execute().actionGet();

            Terms sessions = sr_session.getAggregations().get("Sessions");

            List<Session> sessionList = new ArrayList<>();
            for (Terms.Bucket session : sessions.getBuckets()) {
                Stats agg = session.getAggregations().get("Stats");
                Session sess = new Session(props, es, agg.getMinAsString(), agg.getMaxAsString(),
                        session.getKey().toString());
                sessionList.add(sess);
            }

            Collections.sort(sessionList);
            DateTimeFormatter fmt = ISODateTimeFormat.dateTime();
            String last = null;
            String lastnewID = null;
            String lastoldID = null;
            String current = null;
            for (Session s : sessionList) {
                current = s.getEndTime();
                if (last != null) {
                    if (Seconds.secondsBetween(fmt.parseDateTime(last), fmt.parseDateTime(current))
                            .getSeconds() < Timethres) {
                        if (lastnewID == null) {
                            s.setNewID(lastoldID);
                        } else {
                            s.setNewID(lastnewID);
                        }

                        QueryBuilder fs = QueryBuilders.boolQuery()
                                .must(QueryBuilders.termQuery("SessionID", s.getID()));
                        QueryBuilder qs = QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), fs);
                        SearchResponse scrollResp = es.getClient().prepareSearch(props.getProperty("indexName"))
                                .setTypes(this.cleanupType).setScroll(new TimeValue(60000)).setQuery(qs)
                                .setSize(100).execute().actionGet();
                        while (true) {
                            for (SearchHit hit : scrollResp.getHits().getHits()) {
                                if (lastnewID == null) {
                                    update(props.getProperty("indexName"), this.cleanupType, hit.getId(),
                                            "SessionID", lastoldID);
                                } else {
                                    update(props.getProperty("indexName"), this.cleanupType, hit.getId(),
                                            "SessionID", lastnewID);
                                }
                            }

                            scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId())
                                    .setScroll(new TimeValue(600000)).execute().actionGet();
                            if (scrollResp.getHits().getHits().length == 0) {
                                break;
                            }
                        }
                    }
                    ;
                }
                lastoldID = s.getID();
                lastnewID = s.getNewID();
                last = current;
            }
        }
    }

    /**
     * Method to remove invalid logs through IP address
     * @param ip invalid IP address
     * @throws ElasticsearchException ElasticsearchException
     * @throws IOException IOException
     */
    public void deleteInvalid(String ip) throws IOException {
        QueryBuilder filterAll = QueryBuilders.boolQuery().must(QueryBuilders.termQuery("IP", ip));
        QueryBuilder queryAll = QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), filterAll);

        SearchResponse scrollResp = es.getClient().prepareSearch(props.getProperty("indexName"))
                .setTypes(this.cleanupType).setScroll(new TimeValue(60000)).setQuery(queryAll).setSize(100)
                .execute().actionGet();
        while (true) {
            for (SearchHit hit : scrollResp.getHits().getHits()) {
                update(props.getProperty("indexName"), cleanupType, hit.getId(), "SessionID", "invalid");
            }

            scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId())
                    .setScroll(new TimeValue(600000)).execute().actionGet();
            if (scrollResp.getHits().getHits().length == 0) {
                break;
            }
        }
    }

    /**
     * Method to update a Elasticsearch record/document by id, field, and value
     * @param index index name is Elasticsearch
     * @param type type name
     * @param id ID of the document that needs to be updated
     * @param field1 field of the document that needs to be updated
     * @param value1 value of the document that needs to be changed to
     * @throws ElasticsearchException
     * @throws IOException
     */
    private void update(String index, String type, String id, String field1, Object value1) throws IOException {
        UpdateRequest ur = new UpdateRequest(index, type, id)
                .doc(jsonBuilder().startObject().field(field1, value1).endObject());
        es.getBulkProcessor().add(ur);
    }

    @Override
    public Object execute(Object o) {
        return null;
    }

}