esiptestbed.mudrod.weblog.pre.CrawlerDetection.java Source code

Introduction

Here is the source code for esiptestbed.mudrod.weblog.pre.CrawlerDetection.java
Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License"); you 
 * may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package esiptestbed.mudrod.weblog.pre;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.joda.time.DateTime;
import org.joda.time.Seconds;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
import org.elasticsearch.search.aggregations.bucket.histogram.Histogram;
import org.elasticsearch.search.aggregations.bucket.histogram.Histogram.Order;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;

import esiptestbed.mudrod.discoveryengine.DiscoveryStepAbstract;
import esiptestbed.mudrod.driver.ESDriver;
import esiptestbed.mudrod.driver.SparkDriver;
import esiptestbed.mudrod.main.MudrodConstants;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An {@link esiptestbed.mudrod.discoveryengine.DiscoveryStepAbstract} implementation
 * which detects a known list of Web crawlers which may may be present within, and 
 * pollute various logs acting as input to Mudrod.
 */
public class CrawlerDetection extends DiscoveryStepAbstract {
    /**
     * 
     */
    private static final long serialVersionUID = 1L;
    private static final Logger LOG = LoggerFactory.getLogger(CrawlerDetection.class);

    public static final String CRAWLER = "crawler";
    public static final String GOOGLE_BOT = "googlebot";
    public static final String BING_BOT = "bingbot";
    public static final String YAHOO_BOT = "slurp";
    public static final String YACY_BOT = "yacybot";
    public static final String ROGER_BOT = "rogerbot";
    public static final String YANDEX_BOT = "yandexbot";

    public static final String NO_AGENT_BOT = "-";
    public static final String PERL_BOT = "libwww-perl/";
    public static final String APACHE_HHTP = "apache-httpclient/";
    public static final String JAVA_CLIENT = "java/";
    public static final String CURL = "curl/";

    /**
     * Paramterized constructor to instantiate a configured instance of 
     * {@link esiptestbed.mudrod.weblog.pre.CrawlerDetection}
     * @param props populated {@link java.util.Properties} object
     * @param es {@link esiptestbed.mudrod.driver.ESDriver} object to use in crawler detection preprocessing.
     * @param spark {@link esiptestbed.mudrod.driver.SparkDriver} object to use in crawler detection preprocessing.
     */
    public CrawlerDetection(Properties props, ESDriver es, SparkDriver spark) {
        super(props, es, spark);
    }

    @Override
    public Object execute() {
        LOG.info("Starting Crawler detection.");
        startTime = System.currentTimeMillis();
        try {
            checkByRate();
        } catch (InterruptedException | IOException e) {
            LOG.error("Encountered an error whilst detecting Web crawlers.", e);
        }
        endTime = System.currentTimeMillis();
        es.refreshIndex();
        LOG.info("Crawler detection complete. Time elapsed {} seconds", (endTime - startTime) / 1000);
        return null;
    }

    /**
     * Check known crawler through crawler agent name list
     * @param agent name of a log line
     * @return 1 if the log is initiated by crawler, 0 otherwise
     */
    public boolean checkKnownCrawler(String agent) {
        agent = agent.toLowerCase();
        if (agent.contains(CRAWLER) || agent.contains(GOOGLE_BOT) || agent.contains(BING_BOT)
                || agent.contains(APACHE_HHTP) || agent.contains(PERL_BOT) || agent.contains(YAHOO_BOT)
                || agent.contains(YANDEX_BOT) || agent.contains(NO_AGENT_BOT) || agent.contains(PERL_BOT)
                || agent.contains(APACHE_HHTP) || agent.contains(JAVA_CLIENT) || agent.contains(CURL)) {
            return true;
        } else {
            return false;
        }

    }

    /**
     * Check crawler by request sending rate, which is read from configruation file
     * @throws InterruptedException InterruptedException
     * @throws IOException IOException
     */
    public void checkByRate() throws InterruptedException, IOException {
        es.createBulkProcessor();

        int rate = Integer.parseInt(props.getProperty("sendingrate"));
        SearchResponse sr = es.getClient().prepareSearch(props.getProperty(MudrodConstants.ES_INDEX_NAME))
                .setTypes(httpType).setQuery(QueryBuilders.matchAllQuery()).setSize(0)
                .addAggregation(AggregationBuilders.terms("Users").field("IP").size(0)).execute().actionGet();
        Terms users = sr.getAggregations().get("Users");

        int userCount = 0;

        Pattern pattern = Pattern.compile("get (.*?) http/*");
        Matcher matcher;
        for (Terms.Bucket entry : users.getBuckets()) {
            QueryBuilder filterSearch = QueryBuilders.boolQuery()
                    .must(QueryBuilders.termQuery("IP", entry.getKey()));
            QueryBuilder querySearch = QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), filterSearch);

            AggregationBuilder aggregation = AggregationBuilders.dateHistogram("by_minute").field("Time")
                    .interval(DateHistogramInterval.MINUTE).order(Order.COUNT_DESC);
            SearchResponse checkRobot = es.getClient().prepareSearch(props.getProperty("indexName"))
                    .setTypes(httpType, ftpType).setQuery(querySearch).setSize(0).addAggregation(aggregation)
                    .execute().actionGet();

            Histogram agg = checkRobot.getAggregations().get("by_minute");

            List<? extends Histogram.Bucket> botList = agg.getBuckets();
            long maxCount = botList.get(0).getDocCount();
            if (maxCount >= rate) {
            } else {
                userCount++;
                DateTime dt1 = null;
                int toLast = 0;
                SearchResponse scrollResp = es.getClient().prepareSearch(props.getProperty("indexName"))
                        .setTypes(httpType, ftpType).setScroll(new TimeValue(60000)).setQuery(querySearch)
                        .setSize(100).execute().actionGet();
                while (true) {
                    for (SearchHit hit : scrollResp.getHits().getHits()) {
                        Map<String, Object> result = hit.getSource();
                        String logtype = (String) result.get("LogType");
                        if (logtype.equals("PO.DAAC")) {
                            String request = (String) result.get("Request");
                            matcher = pattern.matcher(request.trim().toLowerCase());
                            boolean find = false;
                            while (matcher.find()) {
                                request = matcher.group(1);
                                result.put("RequestUrl", "http://podaac.jpl.nasa.gov" + request);
                                find = true;
                            }
                            if (!find) {
                                result.put("RequestUrl", request);
                            }
                        } else {
                            result.put("RequestUrl", (String) result.get("Request"));
                        }

                        DateTimeFormatter fmt = ISODateTimeFormat.dateTime();
                        DateTime dt2 = fmt.parseDateTime((String) result.get("Time"));

                        if (dt1 == null) {
                            toLast = 0;
                        } else {
                            toLast = Math.abs(Seconds.secondsBetween(dt1, dt2).getSeconds());
                        }
                        result.put("ToLast", toLast);
                        IndexRequest ir = new IndexRequest(props.getProperty("indexName"), cleanupType)
                                .source(result);

                        es.getBulkProcessor().add(ir);
                        dt1 = dt2;
                    }

                    scrollResp = es.getClient().prepareSearchScroll(scrollResp.getScrollId())
                            .setScroll(new TimeValue(600000)).execute().actionGet();
                    if (scrollResp.getHits().getHits().length == 0) {
                        break;
                    }
                }

            }
        }
        es.destroyBulkProcessor();
        LOG.info("User count: {}", Integer.toString(userCount));
    }

    @Override
    public Object execute(Object o) {
        return null;
    }

}