org.dspace.statistics.util.SpiderDetectorServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.dspace.statistics.util.SpiderDetectorServiceImpl.java

Source

/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.statistics.util;

import org.apache.commons.configuration.ConversionException;
import org.apache.commons.lang.StringUtils;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;

import javax.servlet.http.HttpServletRequest;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;

/**
 * SpiderDetectorServiceImpl is used to find IP's that are spiders...
 * In future someone may add Host Domains
 * to the detection criteria here.
 *
 * @author kevinvandevelde at atmire.com
 * @author ben at atmire.com
 * @author Mark Diggory (mdiggory at atmire.com)
 * @author frederic at atmire.com
 */
public class SpiderDetectorServiceImpl implements SpiderDetectorService {

    private static final Logger log = LoggerFactory.getLogger(SpiderDetectorServiceImpl.class);

    private Boolean useProxies;

    private Boolean useCaseInsensitiveMatching;

    private final List<Pattern> agents = Collections.synchronizedList(new ArrayList<Pattern>());

    private final List<Pattern> domains = Collections.synchronizedList(new ArrayList<Pattern>());

    private ConfigurationService configurationService;

    /**
     * Sparse HashTable structure to hold IP address ranges.
     */
    private IPTable table = null;

    @Autowired(required = true)
    public SpiderDetectorServiceImpl(ConfigurationService configurationService) {
        this.configurationService = configurationService;
    }

    public IPTable getTable() {
        return table;
    }

    /**
     * Service Method for testing spiders against existing spider files.
     * <p>
     * In future spiders HashSet may be optimized as byte offset array to
     * improve performance and memory footprint further.
     *
     * @param clientIP address of the client.
     * @param proxyIPs comma-list of X-Forwarded-For addresses, or null.
     * @param hostname domain name of host, or null.
     * @param agent User-Agent header value, or null.
     * @return true if the client matches any spider characteristics list.
     */
    public boolean isSpider(String clientIP, String proxyIPs, String hostname, String agent) {
        // See if any agent patterns match
        if (null != agent) {
            synchronized (agents) {
                if (agents.isEmpty())
                    loadPatterns("agents", agents);
            }

            if (isUseCaseInsensitiveMatching()) {
                agent = StringUtils.lowerCase(agent);
                hostname = StringUtils.lowerCase(hostname);
            }

            for (Pattern candidate : agents) {

                // prevent matcher() invocation from a null Pattern object
                if (null != candidate && candidate.matcher(agent).find()) {
                    return true;
                }

            }
        }

        // No.  See if any IP addresses match
        if (isUseProxies() && proxyIPs != null) {
            /* This header is a comma delimited list */
            for (String xfip : proxyIPs.split(",")) {
                if (isSpider(xfip)) {
                    return true;
                }
            }
        }

        if (isSpider(clientIP))
            return true;

        // No.  See if any DNS names match
        if (null != hostname) {
            synchronized (domains) {
                if (domains.isEmpty())
                    loadPatterns("domains", domains);
            }
            for (Pattern candidate : domains) {
                // prevent matcher() invocation from a null Pattern object
                if (null != candidate && candidate.matcher(hostname).find()) {
                    return true;
                }
            }
        }

        // Not a known spider.
        return false;
    }

    /**
     * Utility method which reads lines from a file & returns them in a Set.
     *
     * @param patternFile the location of our spider file
     * @return a vector full of patterns
     * @throws IOException could not happen since we check the file be4 we use it
     */
    public Set<String> readPatterns(File patternFile) throws IOException {
        Set<String> patterns = new HashSet<>();

        if (!patternFile.exists() || !patternFile.isFile()) {
            return patterns;
        }

        //Read our file & get all them patterns.
        try (BufferedReader in = new BufferedReader(new FileReader(patternFile))) {
            String line;
            while ((line = in.readLine()) != null) {
                if (!line.startsWith("#")) {
                    line = line.trim();

                    if (!line.equals("")) {
                        patterns.add(line);
                    }
                } else {
                    //   ua.add(line.replaceFirst("#","").replaceFirst("UA","").trim());
                    // ... add this functionality later
                }
            }
        }
        return patterns;
    }

    /**
     * Load agent name patterns from all files in a single subdirectory of config/spiders.
     *
     * @param directory simple directory name (e.g. "agents").
     *      "${dspace.dir}/config/spiders" will be prepended to yield the path to
     *      the directory of pattern files.
     * @param patternList patterns read from the files in {@code directory} will
     *      be added to this List.
     */
    private void loadPatterns(String directory, List<Pattern> patternList) {
        String dspaceHome = configurationService.getProperty("dspace.dir");
        File spidersDir = new File(dspaceHome, "config/spiders");
        File patternsDir = new File(spidersDir, directory);
        if (patternsDir.exists() && patternsDir.isDirectory()) {
            for (File file : patternsDir.listFiles()) {
                Set<String> patterns;
                try {
                    patterns = readPatterns(file);
                } catch (IOException ex) {
                    log.error("Patterns not read from {}:  {}", file.getPath(), ex.getMessage());
                    continue;
                }
                //If case insensitive matching is enabled, lowercase the patterns so they can be lowercase matched
                for (String pattern : patterns) {
                    if (isUseCaseInsensitiveMatching()) {
                        pattern = StringUtils.lowerCase(pattern);
                    }
                    patternList.add(Pattern.compile(pattern));
                }

                log.info("Loaded pattern file:  {}", file.getPath());
            }
        } else {
            log.info("No patterns loaded from {}", patternsDir.getPath());
        }
    }

    /**
     * Service Method for testing spiders against existing spider files.
     *
     * @param request
     * @return true|false if the request was detected to be from a spider.
     */
    public boolean isSpider(HttpServletRequest request) {
        return isSpider(request.getRemoteAddr(), request.getHeader("X-Forwarded-For"), request.getRemoteHost(),
                request.getHeader("User-Agent"));
    }

    /**
     * Check individual IP is a spider.
     *
     * @param ip
     * @return if is spider IP
     */
    public boolean isSpider(String ip) {
        if (table == null) {
            loadSpiderIpAddresses();
        }

        try {
            if (table.contains(ip)) {
                return true;
            }
        } catch (Exception e) {
            return false;
        }

        return false;
    }

    /*
     *  loader to populate the table from files.
     */
    public synchronized void loadSpiderIpAddresses() {

        if (table == null) {
            table = new IPTable();

            String filePath = configurationService.getProperty("dspace.dir");

            try {
                File spidersDir = new File(filePath, "config/spiders");

                if (spidersDir.exists() && spidersDir.isDirectory()) {
                    for (File file : spidersDir.listFiles()) {
                        if (file.isFile()) {
                            for (String ip : readPatterns(file)) {
                                log.debug("Loading {}", ip);
                                if (!Character.isDigit(ip.charAt(0))) {
                                    try {
                                        ip = DnsLookup.forward(ip);
                                        log.debug("Resolved to {}", ip);
                                    } catch (IOException e) {
                                        log.warn("Not loading {}:  {}", ip, e.getMessage());
                                        continue;
                                    }
                                }
                                table.add(ip);
                            }
                            log.info("Loaded Spider IP file: " + file);
                        }
                    }
                } else {
                    log.info("No spider file loaded");
                }
            } catch (IOException | IPTable.IPFormatException e) {
                log.error("Error Loading Spiders:" + e.getMessage(), e);
            }

        }

    }

    /**
     * checks if case insensitive matching is enabled
     * @return true if it's enabled, false if not
     */
    private boolean isUseCaseInsensitiveMatching() {
        if (useCaseInsensitiveMatching == null) {
            try {
                useCaseInsensitiveMatching = configurationService
                        .getBooleanProperty("usage-statistics.bots.case-insensitive");
            } catch (ConversionException e) {
                useCaseInsensitiveMatching = false;
                log.warn("Please use a boolean value for usage-statistics.bots.case-insensitive");
            }
        }

        return useCaseInsensitiveMatching;
    }

    private boolean isUseProxies() {
        if (useProxies == null) {
            useProxies = configurationService.getBooleanProperty("useProxies");
        }

        return useProxies;
    }

}