com.widowcrawler.terminator.parse.Parser.java Source code

Introduction

Here is the source code for com.widowcrawler.terminator.parse.Parser.java
Source

/**
 * Copyright 2015 Scott Mansfield
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.widowcrawler.terminator.parse;

import com.widowcrawler.terminator.ParseException;
import com.widowcrawler.terminator.model.RobotsTxt;
import com.widowcrawler.terminator.model.Rule;
import com.widowcrawler.terminator.model.RuleType;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;

import java.io.IOException;
import java.io.InputStream;
import java.util.*;

/**
 * Parses robots.txt files.
 *
 * Attempts to follow the draft for a standard, but makes no promises.
 *
 * Draft: http://www.robotstxt.org/norobots-rfc.txt
 *
 * @author Scott Mansfield
 */
public class Parser {

    public static final String USER_AGENT = "User-Agent:";
    public static final String ALLOW = "Allow:";
    public static final String DISALLOW = "Disallow:";
    public static final String SITEMAP = "Sitemap:";

    private static final List<Character> whitespaceChars;
    private static final List<Character> endlineChars;

    static {
        whitespaceChars = Collections.unmodifiableList(Arrays.asList(' ', '\t'));
        endlineChars = Collections.unmodifiableList(Arrays.asList('\r', '\n'));
    }

    private String data;
    private int dataPtr;

    private Map<String, Set<Rule>> ruleSets;
    private Set<String> siteMapRefs;

    public Parser(String data) {
        this.data = data;
        this.dataPtr = 0;
        this.ruleSets = new HashMap<>();
        this.siteMapRefs = new HashSet<>();
    }

    public Parser(InputStream inputStream) throws IOException {
        this(readStreamAlwaysClose(inputStream));
    }

    // ouch, seems hacky but it works
    private static String readStreamAlwaysClose(InputStream inputStream) throws IOException {
        try {
            return IOUtils.toString(inputStream, "UTF-8");
        } finally {
            inputStream.close();
        }
    }

    public RobotsTxt parse() throws ParseException {
        robotsTxt();

        return new RobotsTxt(ruleSets, siteMapRefs);
    }

    //////////////////////////////////
    // Begin recursive descent parser
    //////////////////////////////////

    private void robotsTxt() throws ParseException {
        //System.out.println("robotsTxt()");
        while (!isEndOfFile()) {
            robotsTxtPart();
        }
    }

    private void robotsTxtPart() throws ParseException {
        //System.out.println("robotsTxtPart()");
        whitespace();

        if (isEndline()) {
            endline();
        } else if (isCommentStart()) {
            commentLine();
        } else if (isUserAgentStart()) {
            agentSpec();
        } else if (isSitemapRefStart()) {
            sitemapRef();
        } else {
            throw new ParseException(dataPtr, "Invalid line");
        }
    }

    private void whitespace() {
        //System.out.println("whitespace()");
        while (!isEndOfFile() && isWhitespace()) {
            next();
        }
    }

    private void commentLine() throws ParseException {
        //System.out.println("commentLine()");
        while (!isEndOfFile() && !isEndline()) {
            next();
        }

        endline();
    }

    private void endline() throws ParseException {
        //System.out.println("endline()");
        if (isEndOfFile())
            return;

        if (current() == '\n') {
            next();
        } else if (current() == '\r') {
            next();
            if (current() == '\n') {
                next();
            }
        } else {
            throw new ParseException(dataPtr, "Unexpected character: " + current());
        }
    }

    private void agentSpec() throws ParseException {
        //System.out.println("agentSpec()");
        Set<String> userAgents = userAgents();
        Set<Rule> ruleSet = new HashSet<>();

        whitespace();

        while (isRuleLineStart()) {
            Rule rule = ruleLine();

            if (rule != null) {
                ruleSet.add(rule);
                //System.out.println(rule.getRuleType());
                //System.out.println(rule.getPathMatch());
            }

            whitespace();
        }

        userAgents.forEach(userAgent -> ruleSets.put(userAgent, ruleSet));
    }

    private Set<String> userAgents() throws ParseException {
        Set<String> userAgents = new HashSet<>();

        while (isUserAgentStart()) {
            userAgents.add(userAgent());
        }

        return userAgents;
    }

    private String userAgent() throws ParseException {
        //System.out.println("userAgent()");
        skip(USER_AGENT.length());
        whitespace();

        String userAgent = userAgentIdentifier();

        if (isCommentStart()) {
            commentLine();
        } else if (isEndline()) {
            endline();
        } else {
            throw new ParseException(dataPtr, "Expected a User-agent line.");
        }

        return userAgent;
    }

    private String userAgentIdentifier() {
        //System.out.println("userAgentIdentifier()");
        int start = dataPtr;

        while (!isEndOfFile() && !isCommentStart() && !isEndline()) {
            next();
        }

        return StringUtils.trimToEmpty(data.substring(start, dataPtr));
    }

    private Rule ruleLine() throws ParseException {
        //System.out.println("ruleLine()");
        if (isEndline()) {
            endline();
        } else if (isCommentStart()) {
            commentLine();
        } else if (isRuleStart()) {
            return rule();
        } else {
            throw new ParseException(dataPtr, "Expected either 'Allow:' or 'Disallow:' to start a rule");
        }

        // blank lines return null
        return null;
    }

    private Rule rule() throws ParseException {
        //System.out.println("rule()");
        Rule rule;

        if (isAllowRule()) {
            rule = allowRule();
        } else if (isDisallowRule()) {
            rule = disallowRule();
        } else {
            throw new ParseException(dataPtr, "Expected either 'Allow:' or 'Disallow:' to start a rule");
        }

        if (isEndline()) {
            endline();
        } else if (isCommentStart()) {
            commentLine();
        } else if (!isEndOfFile()) {
            throw new ParseException(dataPtr, "Unexpected text after rule path");
        }

        return rule;
    }

    private Rule allowRule() {
        //System.out.println("allowRule()");
        skip(ALLOW.length());
        whitespace();
        String path = rulePath();

        return new Rule(RuleType.ALLOW, path);
    }

    private Rule disallowRule() {
        //System.out.println("disallowRule()");
        skip(DISALLOW.length());
        whitespace();
        String path = rulePath();

        return new Rule(RuleType.DISALLOW, path);
    }

    private String rulePath() {
        //System.out.println("rulePath()");
        int start = dataPtr;

        while (!isEndOfFile() && !isCommentStart() && !isEndline() && !isWhitespace()) {
            next();
        }

        String path = StringUtils.trimToEmpty(data.substring(start, dataPtr));

        whitespace();

        return path;
    }

    private void sitemapRef() throws ParseException {
        //System.out.println("sitemapRef");
        skip(SITEMAP.length());
        whitespace();

        String url = sitemapRefIdentifier();
        siteMapRefs.add(url);

        endline();
    }

    private String sitemapRefIdentifier() {
        //System.out.println("sitemapRefIdentifier()");
        int start = dataPtr;

        while (!isEndOfFile() && !isCommentStart() && !isEndline() && !isWhitespace()) {
            next();
        }

        String url = StringUtils.trimToEmpty(data.substring(start, dataPtr));

        whitespace();

        return url;
    }

    //////////////////////////
    // Begin helper functions
    //////////////////////////

    private Character current() {
        return data.charAt(dataPtr);
    }

    private void next() {
        dataPtr++;
    }

    private void skip(int numChars) {
        for (int i = 0; i < numChars; i++) {
            next();
        }
    }

    // TODO: Maybe these ought to be case-agnostic
    private boolean isWhitespace() {
        return !isEndOfFile() && whitespaceChars.contains(current());
    }

    private boolean isCommentStart() {
        return !isEndOfFile() && current() == '#';
    }

    private boolean isEndline() {
        return !isEndOfFile() && endlineChars.contains(current());
    }

    private boolean isUserAgentStart() {
        return !isEndOfFile() && matchStringStart(USER_AGENT);
    }

    private boolean isRuleLineStart() {
        return isRuleStart() || isCommentStart() || isEndline();
    }

    private boolean isRuleStart() {
        return isAllowRule() || isDisallowRule();
    }

    private boolean isAllowRule() {
        return !isEndOfFile() && matchStringStart(ALLOW);
    }

    private boolean isDisallowRule() {
        return !isEndOfFile() && matchStringStart(DISALLOW);
    }

    private boolean isEndOfFile() {
        return dataPtr >= data.length();
    }

    private boolean isSitemapRefStart() {
        return !isEndOfFile() && matchStringStart(SITEMAP);
    }

    private boolean matchStringStart(String toMatch) {
        // be permissive with casing
        return StringUtils.equalsIgnoreCase(data.substring(dataPtr, dataPtr + toMatch.length()), toMatch);
    }
}