com.norconex.collector.http.robot.impl.StandardRobotsTxtProviderTest.java Source code

Java tutorial

Introduction

Here is the source code for com.norconex.collector.http.robot.impl.StandardRobotsTxtProviderTest.java

Source

/* Copyright 2010-2014 Norconex Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.norconex.collector.http.robot.impl;

import com.norconex.collector.core.filter.IReferenceFilter;
import java.io.IOException;

import org.apache.commons.io.IOUtils;
import org.junit.Assert;
import org.junit.Test;

/**
 * @author Pascal Essiembre
 */
public class StandardRobotsTxtProviderTest {

    @Test
    public void testGetRobotsTxt() throws IOException {
        String robotTxt1 = "User-agent: *\n" + "Disallow: /dontgo/there/\n" + "User-agent: mister-crawler\n"
                + "Disallow: /bathroom/\n" + "User-agent: miss-crawler\n" + "Disallow: /tvremote/\n";
        String robotTxt2 = "User-agent: mister-crawler\n" + "Disallow: /bathroom/\n" + "User-agent: *\n"
                + "Disallow: /dontgo/there/\n";
        String robotTxt3 = "User-agent: miss-crawler\n" + "Disallow: /tvremote/\n" + "User-agent: *\n"
                + "Disallow: /dontgo/there/\n";
        String robotTxt4 = "User-agent: miss-crawler\n" + "Disallow: /tvremote/\n" + "User-agent: *\n"
                + "Disallow: /dontgo/there/\n" + "User-agent: mister-crawler\n" + "Disallow: /bathroom/\n";
        String robotTxt5 = "# robots.txt\n" + "User-agent: *\n" + "Disallow: /some/fake/ # Spiders, keep out! \n"
                + "Disallow: /spidertrap/\n" + "Allow: /open/\n" + "Allow: /\n";

        Assert.assertEquals("Robots.txt (Disallow:/bathroom/)",
                parseRobotRule("mister-crawler", robotTxt1)[1].toString());
        Assert.assertEquals("Robots.txt (Disallow:/bathroom/)",
                parseRobotRule("mister-crawler", robotTxt2)[0].toString());
        Assert.assertEquals("Robots.txt (Disallow:/dontgo/there/)",
                parseRobotRule("mister-crawler", robotTxt3)[0].toString());
        Assert.assertEquals("Robots.txt (Disallow:/bathroom/)",
                parseRobotRule("mister-crawler", robotTxt4)[1].toString());

        Assert.assertEquals("Robots.txt (Disallow:/some/fake/)",
                parseRobotRule("mister-crawler", robotTxt5)[0].toString());
        Assert.assertEquals("Robots.txt (Disallow:/spidertrap/)",
                parseRobotRule("mister-crawler", robotTxt5)[1].toString());
        Assert.assertEquals("Robots.txt (Allow:/open/)", parseRobotRule("mister-crawler", robotTxt5)[2].toString());

    }

    private IReferenceFilter[] parseRobotRule(String agent, String content, String url) throws IOException {
        StandardRobotsTxtProvider robotProvider = new StandardRobotsTxtProvider();
        return robotProvider.parseRobotsTxt(IOUtils.toInputStream(content), url, "mister-crawler").getFilters();
    }

    private IReferenceFilter[] parseRobotRule(String agent, String content) throws IOException {
        return parseRobotRule(agent, content, "http://www.testinguseragents.test/some/fake/url.html");
    }
}