com.norconex.collector.http.crawler.BasicFeaturesTest.java Source code

Java tutorial

Introduction

Here is the source code for com.norconex.collector.http.crawler.BasicFeaturesTest.java

Source

/* Copyright 2014 Norconex Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.norconex.collector.http.crawler;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.CharEncoding;
import org.apache.commons.lang3.mutable.Mutable;
import org.apache.commons.lang3.mutable.MutableObject;
import org.junit.Assert;
import org.junit.Test;

import com.norconex.collector.http.HttpCollector;
import com.norconex.collector.http.doc.HttpDocument;
import com.norconex.collector.http.doc.HttpMetadata;
import com.norconex.commons.lang.file.FileUtil;
import com.norconex.commons.lang.file.IFileVisitor;

/**
 * @author Pascal Essiembre
 *
 */
public class BasicFeaturesTest extends AbstractHttpTest {

    /**
     * Constructor.
     */
    public BasicFeaturesTest() {
    }

    @Test
    public void testRedirect() throws IOException {
        HttpCollector collector = newHttpCollector1Crawler("/test?case=redirect");
        HttpCrawler crawler = (HttpCrawler) collector.getCrawlers()[0];
        crawler.getCrawlerConfig().setMaxDepth(0);
        collector.start(false);

        List<HttpDocument> docs = getCommitedDocuments(crawler);
        assertListSize("document", docs, 1);

        HttpDocument doc = docs.get(0);
        String ref = doc.getReference();

        List<String> urls = doc.getMetadata().getStrings(HttpMetadata.COLLECTOR_URL);
        System.out.println("URLs:" + urls);
        assertListSize("URL", urls, 1);

        Assert.assertTrue("Invalid redirection URL: " + ref,
                ref.contains("/test/redirected/page.html?case=redirect"));

        List<String> inPageUrls = doc.getMetadata().getStrings(HttpMetadata.COLLECTOR_REFERNCED_URLS);
        assertListSize("referenced URLs", inPageUrls, 2);

        Assert.assertTrue("Invalid relative URL: " + inPageUrls.get(0),
                inPageUrls.get(0).matches(".*/test/redirected/page[12].html"));
        Assert.assertTrue("Invalid relative URL: " + inPageUrls.get(1),
                inPageUrls.get(1).matches(".*/test/redirected/page[12].html"));
    }

    @Test
    public void testBasicFeatures() throws IOException {
        HttpCollector collector = newHttpCollector1Crawler("/test?case=basic&depth=0");
        HttpCrawler crawler = (HttpCrawler) collector.getCrawlers()[0];
        crawler.getCrawlerConfig().setMaxDepth(10);
        collector.start(false);

        List<HttpDocument> docs = getCommitedDocuments(crawler);
        testDepth(docs);
        for (HttpDocument httpDocument : docs) {
            testValidMetadata(httpDocument);
        }
    }

    @Test
    public void testKeepDownload() throws IOException {
        HttpCollector collector = newHttpCollector1Crawler("/test/a$dir/blah?case=keepDownloads");
        HttpCrawler crawler = (HttpCrawler) collector.getCrawlers()[0];
        crawler.getCrawlerConfig().setMaxDepth(0);
        crawler.getCrawlerConfig().setKeepDownloads(true);
        //        String url = crawler.getCrawlerConfig().getStartURLs()[0];
        collector.start(false);

        File downloadDir = new File(crawler.getCrawlerConfig().getWorkDir(), "downloads");
        final Mutable<File> downloadedFile = new MutableObject<>();
        FileUtil.visitAllFiles(downloadDir, new IFileVisitor() {
            @Override
            public void visit(File file) {
                if (downloadedFile.getValue() != null) {
                    return;
                }
                if (file.toString().contains("downloads")) {
                    downloadedFile.setValue(file);
                }
            }
        });
        String content = FileUtils.readFileToString(downloadedFile.getValue());
        Assert.assertTrue("Invalid or missing download file.", content
                .contains("<b>This</b> file <i>must</i> be saved as is, " + "with this <span>formatting</span>"));
    }

    @Test
    public void testMaxURLs() throws IOException {
        HttpCollector collector = newHttpCollector1Crawler("/test?case=basic&depth=0");
        HttpCrawler crawler = (HttpCrawler) collector.getCrawlers()[0];
        crawler.getCrawlerConfig().setMaxDocuments(15);
        collector.start(false);

        List<HttpDocument> docs = getCommitedDocuments(crawler);
        assertListSize("URLs", docs, 15);
    }

    @Test
    public void testUserAgent() throws IOException {
        HttpCollector collector = newHttpCollector1Crawler("/test?case=userAgent");
        HttpCrawler crawler = (HttpCrawler) collector.getCrawlers()[0];
        crawler.getCrawlerConfig().setMaxDepth(0);
        crawler.getCrawlerConfig().setUserAgent("Super Secret Agent");
        collector.start(false);

        List<HttpDocument> docs = getCommitedDocuments(crawler);
        assertListSize("document", docs, 1);

        HttpDocument doc = docs.get(0);
        Assert.assertTrue("Wrong or undetected User-Agent.",
                IOUtils.toString(doc.getContent()).contains("Super Secret Agent"));
    }

    private void testDepth(List<HttpDocument> docs) {
        // 0-depth + 10 others == 11 expected files
        Assert.assertEquals("Did not crawl the right depth.", 11, docs.size());
    }

    private void testValidMetadata(HttpDocument doc) {
        HttpMetadata meta = doc.getMetadata();

        //Test single value
        assertOneValue(meta, HttpMetadata.HTTP_CONTENT_TYPE, HttpMetadata.COLLECTOR_CONTENT_TYPE,
                HttpMetadata.COLLECTOR_CONTENT_ENCODING);

        //Test actual values
        Assert.assertEquals("Bad HTTP content-type", "text/html; charset=UTF-8",
                meta.getString(HttpMetadata.HTTP_CONTENT_TYPE));
        Assert.assertEquals("Bad Collection content-type.", "text/html",
                meta.getString(HttpMetadata.COLLECTOR_CONTENT_TYPE));
        Assert.assertEquals("Bad char-encoding.", CharEncoding.UTF_8,
                meta.getString(HttpMetadata.COLLECTOR_CONTENT_ENCODING));
    }

    private void assertListSize(String listName, List<?> list, int size) {
        Assert.assertEquals("Wrong " + listName + " list size.", size, list.size());
    }

    private void assertOneValue(HttpMetadata meta, String... fields) {
        for (String field : fields) {
            Assert.assertEquals(field + " does not contain strickly 1 value.", 1, meta.getStrings(field).size());
        }
    }

}