org.apache.tika.server.TikaResourceTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.server.TikaResourceTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.server;

import org.apache.commons.codec.binary.Base64;
import org.apache.cxf.attachment.AttachmentUtil;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.server.resource.TikaResource;
import org.junit.Test;

import javax.ws.rs.ProcessingException;
import javax.ws.rs.core.Response;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

public class TikaResourceTest extends CXFTestBase {
    public static final String TEST_DOC = "test.doc";
    public static final String TEST_PASSWORD_PROTECTED = "password.xls";
    private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
    private static final String TEST_OOM = "mock/fake_oom.xml";

    private static final String STREAM_CLOSED_FAULT = "java.io.IOException: Stream Closed";

    private static final String TIKA_PATH = "/tika";
    private static final int UNPROCESSEABLE = 422;

    @Override
    protected void setUpResources(JAXRSServerFactoryBean sf) {
        sf.setResourceClasses(TikaResource.class);
        sf.setResourceProvider(TikaResource.class, new SingletonResourceProvider(new TikaResource()));
    }

    @Override
    protected void setUpProviders(JAXRSServerFactoryBean sf) {
        List<Object> providers = new ArrayList<Object>();
        providers.add(new TikaServerParseExceptionMapper(false));
        sf.setProviders(providers);
    }

    @Test
    public void testHelloWorld() throws Exception {
        Response response = WebClient.create(endPoint + TIKA_PATH).type("text/plain").accept("text/plain").get();
        assertEquals(TikaResource.GREETING, getStringFromInputStream((InputStream) response.getEntity()));
    }

    @Test
    public void testSimpleWord() throws Exception {
        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/msword").accept("text/plain")
                .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.contains("test"));
    }

    @Test
    public void testTextMain() throws Exception {
        //boilerpipe
        Response response = WebClient.create(endPoint + TIKA_PATH + "/main").accept("text/plain")
                .put(ClassLoader.getSystemResourceAsStream("testHTML.html"));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.contains("Title : Test Indexation Html"));
        assertFalse(responseMsg.contains("Indexation du fichier"));
    }

    @Test
    public void testTextMainMultipart() throws Exception {
        //boilerpipe
        Attachment attachmentPart = new Attachment("myhtml", "text/html",
                ClassLoader.getSystemResourceAsStream("testHTML.html"));

        Response response = WebClient.create(endPoint + TIKA_PATH + "/form/main").type("multipart/form-data")
                .accept("text/plain").post(attachmentPart);
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.contains("Title : Test Indexation Html"));
        assertFalse(responseMsg.contains("Indexation du fichier"));
    }

    @Test
    public void testApplicationWadl() throws Exception {
        Response response = WebClient.create(endPoint + TIKA_PATH + "?_wadl").accept("text/plain").get();
        String resp = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(resp.startsWith("<application"));
    }

    @Test
    public void testPasswordXLS() throws Exception {
        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/vnd.ms-excel")
                .accept("text/plain").put(ClassLoader.getSystemResourceAsStream("password.xls"));

        assertEquals(UNPROCESSEABLE, response.getStatus());
    }

    @Test
    public void testSimpleWordHTML() throws Exception {
        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/msword").accept("text/html")
                .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.contains("test"));
        assertContains("<meta name=\"X-TIKA:digest:MD5\" content=\"f8be45c34e8919eedba48cc8d207fbf0\"/>",
                responseMsg);
        assertContains("<meta name=\"X-TIKA:digest:SHA1\" content=\"N4EBCE7EGTIGZWETEJ6WD3W4KN32TLPG\"/>",
                responseMsg);
    }

    @Test
    public void testPasswordXLSHTML() throws Exception {
        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/vnd.ms-excel")
                .accept("text/html").put(ClassLoader.getSystemResourceAsStream("password.xls"));

        assertEquals(UNPROCESSEABLE, response.getStatus());
    }

    @Test
    public void testSimpleWordXML() throws Exception {
        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/msword").accept("text/xml")
                .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.contains("test"));
    }

    @Test
    public void testPasswordXLSXML() throws Exception {
        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/vnd.ms-excel")
                .accept("text/xml").put(ClassLoader.getSystemResourceAsStream("password.xls"));

        assertEquals(UNPROCESSEABLE, response.getStatus());
    }

    @Test
    public void testSimpleWordMultipartXML() throws Exception {
        ClassLoader.getSystemResourceAsStream(TEST_DOC);
        Attachment attachmentPart = new Attachment("myworddoc", "application/msword",
                ClassLoader.getSystemResourceAsStream(TEST_DOC));
        WebClient webClient = WebClient.create(endPoint + TIKA_PATH + "/form");
        Response response = webClient.type("multipart/form-data").accept("text/xml").post(attachmentPart);
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.contains("test"));
        assertContains("<meta name=\"X-TIKA:digest:MD5\" content=\"f8be45c34e8919eedba48cc8d207fbf0\"/>",
                responseMsg);

    }

    @Test
    public void testJAXBAndActivationDependency() {
        //TIKA-2778
        AttachmentUtil.getCommandMap();
    }

    @Test
    public void testEmbedded() throws Exception {
        //first try text
        Response response = WebClient.create(endPoint + TIKA_PATH).accept("text/plain")
                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.contains("Course of human events"));

        //now go for xml -- different call than text
        response = WebClient.create(endPoint + TIKA_PATH).accept("text/xml")
                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
        responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.contains("Course of human events"));
        assertContains("<meta name=\"X-TIKA:digest:MD5\" content=\"59f626e09a8c16ab6dbc2800c685f772\"/>",
                responseMsg);

    }

    //TIKA-1845
    @Test
    public void testWMFInRTF() throws Exception {
        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/rtf").accept("text/plain")
                .put(ClassLoader.getSystemResourceAsStream("testRTF_npeFromWMFInTikaServer.rtf"));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.contains("Example text"));
    }

    //TIKA-2638 and TIKA-2816
    @Test
    public void testOCRLanguageConfig() throws Exception {
        if (!new TesseractOCRParser().hasTesseract(new TesseractOCRConfig())) {
            return;
        }

        Response response = WebClient.create(endPoint + TIKA_PATH).accept("text/plain")
                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "OcrStrategy", "ocr_only")
                .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX + "Language", "eng+fra")
                .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX + "MinFileSizeToOcr", "10")
                .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX + "MaxFileSizeToOcr", "1000000000")
                .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertContains("Happy New Year 2003!", responseMsg);
    }

    //TIKA-2290
    @Test
    public void testPDFOCRConfig() throws Exception {
        if (!new TesseractOCRParser().hasTesseract(new TesseractOCRConfig())) {
            return;
        }

        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "OcrStrategy", "no_ocr")
                .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.trim().equals(""));

        response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "OcrStrategy", "ocr_only")
                .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
        responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertContains("Happy New Year 2003!", responseMsg);

        //now try a bad value
        response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "OcrStrategy", "non-sense-value")
                .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
        assertEquals(500, response.getStatus());
    }

    //TIKA-2669
    @Test
    public void testPDFConfig() throws Exception {

        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
        assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", responseMsg);

        response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "sortByPosition", "false")
                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
        responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
        assertEquals("Left column line 1 Left column line 2 Right column line 1 Right column line 2", responseMsg);

        //make sure that default reverts to initial config option
        response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
        responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
        assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", responseMsg);

    }

    @Test
    public void testExtractTextAcceptPlainText() throws Exception {
        //TIKA-2384
        Attachment attachmentPart = new Attachment("my-docx-file",
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                ClassLoader.getSystemResourceAsStream("2pic.docx"));

        Response response = WebClient.create(endPoint + TIKA_PATH + "/form").type("multipart/form-data")
                .accept("text/plain").post(attachmentPart);

        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertTrue(responseMsg.contains("P1040893.JPG"));
        assertNotFound(STREAM_CLOSED_FAULT, responseMsg);
    }

    @Test
    public void testDataIntegrityCheck() throws Exception {
        Response response = null;
        try {
            response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
                    .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX + "tesseractPath", "C://tmp//hello.bat\u0000")
                    .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
            assertEquals(400, response.getStatus());
        } catch (ProcessingException e) {
            //can't tell why this intermittently happens. :(
            //started after the upgrade to 3.2.7
        }

        response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
                .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX + "tesseractPath", "bogus path")
                .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
        assertEquals(200, response.getStatus());
    }

    @Test
    public void testTrustedMethodPrevention() {
        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
                .header(TikaResource.X_TIKA_OCR_HEADER_PREFIX + "trustedPageSeparator", "\u0020")
                .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
        assertEquals(500, response.getStatus());

    }

    @Test
    public void testFloatInHeader() {
        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX + "averageCharTolerance", "2.0")
                .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
        assertEquals(200, response.getStatus());

    }

    @Test
    public void testOOMInLegacyMode() throws Exception {

        Response response = null;
        try {
            response = WebClient.create(endPoint + TIKA_PATH).accept("text/plain")
                    .put(ClassLoader.getSystemResourceAsStream(TEST_OOM));
        } catch (Exception e) {
            //oom may or may not cause an exception depending
            //on the timing
        }

        response = WebClient.create(endPoint + TIKA_PATH).accept("text/plain")
                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());

        assertContains("plundered our seas", responseMsg);
    }

    @Test
    public void testUnicodePasswordProtectedSpaces() throws Exception {
        //TIKA-2858
        final String password = "    ";
        final String encoded = new Base64().encodeAsString(password.getBytes(StandardCharsets.UTF_8));
        Response response = WebClient.create(endPoint + TIKA_PATH).accept("text/plain")
                .header(TikaResource.PASSWORD_BASE64_UTF8, encoded)
                .put(ClassLoader.getSystemResourceAsStream("testPassword4Spaces.pdf"));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertContains("Just some text.", responseMsg);
    }

    @Test
    public void testUnicodePasswordProtectedUnicode() throws Exception {
        //TIKA-2858
        final String password = "  ! < > \" \\ \u20AC \u0153 \u00A4 \u0031\u2044\u0034 \u0031\u2044\u0032 \uD841\uDF0E \uD867\uDD98 \uD83D\uDE00  ";
        final String encoded = new Base64().encodeAsString(password.getBytes(StandardCharsets.UTF_8));
        Response response = WebClient.create(endPoint + TIKA_PATH).accept("text/plain")
                .header(TikaResource.PASSWORD_BASE64_UTF8, encoded)
                .put(ClassLoader.getSystemResourceAsStream("testUnicodePassword.pdf"));
        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
        assertContains("Just some text.", responseMsg);
    }
}