org.apache.tika.cli.TikaCLIBatchIT.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.cli.TikaCLIBatchIT.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.cli;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.Reader;
import java.lang.ProcessBuilder.Redirect;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

public class TikaCLIBatchIT {

    private Path testResourcesDir = Paths.get("src/test/resources");
    private String testInputDirForCommandLine;
    private Path tempOutputDir;
    private String tempOutputDirForCommandLine;
    private OutputStream out = null;
    private OutputStream err = null;
    private ByteArrayOutputStream outContent = null;
    private ProcessBuilder builder;

    @Before
    public void setup() throws Exception {
        builder = new ProcessBuilder();
        builder.directory(new File("target"));
        tempOutputDir = Files.createTempDirectory("tika-cli-test-batch-");
        ByteArrayOutputStream errBuffer = new ByteArrayOutputStream();
        outContent = new ByteArrayOutputStream();
        out = System.out;
        err = System.err;

        PrintStream outWriter = new PrintStream(outContent, true, UTF_8.name());
        PrintStream errWriter = new PrintStream(errBuffer, true, UTF_8.name());

        System.setOut(outWriter);
        System.setErr(errWriter);
        testInputDirForCommandLine = testResourcesDir.toAbsolutePath().toString() + File.separator + "test-data";
        tempOutputDirForCommandLine = tempOutputDir.toAbsolutePath().toString();
    }

    @After
    public void tearDown() throws Exception {
        System.setOut(new PrintStream(out, true, UTF_8.name()));
        System.setErr(new PrintStream(err, true, UTF_8.name()));
        //TODO: refactor to use our deleteDirectory with straight path
        //FileUtils.deleteDirectory(tempOutputDir.toFile());
    }

    @Test
    public void testSimplestBatchIntegration() throws Exception {
        String[] params = { testInputDirForCommandLine, tempOutputDirForCommandLine };
        runFramework(params);

        assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml"));
        assertFileExists(tempOutputDir.resolve("coffee.xls.xml"));
    }

    @Test
    public void testBasicBatchIntegration() throws Exception {
        String[] params = { "-i", testInputDirForCommandLine, "-o", tempOutputDirForCommandLine, "-numConsumers",
                "2" };
        runFramework(params);

        assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml"));
        assertFileExists(tempOutputDir.resolve("coffee.xls.xml"));
    }

    @Test
    public void testJsonRecursiveBatchIntegration() throws Exception {
        String[] params = { "-i", testInputDirForCommandLine, "-o", tempOutputDirForCommandLine, "-numConsumers",
                "10", "-J", //recursive Json
                "-t" //plain text in content
        };
        runFramework(params);

        Path jsonFile = tempOutputDir.resolve("test_recursive_embedded.docx.json");
        try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) {
            List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
            assertEquals(12, metadataList.size());
            assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events"));
        }
    }

    @Test
    public void testProcessLogFileConfig() throws Exception {
        File logProps = new File(
                testResourcesDir.toAbsolutePath().toString() + "/log4j_batch_process_test.properties");

        String[] params = { "-i", testInputDirForCommandLine, "-o", tempOutputDirForCommandLine, "-numConsumers",
                "2", "-JDlog4j.configuration=" + logProps.toURI().toString() };
        runFramework(params);

        assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml"));
        assertFileExists(tempOutputDir.resolve("coffee.xls.xml"));
        assertTrue(outContent.toString(UTF_8.name()).contains("MY_CUSTOM_LOG_CONFIG"));
    }

    @Test
    public void testDigester() throws Exception {
        /*
                try {
        String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
                "-o", escape(tempOutputDir.getAbsolutePath()),
                "-numConsumers", "10",
                "-J", //recursive Json
                "-t" //plain text in content
        };
        TikaCLI.main(params);
        reader = new InputStreamReader(
                new FileInputStream(new File(tempOutputDir, "test_recursive_embedded.docx.json")), UTF_8);
        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
        assertEquals(12, metadataList.size());
        assertEquals("59f626e09a8c16ab6dbc2800c685f772", metadataList.get(0).get("X-TIKA:digest:MD5"));
        assertEquals("22e6e91f408d018417cd452d6de3dede", metadataList.get(5).get("X-TIKA:digest:MD5"));
                } finally {
        IOUtils.closeQuietly(reader);
                }
        */
        String[] params = { "-i", testInputDirForCommandLine, "-o", tempOutputDirForCommandLine, "-numConsumers",
                "10", "-J", //recursive Json
                "-t", //plain text in content
                "-digest", "sha512" };
        runFramework(params);
        Path jsonFile = tempOutputDir.resolve("test_recursive_embedded.docx.json");
        try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) {

            List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
            assertEquals(12, metadataList.size());
            assertNotNull(metadataList.get(0).get("X-TIKA:digest:SHA512"));
            assertTrue(metadataList.get(0).get("X-TIKA:digest:SHA512").startsWith("ee46d973ee1852c01858"));
        }
    }

    private void assertFileExists(Path path) {
        assertTrue("File doesn't exist: " + path.toAbsolutePath(), Files.isRegularFile(path));
    }

    public void runFramework(String[] params) throws Exception {

        List<String> forkCommand = new ArrayList<String>();
        forkCommand.add("java");
        forkCommand.add("-cp");
        forkCommand.add(System.getProperty("project.bundle.file"));
        forkCommand.add("org.apache.tika.main.Main");
        forkCommand.addAll(Arrays.asList(params));
        builder.command(forkCommand);
        final Process process = builder.start();

        IOUtils.copy(process.getInputStream(), outContent);

        process.waitFor();
    }

}