Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.cli; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; import java.lang.ProcessBuilder.Redirect; import java.net.URI; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.main.Main; import org.junit.After; import org.junit.Before; import org.junit.Test; /** * Tests the Tika's cli */ public class TikaCLIIT { /* Test members */ private ByteArrayOutputStream outContent = null; private ByteArrayOutputStream errContent = null; private PrintStream stdout = null; private PrintStream errout = null; private Path testInputDir = Paths.get("src/test/resources/test-data"); private String resourcePrefix; private ProcessBuilder builder = null; @Before public void setUp() throws Exception { builder = new ProcessBuilder(); builder.directory(new File("target")); outContent = new ByteArrayOutputStream(); errContent = new ByteArrayOutputStream(); resourcePrefix = testInputDir.toAbsolutePath().toString() + "/"; stdout = System.out; errout = System.err; System.setOut(new PrintStream(outContent, true, UTF_8.name())); System.setErr(new PrintStream(errContent, true, UTF_8.name())); } /** * Tests --list-parser-detail option of the cli * * @throws Exception */ @Test public void testListParserDetail() throws Exception { String[] params = { "--list-parser-detail" }; runFramework(params); assertTrue(outContent.toString(UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web")); } /** * Tests --list-parser option of the cli * * @throws Exception */ @Test public void testListParsers() throws Exception { String[] params = { "--list-parser" }; runFramework(params); //Assert was commented temporarily for finding the problem // Assert.assertTrue(outContent != null && outContent.toString("UTF-8").contains("org.apache.tika.parser.iwork.IWorkPackageParser")); } /** * Tests -x option of the cli * * @throws Exception */ @Test public void testXMLOutput() throws Exception { String[] params = { "-x", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString(UTF_8.name()).contains("?xml version=\"1.0\" encoding=\"UTF-8\"?")); params = new String[] { "-x", "--digest=SHA256", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString(UTF_8.name()) .contains("<meta name=\"X-TIKA:digest:SHA256\" content=\"e90779adbac09c4ee")); } /** * Tests a -h option of the cli * * @throws Exception */ @Test public void testHTMLOutput() throws Exception { String[] params = { "-h", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString("UTF-8").contains("html xmlns=\"http://www.w3.org/1999/xhtml")); assertTrue("Expanded <title></title> element should be present", outContent.toString(UTF_8.name()).contains("<title></title>")); params = new String[] { "-h", "--digest=SHA384", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString("UTF-8") .contains("<meta name=\"X-TIKA:digest:SHA384\" content=\"c69ea023f5da95a026")); } /** * Tests -t option of the cli * * @throws Exception */ @Test public void testTextOutput() throws Exception { String[] params = { "-t", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString(UTF_8.name()).contains("finished off the cake")); } /** * Tests -m option of the cli * @throws Exception */ @Test public void testMetadataOutput() throws Exception { String[] params = { "-m", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString(UTF_8.name()).contains("text/plain")); params = new String[] { "-m", "--digest=SHA512", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString(UTF_8.name()).contains("text/plain")); assertTrue( outContent.toString(UTF_8.name()).contains("X-TIKA:digest:SHA512: dd459d99bc19ff78fd31fbae46e0")); } /** * Basic tests for -json option * * @throws Exception */ @Test public void testJsonMetadataOutput() throws Exception { String[] params = { "--json", "--digest=MD2", resourcePrefix + "testJsonMultipleInts.html" }; runFramework(params); String json = outContent.toString(UTF_8.name()); //TIKA-1310 assertTrue(json.contains("\"fb:admins\":\"1,2,3,4\",")); //test legacy alphabetic sort of keys int enc = json.indexOf("\"Content-Encoding\""); int fb = json.indexOf("fb:admins"); int title = json.indexOf("\"title\""); assertTrue(enc > -1 && fb > -1 && enc < fb); assertTrue(fb > -1 && title > -1 && fb < title); assertTrue(json.contains("\"X-TIKA:digest:MD2\":")); } /** * Test for -json with prettyprint option * * @throws Exception */ @Test public void testJsonMetadataPrettyPrintOutput() throws Exception { String[] params = { "--json", "-r", resourcePrefix + "testJsonMultipleInts.html" }; runFramework(params); String json = outContent.toString(UTF_8.name()); assertTrue(json.contains(" \"X-Parsed-By\": [\n" + " \"org.apache.tika.parser.DefaultParser\",\n" + " \"org.apache.tika.parser.html.HtmlParser\"\n" + " ],\n")); //test legacy alphabetic sort of keys int enc = json.indexOf("\"Content-Encoding\""); int fb = json.indexOf("fb:admins"); int title = json.indexOf("\"title\""); assertTrue(enc > -1 && fb > -1 && enc < fb); assertTrue(fb > -1 && title > -1 && fb < title); } /** * Tests -l option of the cli * * @throws Exception */ @Test public void testLanguageOutput() throws Exception { String[] params = { "-l", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString(UTF_8.name()).contains("en")); } /** * Tests -d option of the cli * * @throws Exception */ @Test public void testDetectOutput() throws Exception { String[] params = { "-d", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString(UTF_8.name()).contains("text/plain")); } /** * Tests --list-met-models option of the cli * * @throws Exception */ @Test public void testListMetModels() throws Exception { String[] params = { "--list-met-models", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString(UTF_8.name()).contains("text/plain")); } /** * Tests --list-supported-types option of the cli * * @throws Exception */ @Test public void testListSupportedTypes() throws Exception { String[] params = { "--list-supported-types", resourcePrefix + "alice.cli.test" }; runFramework(params); assertTrue(outContent.toString(UTF_8.name()).contains("supertype: application/octet-stream")); } /** * Tears down the test. Returns the System.out */ @After public void tearDown() throws Exception { System.setOut(stdout); System.setErr(errout); //System.out.println("Output: " + outContent.toString(UTF_8.name())); //System.out.println("Error: " + errContent.toString(UTF_8.name())); } @Test public void testExtract() throws Exception { File tempFile = File.createTempFile("tika-test-", ""); tempFile.delete(); tempFile.mkdir(); // not really good method for production usage, but ok for tests // google guava library has better solution try { String[] params = { "--extract-dir=" + tempFile.getAbsolutePath(), "-z", resourcePrefix + "/coffee.xls" }; runFramework(params); StringBuffer allFiles = new StringBuffer(); for (String f : tempFile.list()) { if (allFiles.length() > 0) allFiles.append(" : "); allFiles.append(f); } // ChemDraw file File expectedCDX = new File(tempFile, "MBD002B040A.cdx"); // Image of the ChemDraw molecule File expectedIMG = new File(tempFile, "file4.png"); // OLE10Native File expectedOLE10 = new File(tempFile, "MBD002B0FA6_file5.bin"); // Something that really isnt a text file... Not sure what it is??? File expected262FE3 = new File(tempFile, "MBD00262FE3.txt"); // Image of one of the embedded resources File expectedEMF = new File(tempFile, "file0.emf"); assertExtracted(expectedCDX, allFiles.toString()); assertExtracted(expectedIMG, allFiles.toString()); assertExtracted(expectedOLE10, allFiles.toString()); assertExtracted(expected262FE3, allFiles.toString()); assertExtracted(expectedEMF, allFiles.toString()); } finally { FileUtils.deleteDirectory(tempFile); } } protected static void assertExtracted(File f, String allFiles) { assertTrue("File " + f.getName() + " not found in " + allFiles, f.exists()); assertFalse("File " + f.getName() + " is a directory!", f.isDirectory()); assertTrue("File " + f.getName() + " wasn't extracted with contents", f.length() > 0); } // TIKA-920 @Test public void testMultiValuedMetadata() throws Exception { String[] params = { "-m", resourcePrefix + "testMultipleSheets.numbers" }; runFramework(params); String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("sheetNames: Checking")); assertTrue(content.contains("sheetNames: Secon sheet")); assertTrue(content.contains("sheetNames: Logical Sheet 3")); assertTrue(content.contains("sheetNames: Sheet 4")); } // TIKA-1031 @Test public void testZipWithSubdirs() throws Exception { String[] params = { "-z", "--extract-dir=target", resourcePrefix + "testWithSubdirs.zip" }; new File("subdir/foo.txt").delete(); new File("subdir").delete(); runFramework(params); String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("Extracting 'subdir/foo.txt'")); // clean up. TODO: These should be in target. new File("target/subdir/foo.txt").delete(); new File("target/subdir").delete(); } @Test public void testDefaultConfigException() throws Exception { //default xml parser will throw TikaException //this and TestConfig() are broken into separate tests so that //setUp and tearDown() are called each time String[] params = { resourcePrefix + "bad_xml.xml" }; runFramework(params); assertTrue(errContent.toString(UTF_8.name()).contains("TikaException")); } @Test public void testConfig() throws Exception { String[] params = new String[] { "--config=" + testInputDir.toAbsolutePath().toString() + "/tika-config1.xml", resourcePrefix + "bad_xml.xml" }; runFramework(params); String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("apple")); assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser")); } @Test public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception { String[] params = new String[] { "-m", "-J", "-r", resourcePrefix + "test_recursive_embedded.docx" }; runFramework(params); String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("[\n" + " {\n" + " \"Application-Name\": \"Microsoft Office Word\",\n" + " \"Application-Version\": \"15.0000\",\n" + " \"Character Count\": \"28\",\n" + " \"Character-Count-With-Spaces\": \"31\",")); assertTrue(content.contains("\"X-TIKA:embedded_resource_path\": \"/embed1.zip\"")); assertFalse(content.contains("X-TIKA:content")); } public void runFramework(String[] params) throws Exception { runFramework(params, null); } public void runFramework(String[] params, String[] systemProps) throws Exception { List<String> forkCommand = new ArrayList<String>(); forkCommand.add("java"); forkCommand.add("-cp"); forkCommand.add(System.getProperty("project.bundle.file")); if (systemProps != null) { forkCommand.addAll(Arrays.asList(systemProps)); } forkCommand.add("org.apache.tika.main.Main"); forkCommand.addAll(Arrays.asList(params)); builder.command(forkCommand); Process process = builder.start(); IOUtils.copy(process.getInputStream(), outContent); IOUtils.copy(process.getErrorStream(), errContent); process.waitFor(); } @Test public void testJsonRecursiveMetadataParserDefault() throws Exception { String[] params = new String[] { "-J", "-r", resourcePrefix + "test_recursive_embedded.docx" }; runFramework(params); String content = outContent.toString(UTF_8.name()); assertTrue( content.contains("\"X-TIKA:content\": \"\\u003chtml xmlns\\u003d\\\"http://www.w3.org/1999/xhtml")); } @Test public void testJsonRecursiveMetadataParserText() throws Exception { String[] params = new String[] { "-J", "-r", "-t", resourcePrefix + "test_recursive_embedded.docx" }; runFramework(params); String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("\\n\\nembed_4\\n")); assertTrue(content.contains("\\n\\nembed_0")); } @Test public void testDigestInJson() throws Exception { String[] params = new String[] { "-J", "-r", "-t", "--digest=MD5", resourcePrefix + "test_recursive_embedded.docx" }; runFramework(params); String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"59f626e09a8c16ab6dbc2800c685f772\",")); assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"f9627095ef86c482e61d99f0cc1cf87d\"")); } @Test public void testConfigSerializationStaticAndCurrent() throws Exception { String[] params = new String[] { "--dump-static-config" }; runFramework(params); String content = outContent.toString(UTF_8.name()); //make sure at least one detector is there assertTrue( content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>")); //make sure Executable is there because follow on tests of custom config //test that it has been turned off. assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>")); params = new String[] { "--dump-current-config" }; runFramework(params); content = outContent.toString(UTF_8.name()); //make sure at least one detector is there assertTrue( content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>")); //and at least one parser assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>")); } @Test public void testConfigSerializationCustomMinimal() throws Exception { String[] params = new String[] { "--config=" + testInputDir.toAbsolutePath().toString() + "/tika-config2.xml", "--dump-minimal-config" }; runFramework(params); String content = outContent.toString(UTF_8.name()).replaceAll("[\r\n\t ]+", " "); String expected = "<parser class=\"org.apache.tika.parser.DefaultParser\">" + " <mime-exclude>application/pdf</mime-exclude>" + " <mime-exclude>image/jpeg</mime-exclude> " + "</parser> " + "<parser class=\"org.apache.tika.parser.EmptyParser\">" + " <mime>application/pdf</mime> " + "</parser>"; assertTrue(content.contains(expected)); } @Test public void testPlugins() throws Exception { String[] params = { "--list-parser" }; String[] systemProps = { "-Dorg.osgi.framework.storage.clean=onFirstInit", "-D" + Main.PLUGIN_DEPLOY_DIR_PROP + "=" + testInputDir.toAbsolutePath().toString() + "/plugins" }; runFramework(params, systemProps); assertTrue(outContent.toString(UTF_8.name()).contains("DummyParser")); } @Test public void testConfigSerializationCustomStatic() throws Exception { String[] params = new String[] { "--config=" + testInputDir.toAbsolutePath().toString() + "/tika-config2.xml", "--dump-static-config" }; runFramework(params); String content = outContent.toString(UTF_8.name()); assertFalse(content.contains("org.apache.tika.parser.executable.Executable")); } }