org.apache.tika.parser.DigestingParserTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.parser.DigestingParserTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.utils.CommonsDigester;
import org.junit.Test;

public class DigestingParserTest extends TikaTest {

    private final static String P = TikaCoreProperties.TIKA_META_PREFIX + "digest"
            + Metadata.NAMESPACE_PREFIX_DELIMITER;

    private final static int UNLIMITED = 1000000;//well, not really, but longer than input file

    private final static long SEED = new Random().nextLong();

    private final Random random = new Random(SEED);
    private final Parser p = new AutoDetectParser();

    @Test
    public void testBasic() throws Exception {
        Map<CommonsDigester.DigestAlgorithm, String> expected = new HashMap<>();

        expected.put(CommonsDigester.DigestAlgorithm.MD2, "d768c8e27b0b52c6eaabfaa7122d1d4f");
        expected.put(CommonsDigester.DigestAlgorithm.MD5, "59f626e09a8c16ab6dbc2800c685f772");
        expected.put(CommonsDigester.DigestAlgorithm.SHA1, "7a1f001d163ac90d8ea54c050faf5a38079788a6");
        expected.put(CommonsDigester.DigestAlgorithm.SHA256,
                "c4b7fab030a8b6a9d6691f6699ac8e6f" + "82bc53764a0f1430d134ae3b70c32654");
        expected.put(CommonsDigester.DigestAlgorithm.SHA384, "ebe368b9326fef44408290724d187553"
                + "8b8a6923fdf251ddab72c6e4b5d54160" + "9db917ba4260d1767995a844d8d654df");
        expected.put(CommonsDigester.DigestAlgorithm.SHA512,
                "ee46d973ee1852c018580c242955974d" + "da4c21f36b54d7acd06fcf68e974663b"
                        + "fed1d256875be58d22beacf178154cc3" + "a1178cb73443deaa53aa0840324708bb");

        //test each one
        for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
            Metadata m = new Metadata();
            XMLResult xml = getXML("test_recursive_embedded.docx",
                    new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m);
            assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
        }

        //test comma separated
        CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
        Metadata m = new Metadata();
        XMLResult xml = getXML("test_recursive_embedded.docx",
                new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
        for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[] {
                CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256,
                CommonsDigester.DigestAlgorithm.SHA384, CommonsDigester.DigestAlgorithm.SHA512 }) {
            assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
        }

        assertNull(m.get(P + CommonsDigester.DigestAlgorithm.MD2.toString()));
        assertNull(m.get(P + CommonsDigester.DigestAlgorithm.SHA1.toString()));

    }

    @Test
    public void testReset() throws Exception {
        String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";
        Metadata m = new Metadata();
        XMLResult xml = getXML("test_recursive_embedded.docx",
                new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
        assertEquals(expectedMD5, m.get(P + "MD5"));
    }

    @Test
    public void testNegativeMaxMarkLength() throws Exception {
        Metadata m = new Metadata();
        boolean ex = false;
        try {
            XMLResult xml = getXML("test_recursive_embedded.docx",
                    new DigestingParser(p, new CommonsDigester(-1, CommonsDigester.DigestAlgorithm.MD5)), m);
        } catch (IllegalArgumentException e) {
            ex = true;
        }
        assertTrue("Exception not thrown", ex);
    }

    @Test
    public void testMultipleCombinations() throws Exception {
        Path tmp = Files.createTempFile("tika-digesting-parser-test", "");

        try {
            //try some random lengths
            for (int i = 0; i < 10; i++) {
                testMulti(tmp, random.nextInt(100000), random.nextInt(100000), random.nextBoolean());
            }
            //try specific lengths
            testMulti(tmp, 1000, 100000, true);
            testMulti(tmp, 1000, 100000, false);
            testMulti(tmp, 10000, 10001, true);
            testMulti(tmp, 10000, 10001, false);
            testMulti(tmp, 10000, 10000, true);
            testMulti(tmp, 10000, 10000, false);
            testMulti(tmp, 10000, 9999, true);
            testMulti(tmp, 10000, 9999, false);

            testMulti(tmp, 1000, 100, true);
            testMulti(tmp, 1000, 100, false);
            testMulti(tmp, 1000, 10, true);
            testMulti(tmp, 1000, 10, false);
            testMulti(tmp, 1000, 0, true);
            testMulti(tmp, 1000, 0, false);

            testMulti(tmp, 0, 100, true);
            testMulti(tmp, 0, 100, false);

        } finally {
            Files.delete(tmp);
        }
    }

    private void testMulti(Path tmp, int fileLength, int markLimit, boolean useTikaInputStream) throws IOException {

        OutputStream os = new BufferedOutputStream(Files.newOutputStream(tmp, StandardOpenOption.CREATE));

        for (int i = 0; i < fileLength; i++) {
            os.write(random.nextInt());
        }
        os.flush();
        os.close();

        Metadata truth = new Metadata();
        addTruth(tmp, CommonsDigester.DigestAlgorithm.MD5, truth);
        addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA1, truth);
        addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA512, truth);

        checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.SHA512,
                CommonsDigester.DigestAlgorithm.SHA1, CommonsDigester.DigestAlgorithm.MD5);

        checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.MD5,
                CommonsDigester.DigestAlgorithm.SHA1);

        checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.SHA1,
                CommonsDigester.DigestAlgorithm.SHA512, CommonsDigester.DigestAlgorithm.MD5);

        checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.SHA1);

        checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.MD5);

    }

    private void checkMulti(Metadata truth, Path tmp, int fileLength, int markLimit, boolean useTikaInputStream,
            CommonsDigester.DigestAlgorithm... algos) throws IOException {
        Metadata result = new Metadata();
        CommonsDigester digester = new CommonsDigester(markLimit, algos);
        try (InputStream is = useTikaInputStream ? TikaInputStream.get(tmp)
                : new BufferedInputStream(Files.newInputStream(tmp))) {
            digester.digest(is, result, new ParseContext());
        }

        for (CommonsDigester.DigestAlgorithm algo : algos) {
            String truthValue = truth.get(P + algo.name());
            String resultValue = result.get(P + algo.name());
            assertNotNull("truth", truthValue);
            assertNotNull("result", resultValue);

            assertEquals(
                    "fileLength(" + fileLength + ") markLimit(" + markLimit + ") useTikaInputStream("
                            + useTikaInputStream + ")" + "algorithm(" + algo.name() + ") seed(" + SEED + ")",
                    truthValue, resultValue);
        }

    }

    private void addTruth(Path tmp, CommonsDigester.DigestAlgorithm algo, Metadata truth) throws IOException {
        String digest = null;
        try (InputStream is = Files.newInputStream(tmp)) {
            switch (algo) {
            case MD2:
                digest = DigestUtils.md2Hex(is);
                break;
            case MD5:
                digest = DigestUtils.md5Hex(is);
                break;
            case SHA1:
                digest = DigestUtils.sha1Hex(is);
                break;
            case SHA256:
                digest = DigestUtils.sha256Hex(is);
                break;
            case SHA384:
                digest = DigestUtils.sha384Hex(is);
                break;
            case SHA512:
                digest = DigestUtils.sha512Hex(is);
                break;
            default:
                throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algo.toString());
            }
        }
        truth.set(P + algo.name(), digest);

    }

}