org.apache.tika.eval.ComparerBatchTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.eval.ComparerBatchTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.eval;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.tika.batch.fs.FSBatchTestBase;
import org.apache.tika.eval.db.Cols;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;

@Ignore("move these tests to TikaEvalCLITest")
public class ComparerBatchTest extends FSBatchTestBase {

    public final static String COMPARER_PROCESS_CLASS = "org.apache.tika.batch.fs.FSBatchProcessCLI";

    private static Path dbDir;
    private static Connection conn;

    private final static String compJoinCont = "";
    /*ExtractComparer.COMPARISONS_TABLE+" cmp " +
        "join "+ExtractComparer.CONTAINERS_TABLE + " cnt "+
        "on cmp."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID+
        " = cnt."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID;*/

    @BeforeClass
    public static void setUp() throws Exception {

        File inputRoot = new File(ComparerBatchTest.class.getResource("/test-dirs").toURI());
        dbDir = Files.createTempDirectory(inputRoot.toPath(), "tika-test-db-dir-");
        Map<String, String> args = new HashMap<>();
        Path db = FileSystems.getDefault().getPath(dbDir.toString(), "comparisons_test");
        args.put("-db", db.toString());

        //for debugging, you can use this to select only one file pair to load
        //args.put("-includeFilePat", "file8.*");
        /*
                BatchProcessTestExecutor ex = new BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args,
            "/tika-batch-comparison-eval-config.xml");
                StreamStrings streamStrings = ex.execute();
                System.out.println(streamStrings.getErrString());
                System.out.println(streamStrings.getOutString());
                H2Util dbUtil = new H2Util(db);
                conn = dbUtil.getConnection();*/
    }

    @AfterClass
    public static void tearDown() throws Exception {

        conn.close();

        FileUtils.deleteDirectory(dbDir.toFile());
    }

    @Test
    public void testSimpleDBWriteAndRead() throws Exception {
        Set<String> set = new HashSet<>();
        //filenames
        List<String> list = getColStrings(Cols.FILE_NAME.name(), ExtractComparer.PROFILES_A.getName(), "");
        assertEquals(7, list.size());
        assertTrue(list.contains("file1.pdf"));

        //container ids in comparisons table
        list = getColStrings(Cols.CONTAINER_ID.name(), ExtractComparer.COMPARISON_CONTAINERS.getName(), "");
        assertEquals(10, list.size());
        set.clear();
        set.addAll(list);
        assertEquals(10, set.size());
        /*
                //ids in comparisons table
                list = getColStrings(AbstractProfiler.HEADERS.ID.name(),
            compTable,"");
                assertEquals(9, list.size());
                set.clear(); set.addAll(list);
                assertEquals(9, set.size());*/
    }

    /*
    @Test
    public void testFile1PDFRow() throws Exception {
        String where = fp+"='file1.pdf'";
        Map<String, String> data = getRow(compJoinCont, where);
        String result = data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS + "_A");
        assertTrue(result.startsWith("over: 1"));
        
        result = data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS + "_B");
        assertTrue(result.startsWith("aardvark: 3 | bear: 2"));
        
        
        assertEquals("aardvark: 3 | bear: 2",
                data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.toString()));
        assertEquals("fox: 2 | lazy: 1 | over: 1",
                data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.toString()));
        assertEquals("12", data.get(ExtractComparer.HEADERS.NUM_TOKENS+"_A"));
        assertEquals("13", data.get(ExtractComparer.HEADERS.NUM_TOKENS+"_B"));
        assertEquals("8", data.get(ExtractComparer.HEADERS.NUM_UNIQUE_TOKENS+"_A"));
        assertEquals("9", data.get(ExtractComparer.HEADERS.NUM_UNIQUE_TOKENS+"_B"));
        
        assertEquals(ExtractComparer.COMPARISON_HEADERS.OVERLAP.name(),
                0.64f, Float.parseFloat(data.get("OVERLAP")), 0.0001f);
        
        assertEquals(ExtractComparer.COMPARISON_HEADERS.DICE_COEFFICIENT.name(),
                0.8235294f, Float.parseFloat(data.get("DICE_COEFFICIENT")), 0.0001f);
        
        assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_A", 3.83333d,
                Double.parseDouble(
                        data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_A")), 0.0001d);
        
        assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_B", 4.923d,
                Double.parseDouble(
                        data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_B")), 0.0001d);
        
        assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_A", 1.0298d,
                Double.parseDouble(
                        data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_A")), 0.0001d);
        
        assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_B", 1.9774d,
                Double.parseDouble(data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_B")), 0.0001d);
        
        assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_A", 46,
                Integer.parseInt(
                        data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_A")));
        
        assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_B", 64,
                Integer.parseInt(data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_B")));
        
        assertEquals("TOKEN_ENTROPY_RATE_A", 0.237949,
                Double.parseDouble(data.get("TOKEN_ENTROPY_RATE_A")), 0.0001d);
        
        assertEquals("TOKEN_ENTROPY_RATE_B", 0.232845,
                Double.parseDouble(data.get("TOKEN_ENTROPY_RATE_B")), 0.0001d);
        
    }
        
        
    @Test
    public void testEmpty() throws Exception {
        String where = fp+"='file4_emptyB.pdf'";
        Map<String, String> data = getRow(contTable, where);
        assertNull(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX +
                ExtractComparer.aExtension));
        assertTrue(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX +
                ExtractComparer.bExtension).equals(AbstractProfiler.JSON_PARSE_EXCEPTION));
        
        where = fp+"='file5_emptyA.pdf'";
        data = getRow(contTable, where);
        assertNull(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX +
                ExtractComparer.bExtension));
        assertTrue(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+
                ExtractComparer.aExtension).equals(AbstractProfiler.JSON_PARSE_EXCEPTION));
    }
        
        @Test
        public void testMissingAttachment() throws Exception {
            String where = fp+"='file2_attachANotB.doc' and "+AbstractProfiler.HEADERS.EMBEDDED_FILE_PATH+
                    "='inner.txt'";
            Map<String, String> data = getRow(compJoinCont, where);
            assertContains("attachment: 1", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.name()));
            assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.name()));
            assertNull(data.get(ExtractComparer.HEADERS.TOP_N_TOKENS +
                    ExtractComparer.bExtension));
            assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS +
                    ExtractComparer.bExtension));
        
            assertEquals("3", data.get("NUM_METADATA_VALUES_A"));
            assertNull(data.get("DIFF_NUM_ATTACHMENTS"));
            assertNull(data.get("NUM_METADATA_VALUES_B"));
            assertEquals("0", data.get("NUM_UNIQUE_TOKENS_B"));
            assertNull(data.get("TOKEN_ENTROPY_RATE_B"));
            assertNull(data.get("NUM_EN_STOPS_TOP_N_B"));
        
            where = fp+"='file3_attachBNotA.doc' and "+AbstractProfiler.HEADERS.EMBEDDED_FILE_PATH+
                    "='inner.txt'";
            data = getRow(compJoinCont, where);
            assertContains("attachment: 1", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.name()));
            assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.name()));
            assertNull(data.get(ExtractComparer.HEADERS.TOP_N_TOKENS +
                    ExtractComparer.aExtension));
            assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS +
                    ExtractComparer.aExtension));
        
            assertEquals("3", data.get("NUM_METADATA_VALUES_B"));
            assertNull(data.get("DIFF_NUM_ATTACHMENTS"));
            assertNull(data.get("NUM_METADATA_VALUES_A"));
            assertEquals("0", data.get("NUM_UNIQUE_TOKENS_A"));
            assertNull(data.get("TOKEN_ENTROPY_RATE_A"));
            assertNull(data.get("NUM_EN_STOPS_TOP_N_A"));
        
        }
        
        @Test
        public void testBothBadJson() throws Exception {
            debugDumpAll(contTable);
            String where = fp+"='file7_badJson.pdf'";
            Map<String, String> data = getRow(contTable, where);
            assertEquals(AbstractProfiler.JSON_PARSE_EXCEPTION,
                    data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+ ExtractComparer.aExtension));
            assertEquals(AbstractProfiler.JSON_PARSE_EXCEPTION,
                    data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+ ExtractComparer.bExtension));
            assertEquals("file7_badJson.pdf",
                    data.get(AbstractProfiler.CONTAINER_HEADERS.FILE_PATH.name()));
            assertEquals("61", data.get("JSON_FILE_LENGTH_A"));
            assertEquals("0", data.get("JSON_FILE_LENGTH_B"));
            assertEquals("pdf", data.get(AbstractProfiler.CONTAINER_HEADERS.FILE_EXTENSION.name()));
        
        }
        
        @Test
        public void testAccessPermissionException() throws Exception {
            String sql = "select "+
                    AbstractProfiler.EXCEPTION_HEADERS.ACCESS_PERMISSION_EXCEPTION.name() +
                    " from " + AbstractProfiler.EXCEPTIONS_TABLE+"_A exA "+
                    " join " + ExtractComparer.COMPARISONS_TABLE + " cmp on cmp.ID=exA.ID "+
                    " join " + ExtractComparer.CONTAINERS_TABLE + " cont on cmp.CONTAINER_ID=cont.CONTAINER_ID "+
                    " where "+fp+"='file6_accessEx.pdf'";
            Statement st = conn.createStatement();
            ResultSet rs = st.executeQuery(sql);
            List<String> results = new ArrayList<String>();
            while (rs.next()) {
                results.add(rs.getString(1));
            }
            assertEquals(1, results.size());
            assertEquals("TRUE", results.get(0));
        
            sql = "select "+
                    AbstractProfiler.EXCEPTION_HEADERS.ACCESS_PERMISSION_EXCEPTION.name() +
                    " from " + AbstractProfiler.EXCEPTIONS_TABLE+"_B exB "+
                    " join " + ExtractComparer.COMPARISONS_TABLE + " cmp on cmp.ID=exB.ID "+
                    " join " + ExtractComparer.CONTAINERS_TABLE + " cont on cmp.CONTAINER_ID=cont.CONTAINER_ID "+
                    " where "+fp+"='file6_accessEx.pdf'";
            st = conn.createStatement();
            rs = st.executeQuery(sql);
            results = new ArrayList<String>();
            while (rs.next()) {
                results.add(rs.getString(1));
            }
            assertEquals(1, results.size());
            assertEquals("TRUE", results.get(0));
        
        }
        
        @Test
        public void testContainerException() throws Exception {
            String sql = "select * "+
                    " from " + AbstractProfiler.EXCEPTIONS_TABLE+"_A exA "+
                    " join " + ExtractComparer.COMPARISONS_TABLE + " cmp on cmp.ID=exA.ID "+
                    " join " + ExtractComparer.CONTAINERS_TABLE + " cont on cmp.CONTAINER_ID=cont.CONTAINER_ID "+
                    "where "+fp+"='file8_IOEx.pdf'";
            Statement st = conn.createStatement();
            ResultSet rs = st.executeQuery(sql);
        
            Map<String, String> data = new HashMap<String,String>();
            ResultSetMetaData rsM = rs.getMetaData();
            while (rs.next()) {
                for (int i = 1; i <= rsM.getColumnCount(); i++)
                data.put(rsM.getColumnName(i), rs.getString(i));
            }
        
            String sortStack = data.get(AbstractProfiler.EXCEPTION_HEADERS.SORT_STACK_TRACE.name());
            sortStack = sortStack.replaceAll("[\r\n]", "<N>");
            assertTrue(sortStack.startsWith("java.lang.RuntimeException<N>"));
        
            String fullStack = data.get(AbstractProfiler.EXCEPTION_HEADERS.ORIG_STACK_TRACE.name());
            assertTrue(
                    fullStack.startsWith("java.lang.RuntimeException: java.io.IOException: Value is not an integer"));
        }
        
    private void debugDumpAll(String table) throws Exception {
        Statement st = conn.createStatement();
        String sql = "select * from "+table;
        ResultSet rs = st.executeQuery(sql);
        ResultSetMetaData m = rs.getMetaData();
        for (int i = 1; i <= m.getColumnCount(); i++) {
            System.out.print(m.getColumnName(i) + ", ");
        }
        System.out.println("\n");
        while (rs.next()) {
            for (int i = 1; i <= m.getColumnCount(); i++) {
                System.out.print(rs.getString(i)+", ");
            }
            System.out.println("\n");
        }
        st.close();
        
    }
    */
    private void debugShowColumns(String table) throws Exception {
        Statement st = conn.createStatement();
        String sql = "select * from " + table;
        ResultSet rs = st.executeQuery(sql);
        ResultSetMetaData m = rs.getMetaData();
        for (int i = 1; i <= m.getColumnCount(); i++) {
            System.out.println(i + " : " + m.getColumnName(i));
        }
        st.close();
    }

    //return the string value for one cell
    private String getString(String colName, String table, String where) throws Exception {
        List<String> results = getColStrings(colName, table, where);
        if (results.size() > 1) {
            throw new RuntimeException("more than one result");
        } else if (results.size() == 0) {
            throw new RuntimeException("no results");
        }

        return results.get(0);
    }

    private Map<String, String> getRow(String table, String where) throws Exception {
        String sql = getSql("*", table, where);
        Map<String, String> results = new HashMap<String, String>();
        Statement st = null;

        try {
            st = conn.createStatement();
            ResultSet rs = st.executeQuery(sql);
            ResultSetMetaData m = rs.getMetaData();
            int rows = 0;
            while (rs.next()) {
                if (rows > 0) {
                    throw new RuntimeException("returned more than one row!");
                }
                for (int i = 1; i <= m.getColumnCount(); i++) {
                    results.put(m.getColumnName(i), rs.getString(i));
                }
                rows++;
            }
        } finally {
            if (st != null) {
                st.close();
            }
        }
        return results;

    }

    //return the string representations of the column values for one column
    //as a list of strings
    private List<String> getColStrings(String colName) throws Exception {
        return getColStrings(colName, ExtractComparer.CONTENT_COMPARISONS.getName(), null);
    }

    private List<String> getColStrings(String colName, String table, String where) throws Exception {
        String sql = getSql(colName, table, where);
        List<String> results = new ArrayList<>();
        Statement st = null;
        try {
            st = conn.createStatement();
            System.out.println("SQL: " + sql);
            ResultSet rs = st.executeQuery(sql);
            while (rs.next()) {
                results.add(rs.getString(1));
            }
        } finally {
            if (st != null) {
                st.close();
            }
        }
        return results;
    }

    private String getSql(String colName, String table, String where) {
        StringBuilder sb = new StringBuilder();
        sb.append("select ").append(colName).append(" from ").append(table);
        if (where != null && !where.equals("")) {
            sb.append(" where ").append(where);
        }
        return sb.toString();
    }

}