it.unimi.di.big.mg4j.document.DocumentCollectionTest.java Source code

Introduction

Here is the source code for it.unimi.di.big.mg4j.document.DocumentCollectionTest.java
Source

package it.unimi.di.big.mg4j.document;

/*       
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2005-2015 Paolo Boldi 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.Swapper;
import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongIterators;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.util.Properties;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.util.StringTokenizer;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.io.FileUtils;
import org.junit.BeforeClass;
import org.junit.Test;

public class DocumentCollectionTest {

    /* We consider documents abstractly described by two fields each. 
     * 
     * WARNING: the first string MUST be a prefix of the second string. */
    private final static String[][] document = new String[][] {
            //              0   1   2   3      0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
            new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
            new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
            new String[] { "aaa uuu aaa", "aaa uuu aaa xxx xxx xxx aaa xxx" },
            // This tests that zipped collections handle properly initial spaces and
            // that word readers are propagated correctly.
            new String[] { " aaa uuu aaa", " aaa uuu aaa _ __ xxx _ xxx xxx aaa xxx" }, };

    private final static String[][] document2 = new String[][] {
            //              0   1   2   3      0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
            new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
            new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
            new String[] { "aaa uuu aaa", "aaa uuu aaa xxx xxx xxx aaa xxx" },
            // This tests that zipped collections handle properly initial spaces and
            // that word readers are propagated correctly.
            new String[] { " aaa uuu aaa", " aaa uuu aaa _ __ xxx _ xxx xxx aaa xxx" },
            new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
            new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
            new String[] { "aaa uuu aaa", "aaa uuu aaa xxx xxx xxx aaa xxx" },
            // This tests that zipped collections handle properly initial spaces and
            // that word readers are propagated correctly.
            new String[] { " aaa uuu aaa", " aaa uuu aaa _ __ xxx _ xxx xxx aaa xxx" }, };

    private final static Properties DEFAULT_PROPERTIES = new Properties();
    static {
        DEFAULT_PROPERTIES.setProperty(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "ASCII");
        DEFAULT_PROPERTIES.setProperty(PropertyBasedDocumentFactory.MetadataKeys.WORDREADER,
                it.unimi.dsi.io.FastBufferedReader.class.getName() + "(_)");
    }

    /** The number of documents. */
    private final static int ndoc = document.length;
    /** The temporary directory where all tests are run. */
    private static File tempDir;
    /** The set of files in the HTML directory. */
    private static String[] htmlFileSet;

    /** Given a two-field document, produce an HTML document with the first field as title and
     *  the second field as body.
     *  
     *  @param document the document.
     *  @return the HTML version of the document.
     */
    private static String getHTMLDocument(String[] document) {
        MutableString res = new MutableString();
        res.append(
                "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n");
        res.append("<HTML>\n<HEAD>\n<TITLE>" + document[0] + "</TITLE>\n");
        // Do NOT append the first part of the body
        res.append("<BODY>\n" + document[1].substring(document[0].length()));
        res.append("\n</BODY>\n");
        res.append("</HTML>");
        return res.toString();
    }

    /** Given a two-field document, produce a mbox document with the first field as subject and
     *  the second field as body.
     *  
     *  @param document the document.
     *  @return the HTML version of the document.
     */
    private static String getMboxDocument(String[] document) {
        MutableString res = new MutableString();
        res.append("From MAILER-DAEMON Fri Apr 15 16:22:32 2005\n");
        res.append("Date: 15 Apr 2005 16:22:32 +0200\n");
        res.append("From: Mail System Internal Data <MAILER-DAEMON@sliver.usr.dsi.unimi.it>\n");
        res.append("Subject: " + document[0] + "\n");
        res.append("Message-ID: <1113574952@sliver.usr.dsi.unimi.it>\n");
        res.append("X-IMAP: 1102967122 0000138458\n");
        res.append("Return-Path: <matteo.xxx@unimi.it>\n");
        res.append("Received: from localhost (localhost.localdomain [127.0.0.1])\n");
        res.append("\tby sliver.usr.dsi.unimi.it (8.12.11/8.12.11) with ESMTP id iAUNtadn007305\n");
        res.append("\tfor <vigna@localhost>; Wed, 1 Dec 2004 00:55:36 +0100\n");
        res.append("Received: from law5.usr.dsi.unimi.it [159.149.146.241]\n");
        res.append("\tby localhost with IMAP (fetchmail-6.2.5)\n");
        res.append("\tfor vigna@localhost (single-drop); Wed, 01 Dec 2004 00:55:36 +0100 (CET)\n");
        res.append("To: vigna@dsi.unimi.it\n");
        res.append("Message-id: <Pine.WNT.4.33.0412010051240.-209505@p233-mmx>\n");
        res.append("Content-type: TEXT/PLAIN; charset=iso-8859-15\n");
        res.append("X-Warning: UNAuthenticated Sender\n");
        res.append("Content-Transfer-Encoding: 8bit\n");
        res.append("Content-Length: " + document[1].length() + "\n");
        res.append("\n");
        res.append(document[1] + "\n");
        return res.toString();
    }

    /** Checks that the tokenizer and the word reader return exactly the same sequence of words. 
     * 
     * @param wordReader the word reader.
     * @param tok the tokenizer.
     * @throws IOException
     */
    private void checkSameWords(WordReader wordReader, StringTokenizer tok) throws IOException {
        MutableString word = new MutableString();
        MutableString nonWord = new MutableString();
        boolean aWordInDocum, aWordInDocument;
        boolean firstTime = true;
        for (;;) {
            aWordInDocum = wordReader.next(word, nonWord);
            if (firstTime) {
                firstTime = false;
                if (word.equals(""))
                    continue;
            }
            assertFalse(aWordInDocum && word.equals(""));
            aWordInDocument = tok.hasMoreElements();
            assertTrue(aWordInDocum == aWordInDocument);
            if (!aWordInDocum)
                break;
            assertEquals(tok.nextElement(), word.toString());
        }
    }

    /** Checks that the documents in the collection have the same sequence of words as in
     *  document: the names of the fields to be checked are specified in the array.
     *  
     * @param coll the collection.
     * @param fieldName the field names.
     * @param document documents to be checked against.
     * @throws IOException
     */
    private void checkAllDocuments(final DocumentCollection coll, final String[] fieldName,
            final String[][] document) throws IOException {
        final int nfields = fieldName.length;
        final int[] fieldNumber = new int[nfields];
        final int[] arrayIndex = new int[nfields];
        // Look for field indices
        for (int i = 0; i < nfields; i++) {
            arrayIndex[i] = i;
            int j;
            for (j = 0; j < coll.factory().numberOfFields(); j++)
                if (coll.factory().fieldName(j).equals(fieldName[i])) {
                    fieldNumber[i] = j;
                    break;
                }
            assert j < coll.factory().numberOfFields();
        }
        // Sort fields to guarantee that they are correctly numbered
        Arrays.quickSort(0, nfields, new AbstractIntComparator() {
            public int compare(int x, int y) {
                return fieldNumber[x] - fieldNumber[y];
            }
        }, new Swapper() {
            public void swap(int x, int y) {
                int t = fieldNumber[x];
                fieldNumber[x] = fieldNumber[y];
                fieldNumber[y] = t;
                t = arrayIndex[x];
                arrayIndex[x] = arrayIndex[y];
                arrayIndex[y] = t;
                String q = fieldName[x];
                fieldName[x] = fieldName[y];
                fieldName[y] = q;
            }
        });
        // Start checking
        for (int doc = 0; doc < coll.size(); doc++) {
            Document docum = coll.document(doc);
            for (int i = 0; i < nfields; i++) {
                int field = fieldNumber[i];
                Reader content = (Reader) docum.content(field);
                WordReader wordReader = docum.wordReader(field);
                wordReader.setReader(content);
                StringTokenizer tok = new StringTokenizer(document[doc][arrayIndex[i]]);
                System.err.println("Checking document " + doc + " field " + fieldName[i] + " (" + field + ")");
                checkSameWords(wordReader, tok);
            }
            docum.close();
        }
    }

    /** Checks that the documents in the sequence have the same sequence of words as in
     *  <code>document</code>: the names of the fields to be checked are specified in the array.
     *  
     * @param seq the sequence.
     * @param fieldName the field names.
     * @param document documents to be checked against.
     * @throws IOException
     */
    private void checkAllDocumentsSeq(final DocumentSequence seq, final String[] fieldName,
            final String[][] document) throws IOException {
        final int nfields = fieldName.length;
        final int[] fieldNumber = new int[nfields];
        final int[] arrayIndex = new int[nfields];
        // Look for field indices
        for (int i = 0; i < nfields; i++) {
            arrayIndex[i] = i;
            int j;
            for (j = 0; j < seq.factory().numberOfFields(); j++)
                if (seq.factory().fieldName(j).equals(fieldName[i])) {
                    fieldNumber[i] = j;
                    break;
                }
            assert j < seq.factory().numberOfFields();
        }
        // Sort fields to guarantee that they are correctly numbered
        Arrays.quickSort(0, nfields, new AbstractIntComparator() {
            public int compare(int x, int y) {
                return fieldNumber[x] - fieldNumber[y];
            }
        }, new Swapper() {
            public void swap(int x, int y) {
                int t = fieldNumber[x];
                fieldNumber[x] = fieldNumber[y];
                fieldNumber[y] = t;
                t = arrayIndex[x];
                arrayIndex[x] = arrayIndex[y];
                arrayIndex[y] = t;
                String q = fieldName[x];
                fieldName[x] = fieldName[y];
                fieldName[y] = q;
            }
        });
        // Start checking
        DocumentIterator iterator = seq.iterator();
        Document docum;
        int doc = 0;
        while ((docum = iterator.nextDocument()) != null) {
            for (int i = 0; i < nfields; i++) {
                int field = fieldNumber[i];
                Reader content = (Reader) docum.content(field);
                WordReader wordReader = docum.wordReader(field);
                wordReader.setReader(content);
                StringTokenizer tok = new StringTokenizer(document[doc][arrayIndex[i]]);
                System.err.println(
                        "Checking sequentially document " + doc + " field " + fieldName[i] + " (" + field + ")");
                checkSameWords(wordReader, tok);
            }
            docum.close();
            doc++;
        }
        iterator.close();
    }

    @BeforeClass
    public static void setUp() throws IOException, ConfigurationException {
        // Create a new directory under /tmp
        tempDir = File.createTempFile("mg4jtest", null);
        tempDir.delete();
        tempDir.mkdir();
        // Now create the hierarchy for HTML files
        File htmlDir = new File(tempDir, "html");
        htmlDir.mkdir();
        System.err.println("Temporary directory: " + tempDir);
        htmlFileSet = new String[ndoc];
        for (int i = 0; i < ndoc; i++) {
            String docFile = new File(htmlDir, "doc" + i + ".html").toString();
            htmlFileSet[i] = docFile;
            Writer docWriter = new OutputStreamWriter(new FileOutputStream(docFile), "ISO-8859-1");
            docWriter.write(getHTMLDocument(document[i]));
            docWriter.close();
        }
        // Now create the mbox file
        Writer mboxWriter = new OutputStreamWriter(new FileOutputStream(new File(tempDir, "mbox")), "ISO-8859-1");
        for (int i = 0; i < ndoc; i++)
            mboxWriter.write(getMboxDocument(document[i]));
        mboxWriter.close();

        // Now create the zip collections
        FileSetDocumentCollection fileSetDocumentCollection = new FileSetDocumentCollection(htmlFileSet,
                new HtmlDocumentFactory(DEFAULT_PROPERTIES));
        ZipDocumentCollectionBuilder zipCollBuilder = new ZipDocumentCollectionBuilder(
                new File(tempDir, "zip").toString(), fileSetDocumentCollection.factory(), true);
        zipCollBuilder.build(fileSetDocumentCollection);

        ZipDocumentCollectionBuilder apprZipCollBuilder = new ZipDocumentCollectionBuilder(
                new File(tempDir, "azip").toString(), fileSetDocumentCollection.factory(), false);
        apprZipCollBuilder.build(fileSetDocumentCollection);
        fileSetDocumentCollection.close();

        // Now create the simple collections
        SimpleCompressedDocumentCollectionBuilder simpleCollBuilder = new SimpleCompressedDocumentCollectionBuilder(
                new File(tempDir, "simple").toString(), fileSetDocumentCollection.factory(), true);
        simpleCollBuilder.build(fileSetDocumentCollection);

        SimpleCompressedDocumentCollectionBuilder apprSimpleCollBuilder = new SimpleCompressedDocumentCollectionBuilder(
                new File(tempDir, "asimple").toString(), fileSetDocumentCollection.factory(), false);
        apprSimpleCollBuilder.build(fileSetDocumentCollection);
        fileSetDocumentCollection.close();
    }

    protected void tearDown() throws IOException {
        FileUtils.forceDelete(tempDir);
    }

    @Test
    public void testFileSetDocumentCollection() throws IOException, ConfigurationException {
        System.err.println("Checking fileset collection");
        FileSetDocumentCollection coll = new FileSetDocumentCollection(htmlFileSet,
                new HtmlDocumentFactory(DEFAULT_PROPERTIES));
        assertEquals(coll.size(), ndoc);
        checkAllDocuments(coll, new String[] { "title", "text" }, document);
        coll.close();
    }

    @Test
    public void testFileSetDocumentCollectionSeq() throws IOException, ConfigurationException {
        System.err.println("Checking fileset collection sequentially");
        FileSetDocumentCollection coll = new FileSetDocumentCollection(htmlFileSet,
                new HtmlDocumentFactory(DEFAULT_PROPERTIES));
        checkAllDocumentsSeq(coll, new String[] { "title", "text" }, document);
        coll.close();
    }

    @Test
    public void testZipDocumentCollection() throws IOException, ClassNotFoundException {
        System.err.println("Checking zipped collection");
        ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
                .loadObject(new File(tempDir, "zip.collection").toString());
        checkAllDocuments(coll, new String[] { "title", "text" }, document);
        coll.close();
    }

    @Test
    public void testZipDocumentCollectionSeq() throws IOException, ClassNotFoundException {
        System.err.println("Checking zipped collection sequentially");
        ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
                .loadObject(new File(tempDir, "zip.collection").toString());
        checkAllDocumentsSeq(coll, new String[] { "title", "text" }, document);
        coll.close();
    }

    @Test
    public void testZipDocumentCollectionAppr() throws IOException, ClassNotFoundException {
        System.err.println("Checking approximated zipped collection");
        ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
                .loadObject(new File(tempDir, "azip.collection").toString());
        checkAllDocuments(coll, new String[] { "title", "text" }, document);
        coll.close();
    }

    @Test
    public void testZipDocumentCollectionApprSeq() throws IOException, ClassNotFoundException {
        System.err.println("Checking approximated zipped collection sequentially");
        ZipDocumentCollection coll = (ZipDocumentCollection) BinIO
                .loadObject(new File(tempDir, "azip.collection").toString());
        checkAllDocumentsSeq(coll, new String[] { "title", "text" }, document);
        coll.close();
    }

    @Test
    public void testSimpleCompressedDocumentCollection() throws IOException, ClassNotFoundException {
        System.err.println("Checking simple compressed collection");
        SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection) BinIO
                .loadObject(new File(tempDir, "simple.collection").toString());
        checkAllDocuments(coll, new String[] { "title", "text" }, document);
        coll.close();
    }

    @Test
    public void testSimpleCompressedDocumentCollectionSeq() throws IOException, ClassNotFoundException {
        System.err.println("Checking simple compressed collection sequentially");
        SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection) BinIO
                .loadObject(new File(tempDir, "simple.collection").toString());
        checkAllDocumentsSeq(coll, new String[] { "title", "text" }, document);
        coll.close();
    }

    @Test
    public void testSimpleCompressedDocumentCollectionAppr() throws IOException, ClassNotFoundException {
        System.err.println("Checking approximated simple compressed collection");
        SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection) BinIO
                .loadObject(new File(tempDir, "asimple.collection").toString());
        checkAllDocuments(coll, new String[] { "title", "text" }, document);
        coll.close();
    }

    @Test
    public void testSimpleCompressedDocumentCollectionApprSeq() throws IOException, ClassNotFoundException {
        System.err.println("Checking approximated simple compressed collection sequentially");
        SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection) BinIO
                .loadObject(new File(tempDir, "asimple.collection").toString());
        checkAllDocumentsSeq(coll, new String[] { "title", "text" }, document);
        coll.close();
    }

    @Test
    public void testConcatenated() throws IOException, ClassNotFoundException {
        SimpleCompressedDocumentCollection coll0 = (SimpleCompressedDocumentCollection) BinIO
                .loadObject(new File(tempDir, "asimple.collection").toString());
        SimpleCompressedDocumentCollection coll1 = (SimpleCompressedDocumentCollection) BinIO
                .loadObject(new File(tempDir, "asimple.collection").toString());

        ConcatenatedDocumentCollection concatenatedDocumentCollection = new ConcatenatedDocumentCollection(
                new String[] { new File(tempDir, "asimple.collection").toString(),
                        new File(tempDir, "asimple.collection").toString() });
        ConcatenatedDocumentSequence concatenatedDocumentSequence0 = new ConcatenatedDocumentSequence(coll0, coll1);
        ConcatenatedDocumentSequence concatenatedDocumentSequence1 = new ConcatenatedDocumentSequence(
                new File(tempDir, "asimple.collection").toString(),
                new File(tempDir, "asimple.collection").toString());
        checkAllDocumentsSeq(concatenatedDocumentSequence0, new String[] { "title", "text" }, document2);
        checkAllDocumentsSeq(concatenatedDocumentSequence1, new String[] { "title", "text" }, document2);
        checkAllDocuments(concatenatedDocumentCollection, new String[] { "title", "text" }, document2);
        concatenatedDocumentCollection.close();
        concatenatedDocumentSequence0.close();
        concatenatedDocumentSequence0.close();
    }

    @Test
    public void testInputStreamSequence() throws IOException, ConfigurationException {
        System.err.println("Checking input stream (text field only)");
        // Extract only field number 1, and write it out with separator '\u0000'
        MutableString res = new MutableString();
        String[][] justSecondField = new String[ndoc][1];
        for (int i = 0; i < ndoc; i++) {
            res.append(document[i][1] + "\u0000");
            justSecondField[i][0] = document[i][1];
        }
        String resString = res.toString();
        // Write the sequence on a file (in UTF-8)
        Writer resWriter = new OutputStreamWriter(new FileOutputStream(new File(tempDir, "stream")), "UTF-8");
        resWriter.write(resString);
        resWriter.close();
        // Read it as a input stream document sequence
        InputStream is = new FileInputStream(new File(tempDir, "stream"));
        DocumentSequence seq = new InputStreamDocumentSequence(is, '\u0000',
                new IdentityDocumentFactory(DEFAULT_PROPERTIES));
        checkAllDocumentsSeq(seq, new String[] { "text" }, justSecondField);
        seq.close();
    }

    @Test
    public void testSubsetDocumentSequence() throws IOException, ClassNotFoundException {
        // All documents
        DocumentSequence seq = (SimpleCompressedDocumentCollection) BinIO
                .loadObject(new File(tempDir, "asimple.collection").toString());
        LongSet allDocuments = new LongOpenHashSet(LongIterators.fromTo(0, document.length));
        SubsetDocumentSequence trivialSubsetDocumentSequence = new SubsetDocumentSequence(seq, allDocuments);
        checkAllDocumentsSeq(trivialSubsetDocumentSequence, new String[] { "title", "text" }, document);
        seq.close();
        trivialSubsetDocumentSequence.close();

        // Even documents only
        seq = (SimpleCompressedDocumentCollection) BinIO
                .loadObject(new File(tempDir, "asimple.collection").toString());
        String[][] evenDocuments = new String[document.length / 2][];
        LongSet evenDocumentPointers = new LongOpenHashSet();
        for (int i = 0; i < evenDocuments.length; i++) {
            evenDocuments[i] = document[2 * i];
            evenDocumentPointers.add(2 * i);
        }
        SubsetDocumentSequence evenSubsetDocumentSequence = new SubsetDocumentSequence(seq, evenDocumentPointers);
        checkAllDocumentsSeq(evenSubsetDocumentSequence, new String[] { "title", "text" }, evenDocuments);
        seq.close();
        evenSubsetDocumentSequence.close();

        // All but number 3
        seq = (SimpleCompressedDocumentCollection) BinIO
                .loadObject(new File(tempDir, "asimple.collection").toString());
        String[][] almostAll = new String[document.length - 1][];
        LongSet almostAllDocumentPointers = new LongOpenHashSet();
        for (int i = 0, j = 0; i < document.length; i++)
            if (i != 3) {
                almostAll[j] = document[i];
                almostAllDocumentPointers.add(j);
                j++;
            }
        SubsetDocumentSequence almostAllDocumentSequence = new SubsetDocumentSequence(seq,
                almostAllDocumentPointers);
        checkAllDocumentsSeq(almostAllDocumentSequence, new String[] { "title", "text" }, almostAll);
        seq.close();
        almostAllDocumentSequence.close();

        // None
        seq = (SimpleCompressedDocumentCollection) BinIO
                .loadObject(new File(tempDir, "asimple.collection").toString());
        String[][] none = new String[0][];
        LongSet noneDocumentPointers = new LongOpenHashSet();
        SubsetDocumentSequence noneDocumentSequence = new SubsetDocumentSequence(seq, noneDocumentPointers);
        checkAllDocumentsSeq(noneDocumentSequence, new String[] { "title", "text" }, none);
        seq.close();
        noneDocumentSequence.close();
    }

}