mitm.common.extractor.impl.UnicodeTextExtractorTest.java Source code

Introduction

Here is the source code for mitm.common.extractor.impl.UnicodeTextExtractorTest.java
Source

/*
 * Copyright (c) 2010-2011, Martijn Brinkers, Djigzo.
 * 
 * This file is part of Djigzo email encryption.
 *
 * Djigzo is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License 
 * version 3, 19 November 2007 as published by the Free Software 
 * Foundation.
 *
 * Djigzo is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public 
 * License along with Djigzo. If not, see <http://www.gnu.org/licenses/>
 *
 * Additional permission under GNU AGPL version 3 section 7
 * 
 * If you modify this Program, or any covered work, by linking or 
 * combining it with aspectjrt.jar, aspectjweaver.jar, tyrex-1.0.3.jar, 
 * freemarker.jar, dom4j.jar, mx4j-jmx.jar, mx4j-tools.jar, 
 * spice-classman-1.0.jar, spice-loggerstore-0.5.jar, spice-salt-0.8.jar, 
 * spice-xmlpolicy-1.0.jar, saaj-api-1.3.jar, saaj-impl-1.3.jar, 
 * wsdl4j-1.6.1.jar (or modified versions of these libraries), 
 * containing parts covered by the terms of Eclipse Public License, 
 * tyrex license, freemarker license, dom4j license, mx4j license,
 * Spice Software License, Common Development and Distribution License
 * (CDDL), Common Public License (CPL) the licensors of this Program grant 
 * you additional permission to convey the resulting work.
 */
package mitm.common.extractor.impl;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;

import mitm.common.extractor.ExtractedPart;
import mitm.common.extractor.TextExtractorContext;
import mitm.common.extractor.TextExtractorEventHandler;
import mitm.common.util.RewindableInputStream;
import mitm.common.util.FileConstants;
import mitm.common.util.LimitReachedException;
import mitm.common.util.SizeUtils;
import mitm.test.TestUtils;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.CharEncoding;
import org.apache.commons.lang.NotImplementedException;
import org.apache.log4j.BasicConfigurator;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

/**
 * 
 * @author Martijn Brinkers
 *
 */
public class UnicodeTextExtractorTest {
    private static List<ExtractedPart> textParts = new LinkedList<ExtractedPart>();

    private static List<RewindableInputStream> toClose = new LinkedList<RewindableInputStream>();

    private TextExtractorEventHandler handler = new TextExtractorEventHandlerImpl();

    private static int tempFileCount;

    private static class TextExtractorEventHandlerImpl implements TextExtractorEventHandler {
        @Override
        public void textEvent(ExtractedPart textPart) throws IOException {
            textParts.add(textPart);
        }

        @Override
        public void attachmentEvent(ExtractedPart attachmentPart) {
            throw new NotImplementedException("text file should not have attachments.");
        }
    }

    private void cleanParts(List<ExtractedPart> parts) throws IOException {
        for (ExtractedPart part : parts) {
            part.close();
        }

        parts.clear();
    }

    private void closeInputStreams(List<RewindableInputStream> streams) throws IOException {
        for (RewindableInputStream stream : streams) {
            IOUtils.closeQuietly(stream);
        }

        streams.clear();
    }

    private static RewindableInputStream readDocument(String filename) throws FileNotFoundException {
        RewindableInputStream stream = new RewindableInputStream(
                new BufferedInputStream(
                        new FileInputStream(new File("test/resources/testdata/documents", filename))),
                SizeUtils.MB * 1);

        toClose.add(stream);

        return stream;
    }

    @BeforeClass
    public static void beforeClass() throws Exception {
        BasicConfigurator.configure();
    }

    @Before
    public void before() {
        // get the current nr of temp files
        tempFileCount = TestUtils.getTempFileCount(FileConstants.TEMP_FILE_PREFIX, ".tmp");
    }

    @After
    public void after() throws IOException {
        cleanParts(textParts);
        closeInputStreams(toClose);

        // check if we have any temp file leakage
        assertEquals(tempFileCount, TestUtils.getTempFileCount(FileConstants.TEMP_FILE_PREFIX, ".tmp"));
    }

    private void checkText(String text, String header) {
        String expected = header + "\r\n<note>\r\n" + "   <from>Santa Claus</from>\r\n"
                + "   <to>Donald Duck</to>\r\n" + "\r\n" + "   <body1>Norwegian:  </body1>\r\n"
                + "   <body2>French:  </body2>\r\n" + "   <body3>Finnish:  </body3>\r\n"
                + "   <body4>Greek: </body4>\r\n" + "   <body5>Cyrillic: </body5>\r\n"
                + "   <body6>Arabic: ?</body6>\r\n" + "   <body7>Hebrew: ?</body7>\r\n"
                + "   <body8>Symbols: </body8>\r\n" + "   <body9>Gujarati: ?</body9>\r\n"
                + "   <body10>Tamil: </body10>\r\n" + "   <body11>Malayalam: </body11>\r\n"
                + "   <body12>More symbols: </body12>\r\n"
                + "   <body13>Arrows: ?</body13>\r\n"
                + "   <body14>Circled digits: </body14>\r\n"
                + "   <body15>Circled letters:  </body15>\r\n"
                + "   <body16>Chess: ?</body16>\r\n"
                + "   <body17>Flowers: ???????</body17>\r\n"
                + "   <body18>Units: ??</body18>\r\n"
                + "   <body19>CJK Ideograph: </body19>\r\n"
                + "   <body20>Katakana: </body20>\r\n" + "\r\n" + "</note>\r\n";

        assertEquals(expected, text);
    }

    @Test
    public void testUTF8BOM() throws Exception {
        UnicodeTextExtractor extractor = new UnicodeTextExtractor(1, Integer.MAX_VALUE);

        TextExtractorContext context = new TextExtractorContextImpl();

        context.setName("xmlUTF8.xml");

        extractor.extract(readDocument("xmlUTF8.xml"), context, handler);

        assertEquals(1, textParts.size());

        ExtractedPart part = textParts.get(0);
        assertEquals("xmlUTF8.xml", part.getContext().getName());

        String text = IOUtils.toString(part.getContent(), CharEncoding.UTF_8);

        checkText(text, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
    }

    @Test
    public void testUTF8NoBom() throws IOException {
        UnicodeTextExtractor extractor = new UnicodeTextExtractor(1, Integer.MAX_VALUE);

        TextExtractorContext context = new TextExtractorContextImpl();

        extractor.extract(readDocument("xmlUTF8_NOBOM.xml"), context, handler);

        assertEquals(1, textParts.size());

        ExtractedPart part = textParts.get(0);
        assertNull(part.getContext().getName());

        String text = IOUtils.toString(part.getContent(), CharEncoding.UTF_8);

        assertTrue(text.contains("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"));
        assertTrue(text.contains("This does not work due to a missing BOM marker"));
        assertTrue(text.contains("French:  "));
        assertTrue(text.contains("Units: ??"));
    }

    @Test
    public void testExplicitEncoding() throws IOException {
        UnicodeTextExtractor extractor = new UnicodeTextExtractor(1, Integer.MAX_VALUE);

        TextExtractorContext context = new TextExtractorContextImpl();

        context.setEncoding("US-ASCII");

        extractor.extract(readDocument("xmlUTF8_NOBOM.xml"), context, handler);

        assertEquals(1, textParts.size());

        ExtractedPart part = textParts.get(0);
        assertNull(part.getContext().getName());

        String text = IOUtils.toString(part.getContent(), CharEncoding.UTF_8);

        assertTrue(text.contains("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"));
        assertTrue(text.contains("This does not work due to a missing BOM marker"));
        // this should not be found because encoding is explicitly set to US-ASCII
        assertFalse(text.contains("Units: ??"));
    }

    @Test(expected = LimitReachedException.class)
    public void testLimitExceeded() throws IOException {
        UnicodeTextExtractor extractor = new UnicodeTextExtractor(1, 10);

        TextExtractorContext context = new TextExtractorContextImpl();

        extractor.extract(readDocument("big-preamble.html"), context, handler);
    }
}