Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ import java.io.IOException; import java.io.InputStream; import junit.framework.TestCase; import org.apache.poi.POIDataSamples; import org.apache.poi.POITextExtractor; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFTestDataSamples; import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** * * @author jalvaren */ public class NewEmptyJUnitTest { public static void assertEquals(String expected, String actual) { String newExpected = expected.replaceAll("\r\n", "\n").replaceAll("\r", "\n").trim(); String newActual = actual.replaceAll("\r\n", "\n").replaceAll("\r", "\n").trim(); TestCase.assertEquals(newExpected, newActual); } private String[] p_text1 = new String[] { "This is a simple word document\r\n", "\r\n", "It has a number of paragraphs in it\r\n", "\r\n", "Some of them even feature bold, italic and underlined text\r\n", "\r\n", "\r\n", "This bit is in a different font and size\r\n", "\r\n", "\r\n", "This bit features some red text.\r\n", "\r\n", "\r\n", "It is otherwise very very boring.\r\n" }; private String p_text1_block = ""; // Well behaved document private WordExtractor extractor; // Slightly iffy document private WordExtractor extractor2; // A word doc embeded in an excel file private String filename3; // With header and footer private String filename4; // With unicode header and footer private String filename5; // With footnote private String filename6; protected void setUp() throws Exception { String filename = "test2.doc"; String filename2 = "test.doc"; filename3 = "excel_with_embeded.xls"; filename4 = "ThreeColHeadFoot.doc"; filename5 = "HeaderFooterUnicode.doc"; filename6 = "footnote.doc"; POIDataSamples docTests = POIDataSamples.getDocumentInstance(); extractor = new WordExtractor(docTests.openResourceAsStream(filename)); extractor2 = new WordExtractor(docTests.openResourceAsStream(filename2)); // Build splat'd out text version for (int i = 0; i < p_text1.length; i++) { p_text1_block += p_text1[i]; } } /** * Test paragraph based extraction */ public void testExtractFromParagraphs() { String[] text = extractor.getParagraphText(); assertEquals(p_text1.length, text.length); for (int i = 0; i < p_text1.length; i++) { assertEquals(p_text1[i], text[i]); } // Lots of paragraphs with only a few lines in them assertEquals(24, extractor2.getParagraphText().length); assertEquals("as d\r\n", extractor2.getParagraphText()[16]); assertEquals("as d\r\n", extractor2.getParagraphText()[17]); assertEquals("as d\r\n", extractor2.getParagraphText()[18]); } /** * Test the paragraph -> flat extraction */ public void testGetText() { assertEquals(p_text1_block, extractor.getText()); // For the 2nd, should give similar answers for // the two methods, differing only in line endings // nope, they must have different results, because of garbage // assertEquals( // extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), // extractor2.getText().replaceAll("[\\r\\n]", "")); } /** * Test textPieces based extraction */ public void testExtractFromTextPieces() { String text = extractor.getTextFromPieces(); assertEquals(p_text1_block, text); } /** * Test that we can get data from two different * embeded word documents * @throws Exception */ public void testExtractFromEmbeded() throws Exception { POIFSFileSystem fs = new POIFSFileSystem( POIDataSamples.getSpreadSheetInstance().openResourceAsStream(filename3)); HWPFDocument doc; WordExtractor extractor3; DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B7"); DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B2"); // Should have WordDocument and 1Table assertNotNull(dirA.getEntry("1Table")); assertNotNull(dirA.getEntry("WordDocument")); assertNotNull(dirB.getEntry("1Table")); assertNotNull(dirB.getEntry("WordDocument")); // Check each in turn doc = new HWPFDocument(dirA, fs); extractor3 = new WordExtractor(doc); assertNotNull(extractor3.getText()); assertTrue(extractor3.getText().length() > 20); assertEquals("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractor3.getText()); assertEquals("Sample Doc 1", extractor3.getSummaryInformation().getTitle()); assertEquals("Sample Test", extractor3.getSummaryInformation().getSubject()); doc = new HWPFDocument(dirB, fs); extractor3 = new WordExtractor(doc); assertNotNull(extractor3.getText()); assertTrue(extractor3.getText().length() > 20); assertEquals("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", extractor3.getText()); assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle()); assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject()); } public void testWithHeader() { // Non-unicode HWPFDocument doc = HWPFTestDataSamples.openSampleFile(filename4); extractor = new WordExtractor(doc); assertEquals("First header column!\tMid header Right header!\n", extractor.getHeaderText()); String text = extractor.getText(); assertTrue(text.indexOf("First header column!") > -1); // Unicode doc = HWPFTestDataSamples.openSampleFile(filename5); extractor = new WordExtractor(doc); assertEquals("This is a simple header, with a \u20ac euro symbol in it.\n\n", extractor.getHeaderText()); text = extractor.getText(); assertTrue(text.indexOf("This is a simple header") > -1); } public void testWithFooter() { // Non-unicode HWPFDocument doc = HWPFTestDataSamples.openSampleFile(filename4); extractor = new WordExtractor(doc); assertEquals("Footer Left\tFooter Middle Footer Right\n", extractor.getFooterText()); String text = extractor.getText(); assertTrue(text.indexOf("Footer Left") > -1); // Unicode doc = HWPFTestDataSamples.openSampleFile(filename5); extractor = new WordExtractor(doc); assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\n", extractor.getFooterText()); text = extractor.getText(); assertTrue(text.indexOf("The footer, with") > -1); } public void testFootnote() { HWPFDocument doc = HWPFTestDataSamples.openSampleFile(filename6); extractor = new WordExtractor(doc); String[] text = extractor.getFootnoteText(); StringBuffer b = new StringBuffer(); for (int i = 0; i < text.length; i++) { b.append(text[i]); } assertTrue(b.toString().contains("TestFootnote")); } public void testEndnote() { HWPFDocument doc = HWPFTestDataSamples.openSampleFile(filename6); extractor = new WordExtractor(doc); String[] text = extractor.getEndnoteText(); StringBuffer b = new StringBuffer(); for (int i = 0; i < text.length; i++) { b.append(text[i]); } assertTrue(b.toString().contains("TestEndnote")); } public void testComments() { HWPFDocument doc = HWPFTestDataSamples.openSampleFile(filename6); extractor = new WordExtractor(doc); String[] text = extractor.getCommentsText(); StringBuffer b = new StringBuffer(); for (int i = 0; i < text.length; i++) { b.append(text[i]); } assertTrue(b.toString().contains("TestComment")); } public void testWord95() throws Exception { // Too old for the default try { extractor = new WordExtractor(POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")); fail(); } catch (OldWordFileFormatException e) { } // Can work with the special one Word6Extractor w6e = new Word6Extractor( POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")); String text = w6e.getText(); assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); assertTrue(text.contains("Paragraph 2")); assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it")); assertTrue(text.contains("Last (4th) paragraph")); String[] tp = w6e.getParagraphText(); assertEquals(7, tp.length); assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]); assertEquals("\r\n", tp[1]); assertEquals("Paragraph 2\r\n", tp[2]); assertEquals("\r\n", tp[3]); assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]); assertEquals("\r\n", tp[5]); assertEquals("Last (4th) paragraph.\r\n", tp[6]); } public void testWord6() throws Exception { // Too old for the default try { extractor = new WordExtractor(POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")); fail(); } catch (OldWordFileFormatException e) { } Word6Extractor w6e = new Word6Extractor( POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")); String text = w6e.getText(); assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); String[] tp = w6e.getParagraphText(); assertEquals(1, tp.length); assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]); } public void testFastSaved() throws Exception { extractor = new WordExtractor(POIDataSamples.getDocumentInstance().openResourceAsStream("rasp.doc")); String text = extractor.getText(); assertTrue(text.contains("\u0425\u0425\u0425\u0425\u0425")); assertTrue(text.contains("\u0423\u0423\u0423\u0423\u0423")); } public void testFirstParagraphFix() throws Exception { extractor = new WordExtractor(POIDataSamples.getDocumentInstance().openResourceAsStream("Bug48075.doc")); String text = extractor.getText(); assertTrue(text.startsWith("\u041f\u0440\u0438\u043b\u043e\u0436\u0435\u043d\u0438\u0435")); } /** * Tests that we can work with both {@link POIFSFileSystem} * and {@link NPOIFSFileSystem} */ public void testDifferentPOIFS() throws Exception { POIDataSamples docTests = POIDataSamples.getDocumentInstance(); // Open the two filesystems DirectoryNode[] files = new DirectoryNode[2]; files[0] = (new POIFSFileSystem(docTests.openResourceAsStream("test2.doc"))).getRoot(); NPOIFSFileSystem npoifsFileSystem = new NPOIFSFileSystem(docTests.getFile("test2.doc")); files[1] = npoifsFileSystem.getRoot(); // Open directly for (DirectoryNode dir : files) { WordExtractor extractor = new WordExtractor(dir); assertEquals(p_text1_block, extractor.getText()); } // Open via a HWPFDocument for (DirectoryNode dir : files) { HWPFDocument doc = new HWPFDocument(dir); WordExtractor extractor = new WordExtractor(doc); assertEquals(p_text1_block, extractor.getText()); } npoifsFileSystem.close(); } /** * [RESOLVED FIXED] Bug 51686 - Update to POI 3.8 beta 4 causes * ConcurrentModificationException in Tika's OfficeParser */ public void testBug51686() throws IOException { InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream("Bug51686.doc"); POIFSFileSystem fs = new POIFSFileSystem(is); String text = null; for (Entry entry : fs.getRoot()) { if ("WordDocument".equals(entry.getName())) { WordExtractor ex = new WordExtractor(fs); try { text = ex.getText(); } finally { ex.close(); } } } assertNotNull(text); } public void testExtractorFromWord6Extractor() throws Exception { POIFSFileSystem fs = new POIFSFileSystem( POIDataSamples.getHPSFInstance().openResourceAsStream("TestMickey.doc")); Word6Extractor wExt = new Word6Extractor(fs); try { POITextExtractor ext = wExt.getMetadataTextExtractor(); try { // Now overall String text = ext.getText(); assertTrue(text.indexOf("TEMPLATE = Normal") > -1); assertTrue(text.indexOf("SUBJECT = sample subject") > -1); assertTrue(text.indexOf("MANAGER = sample manager") > -1); assertTrue(text.indexOf("COMPANY = sample company") > -1); } finally { ext.close(); } } finally { wExt.close(); } } }