Java tutorial
/**************************************************************** * Licensed to the Apache Software Foundation (ASF) under one * * or more contributor license agreements. See the NOTICE file * * distributed with this work for additional information * * regarding copyright ownership. The ASF licenses this file * * to you under the Apache License, Version 2.0 (the * * "License"); you may not use this file except in compliance * * with the License. You may obtain a copy of the License at * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, * * software distributed under the License is distributed on an * * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * * KIND, either express or implied. See the License for the * * specific language governing permissions and limitations * * under the License. * ****************************************************************/ package org.apache.james.jmap.utils; import static org.assertj.core.api.Assertions.assertThat; import java.nio.charset.StandardCharsets; import org.apache.commons.io.IOUtils; import org.junit.Before; import org.junit.Test; public class JsoupHtmlTextExtractorTest { private JsoupHtmlTextExtractor textExtractor; @Before public void setUp() { textExtractor = new JsoupHtmlTextExtractor(); } @Test public void toPlainTextShouldNotModifyPlainText() { String textWithoutHtml = "text without html"; assertThat(textExtractor.toPlainText(textWithoutHtml)).isEqualTo(textWithoutHtml); } @Test public void toPlainTextShouldRemoveSimpleHtmlTag() { String html = "This is an <b>HTML</b> text !"; String expectedPlainText = "This is an HTML text !"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void toPlainTextShouldReplaceSkipLine() { String html = "<p>This is an<br/>HTML text !</p>"; String expectedPlainText = "This is an\nHTML text !\n\n"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void toPlainTextShouldSkipLinesBetweenParagraph() { String html = "<p>para1</p><p>para2</p>"; String expectedPlainText = "para1\n\npara2\n\n"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void toPlainTextShouldConciderUpperCaseLabelsAsLowerCase() { String html = "<P>para1</P><p>para2</p>"; String expectedPlainText = "para1\n\npara2\n\n"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void toPlainTextShouldHandleListsWell() { String html = "<ul>Here is my awesome list:" + " <li>JMAP</li>" + " <li>IMAP</li>" + "</ul>" + "<p>Followed with some text</p>" + "<p>And some other text</p>"; String expectedPlainText = "Here is my awesome list: \n" + " - JMAP \n" + " - IMAP\n" + "\n" + "Followed with some text\n" + "\n" + "And some other text\n" + "\n"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void toPlainTextShouldHandleOrderedListsWell() { String html = "<ol>Here is my awesome list:" + " <li>JMAP</li>" + " <li>IMAP</li>" + "</ol>" + "<p>Followed with some text</p>" + "<p>And some other text</p>"; String expectedPlainText = "Here is my awesome list: \n" + " - JMAP \n" + " - IMAP\n" + "\n" + "Followed with some text\n" + "\n" + "And some other text\n" + "\n"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void tableShouldBeWellHandled() { String html = " <table style=\"width:100%\">\n" + " <tr>\n" + " <th>Firstname</th>\n" + " <th>Lastname</th>\n" + " <th>Age</th>\n" + " </tr>\n" + " <tr>\n" + " <td>Jill</td>\n" + " <td>Smith</td>\n" + " <td>50</td>\n" + " </tr>\n" + " <tr>\n" + " <td>Eve</td>\n" + " <td>Jackson</td>\n" + " <td>94</td>\n" + " </tr>\n" + "</table> "; String expectedPlainText = "\n" + " \n" + " Firstname\n" + " Lastname\n" + " Age\n" + " \n" + " \n" + " Jill\n" + " Smith\n" + " 50\n" + " \n" + " \n" + " Eve\n" + " Jackson\n" + " 94\n" + " \n" + " "; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void imgShouldBeWellHandled() { String html = "<img src=\"whitePoney.png\" alt=\"My wonderfull white poney picture\"/>"; String expectedPlainText = "[My wonderfull white poney picture]"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void imgShouldBeWellInsertedInText() { String html = "Text <img src=\"whitePoney.png\" alt=\"My wonderfull white poney picture\"/> text"; String expectedPlainText = "Text [My wonderfull white poney picture] text"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void imgShouldNotBeDisplayedOnEmptyAlt() { String html = "Text <img src=\"whitePoney.png\" alt=\"\"/> text"; String expectedPlainText = "Text text"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void imgShouldNotBeDisplayedOnWhiteSpaceAlt() { String html = "Text <img src=\"whitePoney.png\" alt=\" \"/> text"; String expectedPlainText = "Text text"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void imgShouldNotBeDisplayedOnTabSpaceAlt() { String html = "Text <img src=\"whitePoney.png\" alt=\"\t\"/> text"; String expectedPlainText = "Text text"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void imgShouldNotBeDisplayedOnLineBreakSpaceAlt() { String html = "Text <img src=\"whitePoney.png\" alt=\"\n\"/> text"; String expectedPlainText = "Text text"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void imgShouldNotBeDisplayedOnMissingAlt() { String html = "Text <img src=\"whitePoney.png\"/> text"; String expectedPlainText = "Text text"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void nestedListsShouldBeWellHandled() { String html = " <ul>" + " <li>Coffee</li>" + " <li>Tea" + " <ul>" + " <li>Black tea</li>" + " <li>Green tea</li>" + " </ul>" + " </li>" + " <li>Milk</li>" + "</ul>"; String expectedPlainText = " \n" + " - Coffee \n" + " - Tea \n" + " - Black tea \n" + " - Green tea \n" + " - Milk\n" + "\n"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void nonClosedHtmlShouldBeTranslated() { String html = "This is an <b>HTML text !"; String expectedPlainText = "This is an HTML text !"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void brokenHtmlShouldBeTranslatedUntilTheBrokenBalise() { String html = "This is an <b>HTML</b missing missing missing !"; String expectedPlainText = "This is an HTML"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void toPlainTextShouldWorkWithMoreComplexHTML() throws Exception { String html = IOUtils.toString(ClassLoader.getSystemResource("example.html"), StandardCharsets.UTF_8); String expectedPlainText = "\n" + " Why a new Logo?\n" + "\n" + "\n" + " We are happy with our current logo, but for the\n" + " upcoming James Server 3.0 release, we would like to\n" + " give our community the opportunity to create a new image for James.\n" + "\n" + "\n" + "\n" + "\n" + " Don't be shy, take your inkscape and gimp, and send us on\n" + " the James Server User mailing list\n" + " your creations. We will publish them on this page.\n" + "\n" + "\n" + "\n" + "\n" + " We need an horizontal logo (100p height) to be show displayed on the upper\n" + " left corner of this page, an avatar (48x48p) to be used on a Twitter stream for example.\n" + " The used fonts should be redistributable (or commonly available on Windows and Linux).\n" + " The chosen logo should be delivered in SVG format.\n" + " We also like the Apache feather.\n" + "\n" + "\n" + "\n"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } }