 * ***** BEGIN LICENSE BLOCK *****
 * Zimbra Collaboration Suite Server
 * Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2016 Synacor, Inc.
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software Foundation,
 * version 2 of the License.
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with this program.
 * If not, see <>.
 * ***** END LICENSE BLOCK *****
 * Created on Oct 23, 2005

package com.zimbra.cs.service;

import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import javax.mail.BodyPart;
import javax.mail.MessagingException;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeMultipart;
import javax.mail.internet.MimePart;
import javax.mail.internet.ParseException;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpURL;
import org.apache.commons.httpclient.HttpsURL;
import org.apache.commons.httpclient.UsernamePasswordCredentials;
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.httpclient.util.DateParseException;

import com.zimbra.common.calendar.ZCalendar.ZCalendarBuilder;
import com.zimbra.common.calendar.ZCalendar.ZVCalendar;
import com.zimbra.common.httpclient.HttpClientUtil;
import com.zimbra.common.mime.ContentType;
import com.zimbra.common.mime.MimeConstants;
import com.zimbra.common.mime.shim.JavaMailInternetAddress;
import com.zimbra.common.service.ServiceException;
import com.zimbra.common.soap.Element;
import com.zimbra.common.util.DateUtil;
import com.zimbra.common.util.FileUtil;
import com.zimbra.common.util.ZimbraHttpConnectionManager;
import com.zimbra.common.util.ZimbraLog;
import com.zimbra.common.zmime.ZMimeBodyPart;
import com.zimbra.common.zmime.ZMimeMultipart;
import com.zimbra.cs.account.Account;
import com.zimbra.cs.httpclient.HttpProxyUtil;
import com.zimbra.cs.ldap.LdapUtil;
import com.zimbra.cs.mailbox.Folder;
import com.zimbra.cs.mailbox.calendar.Invite;
import com.zimbra.cs.mime.Mime;
import com.zimbra.cs.mime.ParsedMessage;
import com.zimbra.cs.util.BuildInfo;
import com.zimbra.cs.util.JMSession;
import com.zimbra.cs.util.Zimbra;

public class FeedManager {

    public static final class SubscriptionData<T> {
        private final List<T> items;
        private String lastGuid;
        private long lastDate;
        private boolean notModified;

        static SubscriptionData<Object> NOT_MODIFIED() {
            return new SubscriptionData<Object>(new ArrayList<Object>(0), 0, true);

        SubscriptionData() {
            this(new ArrayList<T>(), 0);

        SubscriptionData(List<T> items, long ldate) {
            this(items, ldate, false);

        SubscriptionData(List<T> items, long lastModifiedDate, boolean notModified) {
            this.items = items;
            this.lastDate = lastModifiedDate;
            this.notModified = notModified;

        void recordItem(T item, String guid, long date) {
            if (date > lastDate) {
                lastGuid = guid;
                lastDate = date;

        void recordFeedModifiedDate(long feedModified) {
            if (feedModified > lastDate) {
                lastDate = feedModified;

        public List<T> getItems() {
            return items;

        // returns the guid of the most recently modified item
        public String getMostRecentGuid() {
            return lastGuid;

        // returns the timestamp of the most recently modified item, or the last modified time of the feed itself,
        // whichever is more recent
        public long getLastModifiedDate() {
            return lastDate;

        // returns true if the feed has no change since the last sync (HTTP 304 Not Modified response)
        public boolean isNotModified() {
            return notModified;

    private static String getBrowserTag() {
        String tag = " Zimbra/" + BuildInfo.MAJORVERSION + "." + BuildInfo.MINORVERSION + "."
                + BuildInfo.MICROVERSION;
        return tag.indexOf("unknown") == -1 ? tag : " Zimbra/8.0";

    public static final int MAX_REDIRECTS = 3;

    public static final String BROWSER_TAG = getBrowserTag();
    public static final String HTTP_USER_AGENT = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:"
            + BROWSER_TAG;
    public static final String HTTP_ACCEPT = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, "
            + "application/, application/, application/msword, */*";

    public static class RemoteDataInfo {
        public final int statusCode;
        public final int redirects;
        public BufferedInputStream content;
        public final String expectedCharset;
        public final long lastModified;
        private GetMethod getMethod = null;

        public RemoteDataInfo(int statusCode, int redirects, BufferedInputStream content, String expectedCharset,
                long lastModified) {
            this.statusCode = statusCode;
            this.redirects = redirects;
            this.content = content;
            this.expectedCharset = expectedCharset;
            this.lastModified = lastModified;

        public GetMethod getGetMethod() {
            return getMethod;

        public void setGetMethod(GetMethod getMethod) {
            this.getMethod = getMethod;

        public void cleanup() {
            content = null;
            if (getMethod != null) {
                getMethod = null;

    private static RemoteDataInfo retrieveRemoteData(String url, Folder.SyncData fsd)
            throws ServiceException, HttpException, IOException {
        assert !Strings.isNullOrEmpty(url);

        HttpClient client = ZimbraHttpConnectionManager.getExternalHttpConnMgr().newHttpClient();

        // cannot set connection timeout because it'll affect all HttpClients associated with the conn mgr.
        // see comments in ZimbraHttpConnectionManager
        // client.setConnectionTimeout(10000);

        HttpMethodParams params = new HttpMethodParams();
        params.setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, MimeConstants.P_CHARSET_UTF8);

        GetMethod get = null;
        BufferedInputStream content = null;
        long lastModified = 0;
        String expectedCharset = MimeConstants.P_CHARSET_UTF8;
        int redirects = 0;
        int statusCode = HttpServletResponse.SC_NOT_FOUND;
        try {
            do {
                String lcurl = url.toLowerCase();
                if (lcurl.startsWith("webcal:")) {
                    url = "http:" + url.substring(7);
                } else if (lcurl.startsWith("feed:")) {
                    url = "http:" + url.substring(5);
                } else if (!lcurl.startsWith("http:") && !lcurl.startsWith("https:")) {
                    throw ServiceException.INVALID_REQUEST("url must begin with http: or https:", null);

                // username and password are encoded in the URL as http://user:pass@host/...
                if (url.indexOf('@') != -1) {
                    HttpURL httpurl = lcurl.startsWith("https:") ? new HttpsURL(url) : new HttpURL(url);
                    if (httpurl.getUser() != null) {
                        String user = httpurl.getUser();
                        if (user.indexOf('%') != -1) {
                            try {
                                user = URLDecoder.decode(user, "UTF-8");
                            } catch (OutOfMemoryError e) {
                                Zimbra.halt("out of memory", e);
                            } catch (Throwable t) {
                        UsernamePasswordCredentials creds = new UsernamePasswordCredentials(user,
                        client.getState().setCredentials(AuthScope.ANY, creds);

                try {
                    get = new GetMethod(url);
                } catch (OutOfMemoryError e) {
                    Zimbra.halt("out of memory", e);
                    return null;
                } catch (Throwable t) {
                    throw ServiceException.INVALID_REQUEST("invalid url for feed: " + url, t);
                get.addRequestHeader("User-Agent", HTTP_USER_AGENT);
                get.addRequestHeader("Accept", HTTP_ACCEPT);
                if (fsd != null && fsd.getLastSyncDate() > 0) {
                    String lastSyncAt = org.apache.commons.httpclient.util.DateUtil
                            .formatDate(new Date(fsd.getLastSyncDate()));
                    get.addRequestHeader("If-Modified-Since", lastSyncAt);
                HttpClientUtil.executeMethod(client, get);

                Header locationHeader = get.getResponseHeader("location");
                if (locationHeader != null) {
                    // update our target URL and loop again to do another HTTP GET
                    url = locationHeader.getValue();
                } else {
                    statusCode = get.getStatusCode();
                    if (statusCode == HttpServletResponse.SC_OK) {
                        Header contentEncoding = get.getResponseHeader("Content-Encoding");
                        InputStream respInputStream = get.getResponseBodyAsStream();
                        if (contentEncoding != null) {
                            if (contentEncoding.getValue().indexOf("gzip") != -1) {
                                respInputStream = new GZIPInputStream(respInputStream);
                        content = new BufferedInputStream(respInputStream);
                        expectedCharset = get.getResponseCharSet();

                        Header lastModHdr = get.getResponseHeader("Last-Modified");
                        if (lastModHdr == null) {
                            lastModHdr = get.getResponseHeader("Date");
                        if (lastModHdr != null) {
                            try {
                                Date d = org.apache.commons.httpclient.util.DateUtil
                                lastModified = d.getTime();
                            } catch (DateParseException e) {
                                        "unable to parse Last-Modified/Date header: " + lastModHdr.getValue(), e);
                                lastModified = System.currentTimeMillis();
                        } else {
                            lastModified = System.currentTimeMillis();
                    } else if (statusCode == HttpServletResponse.SC_NOT_MODIFIED) {
                        ZimbraLog.misc.debug("Remote data at " + url + " not modified since last sync");
                        return new RemoteDataInfo(statusCode, redirects, null, expectedCharset, lastModified);
                    } else {
                        throw ServiceException.RESOURCE_UNREACHABLE(get.getStatusLine().toString(), null);
            } while (++redirects <= MAX_REDIRECTS);
        } catch (ServiceException ex) {
            if (get != null) {
            throw ex;
        } catch (HttpException ex) {
            if (get != null) {
            throw ex;
        } catch (IOException ex) {
            if (get != null) {
            throw ex;
        RemoteDataInfo rdi = new RemoteDataInfo(statusCode, redirects, content, expectedCharset, lastModified);
        return rdi;

    protected static SubscriptionData<?> retrieveRemoteDatasource(Account acct, RemoteDataInfo rdi,
            Folder.SyncData fsd) throws ServiceException, IOException {
        StringBuilder charset = new StringBuilder(rdi.expectedCharset);
        switch (getLeadingChar(rdi.content, charset)) {
        case -1:
            throw ServiceException.PARSE_ERROR("empty body in response when fetching remote subscription", null);
        case '<':
            return parseRssFeed(Element.parseXML(rdi.content), fsd, rdi.lastModified);
        case 'B':
        case 'b':
            List<ZVCalendar> icals = ZCalendarBuilder.buildMulti(rdi.content, charset.toString());
            List<Invite> invites = Invite.createFromCalendar(acct, null, icals, true, true, null);
            // handle missing UIDs on remote calendars by generating them as needed
            for (Invite inv : invites) {
                if (inv.getUid() == null) {
            return new SubscriptionData<Invite>(invites, rdi.lastModified);
            throw ServiceException.PARSE_ERROR("unrecognized remote content", null);

    public static SubscriptionData<?> retrieveRemoteDatasource(Account acct, String url, Folder.SyncData fsd)
            throws ServiceException {
        assert !Strings.isNullOrEmpty(url);
        RemoteDataInfo rdi = null;
        try {
            rdi = retrieveRemoteData(url, fsd);

            if (rdi.statusCode == HttpServletResponse.SC_NOT_MODIFIED) {
                return SubscriptionData.NOT_MODIFIED();
            if (rdi.redirects > MAX_REDIRECTS) {
                throw ServiceException.TOO_MANY_PROXIES(url);
            return retrieveRemoteDatasource(acct, rdi, fsd);
        } catch (HttpException e) {
            throw ServiceException.RESOURCE_UNREACHABLE("HttpException: " + e, e);
        } catch (IOException e) {
            throw ServiceException.RESOURCE_UNREACHABLE("IOException: " + e, e);
        } finally {
            if (rdi != null) {

    private static int getLeadingChar(BufferedInputStream is, StringBuilder charset) throws IOException {
        // check for any BOMs that would override the specified charset
        int ch =;
        switch (ch) {
        case 0xEF:
            if ( == 0xBB && == 0xBF) {
                ch =;
        case 0xFE:
            if ( == 0xFF && == 0x00) {
                ch =;
        case 0xFF:
            if ( == 0xFE) {
                ch =;
        // skip up to 120 bytes of leading whitespace
        for (int index = 0; index < 120 && (ch == '\0' || Character.isWhitespace(ch)); index++)
            ch =;
        // reset to the original state and return the first non-whtespace character
        return ch;

    //    private static org.dom4j.QName QN_CONTENT_ENCODED = org.dom4j.QName.get("encoded", "content", "");

    private static final class Enclosure {
        private final String url, title, cthdr;

        Enclosure(String url, String title, String ctype) {
            this.url = url;
            this.title = title;
            this.cthdr = ctype;

        String getLocation() {
            return url;

        String getDescription() {
            return title;

        String getContentType() {
            ContentType ctype = new ContentType(cthdr == null ? "text/plain" : cthdr).cleanup();
            try {
                ctype.setParameter("name", FileUtil.trimFilename(URLDecoder.decode(url, "utf-8")));
            } catch (UnsupportedEncodingException e) {
                ctype.setParameter("name", FileUtil.trimFilename(url));
            return ctype.toString();

    private static SubscriptionData<ParsedMessage> parseRssFeed(Element root, Folder.SyncData fsd,
            long lastModified) throws ServiceException {
        try {
            String rname = root.getName();
            if (rname.equals("feed")) {
                return parseAtomFeed(root, fsd, lastModified);

            Element channel = root.getElement("channel");
            String hrefChannel = channel.getAttribute("link");
            String subjChannel = channel.getAttribute("title");
            InternetAddress addrChannel = new JavaMailInternetAddress("", subjChannel, "utf-8");
            Date dateChannel = DateUtil.parseRFC2822Date(channel.getAttribute("lastBuildDate", null), new Date());

            List<Enclosure> enclosures = new ArrayList<Enclosure>(3);
            SubscriptionData<ParsedMessage> sdata = new SubscriptionData<ParsedMessage>();

            if (rname.equals("rss")) {
                root = channel;
            } else if (!rname.equals("RDF")) {
                throw ServiceException.PARSE_ERROR("unknown top-level rss element name: " + root.getQualifiedName(),

            for (Element item : root.listElements("item")) {
                // get the item's date
                Date date = DateUtil.parseRFC2822Date(item.getAttribute("pubDate", null), null);
                if (date == null) {
                    date = DateUtil.parseISO8601Date(item.getAttribute("date", null), dateChannel);

                // construct an address for the author
                InternetAddress addr = addrChannel;
                try {
                    addr = parseAuthor(item.getAttribute("author"));
                } catch (Exception e) {
                    addr = parseDublinCreator(stripXML(item.getAttribute("creator", null)), addr);

                // get the item's title and link, defaulting to the channel attributes
                String title = stripXML(item.getAttribute("title", subjChannel));
                String href = item.getAttribute("link", hrefChannel);
                String guid = item.getAttribute("guid", href);

                // make sure we haven't already seen this item
                if (fsd != null
                        && fsd.alreadySeen(guid == hrefChannel ? null : guid, date == dateChannel ? null : date))

                // handle the enclosure (associated media link), if any
                Element enc = item.getOptionalElement("enclosure");
                if (enc != null) {
                            new Enclosure(enc.getAttribute("url", null), null, enc.getAttribute("type", null)));

                // get the feed item's content and guess at its type
                String text = item.getAttribute("encoded", null);
                boolean html = text != null;
                if (text == null) {
                    text = item.getAttribute("description", null);
                if (text == null) {
                    text = item.getAttribute("abstract", null);
                if (text == null && title != subjChannel) {
                    text = "";
                if (text == null)
                html |= text.indexOf("</") != -1 || text.indexOf("/>") != -1 || text.indexOf("<p>") != -1;

                ParsedMessage pm = generateMessage(title, text, href, html, addr, date, enclosures);
                sdata.recordItem(pm, guid, date.getTime());
            return sdata;
        } catch (UnsupportedEncodingException e) {
            throw ServiceException.FAILURE("error encoding rss channel name", e);

    private static SubscriptionData<ParsedMessage> parseAtomFeed(Element feed, Folder.SyncData fsd,
            long lastModified) throws ServiceException {
        try {
            // get defaults from the <feed> element
            InternetAddress addrFeed = parseAtomAuthor(feed.getOptionalElement("author"), null);
            if (addrFeed == null) {
                addrFeed = new JavaMailInternetAddress("", stripXML(feed.getAttribute("title")), "utf-8");
            Date dateFeed = DateUtil.parseISO8601Date(feed.getAttribute("updated", null), new Date());
            List<Enclosure> enclosures = new ArrayList<Enclosure>();
            SubscriptionData<ParsedMessage> sdata = new SubscriptionData<ParsedMessage>();

            for (Element item : feed.listElements("entry")) {
                // get the item's date
                Date date = DateUtil.parseISO8601Date(item.getAttribute("updated", null), null);
                if (date == null) {
                    date = DateUtil.parseISO8601Date(item.getAttribute("modified", null), dateFeed);

                // construct an address for the author
                InternetAddress addr = parseAtomAuthor(item.getOptionalElement("author"), addrFeed);

                // get the item's title (may be html or xhtml)
                Element tblock = item.getElement("title");
                String type = tblock.getAttribute("type", "text").trim().toLowerCase();
                String title = tblock.getText();
                if (type.equals("html") || type.equals("xhtml") || type.equals("text/html")
                        || type.equals("application/xhtml+xml")) {
                    title = stripXML(title);

                // find the item's link and any enclosures (associated media links)
                String href = "";
                for (Element link : item.listElements("link")) {
                    String relation = link.getAttribute("rel", "alternate");
                    if (relation.equals("alternate")) {
                        href = link.getAttribute("href");
                    } else if (relation.equals("enclosure")) {
                        enclosures.add(new Enclosure(link.getAttribute("href", null),
                                link.getAttribute("title", null), link.getAttribute("type", null)));
                String guid = item.getAttribute("id", href);

                // make sure we haven't already seen this item
                if (fsd != null && fsd.alreadySeen(guid == null || guid.equals("") ? null : guid,
                        date == dateFeed ? null : date))

                // get the content/summary and markup
                Element content = item.getOptionalElement("content");
                if (content == null) {
                    content = item.getOptionalElement("summary");
                if (content == null)

                type = content.getAttribute("type", "text").trim().toLowerCase();
                boolean html = false;
                if (type.equals("html") || type.equals("xhtml") || type.equals("text/html")
                        || type.equals("application/xhtml+xml")) {
                    html = true;
                } else if (!type.equals("text") && !type.equals("text/plain")) {
                    throw ServiceException.PARSE_ERROR("unsupported atom entry content type: " + type, null);

                String text = content.getText();
                if (Strings.isNullOrEmpty(text)) {
                    Element div = content.getElement("div");
                    if (div != null) {
                         * Assume it is this variant:
                         *   atomInlineXHTMLContent = element atom:content { atomCommonAttributes,
                         *           attribute type { "xhtml" }, xhtmlDiv }
                        text = div.getText();
                ParsedMessage pm = generateMessage(title, text, href, html, addr, date, enclosures);
                sdata.recordItem(pm, guid, date.getTime());
            return sdata;
        } catch (UnsupportedEncodingException e) {
            throw ServiceException.FAILURE("error encoding atom feed name", e);

    private static final String HTML_HEADER = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\n"
            + "<HTML><HEAD><META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=utf-8\"></HEAD><BODY>";
    private static final String HTML_FOOTER = "</BODY></HTML>";

    private static ParsedMessage generateMessage(String title, String text, String href, boolean html,
            InternetAddress addr, Date date, List<Enclosure> attach) throws ServiceException {
        String ctype = html ? "text/html; charset=\"utf-8\"" : "text/plain; charset=\"utf-8\"";
        StringBuilder content = new StringBuilder();
        if (html) {
        } else {
        return generateMessage(title, content.toString(), ctype, addr, date, attach);

    private static ParsedMessage generateMessage(String title, String content, String ctype, InternetAddress addr,
            Date date, List<Enclosure> attach) throws ServiceException {
        // cull out invalid enclosures
        if (attach != null) {
            for (Iterator<Enclosure> it = attach.iterator(); it.hasNext();) {
                if ( == null) {
        boolean hasAttachments = attach != null && !attach.isEmpty();

        // clean up whitespace in the title
        if (title != null) {
            title = title.replaceAll("\\s+", " ");

        // create the MIME message and wrap it
        try {
            MimeMessage mm = new Mime.FixedMimeMessage(JMSession.getSession());
            MimePart body = hasAttachments ? new ZMimeBodyPart() : (MimePart) mm;
            body.setText(content, "utf-8");
            body.setHeader("Content-Type", ctype);

            if (hasAttachments) {
                // encode each enclosure as an attachment with Content-Location set
                MimeMultipart mmp = new ZMimeMultipart("mixed");
                mmp.addBodyPart((BodyPart) body);
                for (Enclosure enc : attach) {
                    MimeBodyPart part = new ZMimeBodyPart();
                    part.addHeader("Content-Location", enc.getLocation());
                    part.addHeader("Content-Type", enc.getContentType());
                    if (enc.getDescription() != null) {
                        part.addHeader("Content-Description", enc.getDescription());
                    part.addHeader("Content-Disposition", "attachment");

            mm.addFrom(new InternetAddress[] { addr });
            mm.setSubject(title, "utf-8");
            // more stuff here!
            return new ParsedMessage(mm, date.getTime(), false);
        } catch (MessagingException e) {
            throw ServiceException.PARSE_ERROR("error wrapping feed item in MimeMessage", e);

    private static InternetAddress parseDublinCreator(String creator, InternetAddress addrChannel) {
        if (creator == null || creator.equals("")) {
            return addrChannel;

        // check for a mailto: link
        String lc = creator.trim().toLowerCase(), address = "", personal = creator;
        int mailto = lc.indexOf("mailto:");
        if (mailto == 0 && lc.length() <= 7) {
            return addrChannel;
        } else if (mailto == 0) {
            personal = null;
            address = creator = creator.substring(7);
        } else if (mailto != -1) {
            // checking for "...[mailto:...]..." or "...(mailto:...)..."
            char delimit = creator.charAt(mailto - 1), complement = 0;
            if (delimit == '[') {
                complement = ']';
            } else if (delimit == '(') {
                complement = ')';
            int closing = creator.indexOf(complement, mailto + 7);
            if (closing != -1 && closing != mailto + 7) {
                address = creator.substring(mailto + 7, closing);
                personal = (creator.substring(0, mailto - 1) + creator.substring(closing + 1)).trim();

        try {
            return new JavaMailInternetAddress(address, personal, "utf-8");
        } catch (UnsupportedEncodingException e) {
        try {
            return new JavaMailInternetAddress("", creator, "utf-8");
        } catch (UnsupportedEncodingException e) {
        return addrChannel;

    private static InternetAddress parseAuthor(String author) throws IOException, ParseException {
        if (author != null && author.indexOf('@') == -1) {
            return new JavaMailInternetAddress("", stripXML(author), "utf-8");
        } else {
            return new JavaMailInternetAddress(author);

    private static InternetAddress parseAtomAuthor(Element author, InternetAddress addrChannel) {
        if (author == null) {
            return addrChannel;

        String address = stripXML(author.getAttribute("email", ""));
        String personal = stripXML(author.getAttribute("name", ""));
        if (personal.equals("") && address.equals("")) {
            return addrChannel;

        try {
            return new JavaMailInternetAddress(address, personal, "utf-8");
        } catch (UnsupportedEncodingException e) {
        try {
            return new JavaMailInternetAddress("", address + personal, "utf-8");
        } catch (UnsupportedEncodingException e) {
        return addrChannel;

    private static class UnescapedContent extends org.xml.sax.helpers.DefaultHandler {
        private final StringBuilder str = new StringBuilder();

        UnescapedContent() {

        public void startDocument() {

        public void characters(char[] ch, int offset, int length) {
            str.append(ch, offset, length);

        public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) {
            if (str.length() > 0) {
                String name = localName.toUpperCase();
                if (name.equals("P") || name.equals("BR") || name.equals("HR")) {
                    if (!Character.isWhitespace(str.charAt(str.length() - 1))) {
                        str.append(" ");

        public String toString() {
            return str.toString();

    static final String stripXML(String title) {
        if (title == null) {
            return "";
        } else if (title.indexOf('<') == -1 && title.indexOf('&') == -1) {
            return title;

        org.xml.sax.XMLReader parser = new org.cyberneko.html.parsers.SAXParser();
        org.xml.sax.ContentHandler handler = new UnescapedContent();
        try {
            parser.parse(new org.xml.sax.InputSource(new StringReader(title)));
            return handler.toString();
        } catch (Exception e) {
            return title;