Java tutorial
/* * Copyright (C) 2008-2010 Surevine Limited. * * Although intended for deployment and use alongside Alfresco this module should * be considered 'Not a Contribution' as defined in Alfresco'sstandard contribution agreement, see * http://www.alfresco.org/resource/AlfrescoContributionAgreementv2.pdf * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ package com.surevine.alfresco.repo.impl; import org.alfresco.service.cmr.repository.ContentAccessor; import org.alfresco.service.cmr.repository.ContentReader; import org.alfresco.service.cmr.repository.NodeService; import org.alfresco.service.cmr.repository.NodeRef; import org.alfresco.service.namespace.QName; import org.alfresco.service.cmr.repository.ContentService; import com.surevine.alfresco.repo.HTMLIdentifier; import com.surevine.alfresco.repo.action.SanitiseHTMLAction; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * An implementation of HTMLIdentifier that implements the identification algorithm described by INT-128 * This algorithm identifies a piece of content as HTML if any of the following are true * <ul> * <li> The Mime Type of the content item, as identified by Alfresco, contains the String "htm" </li> * <li> The name of the content item contains one of a configurable series of extensions, such as .html or .shtm</li> * <li> The first (configurable) X characters of the content item contain one of a configurable list of pattens, such as <html&rt; or <script&rt;</li> * </ul> * @author SimonW * */ public class FilenameAndContentHTMLIdentifier implements HTMLIdentifier { private static final QName CONTENT_QNAME = QName .createQName("{http://www.alfresco.org/model/content/1.0}content"); private static final QName NAME_QNAME = QName.createQName("http://www.alfresco.org/model/content/1.0}name"); private static final Log LOGGER = LogFactory.getLog(FilenameAndContentHTMLIdentifier.class); private NodeRef _target = null; private ContentService _contentService; private NodeService _nodeService; private String[] _htmlExtensions; static { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Loaded FilenameAndContentHTMLIdentifier"); } } /** * Constructor for factory and testing containers * @param contentService * @param nodeService * @param exts * @param contentFragments * @param maxChars */ public FilenameAndContentHTMLIdentifier(ContentService contentService, NodeService nodeService, String[] exts, String[] contentFragments, int maxChars) { _contentService = contentService; _nodeService = nodeService; _htmlExtensions = exts; _htmlContent = contentFragments; _maxCharsToSearch = maxChars; if (LOGGER.isInfoEnabled()) { LOGGER.info("Created a FilenameAndContentHTMLIdentifier using [" + _contentService + "|" + _nodeService + "|" + _htmlExtensions + "|" + _htmlContent + "|" + _maxCharsToSearch + "]"); } } /** * Default constructor intended for use by setter injection containers */ public FilenameAndContentHTMLIdentifier() { } /** * Utility method - convert input Strings to lower case to make later matching faster * @param in An Array of Strings * @return Equal to the input array of String, but everything is lower case */ protected static String[] StringArrayToLowerCase(String[] in) { if (in == null) { return null; } String[] out = new String[in.length]; for (int i = 0; i < in.length; i++) { if (in[i] != null) { out[i] = in[i].toLowerCase(); } else { out[i] = null; } } return out; } public void setHTMLExtensions(String[] htmlExtensions) { _htmlExtensions = StringArrayToLowerCase(htmlExtensions); } private String[] _htmlContent; public void setHTMLContent(String[] htmlContent) { _htmlContent = StringArrayToLowerCase(htmlContent); validateHTMLContent(); } private int _maxCharsToSearch; public void setMaxCharsToSearch(int maxCharsToSearch) { _maxCharsToSearch = maxCharsToSearch; } public void setNodeService(NodeService nodeService) { _nodeService = nodeService; } public NodeRef getTargetNodeRef() { return _target; } /** * Only lower-case alphanumerics, <, > and - to prevent accidentally escaping regexes elsewhere in the code. * A more sophisticated approach could be used to ensure that regexes included here are escaped correctly, * but this is not required for our use-case */ private void validateHTMLContent() { if (_htmlContent == null) { return; } for (int i = 0; i < _htmlContent.length; i++) { if (!_htmlContent[i].matches("[a-z0-9<>-]+")) { throw new IllegalArgumentException("HTML Content Array contains " + _htmlContent[i] + " at index " + i + " which is not a valid value"); } } } protected Pattern getHTMLContentRegularExpression() { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Getting HTMLContentRegex"); } StringBuffer sb = new StringBuffer(_htmlContent.length * 6); sb.append(".*("); for (int i = 0; i < _htmlContent.length; i++) { sb.append("(<\\s*?").append(_htmlContent[i]).append("[\\s>])"); if (i < _htmlContent.length - 1) { sb.append("|"); } } sb.append(").*"); String rVal = sb.toString(); if (LOGGER.isInfoEnabled()) { LOGGER.info("Created RE: " + rVal); } return Pattern.compile(rVal, Pattern.DOTALL); } public boolean isTargetHTML() throws IllegalStateException { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Is target HTML?"); } if (_target == null) { throw new IllegalStateException("isTargetHTML called but target not set"); } if (isHTMLByMimeType() || isHTMLByFileName() || isHTMLByContent()) { if (LOGGER.isInfoEnabled()) { LOGGER.info(_target + " is HTML"); } return true; } else if (LOGGER.isInfoEnabled()) { LOGGER.info(_target + " is not HTML"); } return false; } /** * Examine the first X chars of the file, and identify the file as HTML if is contains one of a series of Strings within * these chars * @return True if any of the Strings in _htmlContent are present within the first _maxCharsToSearch chars of the content * of the target node */ protected boolean isHTMLByContent() { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Entering isHTMLByContent"); } ContentReader cr = _contentService.getReader(_target, CONTENT_QNAME); if (cr == null) { if (LOGGER.isWarnEnabled()) { LOGGER.warn("Target " + _target + " has no content reader"); } return false; } String contentString = cr.getContentString(_maxCharsToSearch); //In theory, could have problems with massive content, but given our use-case I'm confident we'll be OK //no content == no HTML content if (contentString == null || _htmlContent == null) { if (LOGGER.isWarnEnabled()) { LOGGER.warn("Target " + _target + " has no content"); } return false; } contentString = contentString.toLowerCase(); Pattern regex = getHTMLContentRegularExpression(); if (regex.matcher(contentString).matches()) { if (LOGGER.isInfoEnabled()) { LOGGER.info(_target + " has HTML content"); } return true; } LOGGER.info(_target + " has no HTML content"); return false; } /** * Does the mimetype of the content item (as recorded by Alfresco) indicate * that the content item is HTML? */ protected boolean isHTMLByMimeType() { ContentAccessor ca = _contentService.getReader(_target, CONTENT_QNAME); if (ca == null) //If we don't have any content, it can't be HTML { return false; } String mimeString = ca.getMimetype(); if (LOGGER.isInfoEnabled()) { LOGGER.info("Mime Type of " + _target + " is " + mimeString); } if (mimeString != null && mimeString.toLowerCase().contains("htm")) { if (LOGGER.isInfoEnabled()) { LOGGER.info("Mime Type indicates that " + _target + " is HTML"); } return true; } if (LOGGER.isDebugEnabled()) { LOGGER.debug("Mime type indicates that " + _target + " is not HTML"); } return false; } /** * Retrieve the name of the given node, or null if the node either doesn't have the name or is itself null * @param node * @return Name of the node */ protected String getNodeName(NodeRef node) { Object rVal = _nodeService.getProperty(_target, NAME_QNAME); if (rVal == null) { return null; } if (LOGGER.isDebugEnabled()) { LOGGER.debug("Name of " + node + " is " + rVal); } return rVal.toString(); } protected boolean isHTMLByFileName() { String name = getNodeName(_target); if (name == null || _htmlExtensions == null) { return false; //If we've got no filename, we can't be HTML } // Case insensitive comparison - if the filename contains .htm, .html etc then it's html // Note we use contains, not ends-with logic, so a file called test.html.doc.tmp will be // identified as HTML, as will test.htmlfish, but we add a . to the front so myhtml.doc will // not be reported as HTML name = name.toLowerCase(); if (LOGGER.isInfoEnabled()) { LOGGER.info("Name of " + _target + " is " + name); } for (int i = 0; i < _htmlExtensions.length; i++) { if (LOGGER.isInfoEnabled()) { LOGGER.info("Looking for " + _htmlExtensions[i] + " in " + name); } if (name.indexOf("." + _htmlExtensions[i]) != -1) { if (LOGGER.isInfoEnabled()) { LOGGER.info("File name indicates that " + _target + " is HTML"); } return true; } } if (LOGGER.isDebugEnabled()) { LOGGER.debug("File name indicates that " + _target + " is not HTML"); } return false; } public void setTargetNodeRef(NodeRef target) { _target = target; } public void setContentService(ContentService contentService) { _contentService = contentService; } }