Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.axiom.om.util; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * This is an XMLStreamWriterFilter that removes illegal characters. * * Valid and invalid character ranges are defined by: * http://www.w3.org/TR/2008/REC-xml-20081126/#NT-Char * * */ public class XMLStreamWriterRemoveIllegalChars extends XMLStreamWriterFilterBase { private static Log log = LogFactory.getLog(XMLStreamWriterRemoveIllegalChars.class); public XMLStreamWriterRemoveIllegalChars() { super(); if (log.isDebugEnabled()) { log.debug("Creating XMLStreamWriterRemoveIllegalChars object " + this); } } // Characters less than 0x20 may be control characters and should be removed // Note the non-initialized bytes in this array are zero private static byte[] REMOVE = new byte[32]; static { REMOVE[0x00] = 1; REMOVE[0x01] = 1; REMOVE[0x02] = 1; REMOVE[0x03] = 1; REMOVE[0x04] = 1; REMOVE[0x05] = 1; REMOVE[0x06] = 1; REMOVE[0x07] = 1; REMOVE[0x08] = 1; // 0x09 is TAB...which is allowed // 0x0A is LINEFEED...which is allowed REMOVE[0x0B] = 1; REMOVE[0x0C] = 1; // 0x0D is CARRIAGE RETURN, which is allowed REMOVE[0x0E] = 1; REMOVE[0x0F] = 1; REMOVE[0x10] = 1; REMOVE[0x11] = 1; REMOVE[0x12] = 1; REMOVE[0x13] = 1; REMOVE[0x14] = 1; REMOVE[0x15] = 1; REMOVE[0x16] = 1; REMOVE[0x17] = 1; REMOVE[0x18] = 1; REMOVE[0x19] = 1; REMOVE[0x1A] = 1; REMOVE[0x1B] = 1; REMOVE[0x1C] = 1; REMOVE[0x1D] = 1; REMOVE[0x1E] = 1; REMOVE[0x1F] = 1; } // These two characters are not allowed private final int FFFE = 0xFFFE; private final char FFFF = 0xFFFF; // Characters in the surrogate range are not allowed // (unless the result is a valid supplemental character) private final char SURROGATE_START = 0xD800; private final char SURROGATE_END = 0xDFFF; /* (non-Javadoc) * @see org.apache.axiom.om.util.XMLStreamWriterFilterBase#xmlData(java.lang.String) */ protected String xmlData(String value) { char[] buffer = null; int len = value.length(); int srcI = 0; int tgtI = 0; int copyLength = 0; int i = 0; // Traverse all of the characters in the input String (value) while (i < len) { // Get the codepoint of the character at the index // Note that the code point may be two characters long (a supplemental character) int cp = value.codePointAt(i); if (cp > FFFF) { // Supplemental Character...Increase index by 2 // Increase the length of good characters to copy by 2 i = i + 2; copyLength = copyLength + 2; } else { // See if the character is invalid if ((cp < 0x20 && (REMOVE[cp] > 0)) || // Control Character (cp >= SURROGATE_START && cp <= SURROGATE_END) || // Bad surrogate (cp == FFFF || cp == FFFE)) { // or illegal character // Flow to here indicates that the character is not allowed. // The good characters (up to this point) are copied into the buffer. // Note that the buffer is initialized with the original characters. // Thus the buffer copy is always done on the same buffer (saving // both time and space). // Make the buffer on demand if (buffer == null) { if (log.isDebugEnabled()) { log.debug("One or more illegal characterss found. Codepoint=" + cp); } buffer = value.toCharArray(); } // Copy the good characters into the buffer System.arraycopy(buffer, srcI, buffer, tgtI, copyLength); tgtI = tgtI + copyLength; // Update the target location in the array srcI = i + 1; // Skip over the current character copyLength = 0; // reset new copy length } else { // Valid character, increase copy length copyLength = copyLength + 1; } // Single bit16 character, increase index by 1 i = i + 1; } } if (buffer == null) { // Normal case, no illegal characters removed..No buffer return value; } else { // Move the final valid characters to the buffer // and return a string representing the value System.arraycopy(buffer, srcI, buffer, tgtI, copyLength); String newValue = new String(buffer, 0, tgtI + copyLength); return newValue; } } }