Determining the validity of various XML names
/*
* JBoss DNA (http://www.jboss.org/dna)
* See the COPYRIGHT.txt file distributed with this work for information
* regarding copyright ownership. Some portions may be licensed
* to Red Hat, Inc. under one or more contributor license agreements.
* See the AUTHORS.txt file in the distribution for a full listing of
* individual contributors.
*
* Unless otherwise indicated, all code in JBoss DNA is licensed
* to you under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* JBoss DNA is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
/**
* A utility class for determining the validity of various XML names, per the
* <a href="http://www.w3.org/TR/REC-xml/">XML 1.0 Specification</a>.
*/
public class XmlCharacters {
private static final int NUMBER_OF_CHARACTERS = 1 << 16; // 65536 or 0x10000
/**
* This implementation uses an array that captures for each character the XML classifications.
* An array is used because it is a fast way of looking up each character.
*/
private static final char[] MASKS = new char[NUMBER_OF_CHARACTERS];
private static final int VALID_CHARACTER = 1;
private static final int CONTENT_CHARACTER = 1 <<1;
private static final int SPACE_CHARACTER = 1 <<2;
private static final int NAME_START_CHARACTER = 1<<3;
private static final int NAME_CHARACTER = 1<<4;
private static final int NCNAME_START_CHARACTER = 1<<5;
private static final int NCNAME_CHARACTER = 1<<6;
private static final int PUBID_CHARACTER = 1<<7;
static {
// ----------------
// Valid Characters
// ----------------
// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
// See http://www.w3.org/TR/REC-xml/#charsets
MASKS[0x9] |= VALID_CHARACTER | CONTENT_CHARACTER;
MASKS[0xA] |= VALID_CHARACTER | CONTENT_CHARACTER;
MASKS[0xD] |= VALID_CHARACTER | CONTENT_CHARACTER;
for (int i = 0x20; i <= 0xD7FF; ++i) MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
for (int i = 0xE000; i <= 0xFFFD; ++i) MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
// Last range is bigger than our character array, so we'll handle in the 'isValid' method ...
// for ( int i=0x10000; i<=0x10FFFF; ++i ) MASKS[i] = VALID_CHARACTER_MASK | CONTENT_CHARACTER;
// Remove the other characters that are not allowed in XML content:
// '<', '&', '\n', '\r', ']'
MASKS['<'] &= ~(CONTENT_CHARACTER);
MASKS['&'] &= ~(CONTENT_CHARACTER);
MASKS['\n'] &= ~(CONTENT_CHARACTER);
MASKS['\r'] &= ~(CONTENT_CHARACTER);
MASKS[']'] &= ~(CONTENT_CHARACTER);
// ---------------------
// Whitespace Characters
// ---------------------
// [3] S ::= (#x20 | #x9 | #xD | #xA)+
// See http://www.w3.org/TR/REC-xml/#sec-common-syn
MASKS[0x20] |= SPACE_CHARACTER;
MASKS[0x9] |= SPACE_CHARACTER;
MASKS[0xA] |= SPACE_CHARACTER;
MASKS[0xD] |= SPACE_CHARACTER;
// ---------------------
// Name Start Characters
// ---------------------
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
// [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
// [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
// [#x10000-#xEFFFF]
// See http://www.w3.org/TR/REC-xml/#sec-common-syn
//
// Note that all these start characters AND characters are valid for NAME and NCNAME
int nameStartMask = NAME_START_CHARACTER | NCNAME_START_CHARACTER | NAME_CHARACTER | NCNAME_CHARACTER;
MASKS[':'] |= nameStartMask;
MASKS['_'] |= nameStartMask;
for (int i = 'A'; i <= 'Z'; ++i) MASKS[i] |= nameStartMask;
for (int i = 'a'; i <= 'z'; ++i) MASKS[i] |= nameStartMask;
for (int i = 0xC0; i <= 0xD6; ++i) MASKS[i] |= nameStartMask;
for (int i = 0xD8; i <= 0xF6; ++i) MASKS[i] |= nameStartMask;
for (int i = 0xF8; i <= 0x2FF; ++i) MASKS[i] |= nameStartMask;
for (int i = 0x370; i <= 0x37D; ++i) MASKS[i] |= nameStartMask;
for (int i = 0x37F; i <= 0x1FFF; ++i) MASKS[i] |= nameStartMask;
for (int i = 0x200C; i <= 0x200D; ++i) MASKS[i] |= nameStartMask;
for (int i = 0x2070; i <= 0x218F; ++i) MASKS[i] |= nameStartMask;
for (int i = 0x2C00; i <= 0x2FEF; ++i) MASKS[i] |= nameStartMask;
for (int i = 0x3001; i <= 0xD7FF; ++i) MASKS[i] |= nameStartMask;
for (int i = 0xF900; i <= 0xFDCF; ++i) MASKS[i] |= nameStartMask;
for (int i = 0xFDF0; i <= 0xFFFD; ++i) MASKS[i] |= nameStartMask;
// Last range is bigger than our character array ...
//for (int i = 0x10000; i <= 0xEFFFF; ++i) MASKS[i] |= nameStartMask;
// ---------------
// Name Characters
// ---------------
// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
// See http://www.w3.org/TR/REC-xml/#sec-common-syn
//
// Note that all these characters are valid for NAME and NCNAME
int nameMask = NAME_CHARACTER | NCNAME_CHARACTER;
MASKS['-'] |= nameMask;
MASKS['.'] |= nameMask;
MASKS[0xB7] |= nameMask;
for (int i = '0'; i <= '9'; ++i) MASKS[i] |= nameMask;
for (int i = 0x0300; i <= 0x036F; ++i) MASKS[i] |= nameStartMask;
for (int i = 0x203F; i <= 0x2040; ++i) MASKS[i] |= nameStartMask;
// --------
// NC Names
// --------
// [4] NCName ::= NCNameStartChar NCNameChar*
// which is just an XML Name, minus the ":"
// See http://www.w3.org/TR/REC-xml-names/#ns-decl
// So, remove the NCNAME_CHARACTER and NCNAME_START_CHARACTER masks from ':' ...
MASKS[':'] &= ~(NCNAME_START_CHARACTER | NCNAME_CHARACTER);
// --------------------
// Public ID characters
// --------------------
// [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
MASKS[0x20] |= PUBID_CHARACTER;
MASKS[0xA] |= PUBID_CHARACTER;
MASKS[0xD] |= PUBID_CHARACTER;
for (int i = 'A'; i <= 'Z'; ++i) MASKS[i] |= PUBID_CHARACTER;
for (int i = 'a'; i <= 'z'; ++i) MASKS[i] |= PUBID_CHARACTER;
for (int i = '0'; i <= '9'; ++i) MASKS[i] |= PUBID_CHARACTER;
MASKS['-'] |= PUBID_CHARACTER;
MASKS['\''] |= PUBID_CHARACTER;
MASKS['('] |= PUBID_CHARACTER;
MASKS[')'] |= PUBID_CHARACTER;
MASKS['+'] |= PUBID_CHARACTER;
MASKS[','] |= PUBID_CHARACTER;
MASKS['.'] |= PUBID_CHARACTER;
MASKS['/'] |= PUBID_CHARACTER;
MASKS[':'] |= PUBID_CHARACTER;
MASKS['='] |= PUBID_CHARACTER;
MASKS['?'] |= PUBID_CHARACTER;
MASKS[';'] |= PUBID_CHARACTER;
MASKS['!'] |= PUBID_CHARACTER;
MASKS['*'] |= PUBID_CHARACTER;
MASKS['#'] |= PUBID_CHARACTER;
MASKS['@'] |= PUBID_CHARACTER;
MASKS['$'] |= PUBID_CHARACTER;
MASKS['_'] |= PUBID_CHARACTER;
MASKS['%'] |= PUBID_CHARACTER;
}
private XmlCharacters() {
}
/**
* Determine whether the supplied character is a valid first character in an XML Name.
* The first character in an XML name is more restrictive than the {@link #isValidName(int) remaining characters}.
*
* @param c the character
* @return true if the character is valid for an XML Name's first character
*/
public static boolean isValidNameStart( int c ) {
return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_START_CHARACTER ) != 0;
}
/**
* Determine whether the supplied character is a valid first character in an XML NCName.
* The first character in an XML NCName is more restrictive than the {@link #isValidName(int) remaining characters}.
*
* @param c the character
* @return true if the character is valid for an XML NCName's first character
*/
public static boolean isValidNcNameStart( int c ) {
return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_START_CHARACTER ) != 0;
}
/**
* Determine whether the supplied character is a valid non-first character in an XML Name.
* The {@link #isValidNameStart(int) first character} in an XML name is more restrictive than the remaining characters.
*
* @param c the character
* @return true if the character is valid character in an XML Name
*/
public static boolean isValidName( int c ) {
return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_CHARACTER ) != 0;
}
/**
* Determine whether the supplied character is a valid non-first character in an XML NCName.
* The {@link #isValidNcNameStart(int) first character} in an XML NCName is more restrictive than the remaining characters.
*
* @param c the character
* @return true if the character is valid character in an XML NCName
*/
public static boolean isValidNcName( int c ) {
return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_CHARACTER ) != 0;
}
/**
* Determine whether the supplied character is a valid character in an XML Pubid.
*
* @param c the character
* @return true if the character is valid character in an XML Pubid
*/
public static boolean isValidPubid( int c ) {
return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & PUBID_CHARACTER ) != 0;
}
/**
* Determine whether the supplied character is a valid character in XML.
*
* @param c the character
* @return true if the character is valid character in XML
*/
public static boolean isValid( int c ) {
return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & VALID_CHARACTER ) != 0) || ( 0x10000 <= c && c <= 0x10FFFF);
}
/**
* Determine whether the supplied character is a valid character in XML content
*
* @param c the character
* @return true if the character is valid character in XML content
*/
public static boolean isValidContent( int c ) {
return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & CONTENT_CHARACTER ) != 0) || ( 0x10000 <= c && c <= 0x10FFFF);
}
/**
* Determine whether the supplied character is a valid whitespace character in XML
*
* @param c the character
* @return true if the character is valid whitespace character in XML
*/
public static boolean isValidSpace( int c ) {
return c <= 0x20 && ( MASKS[c] & SPACE_CHARACTER ) != 0;
}
/**
* Determine if the supplied name is a valid XML Name.
*
* @param name the string being checked
* @return true if the supplied name is indeed a valid XML Name, or false otherwise
*/
public static boolean isValidName( String name ) {
if ( name == null || name.length() == 0 ) return false;
CharacterIterator iter = new StringCharacterIterator(name);
char c = iter.first();
if ( !isValidNameStart(c) ) return false;
while ( c != CharacterIterator.DONE ) {
if ( !isValidName(c) ) return false;
c = iter.next();
}
return true;
}
/**
* Determine if the supplied name is a valid XML NCName.
*
* @param name the string being checked
* @return true if the supplied name is indeed a valid XML NCName, or false otherwise
*/
public static boolean isValidNcName( String name ) {
if ( name == null || name.length() == 0 ) return false;
CharacterIterator iter = new StringCharacterIterator(name);
char c = iter.first();
if ( !isValidNcNameStart(c) ) return false;
while ( c != CharacterIterator.DONE ) {
if ( !isValidNcName(c) ) return false;
c = iter.next();
}
return true;
}
}
Related examples in the same category
1. | String Region Match Demo | | |
2. | Palindrome | | |
3. | Look for particular sequences in sentences | | |
4. | Strings -- extract printable strings from binary file | | |
5. | Java Search String | | |
6. | Java String endsWith | | |
7. | Java String startsWith | | |
8. | Search a substring Anywhere | | |
9. | Starts with, ignore case( regular expressions ) | | |
10. | Ends with, ignore case( regular expressions ) | | |
11. | Anywhere, ignore case( regular expressions ) | | |
12. | Searching a String for a Character or a Substring | | |
13. | Not found returns -1 | | |
14. | If a string contains a specific word | | |
15. | Not found | | |
16. | if a String starts with a digit or uppercase letter | | |
17. | Search a String to find the first index of any character in the given set of characters. | | |
18. | Search a String to find the first index of any character not in the given set of characters. | | |
19. | Searches a String for substrings delimited by a start and end tag, returning all matching substrings in an array. | | |
20. | Helper functions to query a strings end portion. The comparison is case insensitive. | | |
21. | Helper functions to query a strings start portion. The comparison is case insensitive. | | |
22. | Wrapper for arrays of ordered strings. This verifies the arrays and supports efficient lookups. | | |
23. | Returns an index into arra (or -1) where the character is not in the charset byte array. | | |
24. | Returns an int[] array of length segments containing the distribution count of the elements in unsorted int[] array with values between min and max (range). | | |
25. | Returns the next index of a character from the chars string | | |
26. | Finds the first index within a String, handling null. | | |
27. | Finds the last index within a String from a start position, handling null. | | |
28. | Finds the n-th index within a String, handling null. | | |
29. | Case insensitive check if a String ends with a specified suffix. | | |
30. | Case insensitive check if a String starts with a specified prefix. | | |
31. | Case insensitive removal of a substring if it is at the begining of a source string, otherwise returns the source string. | | |
32. | Case insensitive removal of a substring if it is at the end of a source string, otherwise returns the source string. | | |
33. | Check if a String ends with a specified suffix. | | |
34. | Check if a String starts with a specified prefix. | | |
35. | Determine if a String is contained in a String Collection | | |
36. | Determine if a String is contained in a String Collection, ignoring case | | |
37. | Determine if a String is contained in a String [], ignoring case | | |
38. | Determine if a String is contained in a String [], ignoring case or not as specified | | |
39. | Determine if a String is contained in a String[] | | |
40. | Determines if the specified string contains only Unicode letters or digits as defined by Character#isLetterOrDigit(char) | | |
41. | Return the nth index of the given token occurring in the given string | | |
42. | Find the earliest index of any of a set of potential substrings. | | |
43. | Find the latest index of any of a set of potential substrings. | | |
44. | Fast String Search | | |
45. | Performs continuous matching of a pattern in a given string. | | |
46. | Count match | | |