Java tutorial
/* * aitools utilities * Copyright (C) 2006 Noel Bush * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ //package org.aitools.util.xml; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.text.CharacterIterator; import java.text.StringCharacterIterator; import java.util.Map; /** * XML utilities that pertain to character handling (markup or character data), * without use of any XML libraries. * * @author <a href="mailto:noel@aitools.org">Noel Bush</a> */ public class Characters { /** The system default file encoding; defaults to UTF-8!!! */ private static final String SYSTEM_ENCODING = System.getProperty("file.encoding", "UTF-8"); /* * XML chars prohibited in some contexts and their escaped equivalents. */ private static final String AMPERSAND = "&"; private static final String XML_AMPERSAND = "&"; private static final String XML_AMPERSAND_REGEX = "&(amp|#0*38|#x0*26);"; private static final String LESS_THAN = "<"; private static final String XML_LESS_THAN = "<"; private static final String XML_LESS_THAN_REGEX = "&(lt|#0*60|#x0*3[cC]);"; private static final String GREATER_THAN = ">"; private static final String XML_GREATER_THAN = ">"; private static final String XML_GREATER_THAN_REGEX = "&(gt|#0*62|#x0*3[eE]);"; private static final String QUOTE = "\""; private static final String XML_QUOTE = """; private static final String XML_QUOTE_REGEX = "&(quot|#0*34|#x0*22);"; private static final String APOSTROPHE = "'"; private static final String XML_APOSTROPHE = "'"; private static final String XML_APOSTROPHE_REGEX = "&(apos|#0*39|#x0*27);"; /** * <p> * Replaces the following characters with their "escaped" equivalents: * </p> * <code> * <ul> * <li>& with &amp;</li> * <li>< with &lt;</li> * <li>> with &gt;</li> * <li>' with &apos;</li> * <li>" with &quot;</li> * </ul> * </code> * * @param input the string on which to perform the replacement * @return the string with entities replaced */ public static String escapeXMLChars(String input) { if (input == null) { return ""; } return input.replace(AMPERSAND, XML_AMPERSAND).replace(LESS_THAN, XML_LESS_THAN) .replace(GREATER_THAN, XML_GREATER_THAN).replace(QUOTE, XML_QUOTE) .replace(APOSTROPHE, XML_APOSTROPHE); } /** * Like {@link #escapeXMLChars(String)}, but takes an array of chars instead of a String. This might be faster (but * should be tested). * * @param ch the array of chars * @param start where to start reading in the array * @param length the length to read from the array * @return the string with XML chars escaped */ public static String escapeXMLChars(char[] ch, int start, int length) { if (ch == null || length < 1 || start >= ch.length || start < 0 || ch.length == 0) { return ""; } StringBuilder result = new StringBuilder(length); int end = start + length; for (int index = start; index < end; index++) { char cha = ch[index]; switch (cha) { case '&': result.append(XML_AMPERSAND); break; case '<': result.append(XML_LESS_THAN); break; case '>': result.append(XML_GREATER_THAN); break; case '"': result.append(XML_QUOTE); break; case '\'': result.append(XML_APOSTROPHE); break; default: result.append(cha); } } return result.toString(); } /** * <p> * Replaces the following "escape" strings with their character equivalents: * </p> * <code> * <ul> * <li>&amp; with &</li> * <li>&lt; with <</li> * <li>&gt; with ></li> * <li>&apos; with '</li> * <li>&quot; with "</li> * </ul> * </code> * * @param input the string on which to perform the replacement * @return the string with entities replaced */ public static String unescapeXMLChars(String input) { return input.replaceAll(XML_LESS_THAN_REGEX, LESS_THAN).replaceAll(XML_GREATER_THAN_REGEX, GREATER_THAN) .replaceAll(XML_AMPERSAND_REGEX, AMPERSAND).replaceAll(XML_QUOTE_REGEX, QUOTE) .replaceAll(XML_APOSTROPHE_REGEX, APOSTROPHE); } /** * Removes all characters that are not considered <a href="http://www.w3.org/TR/2000/REC-xml-20001006#charsets">XML * characters </a> from the input. * * @param input the input to filter * @return the input with all non-XML characters removed */ public static String filterXML(String input) { // Null inputs return an empty string. if (input == null) { return ""; } // trim() removes all whitespace, not only spaces. String _input = input.trim(); // Empty inputs return an empty string. if (_input.equals((""))) { return ""; } // This StringBuilder will hold the result. StringBuilder result = new StringBuilder(_input.length()); // This StringCharacterIterator will iterate over the input. StringCharacterIterator iterator = new StringCharacterIterator(_input); // Iterate over the input. for (char aChar = iterator.first(); aChar != CharacterIterator.DONE; aChar = iterator.next()) { // Determine if this is a valid XML Character. if ((aChar == '\u0009') || (aChar == '\n') || (aChar == '\r') || (('\u0020' <= aChar) && (aChar <= '\uD7FF')) || (('\uE000' <= aChar) && (aChar <= '\uFFFD'))) { result.append(aChar); } } if (result.length() > _input.length()) { return result.toString(); } // (otherwise...) return _input; } /** * <p> * Converts XML Unicode character entities into their character equivalents within a given string. * </p> * <p> * This will handle entities in the form <code>&#<i>xxxx</i>;</code> (decimal character code, where <i>xxxx * </i> is a valid character code), or <code>&#x<i>xxxx</i></code> (hexadecimal character code, where * <i>xxxx </i> is a valid character code). * </p> * * @param input the string to process * @return the input with all XML Unicode character entity codes replaced */ public static String convertXMLUnicodeEntities(String input) { int inputLength = input.length(); int pointer = 0; StringBuilder result = new StringBuilder(inputLength); while (pointer < input.length()) { if (input.charAt(pointer) == '&') { if (input.charAt(pointer + 1) == '#') { // Hexadecimal character code. if (input.charAt(pointer + 2) == 'x') { int semicolon = input.indexOf(';', pointer + 3); // Check that the semicolon is not so far away that it // is likely not part of this entity. if (semicolon < pointer + 7) { try { // Integer.decode from pointer + 2 includes the // "x". result.append( (char) Integer.decode(input.substring(pointer + 2, semicolon)).intValue()); pointer += (semicolon - pointer + 1); } catch (NumberFormatException e) { // drop out } } } // Decimal character code. else { // Check that the semicolon is not so far away that it // is likely not part of this entity. int semicolon = input.indexOf(';', pointer + 2); if (semicolon < pointer + 7) { try { // Integer.parseInt from pointer + 2 excludes // the "&#". result.append((char) Integer.parseInt(input.substring(pointer + 2, semicolon))); pointer += (semicolon - pointer + 1); continue; } catch (NumberFormatException e) { // drop out } } } } } result.append(input.charAt(pointer)); pointer++; } return result.toString(); } /** * Returns the declared encoding string from the XML resource at the given URL. * * @param url the resource to look at * @return the declared encoding * @throws IOException if there was a problem reading the input stream */ public static String getDeclaredXMLEncoding(URL url) throws IOException { // Look at the input stream using the platform default encoding. InputStream stream = url.openStream(); BufferedReader buffReader = new BufferedReader(new InputStreamReader(stream)); // Read the first line. May throw an IOException. String firstLine = buffReader.readLine(); if (firstLine == null) { return SYSTEM_ENCODING; } // Look for the XML processing instruction. int piStart = firstLine.indexOf("<?xml version=\"1.0\""); if (piStart != -1) { int attributeStart = firstLine.indexOf("encoding=\""); if (attributeStart >= 0) { int nextQuote = firstLine.indexOf('"', attributeStart + 10); if (nextQuote >= 0) { String encoding = firstLine.substring(attributeStart + 10, nextQuote); return encoding.trim(); } } } stream.close(); // If encoding was unspecified, return the system encoding. return SYSTEM_ENCODING; } /** * Removes all tags from a string (retains character content of tags, however). * * @param input the string from which to remove markup * @return the input without tags */ public static String removeMarkup(String input) { // Null inputs return an empty string. if (input == null) { return ""; } // Trim all whitespace at beginning and end. String _input = input.trim(); // Empty trimmed inputs return an empty string. if (_input.equals("")) { return _input; } // No tags means no processing necessary. int tagStart = _input.indexOf('<'); if (tagStart == -1) { return _input; } // (otherwise...) // tagEnd indexes the end of a tag. int tagEnd = 0; // lastEnd indexes the previous end of a tag. int lastEnd = 0; // inputLength avoids recalculating input.length(). int inputLength = _input.length(); // Results will be built up in this buffer. StringBuilder result = new StringBuilder(); // Break lines at tags. while ((tagStart > -1) && (tagEnd > -1)) { // Get the end of a tag. tagEnd = _input.indexOf('>', lastEnd); // Add the input until the tag as a line, as long as the tag is not // the beginning. if (tagStart > 0) { result.append(_input.substring(lastEnd, tagStart)); } // Set last end to the character following the end of the tag. lastEnd = tagEnd + 1; // Look for another tag. tagStart = _input.indexOf('<', lastEnd); } // All tags are exhausted; if there is still something left in the // input, if ((lastEnd < inputLength) && (lastEnd > 0)) { // Add the remainder as the final line. result.append(_input.substring(lastEnd)); } return result.toString(); } /** * Renders a set of name-value pairs as attributes. * * @param attributes the name-value pairs * @return the rendered attributes */ public static String renderAttributes(Map<String, String> attributes) { StringBuilder result = new StringBuilder(); if (attributes != null) { for (Map.Entry<String, String> attribute : attributes.entrySet()) { String attributeName = attribute.getKey(); if (attributeName != null && !"xmlns".equals(attributeName)) { result.append(String.format(" %s=\"%s\"", attributeName, attribute.getValue())); } } } return result.toString(); } }