Java tutorial
package org.apache.maven.doxia.parser; /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; import java.io.StringReader; import java.net.URL; import java.util.Hashtable; import java.util.LinkedHashMap; import java.util.Locale; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpRequestRetryHandler; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; import org.apache.http.util.EntityUtils; import org.apache.maven.doxia.macro.MacroExecutionException; import org.apache.maven.doxia.markup.XmlMarkup; import org.apache.maven.doxia.sink.Sink; import org.apache.maven.doxia.sink.SinkEventAttributeSet; import org.apache.maven.doxia.util.HtmlTools; import org.apache.maven.doxia.util.XmlValidator; import org.codehaus.plexus.util.FileUtils; import org.codehaus.plexus.util.IOUtil; import org.codehaus.plexus.util.StringUtils; import org.codehaus.plexus.util.xml.pull.MXParser; import org.codehaus.plexus.util.xml.pull.XmlPullParser; import org.codehaus.plexus.util.xml.pull.XmlPullParserException; import org.xml.sax.EntityResolver; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * An abstract class that defines some convenience methods for <code>XML</code> parsers. * * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a> * @version $Id: AbstractXmlParser.java 1465336 2013-04-07 07:39:00Z hboutemy $ * @since 1.0 */ public abstract class AbstractXmlParser extends AbstractParser implements XmlMarkup { /** * Entity pattern for HTML entity, i.e. &nbsp; * "<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*> * <br/> * see <a href="http://www.w3.org/TR/REC-xml/#NT-EntityDecl">http://www.w3.org/TR/REC-xml/#NT-EntityDecl</a>. */ private static final Pattern PATTERN_ENTITY_1 = Pattern .compile(ENTITY_START + "(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&[a-zA-Z]{2,6};)(\\s)*\"(\\s)*>"); /** * Entity pattern for Unicode entity, i.e. &#38; * "<!ENTITY(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&(#x?[0-9a-fA-F]{1,5};)*)(\\s)*\"(\\s)*>" * <br/> * see <a href="http://www.w3.org/TR/REC-xml/#NT-EntityDecl">http://www.w3.org/TR/REC-xml/#NT-EntityDecl</a>. */ private static final Pattern PATTERN_ENTITY_2 = Pattern .compile(ENTITY_START + "(\\s)+([^>|^\\s]+)(\\s)+\"(\\s)*(&(#x?[0-9a-fA-F]{1,5};)*)(\\s)*\"(\\s)*>"); private boolean ignorableWhitespace; private boolean collapsibleWhitespace; private boolean trimmableWhitespace; private Map<String, String> entities; private boolean validate = false; /** {@inheritDoc} */ public void parse(Reader source, Sink sink) throws ParseException { init(); Reader src = source; // 1 first parsing if validation is required if (isValidate()) { String content; try { content = IOUtil.toString(new BufferedReader(src)); } catch (IOException e) { throw new ParseException("Error reading the model: " + e.getMessage(), e); } new XmlValidator(getLog()).validate(content); src = new StringReader(content); } // 2 second parsing to process try { XmlPullParser parser = new MXParser(); parser.setInput(src); // allow parser initialization, e.g. for additional entities in XHTML // Note: do it after input is set, otherwise values are reset initXmlParser(parser); sink.enableLogging(getLog()); parseXml(parser, sink); } catch (XmlPullParserException ex) { throw new ParseException("Error parsing the model: " + ex.getMessage(), ex, ex.getLineNumber(), ex.getColumnNumber()); } catch (MacroExecutionException ex) { throw new ParseException("Macro execution failed: " + ex.getMessage(), ex); } setSecondParsing(false); init(); } /** * Initializes the parser with custom entities or other options. * * @param parser A parser, not null. * @throws org.codehaus.plexus.util.xml.pull.XmlPullParserException if there's a problem initializing the parser */ protected void initXmlParser(XmlPullParser parser) throws XmlPullParserException { // nop } /** * {@inheritDoc} * * Convenience method to parse an arbitrary string and emit any xml events into the given sink. */ @Override public void parse(String string, Sink sink) throws ParseException { super.parse(string, sink); } /** {@inheritDoc} */ @Override public final int getType() { return XML_TYPE; } /** * Converts the attributes of the current start tag of the given parser to a SinkEventAttributeSet. * * @param parser A parser, not null. * @return a SinkEventAttributeSet or null if the current parser event is not a start tag. * @since 1.1 */ protected SinkEventAttributeSet getAttributesFromParser(XmlPullParser parser) { int count = parser.getAttributeCount(); if (count < 0) { return null; } SinkEventAttributeSet atts = new SinkEventAttributeSet(count); for (int i = 0; i < count; i++) { atts.addAttribute(parser.getAttributeName(i), parser.getAttributeValue(i)); } return atts; } /** * Parse the model from the XmlPullParser into the given sink. * * @param parser A parser, not null. * @param sink the sink to receive the events. * @throws org.codehaus.plexus.util.xml.pull.XmlPullParserException if there's a problem parsing the model * @throws org.apache.maven.doxia.macro.MacroExecutionException if there's a problem executing a macro */ private void parseXml(XmlPullParser parser, Sink sink) throws XmlPullParserException, MacroExecutionException { int eventType = parser.getEventType(); while (eventType != XmlPullParser.END_DOCUMENT) { if (eventType == XmlPullParser.START_TAG) { handleStartTag(parser, sink); } else if (eventType == XmlPullParser.END_TAG) { handleEndTag(parser, sink); } else if (eventType == XmlPullParser.TEXT) { String text = getText(parser); if (isIgnorableWhitespace()) { if (text.trim().length() != 0) { handleText(parser, sink); } } else { handleText(parser, sink); } } else if (eventType == XmlPullParser.CDSECT) { handleCdsect(parser, sink); } else if (eventType == XmlPullParser.COMMENT) { handleComment(parser, sink); } else if (eventType == XmlPullParser.ENTITY_REF) { handleEntity(parser, sink); } else if (eventType == XmlPullParser.IGNORABLE_WHITESPACE) { // nop } else if (eventType == XmlPullParser.PROCESSING_INSTRUCTION) { // nop } else if (eventType == XmlPullParser.DOCDECL) { addLocalEntities(parser, parser.getText()); for (byte[] res : CachedFileEntityResolver.ENTITY_CACHE.values()) { addDTDEntities(parser, new String(res)); } } try { eventType = parser.nextToken(); } catch (IOException io) { throw new XmlPullParserException("IOException: " + io.getMessage(), parser, io); } } } /** * Goes through the possible start tags. * * @param parser A parser, not null. * @param sink the sink to receive the events. * @throws org.codehaus.plexus.util.xml.pull.XmlPullParserException if there's a problem parsing the model * @throws org.apache.maven.doxia.macro.MacroExecutionException if there's a problem executing a macro */ protected abstract void handleStartTag(XmlPullParser parser, Sink sink) throws XmlPullParserException, MacroExecutionException; /** * Goes through the possible end tags. * * @param parser A parser, not null. * @param sink the sink to receive the events. * @throws org.codehaus.plexus.util.xml.pull.XmlPullParserException if there's a problem parsing the model * @throws org.apache.maven.doxia.macro.MacroExecutionException if there's a problem executing a macro */ protected abstract void handleEndTag(XmlPullParser parser, Sink sink) throws XmlPullParserException, MacroExecutionException; /** * Handles text events. * * <p>This is a default implementation, if the parser points to a non-empty text element, * it is emitted as a text event into the specified sink.</p> * * @param parser A parser, not null. * @param sink the sink to receive the events. Not null. * @throws org.codehaus.plexus.util.xml.pull.XmlPullParserException if there's a problem parsing the model */ protected void handleText(XmlPullParser parser, Sink sink) throws XmlPullParserException { String text = getText(parser); /* * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the * parser so any whitespace that makes it here is significant. */ if (StringUtils.isNotEmpty(text)) { sink.text(text); } } /** * Handles CDATA sections. * * <p>This is a default implementation, all data are emitted as text * events into the specified sink.</p> * * @param parser A parser, not null. * @param sink the sink to receive the events. Not null. * @throws org.codehaus.plexus.util.xml.pull.XmlPullParserException if there's a problem parsing the model */ protected void handleCdsect(XmlPullParser parser, Sink sink) throws XmlPullParserException { sink.text(getText(parser)); } /** * Handles comments. * * <p>This is a default implementation, all data are emitted as comment * events into the specified sink.</p> * * @param parser A parser, not null. * @param sink the sink to receive the events. Not null. * @throws org.codehaus.plexus.util.xml.pull.XmlPullParserException if there's a problem parsing the model */ protected void handleComment(XmlPullParser parser, Sink sink) throws XmlPullParserException { sink.comment(getText(parser).trim()); } /** * Handles entities. * * <p>This is a default implementation, all entities are resolved and emitted as text * events into the specified sink, except:</p> * <ul> * <li>the entities with names <code>#160</code>, <code>nbsp</code> and <code>#x00A0</code> * are emitted as <code>nonBreakingSpace()</code> events.</li> * </ul> * * @param parser A parser, not null. * @param sink the sink to receive the events. Not null. * @throws org.codehaus.plexus.util.xml.pull.XmlPullParserException if there's a problem parsing the model */ protected void handleEntity(XmlPullParser parser, Sink sink) throws XmlPullParserException { String text = getText(parser); String name = parser.getName(); if ("#160".equals(name) || "nbsp".equals(name) || "#x00A0".equals(name)) { sink.nonBreakingSpace(); } else { String unescaped = HtmlTools.unescapeHTML(text); sink.text(unescaped); } } /** * Handles an unknown event. * * <p>This is a default implementation, all events are emitted as unknown * events into the specified sink.</p> * * @param parser the parser to get the event from. * @param sink the sink to receive the event. * @param type the tag event type. This should be one of HtmlMarkup.TAG_TYPE_SIMPLE, * HtmlMarkup.TAG_TYPE_START, HtmlMarkup.TAG_TYPE_END or HtmlMarkup.ENTITY_TYPE. * It will be passed as the first argument of the required parameters to the Sink * {@link org.apache.maven.doxia.sink.Sink#unknown(String, Object[], org.apache.maven.doxia.sink.SinkEventAttributes)} * method. */ protected void handleUnknown(XmlPullParser parser, Sink sink, int type) { Object[] required = new Object[] { Integer.valueOf(type) }; SinkEventAttributeSet attribs = getAttributesFromParser(parser); sink.unknown(parser.getName(), required, attribs); } /** * <p>isIgnorableWhitespace.</p> * * @return <code>true</code> if whitespace will be ignored, <code>false</code> otherwise. * @see #setIgnorableWhitespace(boolean) * @since 1.1 */ protected boolean isIgnorableWhitespace() { return ignorableWhitespace; } /** * Specify that whitespace will be ignored. I.e.: * <pre><tr> <td/> </tr></pre> * is equivalent to * <pre><tr><td/></tr></pre> * * @param ignorable <code>true</code> to ignore whitespace, <code>false</code> otherwise. * @since 1.1 */ protected void setIgnorableWhitespace(boolean ignorable) { this.ignorableWhitespace = ignorable; } /** * <p>isCollapsibleWhitespace.</p> * * @return <code>true</code> if text will collapse, <code>false</code> otherwise. * @see #setCollapsibleWhitespace(boolean) * @since 1.1 */ protected boolean isCollapsibleWhitespace() { return collapsibleWhitespace; } /** * Specify that text will be collapsed. I.e.: * <pre>Text Text</pre> * is equivalent to * <pre>Text Text</pre> * * @param collapsible <code>true</code> to allow collapsible text, <code>false</code> otherwise. * @since 1.1 */ protected void setCollapsibleWhitespace(boolean collapsible) { this.collapsibleWhitespace = collapsible; } /** * <p>isTrimmableWhitespace.</p> * * @return <code>true</code> if text will be trim, <code>false</code> otherwise. * @see #setTrimmableWhitespace(boolean) * @since 1.1 */ protected boolean isTrimmableWhitespace() { return trimmableWhitespace; } /** * Specify that text will be collapsed. I.e.: * <pre><p> Text </p></pre> * is equivalent to * <pre><p>Text</p></pre> * * @param trimmable <code>true</code> to allow trimmable text, <code>false</code> otherwise. * @since 1.1 */ protected void setTrimmableWhitespace(boolean trimmable) { this.trimmableWhitespace = trimmable; } /** * <p>getText.</p> * * @param parser A parser, not null. * @return the {@link XmlPullParser#getText()} taking care of trimmable or collapsible configuration. * @see XmlPullParser#getText() * @see #isCollapsibleWhitespace() * @see #isTrimmableWhitespace() * @since 1.1 */ protected String getText(XmlPullParser parser) { String text = parser.getText(); if (isTrimmableWhitespace()) { text = text.trim(); } if (isCollapsibleWhitespace()) { StringBuilder newText = new StringBuilder(); String[] elts = StringUtils.split(text, " \r\n"); for (int i = 0; i < elts.length; i++) { newText.append(elts[i]); if ((i + 1) < elts.length) { newText.append(" "); } } text = newText.toString(); } return text; } /** * Return the defined entities in a local doctype. I.e.: * <pre> * <!DOCTYPE foo [ * <!ENTITY bar "&#x160;"> * <!ENTITY bar1 "&#x161;"> * ]> * </pre> * * @return a map of the defined entities in a local doctype. * @since 1.1 */ protected Map<String, String> getLocalEntities() { if (entities == null) { entities = new LinkedHashMap<String, String>(); } return entities; } /** * <p>isValidate.</p> * * @return <code>true</code> if XML content will be validate, <code>false</code> otherwise. * @since 1.1 */ public boolean isValidate() { return validate; } /** * Specify a flag to validate or not the XML content. * * @param validate the validate to set * @see #parse(Reader, Sink) * @since 1.1 */ public void setValidate(boolean validate) { this.validate = validate; } // ---------------------------------------------------------------------- // Private methods // ---------------------------------------------------------------------- /** * Add an entity given by <code>entityName</code> and <code>entityValue</code> to {@link #entities}. * <br/> * By default, we exclude the default XML entities: &amp;, &lt;, &gt;, &quot; and &apos;. * * @param parser not null * @param entityName not null * @param entityValue not null * @throws XmlPullParserException if any * @see {@link XmlPullParser#defineEntityReplacementText(String, String)} */ private void addEntity(XmlPullParser parser, String entityName, String entityValue) throws XmlPullParserException { if (entityName.endsWith("amp") || entityName.endsWith("lt") || entityName.endsWith("gt") || entityName.endsWith("quot") || entityName.endsWith("apos")) { return; } parser.defineEntityReplacementText(entityName, entityValue); getLocalEntities().put(entityName, entityValue); } /** * Handle entities defined in a local doctype as the following: * <pre> * <!DOCTYPE foo [ * <!ENTITY bar "&#x160;"> * <!ENTITY bar1 "&#x161;"> * ]> * </pre> * * @param parser not null * @param text not null * @throws XmlPullParserException if any */ private void addLocalEntities(XmlPullParser parser, String text) throws XmlPullParserException { int entitiesCount = StringUtils.countMatches(text, ENTITY_START); if (entitiesCount > 0) { // text should be foo [...] int start = text.indexOf('['); int end = text.lastIndexOf(']'); if (start != -1 && end != -1) { addDTDEntities(parser, text.substring(start + 1, end)); } } } /** * Handle entities defined in external doctypes as the following: * <pre> * <!DOCTYPE foo [ * <!-- These are the entity sets for ISO Latin 1 characters for the XHTML --> * <!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" * "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent"> * %HTMLlat1; * ]> * </pre> * * @param parser not null * @param text not null * @throws XmlPullParserException if any */ private void addDTDEntities(XmlPullParser parser, String text) throws XmlPullParserException { int entitiesCount = StringUtils.countMatches(text, ENTITY_START); if (entitiesCount > 0) { final String txt = StringUtils.replace(text, ENTITY_START, "\n" + ENTITY_START); BufferedReader reader = new BufferedReader(new StringReader(txt)); String line; String tmpLine = ""; try { Matcher matcher; while ((line = reader.readLine()) != null) { tmpLine += "\n" + line; matcher = PATTERN_ENTITY_1.matcher(tmpLine); if (matcher.find() && matcher.groupCount() == 7) { String entityName = matcher.group(2); String entityValue = matcher.group(5); addEntity(parser, entityName, entityValue); tmpLine = ""; } else { matcher = PATTERN_ENTITY_2.matcher(tmpLine); if (matcher.find() && matcher.groupCount() == 8) { String entityName = matcher.group(2); String entityValue = matcher.group(5); addEntity(parser, entityName, entityValue); tmpLine = ""; } } } } catch (IOException e) { // nop } finally { IOUtil.close(reader); } } } /** * Implementation of the callback mechanism <code>EntityResolver</code>. * Using a mechanism of cached files in temp dir to improve performance when using the <code>XMLReader</code>. */ public static class CachedFileEntityResolver implements EntityResolver { /** Map with systemId as key and the content of systemId as byte[]. */ protected static final Map<String, byte[]> ENTITY_CACHE = new Hashtable<String, byte[]>(); /** {@inheritDoc} */ public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { byte[] res = ENTITY_CACHE.get(systemId); // already cached? if (res == null) { String systemName = FileUtils.getFile(systemId).getName(); File temp = new File(System.getProperty("java.io.tmpdir"), systemName); // maybe already as a temp file? if (!temp.exists()) { // is systemId a file or an url? if (systemId.toLowerCase(Locale.ENGLISH).startsWith("file")) { // Doxia XSDs are included in the jars, so try to find the resource systemName from // the classpath... String resource = "/" + systemName; URL url = getClass().getResource(resource); if (url != null) { res = toByteArray(url); } else { throw new SAXException("Could not find the SYSTEM entity: " + systemId + " because '" + resource + "' is not available of the classpath."); } } else { res = toByteArray(new URL(systemId)); } // write systemId as temp file copy(res, temp); } else { // TODO How to refresh Doxia XSDs from temp dir? res = toByteArray(temp.toURI().toURL()); } ENTITY_CACHE.put(systemId, res); } InputSource is = new InputSource(new ByteArrayInputStream(res)); is.setPublicId(publicId); is.setSystemId(systemId); return is; } /** * If url is not an http/https urls, call {@link IOUtil#toByteArray(java.io.InputStream)} to get the url * content. * Otherwise, use HttpClient to get the http content. * Wrap all internal exceptions to throw SAXException. * * @param url not null * @return return an array of byte * @throws SAXException if any */ private static byte[] toByteArray(URL url) throws SAXException { if (!(url.getProtocol().equalsIgnoreCase("http") || url.getProtocol().equalsIgnoreCase("https"))) { InputStream is = null; try { is = url.openStream(); if (is == null) { throw new SAXException("Cannot open stream from the url: " + url.toString()); } return IOUtil.toByteArray(is); } catch (IOException e) { throw new SAXException("IOException: " + e.getMessage(), e); } finally { IOUtil.close(is); } } // it is an HTTP url, using HttpClient... DefaultHttpClient client = new DefaultHttpClient(); HttpGet method = new HttpGet(url.toString()); // Set a user-agent that doesn't contain the word "java", otherwise it will be blocked by the W3C // The default user-agent is "Apache-HttpClient/4.0.2 (java 1.5)" method.setHeader("user-agent", "Apache-Doxia/" + doxiaVersion()); HttpRequestRetryHandler retryHandler = new DefaultHttpRequestRetryHandler(3, false); client.setHttpRequestRetryHandler(retryHandler); HttpEntity entity = null; try { HttpResponse response = client.execute(method); int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) { throw new IOException("The status code when accessing the URL '" + url.toString() + "' was " + statusCode + ", which is not allowed. The server gave this reason for the failure '" + response.getStatusLine().getReasonPhrase() + "'."); } entity = response.getEntity(); return EntityUtils.toByteArray(entity); } catch (ClientProtocolException e) { throw new SAXException("ClientProtocolException: Fatal protocol violation: " + e.getMessage(), e); } catch (IOException e) { throw new SAXException("IOException: Fatal transport error: " + e.getMessage(), e); } finally { if (entity != null) { try { entity.consumeContent(); } catch (IOException e) { // Ignore } } } } /** * Wrap {@link IOUtil#copy(byte[], OutputStream)} to throw SAXException. * * @param res not null array of byte * @param f the file where to write the bytes * @throws SAXException if any * @see {@link IOUtil#copy(byte[], OutputStream)} */ private void copy(byte[] res, File f) throws SAXException { if (f.isDirectory()) { throw new SAXException("'" + f.getAbsolutePath() + "' is a directory, can not write it."); } OutputStream os = null; try { os = new FileOutputStream(f); IOUtil.copy(res, os); } catch (IOException e) { throw new SAXException("IOException: " + e.getMessage(), e); } finally { IOUtil.close(os); } } } }