HTML Parser
/*******************************************************************************
* Copyright (c) 2004 Actuate Corporation.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Actuate Corporation - initial API and implementation
*******************************************************************************/
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.ArrayList;
public class HTMLParser
{
FileReader reader;
LineNumberReader in;
String token;
ArrayList attribs = new ArrayList( );
int pushC = -1;
private boolean ignoreWhitespace = true;
public static final int EOF = -1;
public static final int TEXT = 1;
public static final int DOCTYPE = 2;
public static final int ELEMENT = 3;
public static final int COMMENT = 4;
public static final int SPECIAL_ELEMENT = 5;
public static final int START_ELEMENT = 0;
public static final int END_ELEMENT = 1;
public static final int SINGLE_ELEMENT = 2;
public HTMLParser( )
{
}
public void open( String fileName ) throws FileNotFoundException
{
reader = new FileReader( fileName );
in = new LineNumberReader( reader );
}
/**
*
*/
public void close( )
{
try
{
in.close( );
reader.close( );
}
catch ( IOException e1 )
{
// Ignore
}
}
public String getTokenText( )
{
return token;
}
public int getElementType( )
{
if ( token.startsWith( "/" ) ) //$NON-NLS-1$
return END_ELEMENT;
if ( token.endsWith( "/" ) ) //$NON-NLS-1$
return SINGLE_ELEMENT;
return START_ELEMENT;
}
public String getElement( )
{
if ( token.startsWith( "/" ) ) //$NON-NLS-1$
return token.substring( 1 );
if ( token.endsWith( "/" ) ) //$NON-NLS-1$
return token.substring( 0, token.length( ) - 1 );
return token;
}
public ArrayList getAttribs( )
{
return attribs;
}
public String getAttrib( String name )
{
for ( int i = 0; i < attribs.size( ); i++ )
{
AttribPair a = (AttribPair) attribs.get( i );
if ( a.attrib.equalsIgnoreCase( name ) )
return a.value;
}
return null;
}
private int getC( )
{
if ( pushC != -1 )
{
int c = pushC;
pushC = -1;
return c;
}
try
{
return in.read( );
}
catch ( IOException e )
{
return EOF;
}
}
private void pushC( int c )
{
pushC = c;
}
public int getToken( )
{
for ( ; ; )
{
int c = getC( );
switch ( c )
{
case -1:
return EOF;
case '<':
return getElement( c );
default:
{
parseText( c );
if ( ! ignoreWhitespace || token.trim( ).length( ) > 0 )
return TEXT;
}
}
}
}
private int parseText( int c )
{
StringBuffer text = new StringBuffer( );
for ( ; ; )
{
if ( c == EOF )
break;
if ( c == '<' )
{
pushC( c );
break;
}
// Convert MS-Word-style quotes.
if ( c == 8220 || c == 8221 )
text.append( """ );
else
text.append( (char) c );
c = getC( );
}
token = text.toString( );
return TEXT;
}
private int skipSpace( int c )
{
while ( c != EOF && Character.isWhitespace( (char)c ) )
{
c = getC( );
}
return c;
}
private int getElement( int c )
{
c = getC( );
// Broken element
if ( c == EOF )
return EOF;
if ( c == '!' )
return getSpecialElement( );
attribs.clear( );
c = skipSpace( c );
if ( c == EOF )
return EOF;
StringBuffer tag = new StringBuffer( );
if ( c == '/' )
{
tag.append( (char) c );
c = skipSpace( getC( ) );
while ( c != EOF && c != '>' && ! Character.isWhitespace( (char)c ) )
{
tag.append( (char) c );
c = getC( );
}
token = tag.toString( );
for ( ; ; )
{
if ( c == '>' || c == -1 )
break;
c = getC( );
}
return ELEMENT;
}
while ( c != EOF && c != '>' && c != '/' && ! Character.isWhitespace( (char)c ) )
{
tag.append( (char) c );
c = getC( );
}
if ( c == EOF )
{
token = tag.toString( );
return ELEMENT;
}
for ( ; ; )
{
c = skipSpace( c );
if ( c == EOF || c == '>' || c == '/' )
break;
c = getAttrib( c );
}
if ( c == '/' )
{
tag.append( (char) c );
for ( ; ; )
{
c = getC( );
if ( c == -1 || c == '>' )
break;
}
}
token = tag.toString( );
return ELEMENT;
}
private int getAttrib( int c )
{
AttribPair a = new AttribPair( );
StringBuffer s = new StringBuffer( );
while ( c != EOF && c != '=' && ! Character.isWhitespace( (char)c ) )
{
s.append( (char) c );
c = getC( );
}
a.attrib = s.toString( );
c = skipSpace( c );
if ( c != '=' )
{
attribs.add( a );
return c;
}
s = new StringBuffer( );
c = skipSpace( getC( ) );
if ( c == '\'' || c == '"' )
{
int quote = c;
for ( ; ; )
{
c = getC( );
if ( c == -1 )
break;
if ( c == quote )
{
c = getC( );
break;
}
if ( c == '\\' )
{
c = getC( );
if ( c == EOF )
break;
s.append( '\\' );
s.append( (char) c );
}
else
{
s.append( (char) c );
}
}
}
else
{
for ( ; ; )
{
c = getC( );
if ( c == -1 )
break;
if ( c == '>' || c == '/' || Character.isWhitespace( (char)c ) )
{
c = getC( );
break;
}
s.append( (char) c );
}
}
a.value = s.toString( );
attribs.add( a );
return c;
}
class AttribPair
{
String attrib;
String value;
}
private int getSpecialElement( )
{
StringBuffer text = new StringBuffer( );
text.append( "<!" ); //$NON-NLS-1$
for ( ; ; )
{
int c = getC( );
if ( c == EOF || c == '>' )
break;
text.append( (char) c );
}
text.append( '>' );
token = text.toString( );
if ( token.startsWith( "<!--" ) ) //$NON-NLS-1$
return COMMENT;
return SPECIAL_ELEMENT;
}
static String formatTags[ ] =
{
"i", "b", //$NON-NLS-1$//$NON-NLS-2$
"strong", "em", //$NON-NLS-1$//$NON-NLS-2$
"code", "span", //$NON-NLS-1$ //$NON-NLS-2$
"a" //$NON-NLS-1$
};
public boolean isFormatTag( )
{
return isFormatTag( getElement( ) );
}
public boolean isFormatTag( String tag )
{
for ( int i = 0; i < formatTags.length; i++ )
{
if ( formatTags[ i ].equalsIgnoreCase( tag ) )
return true;
}
return false;
}
public Object getFullElement( )
{
StringBuffer text = new StringBuffer( );
text.append( '<' );
int elementType = getElementType( );
if ( elementType == END_ELEMENT )
text.append( '/' );
text.append( getElement( ) );
for ( int i = 0; i < attribs.size( ); i++ )
{
text.append( ' ' );
AttribPair a = (AttribPair) attribs.get( i );
text.append( a.attrib );
text.append( "=\"" ); //$NON-NLS-1$
if ( a.value != null )
text.append( a.value );
text.append( "\"" ); //$NON-NLS-1$
}
if ( elementType == SINGLE_ELEMENT )
text.append( '/' );
text.append( '>' );
return text.toString( );
}
public int getLineNo( )
{
return in.getLineNumber( );
}
public void ignoreWhitespace( boolean b )
{
ignoreWhitespace = b;
}
}
Related examples in the same category