Java tutorial
import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.net.URL; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; public class MainClass { private static void parse(URL url, String encoding) throws IOException { ParserGetter kit = new ParserGetter(); HTMLEditorKit.Parser parser = kit.getParser(); InputStream in = url.openStream(); InputStreamReader r = new InputStreamReader(in, encoding); HTMLEditorKit.ParserCallback callback = new Outliner(new OutputStreamWriter(System.out)); parser.parse(r, callback, true); } public static void main(String[] args) throws Exception { ParserGetter kit = new ParserGetter(); HTMLEditorKit.Parser parser = kit.getParser(); String encoding = "ISO-8859-1"; URL url = new URL("http://www.java2s.com"); InputStream in = url.openStream(); InputStreamReader r = new InputStreamReader(in, encoding); // parse once just to detect the encoding HTMLEditorKit.ParserCallback doNothing = new HTMLEditorKit.ParserCallback(); parser.parse(r, doNothing, false); parse(url, encoding); } } class Outliner extends HTMLEditorKit.ParserCallback { private Writer out; private int level = 0; private boolean inHeader = false; private static String lineSeparator = System.getProperty("line.separator", "\r\n"); public Outliner(Writer out) { this.out = out; } public void handleStartTag(HTML.Tag tag, MutableAttributeSet attributes, int position) { int newLevel = 0; if (tag == HTML.Tag.H1) newLevel = 1; else if (tag == HTML.Tag.H2) newLevel = 2; else if (tag == HTML.Tag.H3) newLevel = 3; else if (tag == HTML.Tag.H4) newLevel = 4; else if (tag == HTML.Tag.H5) newLevel = 5; else if (tag == HTML.Tag.H6) newLevel = 6; else return; this.inHeader = true; try { if (newLevel > this.level) { for (int i = 0; i < newLevel - this.level; i++) { out.write("<ul>" + lineSeparator + "<li>"); } } else if (newLevel < this.level) { for (int i = 0; i < this.level - newLevel; i++) { out.write(lineSeparator + "</ul>" + lineSeparator); } out.write(lineSeparator + "<li>"); } else { out.write(lineSeparator + "<li>"); } this.level = newLevel; out.flush(); } catch (IOException ex) { System.err.println(ex); } } public void handleEndTag(HTML.Tag tag, int position) { if (tag == HTML.Tag.H1 || tag == HTML.Tag.H2 || tag == HTML.Tag.H3 || tag == HTML.Tag.H4 || tag == HTML.Tag.H5 || tag == HTML.Tag.H6) { inHeader = false; } // work around bug in the parser that fails to call flush if (tag == HTML.Tag.HTML) this.flush(); } public void handleText(char[] text, int position) { if (inHeader) { try { out.write(text); out.flush(); } catch (IOException ex) { System.err.println(ex); } } } public void flush() { try { while (this.level-- > 0) { out.write(lineSeparator + "</ul>"); } out.flush(); } catch (IOException e) { System.err.println(e); } } private static void parse(URL url, String encoding) throws IOException { ParserGetter kit = new ParserGetter(); HTMLEditorKit.Parser parser = kit.getParser(); InputStream in = url.openStream(); InputStreamReader r = new InputStreamReader(in, encoding); HTMLEditorKit.ParserCallback callback = new Outliner(new OutputStreamWriter(System.out)); parser.parse(r, callback, true); } } class ParserGetter extends HTMLEditorKit { public HTMLEditorKit.Parser getParser() { return super.getParser(); } }