Java tutorial
/* * Copyright (C) 2006 Jordi Marqus Ferr * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this software; see the file DUROTY.txt. * * Author: Jordi Marqus Ferr * c/Mallorca 295 principal B 08037 Barcelona Spain * Phone: +34 625397324 */ package com.duroty.lucene.parser; import com.duroty.lucene.parser.exception.ParserException; import com.duroty.utils.io.NullWriter; import org.apache.commons.io.IOUtils; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.Text; import org.w3c.tidy.Tidy; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; import java.io.PrintWriter; import java.nio.charset.Charset; /** * DOCUMENT ME! * * @author jordi marques */ public class HtmlParser implements Parser { /** DOCUMENT ME! */ private InputStream input; /** DOCUMENT ME! */ private Element element; /** * DOCUMENT ME! */ private String title = null; /** * DOCUMENT ME! */ private String body = null; /** * DOCUMENT ME! */ private String charset = Charset.defaultCharset().displayName(); /** * Creates a new instance of HtmlParser */ public HtmlParser() { } /** * DOCUMENT ME! * * @param file DOCUMENT ME! * * @return DOCUMENT ME! * * @throws ParserException DOCUMENT ME! */ public String parse(File file) throws ParserException { try { input = new FileInputStream(file); } catch (FileNotFoundException e) { throw new ParserException(e); } return parse(); } /** * DOCUMENT ME! * * @param in DOCUMENT ME! * * @return DOCUMENT ME! * * @throws ParserException DOCUMENT ME! */ public String parse(InputStream in) throws ParserException { this.input = in; return parse(); } /** * DOCUMENT ME! * * @return DOCUMENT ME! * * @throws ParserException DOCUMENT ME! */ private String parse() throws ParserException { try { Tidy tidy = new Tidy(); tidy.setUpperCaseTags(true); tidy.setInputEncoding(charset); tidy.setOutputEncoding(Charset.defaultCharset().displayName()); tidy.setMakeBare(true); tidy.setMakeClean(true); tidy.setShowWarnings(false); tidy.setErrout(new PrintWriter(new NullWriter())); tidy.setXmlOut(false); tidy.setWord2000(true); tidy.setDropProprietaryAttributes(true); tidy.setFixBackslash(true); tidy.setXHTML(true); tidy.setWrapSection(true); tidy.setWrapScriptlets(true); tidy.setWrapPhp(true); tidy.setQuiet(true); org.w3c.dom.Document root = tidy.parseDOM(input, null); element = root.getDocumentElement(); StringBuffer buffer = new StringBuffer(); this.title = this.getTitle(); if ((this.title != null) && !this.title.trim().equals("")) { buffer.append(this.title); buffer.append('\n'); } this.body = this.getBody(); if ((this.body != null) && !this.body.trim().equals("")) { buffer.append(this.body); } return buffer.toString(); } catch (Exception ex) { throw new ParserException(ex); } finally { IOUtils.closeQuietly(input); } } /** * DOCUMENT ME! * * @return DOCUMENT ME! */ public String getTitle() { if (this.title != null) { return this.title; } if (element == null) { return null; } String title = null; NodeList nl = element.getElementsByTagName("title"); if (nl.getLength() > 0) { Element titleElement = ((Element) nl.item(0)); Text text = (Text) titleElement.getFirstChild(); if (text != null) { title = text.getData(); } } return title; } /** * DOCUMENT ME! * * @return DOCUMENT ME! */ public String getBody() { if (this.body != null) { return this.body; } if (element == null) { return null; } String body = ""; NodeList nl = element.getElementsByTagName("body"); if (nl.getLength() > 0) { body = getBodyText(nl.item(0)); } return body; } /** * DOCUMENT ME! * * @param node DOCUMENT ME! * * @return DOCUMENT ME! */ private String getBodyText(Node node) { NodeList nl = node.getChildNodes(); StringBuffer buffer = new StringBuffer(); for (int i = 0; i < nl.getLength(); i++) { Node child = nl.item(i); switch (child.getNodeType()) { case Node.ELEMENT_NODE: if (!child.getNodeName().toLowerCase().equals("script")) { buffer.append(getBodyText(child)); buffer.append(" \n"); } break; case Node.TEXT_NODE: buffer.append(((Text) child).getData()); break; } } return buffer.toString(); } /** * DOCUMENT ME! * * @param charset DOCUMENT ME! */ public void setCharset(String charset) { this.charset = charset; } /** * DOCUMENT ME! * * @param sleep DOCUMENT ME! */ public void setSleep(long sleep) { // TODO Auto-generated method stub } }