org.jlibrary.core.search.extraction.html.HTMLRipper.java Source code

Introduction

Here is the source code for org.jlibrary.core.search.extraction.html.HTMLRipper.java
Source

/*
* jLibrary, Open Source Document Management System
* 
* Copyright (c) 2003-2006, Martn Prez Marin, Blandware (represented by
* Andrey Grebnev), and individual contributors as indicated by the
* @authors tag. See copyright.txt in the distribution for a full listing of
* individual contributors. All rights reserved.
* 
* This is free software; you can redistribute it and/or modify it
* under the terms of the Modified BSD License as published by the Free 
* Software Foundation.
* 
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Modified
* BSD License for more details.
* 
* You should have received a copy of the Modified BSD License along with 
* this software; if not, write to the Free Software Foundation, Inc., 
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA, or see the
* FSF site: http://www.fsf.org.
*/
package org.jlibrary.core.search.extraction.html;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.htmlparser.Attribute;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.jlibrary.core.entities.Types;
import org.jlibrary.core.util.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HTMLRipper {

    static Logger logger = LoggerFactory.getLogger(HTMLRipper.class);

    private static Parser mParser;
    private static String mSource;

    private static HashSet resources;
    private static String resourcesDirectory = "";

    protected final int TRANSFER_SIZE = 4096;

    public File crawlFile(File parent, String url) throws ParserException {

        if (mParser == null) {
            initParser();
        }

        return crawl(parent, url);
    }

    public File[] crawlFileWithResources(File parent, String url) throws ParserException {

        if (mParser == null) {
            initParser();
        }

        ArrayList crawled = new ArrayList();
        resources.clear();
        resourcesDirectory = getResourcesDirectory(url);

        File file = crawl(parent, url);
        crawled.add(file);

        if (resources.size() > 0) {
            File directory = new File(parent, resourcesDirectory);
            if (!directory.exists()) {
                directory.mkdir();
            }
            Iterator it = resources.iterator();
            while (it.hasNext()) {
                String resourceURL = (String) it.next();
                crawled.add(crawlResource(directory, resourceURL));
            }
        }
        return (File[]) crawled.toArray(new File[] {});
    }

    private String getResourcesDirectory(String url) {

        String extension = FileUtils.getExtension(url);
        url = StringUtils.replace(url, extension, "");
        int i = url.lastIndexOf('/');
        url = url.substring(i + 1);
        int j = url.lastIndexOf('.');
        return url.substring(j + 1);
    }

    protected File crawlResource(File parent, String url) {

        byte[] data;
        InputStream in;
        FileOutputStream out;
        int read;
        File file = null;

        if (logger.isDebugEnabled()) {
            logger.debug("[HTMLRipper] processing " + url);
        }

        try {
            URL source = new URL(url);
            file = new File(parent, makeLocalLink(url, url, ""));
            data = new byte[TRANSFER_SIZE];
            try {
                in = source.openStream();
                try {
                    out = new FileOutputStream(file);
                    try {
                        while (-1 != (read = in.read(data, 0, data.length)))
                            out.write(data, 0, read);
                    } finally {
                        out.close();
                    }
                } catch (FileNotFoundException fnfe) {
                    logger.error(fnfe.getMessage(), fnfe);
                } finally {
                    in.close();
                }
            } catch (FileNotFoundException fnfe) {
                System.err.println("broken link " + fnfe.getMessage() + " ignored");
            }
        } catch (MalformedURLException murle) {
            logger.error(murle.getMessage(), murle);
        } catch (IOException ioe) {
            logger.error(ioe.getMessage(), ioe);
        }
        return file;
    }

    private File crawl(File parent, String url) throws ParserException {

        //System.out.println ("[HTMLRipper] processing " + url);

        mSource = url;
        NodeList list = null;
        File file = null;
        File dir = null;
        PrintWriter out = null;

        try {
            // save the page locally
            file = new File(parent, makeLocalLink(url, url, ""));
            dir = file.getParentFile();
            if (!dir.exists())
                dir.mkdirs();
            else if (!dir.isDirectory()) {
                dir = new File(dir.getParentFile(), dir.getName() + ".content");
                if (!dir.exists())
                    dir.mkdirs();
                file = new File(dir, file.getName());
            }
            FileOutputStream fos = null;
            // If the file is known we will download it with html crawler
            if (Types.isBrowsable(Types.getTypeForFile(file.getAbsolutePath()))) {
                try {
                    // fetch the page and gather the list of nodes
                    mParser.setURL(url);
                    try {
                        list = new NodeList();
                        for (NodeIterator e = mParser.elements(); e.hasMoreNodes();)
                            list.add(e.nextNode()); // URL conversion occurs in the tags
                    } catch (EncodingChangeException ece) {
                        // fix bug #998195 SiteCatpurer just crashed
                        // try again with the encoding now set correctly
                        // hopefully mPages, mImages, mCopied and mFinished won't be corrupted
                        mParser.reset();
                        list = new NodeList();
                        for (NodeIterator e = mParser.elements(); e.hasMoreNodes();)
                            list.add(e.nextNode());
                    }
                    fos = new FileOutputStream(file);
                    out = new PrintWriter(fos);
                    for (int i = 0; i < list.size(); i++)
                        out.print(list.elementAt(i).toHtml());
                } catch (FileNotFoundException fnfe) {
                    logger.error(fnfe.getMessage(), fnfe);
                } finally {
                    try {
                        if (fos != null) {
                            fos.close();
                        }
                        if (out != null) {
                            out.close();
                        }
                    } catch (IOException ioe) {
                        logger.error(ioe.getMessage(), ioe);
                        throw new ParserException(ioe);
                    }
                }
            } else {
                InputStream stream = null;
                try {
                    stream = new URL(url).openStream();
                    fos = new FileOutputStream(file);
                    IOUtils.copy(stream, fos);
                } catch (IOException ioe) {
                    logger.error(ioe.getMessage(), ioe);
                    throw new ParserException(ioe);
                } finally {
                    try {
                        if (fos != null) {
                            fos.close();
                        }
                        if (stream != null) {
                            stream.close();
                        }
                    } catch (IOException ioe) {
                        logger.error(ioe.getMessage(), ioe);
                        throw new ParserException(ioe);
                    }
                }
            }
        } catch (ParserException pe) {
            String message;

            // this exception handling is suboptimal,
            // but it recognizes resources that aren't text/html
            message = pe.getMessage();
            if ((null != message) && (message.endsWith("does not contain text"))) {
                // do nothing
            } else
                throw pe;
        }
        return file;
    }

    private void initParser() {

        PrototypicalNodeFactory factory;

        mParser = new Parser();
        factory = new PrototypicalNodeFactory();
        factory.registerTag(new LocalLinkTag());
        factory.registerTag(new LocalFrameTag());
        factory.registerTag(new LocalBaseHrefTag());
        factory.registerTag(new LocalImageTag());
        factory.registerTag(new LocalMetaLinkTag());
        mParser.setNodeFactory(factory);

        resources = new HashSet();
    }

    /**
     * Converts a link to local.
     * A relative link can be used to construct both a URL and a file name.
     * Basically, the operation is to strip off the base url, if any,
     * and then prepend as many dot-dots as necessary to make
     * it relative to the current page.
     * A bit of a kludge handles the root page specially by calling it
     * index.html, even though that probably isn't it's real file name.
     * This isn't pretty, but it works for me.
     * @param link The link to make relative.
     * @param current The current page URL, or empty if it's an absolute URL
     * that needs to be converted.
     * @return The URL relative to the current page.
     */
    protected String makeLocalLink(String source, String link, String current) {
        int i;
        int j;
        String ret;

        link = cleanString(link);
        source = cleanString(source);

        if (link.equals(source) || (!source.endsWith("/") && link.equals(source + "/"))) {
            if (source.endsWith("/")) {
                ret = "index.html";
            } else {
                int k = source.lastIndexOf('/');
                int z = source.indexOf('/');
                if (z == k - 1) {
                    // handle root urls without end '/' (http://jlibrary.sourceforge.net)
                    ret = "index.html";
                } else {
                    ret = source.substring(k);
                }
            }
        } else if (link.startsWith(source) && (link.length() > source.length()))
            ret = link.substring(source.length() + 1);
        else
            ret = link; // give up

        // make it relative to the current page by prepending "../" for
        // each '/' in the current local path
        if ((null != current) && link.startsWith(source) && (current.length() > source.length())) {
            current = current.substring(source.length() + 1);
            i = 0;
            while (-1 != (j = current.indexOf('/', i))) {
                ret = "../" + ret;
                i = j + 1;
            }
        }

        int dotIndex = ret.indexOf(".");
        if (dotIndex == -1) {
            ret = ret + ".html";
        }

        return (ret);
    }

    /**
     * Link tag that rewrites the HREF.
     * The HREF is changed to a local target if it matches the source.
     */
    class LocalLinkTag extends LinkTag {
        private static final long serialVersionUID = 1L;

        public void doSemanticAction() throws ParserException {
            String link;

            // get the link
            link = getLink();
            // check if it needs to be captured
            if (isToBeCaptured(link)) {
            }
        }
    }

    /**
     * Frame tag that rewrites the SRC URLs.
     * The SRC URLs are mapped to local targets if they match the source.
     */
    class LocalFrameTag extends FrameTag {
        private static final long serialVersionUID = 1L;

        public void doSemanticAction() throws ParserException {
            String link;

            // get the link
            link = getFrameLocation();
            // check if it needs to be captured
            if (isToBeCaptured(link)) {
            }
        }
    }

    /**
     * Image tag that rewrites the SRC URL.
     * If resources are being captured the SRC is mapped to a local target if
     * it matches the source, otherwise it is convered to a full URL to point
     * back to the original site.
     */
    class LocalImageTag extends ImageTag {
        private static final long serialVersionUID = 1L;

        public void doSemanticAction() throws ParserException {
            String image = getImageURL();
            resources.add(image);

            int i = image.lastIndexOf('/');
            String imageName = image.substring(i);

            setImageURL(resourcesDirectory + imageName);
        }
    }

    /**
     * Base tag that doesn't show.
     * The toHtml() method is overridden to return an empty string,
     * effectively shutting off the base reference.
     */
    class LocalBaseHrefTag extends BaseHrefTag {
        private static final long serialVersionUID = 1L;

        // we don't want to have a base pointing back at the source page
        public String toHtml() {
            return ("");
        }
    }

    /**
     * Returns <code>true</code> if the link is one we are interested in.
     * @param link The link to be checked.
     * @return <code>true</code> if the link has the source URL as a prefix
     * and doesn't contain '?' or '#'; the former because we won't be able to
     * handle server side queries in the static target directory structure and
     * the latter because presumably the full page with that reference has
     * already been captured previously. This performs a case insensitive
     * comparison, which is cheating really, but it's cheap.
     */
    protected boolean isToBeCaptured(String link) {
        return (link.toLowerCase().startsWith(mSource.toLowerCase()) && (-1 == link.indexOf("?"))
                && (-1 == link.indexOf("#")));
    }

    public static void main(String[] args) {

        String url = "http://localhost:4277/jlibrary/index.html";
        HTMLRipper ripper = new HTMLRipper();
        try {
            ripper.crawlFileWithResources(new File("/temp"), url);
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
        }
    }

    class LocalMetaLinkTag extends TagNode {

        private static final long serialVersionUID = 1L;

        private final String[] mIds = new String[] { "LINK" };

        public LocalMetaLinkTag() {
        }

        public String[] getIds() {
            return (mIds);
        }

        public String getRel() {
            return (getAttribute("REL"));
        }

        public String getHref() {
            return (getAttribute("HREF"));
        }

        public String getType() {
            return (getAttribute("TYPE"));
        }

        public String getMedia() {
            return (getAttribute("MEDIA"));
        }

        public String getMetaLinkTagName() {

            return (getAttribute("NAME"));
        }

        public void setRel(String rel) {
            Attribute att;
            att = getAttributeEx("REL");
            if (null != att)
                att.setValue(rel);
            else
                getAttributesEx().add(new Attribute("REL", rel));
        }

        public void setHref(String href) {
            Attribute att;
            att = getAttributeEx("HREF");
            if (null != att)
                att.setValue(href);
            else
                getAttributesEx().add(new Attribute("HREF", href));
        }

        public void setType(String type) {
            Attribute att;
            att = getAttributeEx("TYPE");
            if (null != att)
                att.setValue(type);
            else
                getAttributesEx().add(new Attribute("TYPE", type));
        }

        public void setMedia(String media) {
            Attribute att;
            att = getAttributeEx("MEDIA");
            if (null != att)
                att.setValue(media);
            else
                getAttributesEx().add(new Attribute("MEDIA", media));
        }

        public void setMetaTagName(String metaTagName) {
            Attribute name;
            name = getAttributeEx("NAME");
            if (null != name)
                name.setValue(metaTagName);
            else
                getAttributesEx().add(new Attribute("NAME", metaTagName));
        }

        public void doSemanticAction() throws ParserException {

            String type = getType();
            if (type.equals("text/css")) {
                String url = getPage().getAbsoluteURL(getHref());
                resources.add(url);

                int i = url.lastIndexOf('/');
                String hrefName = url.substring(i);
                setHref(resourcesDirectory + hrefName);
            }
        }
    }

    private String cleanString(String source) {

        source = StringUtils.replaceChars(source, '?', '_');
        source = StringUtils.replaceChars(source, '&', '_');

        return source;
    }
}