org.archive.wayback.util.htmllex.ParseContext.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.wayback.util.htmllex.ParseContext.java

Source

/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.util.htmllex;

import java.net.URISyntaxException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.archive.url.UsableURI;
import org.archive.url.UsableURIFactory;
import org.archive.wayback.core.CaptureSearchResult;

/**
 * Class which tracks the context and state involved with parsing an HTML
 * document via SAX events.
 * 
 * Also holds some page URL information, and provides some URL resolving 
 * functionality.
 * 
 * Lastly, this class exposes a general purpose HashMap<String,String> for use
 * by specific applications.
 *
 * @author brad
 * @version $Date$, $Revision$
 */

public class ParseContext {
    private static final Logger LOGGER = Logger.getLogger(ParseContext.class.getName());

    protected UsableURI baseUrl = null;

    private boolean inHTML = false;
    private boolean inCSS = false;
    private boolean inJS = false;
    private boolean inScriptText = false;
    private HashMap<String, String> data = null;

    /**
     * constructor
     */
    public ParseContext() {
        data = new HashMap<String, String>();
    }

    /**
     * Stores arbitrary key value pairs in this ParseContext
     * @param key for storage
     * @param value for storage
     */
    public void putData(String key, String value) {
        data.put(key, value);
    }

    /**
     * Retrieves previously stored data for key key from this ParseContext
     * @param key under which value was stored
     * @return previously stored value for key or null, if nothing was stored
     */
    public String getData(String key) {
        return data.get(key);
    }

    /**
     * @return the full Map of String to String for this parsing context.
     */
    public Map<String, String> getMap() {
        return data;
    }

    /**
     * @param baseURL an base URL for relative URLs
     */
    public void setBaseUrl(String baseURL) {
        try {
            baseUrl = UsableURIFactory.getInstance(baseURL);
        } catch (URIException ex) {
            // XXX
            ex.printStackTrace();
        }
    }

    /**
     * @param url against which relative URLs should be resolved for this parse
     */
    public void setBaseUrl(URL url) {
        setBaseUrl(url.toExternalForm());
    }

    /**
     * Resolve possibly-relative {@code url} with {@code baseUrl} set to
     * this object. 
     * <p>Caveat: this method no longer unescape HTML entities in {@code url}.
     * HTML entities must be all unescaped before calling method.</p>
     * @param url which should be resolved
     * @return absolute URL.
     * @throws URISyntaxException if the input URL is malformed
     */
    public String resolve(String url) throws URISyntaxException {
        int hashIdx = url.indexOf('#');
        String frag = "";
        if (hashIdx != -1) {
            frag = url.substring(hashIdx);
            url = url.substring(0, hashIdx);
        }

        if (baseUrl == null) {
            // TODO: log ?
            return url + frag;
        }

        try {
            url = UsableURIFactory.getInstance(baseUrl, url).toString() + frag;
        } catch (URIException e) {
            LOGGER.warning("FAILED RESOLVE: base(" + baseUrl + ") frag(" + url + ") error(" + e.getMessage() + ")");
            url = url + frag;
        }
        return url;
    }

    /**
     * @param url which should be resolved.
     * @return absolute form of input url, or url itself if javascript:
     */
    public String contextualizeUrl(String url) {
        if (url.startsWith("javascript:") || url.startsWith("#")) {
            return url;
        }
        try {
            return resolve(url);
        } catch (URISyntaxException e) {
            e.printStackTrace();
            return url;
        }
    }

    /**
     * set to {@code true} when any HTML open tag
     * is found.
     * <p>used for checking if the content really
     * looks like an HTML document.</p>
     * @param inHTML
     */
    public void setInHTML(boolean inHTML) {
        this.inHTML = inHTML;
    }

    public boolean isInHTML() {
        return inHTML;
    }

    /**
     * @return the inCSS
     */
    public boolean isInCSS() {
        return inCSS;
    }

    /**
     * @param inCSS the inCSS to set
     */
    public void setInCSS(boolean inCSS) {
        this.inCSS = inCSS;
    }

    /**
     * @return the inJS
     */
    public boolean isInJS() {
        return inJS;
    }

    /**
     * @param inJS the inJS to set
     */
    public void setInJS(boolean inJS) {
        this.inJS = inJS;
    }

    /**
     * @return the inScriptText
     */
    public boolean isInScriptText() {
        return inScriptText;
    }

    /**
     * @param inScriptText the inScriptText to set
     */
    public void setInScriptText(boolean inScriptText) {
        this.inScriptText = inScriptText;
    }

    public String getOraclePolicy() {
        return getData(CaptureSearchResult.CAPTURE_ORACLE_POLICY);
    }

    public void setOraclePolicy(String policy) {
        putData(CaptureSearchResult.CAPTURE_ORACLE_POLICY, policy);
    }
}