babel.content.pages.Page.java Source code

Java tutorial

Introduction

Here is the source code for babel.content.pages.Page.java

Source

/**
 * This file is licensed to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package babel.content.pages;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;

import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLStreamWriter;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;

import babel.prep.extract.NutchChunk;

import babel.util.language.Language;
import babel.util.persistence.XMLPersistable;

public class Page implements XMLPersistable, Writable {
    public static final Log LOG = LogFactory.getLog(Page.class);

    private static final String XML_TAG_PAGE = "Page";
    private static final String XML_ATTRIB_URL = "URL";

    private static final String PROP_LANG = "Language";

    public Page() {
        this(null);
    }

    public Page(String url) {
        m_pageProps = new MetaData("PageProperties");
        m_pageURL = (url == null) ? new String() : url;
        m_versions = new ArrayList<PageVersion>();
    }

    public Page(String url, Iterator<NutchChunk> values) {
        HashMap<String, List<NutchChunk>> verChunks = splitIntoVersions(values);

        m_pageProps = new MetaData("PageProperties");
        m_pageURL = (url == null) ? new String() : url; // TODO: Is URL already normalized?
        m_versions = new ArrayList<PageVersion>(verChunks.size());

        PageVersion curVer;

        for (String segId : verChunks.keySet()) {
            curVer = new PageVersion(segId, verChunks.get(segId), this);

            if (curVer.isNutchComplete()) {
                addVersion(curVer);
            }
        }

        m_versions.trimToSize();
    }

    /**
     * Adds page properties and versions from the given page. If a page property
     * already exists, the values are ignored. If a version already exists, keeps 
     * whichever was fetched earlier.
     */
    public void merge(Page other) {
        if (other == null || !m_pageURL.equals(other.m_pageURL)) {
            throw new IllegalArgumentException("null ref or different URL");
        }

        addProperties(other.m_pageProps);

        for (PageVersion ver : other.m_versions) {
            addVersion(ver);
        }
    }

    public String pageURL() {
        return m_pageURL;
    }

    /**
     * @return page properties
     */
    public MetaData pageProperties() {
        return m_pageProps;
    }

    public Language getLanguage() {
        return Language.fromString(m_pageProps.getFirst(PROP_LANG));
    }

    public void setLanguage(Language lang) {
        String oldLang = m_pageProps.getFirst(PROP_LANG);
        String newLang = (lang != null) ? lang.toString() : null;

        m_pageProps.remove(PROP_LANG);

        if (newLang != null) {
            m_pageProps.set(PROP_LANG, newLang);
        }

        if (LOG.isWarnEnabled() && (oldLang != null) && !oldLang.equals(newLang)) {
            LOG.warn("Changing language of " + m_pageURL + " from " + oldLang + " to "
                    + (newLang == null ? " nothing." : newLang + "."));
        }
    }

    /**
     * Adds page properties. If a key is already containined, new values are 
     * ignored.
     */
    public void addProperties(MetaData props) {
        if (props != null) {
            String[] keys = props.keys();

            for (int i = 0; i < keys.length; i++) {
                if (!m_pageProps.hasKey(keys[i])) {
                    m_pageProps.add(keys[i], props.get(keys[i]));
                }
            }
        }
    }

    public int numVersions() {
        return (m_versions == null) ? 0 : m_versions.size();
    }

    /**
     * @return page versions or null if none
     */
    public List<PageVersion> pageVersions() {
        return m_versions;
    }

    /**
     * Adds a page version to a page. If same version already exists, keeps
     * whichever was fetched earlier.
     */
    public boolean addVersion(PageVersion ver) {
        boolean added = false;
        int idx;
        Long fetchCur, fetchOther;

        // If same page exists - keep the version that was fetched earlier
        if ((idx = m_versions.indexOf(ver)) >= 0) {
            fetchCur = m_versions.get(idx).getFetchTime();
            fetchOther = ver.getFetchTime();

            if (fetchCur != null && fetchOther != null && fetchCur > fetchOther) {
                m_versions.remove(idx);
                added = m_versions.add(ver);
            }
        } else {
            added = m_versions.add(ver);
        }

        return added;
    }

    public String toString() {
        StringBuilder strBld = new StringBuilder();

        strBld.append("Page URL: " + m_pageURL + "\n\n");

        for (PageVersion ver : m_versions) {
            strBld.append(ver.toString() + "\n");
        }

        return strBld.toString();
    }

    public void persist(XMLStreamWriter writer) throws XMLStreamException {
        writer.writeStartElement(XML_TAG_PAGE);
        writer.writeAttribute(XML_ATTRIB_URL, m_pageURL);

        if (m_pageProps.numKeys() > 0) {
            m_pageProps.persist(writer);
        }

        for (PageVersion ver : m_versions) {
            ver.persist(writer);
        }

        writer.writeEndElement();
    }

    public void unpersist(XMLStreamReader reader) throws XMLStreamException {
        String elemTag;
        PageVersion ver;

        m_pageURL = reader.getAttributeValue(0);
        m_versions.clear();

        while (true) {
            int event = reader.next();

            if (event == XMLStreamConstants.END_ELEMENT && XML_TAG_PAGE.equals(reader.getName().toString())) {
                break;
            }

            if (event == XMLStreamConstants.START_ELEMENT) {
                elemTag = reader.getName().toString();

                if ("MetaData".equals(elemTag)) {
                    m_pageProps.unpersist(reader);
                } else if ("PageVersion".equals(elemTag)) {
                    ver = new PageVersion();
                    ver.unpersist(reader);

                    m_versions.add(ver);
                }
            }
        }
    }

    public void readFields(DataInput in) throws IOException {
        m_pageURL = Text.readString(in);
        m_pageProps.readFields(in);

        int numVersions = WritableUtils.readVInt(in);
        m_versions = new ArrayList<PageVersion>(numVersions);

        PageVersion curVer;

        for (int i = 0; i < numVersions; i++) {
            curVer = new PageVersion();
            curVer.readFields(in);
            m_versions.add(curVer);
        }
    }

    public void write(DataOutput out) throws IOException {
        Text.writeString(out, m_pageURL);
        m_pageProps.write(out);

        WritableUtils.writeVInt(out, m_versions.size());

        for (PageVersion ver : m_versions) {
            ver.write(out);
        }
    }

    protected HashMap<String, List<NutchChunk>> splitIntoVersions(Iterator<NutchChunk> values) {
        HashMap<String, List<NutchChunk>> verChunks = new HashMap<String, List<NutchChunk>>();

        if (values != null) {
            String curSegId;
            NutchChunk curChunk;
            List<NutchChunk> curList;

            while (values.hasNext()) {
                curChunk = new NutchChunk(values.next());
                curSegId = curChunk.getSegmentId();

                if (null == (curList = verChunks.get(curSegId))) {
                    verChunks.put(curSegId, curList = new LinkedList<NutchChunk>());
                }

                curList.add(curChunk);
            }
        }

        return verChunks;
    }

    protected MetaData m_pageProps;
    protected String m_pageURL;
    protected ArrayList<PageVersion> m_versions;
}