babel.content.pages.PageVersion.java Source code

Introduction

Here is the source code for babel.content.pages.PageVersion.java
Source

/**
 * This file is licensed to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package babel.content.pages;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;

import babel.prep.extract.NutchChunk;
import babel.util.language.Language;
import babel.util.persistence.XMLPersistable;

import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLStreamWriter;

/**
 * Encapsulates all information extracted per page version by a nutch crawl.
 */
public class PageVersion implements XMLPersistable, Writable {
    public static final Log LOG = LogFactory.getLog(PageVersion.class);

    private static final String DEFAULT_CHARSET = "utf-8";

    private static final String XML_TAG_PAGEVERSION = "PageVersion";
    private static final String XML_TAG_OUT_LINKS = "OutgoingLinks";
    private static final String XML_TAG_LINK = "Link";
    private static final String XML_ATTRIB_ANCHOR = "Anchor";
    private static final String XML_TAG_CONTENT = "ParsedContent";

    private static final String PROP_TITLE = "Title";
    private static final String PROP_FETCH_TIME = "Fetched";
    private static final String PROP_MODIFIED_TIME = "Modified";
    private static final String PROP_SEGMENT_ID = "NutchSegment";

    public PageVersion() {
        m_verProps = new MetaData("VersionProperties");
        m_contentMeta = new MetaData("ContentMetadata");
        m_parseMeta = new MetaData("ParseMetadata");
        m_outLinks = null;
        m_content = new String();
    }

    /**
     * Used to construct a page version from a set of nutch page related objects.
     * Should only be used by NutchPageExtractor.
     */
    public PageVersion(String segmentId, List<NutchChunk> chunks, Page page) {
        this();

        Writable curVal;
        CrawlDatum curCD;
        Content curCT;
        ParseData curPD;
        ParseText curPT;

        // Store Segment ID
        m_verProps.set(PROP_SEGMENT_ID, segmentId);

        // Unwrap all of the page related information
        for (NutchChunk chunk : chunks) {
            curVal = chunk.get();

            if (curVal instanceof CrawlDatum) {
                // Get fetch information
                curCD = (CrawlDatum) curVal;

                if (curCD.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
                    m_verProps.set(PROP_FETCH_TIME, Long.toString(curCD.getFetchTime()));
                    m_verProps.set(PROP_MODIFIED_TIME, Long.toString(curCD.getModifiedTime()));
                }
            } else if (curVal instanceof Content) {
                // Get the original unparsed content
                curCT = (Content) curVal;

                try {
                    String str = new String(curCT.getContent(), DEFAULT_CHARSET);
                    Matcher m = Pattern.compile("<html[^>]*>", Pattern.CASE_INSENSITIVE).matcher(str);

                    if (m.find()) {
                        str = str.substring(m.start(), m.end());
                        m = Pattern.compile("\\slang\\s*=\\s*\"*([^\\s=\"]+)\"*", Pattern.CASE_INSENSITIVE)
                                .matcher(str);

                        if (m.find()) {
                            str = str.substring(m.start(1), m.end(1)).trim().toLowerCase();
                            if (str.length() > 0) {
                                page.setLanguage(Language.fromString(str));
                            }
                        }
                    }
                } catch (Exception e) {
                }
            } else if (curVal instanceof ParseData) {
                // Get data extracted from page content
                curPD = (ParseData) curVal;
                if (curPD.getStatus().isSuccess()) {
                    m_verProps.set(PROP_TITLE, curPD.getTitle());
                    m_parseMeta.setAll(curPD.getParseMeta());
                    m_contentMeta.setAll(curPD.getContentMeta());
                    m_outLinks = curPD.getOutlinks();
                }
            } else if (curVal instanceof ParseText) {
                // Get parsed content
                curPT = (ParseText) curVal;
                m_content = setStr(curPT.getText());
            } else if (LOG.isWarnEnabled()) {
                LOG.warn("Unrecognized type: " + curVal.getClass());
            }
        }
    }

    /**
     * Checks if the page version is complete (i.e. contains at least a nutch 
     * segment ID and parsed content).
     */
    public boolean isNutchComplete() {
        return ((m_content.length() > 0) && m_verProps.hasKey(PROP_SEGMENT_ID)
                && m_verProps.hasKey(PROP_FETCH_TIME));
    }

    public String getContent() {
        return m_content;
    }

    public Long getFetchTime() {
        String prop = m_verProps.getFirst(PROP_FETCH_TIME);
        return (prop != null ? Long.parseLong(prop) : null);
    }

    public Long getModificationTime() {
        String prop = m_verProps.getFirst(PROP_MODIFIED_TIME);
        return (prop != null ? Long.parseLong(prop) : null);
    }

    public void setModificationTime(Long time) {
        m_verProps.set(PROP_MODIFIED_TIME, Long.toString(time));
    }

    public String toString() {
        StringBuilder strBld = new StringBuilder();
        SimpleDateFormat dft = new SimpleDateFormat();
        String prop;

        prop = m_verProps.getFirst(PROP_SEGMENT_ID);
        strBld.append("PageVersion from Segment: " + (prop != null ? prop : "-") + "\n");
        prop = m_verProps.getFirst(PROP_TITLE);
        strBld.append("  Title: " + (prop != null ? prop : "-") + "\n");
        prop = m_verProps.getFirst(PROP_FETCH_TIME);
        strBld.append("  Fetched: " + (prop != null ? dft.format(new Date(Long.parseLong(prop))) : "-") + "\n");
        strBld.append("  Content Metadata: " + ((m_contentMeta.numKeys() > 0) ? "present" : "absent") + "\n");
        strBld.append("  Parse Metadata: " + ((m_parseMeta.numKeys() > 0) ? "present" : "absent") + "\n");
        strBld.append("  Outgoing Links: " + ((m_outLinks == null) ? "0" : m_outLinks.length) + "\n");
        strBld.append("  Parsed Content: " + ((m_content.length() > 0) ? "present" : "absent") + "\n");

        return strBld.toString();
    }

    /**
     * Considers two version equal if content is the same.
     */
    public boolean equals(Object obj) {
        return (obj instanceof PageVersion) && (m_content.equals(((PageVersion) obj).m_content));
    }

    /**
     * To be called by ObjectWriter only.
     * @throws XMLStreamException 
     */
    public void persist(XMLStreamWriter writer) throws XMLStreamException {
        writer.writeStartElement(XML_TAG_PAGEVERSION);

        if (m_verProps.numKeys() > 0) {
            m_verProps.persist(writer);
        }

        if (m_parseMeta.numKeys() > 0) {
            m_parseMeta.persist(writer);
        }

        if (m_contentMeta.numKeys() > 0) {
            m_contentMeta.persist(writer);
        }

        if (m_outLinks != null) {
            String anchor;

            writer.writeStartElement(XML_TAG_OUT_LINKS);

            for (Outlink outlink : m_outLinks) {
                writer.writeStartElement(XML_TAG_LINK);
                anchor = outlink.getAnchor();

                if ((anchor != null) && (anchor.length() != 0)) {
                    writer.writeAttribute(XML_ATTRIB_ANCHOR, anchor);
                }

                writer.writeCharacters(outlink.getToUrl());
                writer.writeEndElement();
            }

            writer.writeEndElement();
        }

        if (m_content.length() > 0) {
            writer.writeStartElement(XML_TAG_CONTENT);
            writer.writeCharacters(new String(Base64.encodeBase64(m_content.getBytes())));
            //writer.writeCharacters(m_content);
            writer.writeEndElement();
        }

        writer.writeEndElement();
    }

    public void readFields(DataInput in) throws IOException {
        m_verProps.readFields(in);
        m_contentMeta.readFields(in);
        m_parseMeta.readFields(in);

        int numLinks = WritableUtils.readVInt(in);

        m_outLinks = (numLinks == 0) ? null : new Outlink[numLinks];

        for (int i = 0; i < numLinks; i++) {
            (m_outLinks[i] = new Outlink()).readFields(in);
        }

        m_content = Text.readString(in);
    }

    public void write(DataOutput out) throws IOException {
        m_verProps.write(out);
        m_contentMeta.write(out);
        m_parseMeta.write(out);

        int numLinks = (m_outLinks == null) ? 0 : m_outLinks.length;

        WritableUtils.writeVInt(out, numLinks);

        for (int i = 0; i < numLinks; i++) {
            m_outLinks[i].write(out);
        }

        Text.writeString(out, m_content);
    }

    public void unpersist(XMLStreamReader reader) throws XMLStreamException {
        String elemTag;
        String elemAttrib;

        m_verProps.clear();
        ;
        m_contentMeta.clear();
        m_parseMeta.clear();
        m_outLinks = null;
        m_content = new String();

        while (true) {
            int event = reader.next();

            if (event == XMLStreamConstants.END_ELEMENT
                    && XML_TAG_PAGEVERSION.equals(reader.getName().toString())) {
                break;
            }

            if (event == XMLStreamConstants.START_ELEMENT) {
                elemTag = reader.getName().toString();
                elemAttrib = reader.getAttributeValue(0);

                if ("MetaData".equals(elemTag)) {
                    if ("VersionProperties".equals(elemAttrib)) {
                        m_verProps.unpersist(reader);
                    } else if ("ContentMetadata".equals(elemAttrib)) {
                        m_contentMeta.unpersist(reader);
                    } else if ("ParseMetadata".equals(elemAttrib)) {
                        m_parseMeta.unpersist(reader);
                    }
                } else if (XML_TAG_CONTENT.equals(elemTag)) {
                    //m_content = new String(Base64.decodeBase64(reader.getElementText().getBytes()));
                    m_content = reader.getElementText();
                }

                //TODO: Not reading the out links
            }
        }
    }

    protected String setStr(String val) {
        return (val == null) ? new String() : val;
    }

    protected MetaData m_verProps;
    protected MetaData m_contentMeta;
    protected MetaData m_parseMeta;
    protected Outlink[] m_outLinks;
    protected String m_content;
}