org.apache.nutch.parse.oo.OOParser.java Source code

Introduction

Here is the source code for org.apache.nutch.parse.oo.OOParser.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.parse.oo;

import java.io.*;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.*;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.*;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.jaxen.*;
import org.jaxen.jdom.JDOMXPath;
import org.jdom.*;
import org.jdom.input.*;

/**
 * Parser for OpenOffice and OpenDocument formats. This should handle
 * the following formats: Text, Spreadsheet, Presentation, and
 * corresponding templates and "master" documents.
 * 
 * @author Andrzej Bialecki
 */
public class OOParser implements Parser {
    public static final Log LOG = LogFactory.getLog(OOParser.class);

    private Configuration conf;

    public OOParser() {
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    public Configuration getConf() {
        return conf;
    }

    public ParseResult getParse(Content content) {
        String text = null;
        String title = null;
        Metadata metadata = new Metadata();
        ArrayList outlinks = new ArrayList();

        try {
            byte[] raw = content.getContent();
            String contentLength = content.getMetadata().get("Content-Length");
            if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
                return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                        "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete files.")
                                .getEmptyParseResult(content.getUrl(), conf);
            }
            ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(raw));
            ZipEntry ze = null;
            while ((ze = zis.getNextEntry()) != null) {
                if (ze.getName().equals("content.xml")) {
                    text = parseContent(ze, zis, outlinks);
                } else if (ze.getName().equals("meta.xml")) {
                    parseMeta(ze, zis, metadata);
                }
            }
            zis.close();
        } catch (Exception e) { // run time exception
            e.printStackTrace(LogUtil.getWarnStream(LOG));
            return new ParseStatus(ParseStatus.FAILED, "Can't be handled as OO document. " + e)
                    .getEmptyParseResult(content.getUrl(), conf);
        }

        title = metadata.get(Metadata.TITLE);
        if (text == null)
            text = "";

        if (title == null)
            title = "";

        Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
        ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, links, metadata);
        return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    }

    // extract as much plain text as possible.
    private String parseContent(ZipEntry ze, ZipInputStream zis, ArrayList outlinks) throws Exception {
        StringBuffer res = new StringBuffer();
        FilterInputStream fis = new FilterInputStream(zis) {
            public void close() {
            };
        };
        SAXBuilder builder = new SAXBuilder();
        Document doc = builder.build(fis);
        Element root = doc.getRootElement();
        // XXX this is expensive for very large documents. In those cases another
        // XXX method (direct processing of SAX events, or XMLPull) should be used.
        XPath path = new JDOMXPath("//text:span | //text:p | //text:tab | //text:tab-stop | //text:a");
        path.addNamespace("text", root.getNamespace("text").getURI());
        Namespace xlink = Namespace.getNamespace("xlink", "http://www.w3.org/1999/xlink");
        List list = path.selectNodes(doc);
        boolean lastp = true;
        for (int i = 0; i < list.size(); i++) {
            Element el = (Element) list.get(i);
            String text = el.getText();
            if (el.getName().equals("p")) {
                // skip empty paragraphs
                if (!text.equals("")) {
                    if (!lastp)
                        res.append("\n");
                    res.append(text + "\n");
                    lastp = true;
                }
            } else if (el.getName().startsWith("tab")) {
                res.append("\t");
                lastp = false;
            } else if (el.getName().equals("a")) {
                List nl = el.getChildren();
                String a = null;
                for (int k = 0; k < nl.size(); k++) {
                    Element anchor = (Element) nl.get(k);
                    String nsName = anchor.getNamespacePrefix() + ":" + anchor.getName();
                    if (!nsName.equals("text:span"))
                        continue;
                    a = anchor.getText();
                    break;
                }
                String u = el.getAttributeValue("href", xlink);
                if (u == null)
                    u = a; // often anchors are URLs
                try {
                    Outlink o = new Outlink(u, a);
                    outlinks.add(o);
                } catch (MalformedURLException mue) {
                    // skip
                }
                if (a != null && !a.equals("")) {
                    if (!lastp)
                        res.append(' ');
                    res.append(a);
                    lastp = false;
                }
            } else {
                if (!text.equals("")) {
                    if (!lastp)
                        res.append(' ');
                    res.append(text);
                }
                lastp = false;
            }
        }
        return res.toString();
    }

    // extract metadata and convert them to Nutch format
    private void parseMeta(ZipEntry ze, ZipInputStream zis, Metadata metadata) throws Exception {
        FilterInputStream fis = new FilterInputStream(zis) {
            public void close() {
            };
        };
        SAXBuilder builder = new SAXBuilder();
        Document doc = builder.build(fis);
        XPath path = new JDOMXPath("/office:document-meta/office:meta/*");
        Element root = doc.getRootElement();
        path.addNamespace("office", root.getNamespace("office").getURI());
        List list = path.selectNodes(doc);
        for (int i = 0; i < list.size(); i++) {
            Element n = (Element) list.get(i);
            String text = n.getText();
            if (text.trim().equals(""))
                continue;
            String name = n.getName();
            if (name.equals("title"))
                metadata.add(Metadata.TITLE, text);
            else if (name.equals("language"))
                metadata.add(Metadata.LANGUAGE, text);
            else if (name.equals("creation-date"))
                metadata.add(Metadata.DATE, text);
            else if (name.equals("print-date"))
                metadata.add(Metadata.LAST_PRINTED, text);
            else if (name.equals("generator"))
                metadata.add(Metadata.APPLICATION_NAME, text);
            else if (name.equals("creator"))
                metadata.add(Metadata.CREATOR, text);
        }
    }

    public static void main(String[] args) throws Exception {
        OOParser oo = new OOParser();
        Configuration conf = NutchConfiguration.create();
        oo.setConf(conf);
        FileInputStream fis = new FileInputStream(args[0]);
        byte[] bytes = new byte[fis.available()];
        fis.read(bytes);
        fis.close();
        Content c = new Content("local", "local", bytes, "application/vnd.oasis.opendocument.text", new Metadata(),
                conf);
        Parse p = oo.getParse(c).get(c.getUrl());
        System.out.println(p.getData());
        System.out.println("Text: '" + p.getText() + "'");
        /*
        // create the test output file
        OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream("e:\\ootest.txt"), "UTF-8");
        osw.write(p.getText());
        osw.flush();
        osw.close();
        */
    }
}