com.armorize.hackalert.extractor.msword.MSExtractor.java Source code

Java tutorial

Introduction

Here is the source code for com.armorize.hackalert.extractor.msword.MSExtractor.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.armorize.hackalert.extractor.msword;

// JDK imports
import java.io.InputStream;
import java.util.Date;
import java.util.Properties;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.nutch.metadata.DublinCore;
import org.apache.nutch.metadata.HttpHeaders;
import org.apache.nutch.metadata.Office;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;

/**
 * Defines a Microsoft document content extractor.
 *
 * @author Jérôme Charron
 */
public abstract class MSExtractor {

    protected final static Log LOG = LogFactory.getLog(MSExtractor.class);

    private String text = null;
    private POIFSReader reader = null;
    private PropertiesBroker properties = null;

    /** Constructs a new Microsoft document extractor. */
    protected MSExtractor() {
    }

    /**
     * Extracts properties and text from an MS Document input stream
     */
    protected void extract(InputStream input) throws Exception {
        // First, extract properties
        this.reader = new POIFSReader();
        this.properties = new PropertiesBroker();
        this.reader.registerListener(new PropertiesReaderListener(this.properties),
                SummaryInformation.DEFAULT_STREAM_NAME);
        input.reset();
        if (input.available() > 0) {
            reader.read(input);
        }
        // Then, extract text
        input.reset();
        this.text = extractText(input);
    }

    /**
     * Extracts the text content from a Microsoft document input stream.
     */
    protected abstract String extractText(InputStream input) throws Exception;

    /**
     * Get the content text of the Microsoft document.
     * @return the content text of the document
     */
    protected String getText() {
        return this.text;
    }

    /**
     * Get the <code>Properties</code> of the Microsoft document.
     * @return the properties of the document
     */
    protected Properties getProperties() {
        return properties.getProperties();
    }

    private final static class PropertiesBroker {

        private final static int TIMEOUT = 2 * 1000;
        private Properties properties = null;

        public synchronized Properties getProperties() {

            final long start = new Date().getTime();
            long now = start;

            while (this.properties == null && now - start < TIMEOUT) {
                try {
                    wait(TIMEOUT / 10);
                } catch (InterruptedException e) {
                }
                now = new Date().getTime();
            }
            notifyAll();
            return this.properties;
        }

        public synchronized void setProperties(Properties properties) {
            this.properties = properties;
            notifyAll();
        }
    }

    private class PropertiesReaderListener implements POIFSReaderListener {

        private PropertiesBroker propertiesBroker;
        private Properties metadata = new Properties();

        PropertiesReaderListener(PropertiesBroker propertiesBroker) {
            this.propertiesBroker = propertiesBroker;
        }

        public void processPOIFSReaderEvent(POIFSReaderEvent event) {
            if (!event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) {
                return;
            }

            try {
                SummaryInformation si = (SummaryInformation) PropertySetFactory.create(event.getStream());
                setProperty(DublinCore.TITLE, si.getTitle());
                setProperty(Office.APPLICATION_NAME, si.getApplicationName());
                setProperty(Office.AUTHOR, si.getAuthor());
                setProperty(Office.CHARACTER_COUNT, si.getCharCount());
                setProperty(Office.COMMENTS, si.getComments());
                setProperty(DublinCore.DATE, si.getCreateDateTime());
                //        setProperty(Office.EDIT_TIME, si.getEditTime());
                setProperty(HttpHeaders.LAST_MODIFIED, si.getLastSaveDateTime());
                setProperty(Office.KEYWORDS, si.getKeywords());
                setProperty(Office.LAST_AUTHOR, si.getLastAuthor());
                setProperty(Office.LAST_PRINTED, si.getLastPrinted());
                setProperty(Office.LAST_SAVED, si.getLastSaveDateTime());
                setProperty(Office.PAGE_COUNT, si.getPageCount());
                setProperty(Office.REVISION_NUMBER, si.getRevNumber());
                setProperty(DublinCore.RIGHTS, si.getSecurity());
                setProperty(DublinCore.SUBJECT, si.getSubject());
                setProperty(Office.TEMPLATE, si.getTemplate());
                setProperty(Office.WORD_COUNT, si.getWordCount());
            } catch (Exception ex) {
            }
            propertiesBroker.setProperties(metadata);
        }

        private final void setProperty(String name, String value) {
            if (!StringUtils.isEmpty(name) && !StringUtils.isEmpty(value)) {
                metadata.setProperty(name, value);
            }
        }

        private final void setProperty(String name, int value) {
            if (value != 0) {
                setProperty(name, String.valueOf(value));
            }
        }

        private final void setProperty(String name, long value) {
            if (value != 0) {
                setProperty(name, String.valueOf(value));
            }
        }

        private final void setProperty(String name, Date date) {
            if (date != null) {
                setProperty(name, HttpDateFormat.toString(date));
            }
        }

    }

}