org.jlibrary.core.search.extraction.XMLExtractor.java Source code

Introduction

Here is the source code for org.jlibrary.core.search.extraction.XMLExtractor.java
Source

/*
* jLibrary, Open Source Document Management System
* 
* Copyright (c) 2003-2006, Martn Prez Marin, Blandware (represented by
* Andrey Grebnev), and individual contributors as indicated by the
* @authors tag. See copyright.txt in the distribution for a full listing of
* individual contributors. All rights reserved.
* 
* This is free software; you can redistribute it and/or modify it
* under the terms of the Modified BSD License as published by the Free 
* Software Foundation.
* 
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Modified
* BSD License for more details.
* 
* You should have received a copy of the Modified BSD License along with 
* this software; if not, write to the Free Software Foundation, Inc., 
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA, or see the
* FSF site: http://www.fsf.org.
*/
package org.jlibrary.core.search.extraction;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.commons.io.IOUtils;
import org.jlibrary.core.search.extraction.xml.XMLParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.XMLReader;

/**
 * @author Martn
 *
 * XML text extraction class
 */
public class XMLExtractor implements Extractor {

    static Logger logger = LoggerFactory.getLogger(XMLExtractor.class);

    private XMLParser parser;

    public XMLExtractor() throws ExtractionException {

        try {
            SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
            saxParserFactory.setValidating(false);
            SAXParser saxParser = saxParserFactory.newSAXParser();
            XMLReader xmlReader = saxParser.getXMLReader();
            xmlReader.setFeature("http://xml.org/sax/features/validation", false);
            xmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
            parser = new XMLParser(xmlReader);
        } catch (Exception e) {
            throw new ExtractionException(e);
        }
    }

    /**
     * @see org.jlibrary.core.indexer.extractors.Extractor#extractText(java.io.File)
     */
    public String extractText(File f) throws ExtractionException {

        FileInputStream fis = null;
        try {
            fis = new FileInputStream(f);
            return extractText(fis);
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
            throw new ExtractionException(e);
        } catch (ExtractionException ee) {
            throw ee;
        } finally {
            if (fis != null) {
                try {
                    fis.close();
                } catch (IOException e) {
                    throw new ExtractionException(e);
                }
            }
        }
    }

    /**
     * @see org.jlibrary.core.indexer.extractors.Extractor#extractText(java.io.InputStream)
     */
    public String extractText(InputStream is) throws ExtractionException {

        try {
            byte[] content = IOUtils.toByteArray(is);
            parser.parse(content);
            return parser.getContents();
        } catch (Exception e) {
            throw new ExtractionException(e);
        }
    }

    /**
     * @see org.jlibrary.core.indexer.extractors.Extractor#getReader(java.io.File)
     */
    public Reader getReader(File f) throws ExtractionException {

        throw new ExtractionException("XMLExtractor can't use getReader() method");
    }

    /**
     * @see org.jlibrary.core.search.extraction.Extractor#extractHeader(java.io.File)
     */
    public HeaderMetaData extractHeader(File f) throws ExtractionException {

        HeaderMetaData metadata = new HeaderMetaData();

        return metadata;
    }

    /**
     * @see org.jlibrary.core.search.extraction.Extractor#extractHeader(java.io.InputStream)
     */
    public HeaderMetaData extractHeader(InputStream is) throws ExtractionException {

        HeaderMetaData metadata = new HeaderMetaData();

        return metadata;
    }
}