org.encuestame.business.search.SearchUtils.java Source code

Introduction

Here is the source code for org.encuestame.business.search.SearchUtils.java
Source

/*
 ************************************************************************************
 * Copyright (C) 2001-2011 encuestame: system online surveys Copyright (C) 2009
 * encuestame Development Team.
 * Licensed under the Apache Software License version 2.0
 * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to  in writing,  software  distributed
 * under the License is distributed  on  an  "AS IS"  BASIS,  WITHOUT  WARRANTIES  OR
 * CONDITIONS OF ANY KIND, either  express  or  implied.  See  the  License  for  the
 * specific language governing permissions and limitations under the License.
 ************************************************************************************
 */
package org.encuestame.business.search;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLException;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRichTextString;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.encuestame.core.search.DirectoryIndexStore;
import org.springframework.util.Assert;

/**
 * Search Utils.
 * @author Morales, Diana Paola paolaATencuestame.org
 * @since Mar 23, 2011
 */
public class SearchUtils {

    /****/
    protected static final String CONTENT = "content";

    /****/
    protected static final String FULLPATH = "fullpath";

    /****/
    protected static final String FILENAME = "filename";

    /** Lucene Version. **/
    public static final Version LUCENE_VERSION = Version.LUCENE_30;

    /**
    * Log
    */
    private static final Log log = LogFactory.getLog(SearchUtils.class);

    /**
    * Get Filename extension.
    * @param path fullname file
    * @return
    */
    public static String getExtension(final String path) {
        final String ext = path.substring(path.lastIndexOf('.') + 1);
        log.debug("Path file " + path);
        log.debug("Ext file " + ext);
        return ext;
    }

    /**
     * PDF Document content parser.
     * @param is Document content
     * @return
     * @throws IOException
     */
    public static COSDocument parseDocument(final InputStream is) throws IOException {
        PDFParser parser = null;
        parser = new PDFParser(is);
        parser.parse();
        return parser.getDocument();
    }

    /**
     * Add Lucene Document fields.
     * @param file
     * @param docText
     * @return
     * @throws IOException
     */
    public static Document addFields(final File file, final String docText) throws IOException {
        final String fullpath = file.getCanonicalPath();
        final String filename = file.getName();
        final Document doc = new Document();
        if (StringUtils.isNotEmpty(docText)) {
            doc.add(new Field(CONTENT, docText, Field.Store.NO, Field.Index.ANALYZED));
            doc.add(new Field(FULLPATH, fullpath, Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.add(new Field(FILENAME, filename, Field.Store.YES, Field.Index.NOT_ANALYZED));
        }
        return doc;
    }

    /**
     * Create PDF Document.
     * @param file {@link File}
     * @param Long attachmentId.
     * @return {@link Document}
     * @throws Exception
     */
    public static Document createPdfDocument(final File file) throws Exception {
        InputStream is = new FileInputStream(file);
        COSDocument cosDoc = null;
        String docText = "";
        PDDocument pdDoc = null;
        try {
            cosDoc = parseDocument(is);
            pdDoc = new PDDocument(cosDoc);
            PDFTextStripper stripper = new PDFTextStripper();
            docText = stripper.getText(pdDoc);
            log.debug("PDF Doc Text " + docText.length());
        } finally {
            if (pdDoc == null) {
                log.error("PdDocument is null");
            } else {
                pdDoc.close();
            }
        }
        final Document doc = SearchUtils.addFields(file, docText);
        return doc;
    }

    /**
    * Create Document Word.
    * @param file {@link File}
    * @param Long attachmentId.
    * @return {@link Document}
    * @throws POIXMLException
    * @throws Exception
    */
    public static Document createWordDocument(final File file) throws POIXMLException, Exception {
        InputStream is = new FileInputStream(file);
        String bodyText = null;
        try {
            XWPFDocument wd = new XWPFDocument(is);
            XWPFWordExtractor wde = new XWPFWordExtractor(wd);
            bodyText = wde.getText();
        } catch (Exception e) {
            log.debug(e);
        }
        Document doc = SearchUtils.addFields(file, bodyText);
        return doc;
    }

    /**
    * Create Spreadsheets Document.
    * @param file Spreadsheet {@link File}.
    * @param Long attachmentId.
    * @return {@link Document}
    * @throws FileNotFoundException
    */
    public static Document createSpreadsheetsDocument(final File file) throws Exception {
        InputStream is = new FileInputStream(file);
        StringBuilder contents = new StringBuilder();
        POIFSFileSystem fileSystem = new POIFSFileSystem(is);
        HSSFWorkbook workBook = new HSSFWorkbook(fileSystem);
        for (int i = 0; i < workBook.getNumberOfSheets(); i++) {
            HSSFSheet sheet = workBook.getSheetAt(i);
            Iterator<Row> rows = sheet.rowIterator();
            while (rows.hasNext()) {
                HSSFRow row = (HSSFRow) rows.next();
                // Display the row number
                log.debug(row.getRowNum());
                Iterator<Cell> cells = row.cellIterator();
                while (cells.hasNext()) {
                    HSSFCell cell = (HSSFCell) cells.next();
                    // Display the cell number of the current Row
                    switch (cell.getCellType()) {
                    case HSSFCell.CELL_TYPE_NUMERIC: {
                        log.debug(String.valueOf(cell.getNumericCellValue()));
                        contents.append(String.valueOf(cell.getNumericCellValue())).append(" ");
                        break;
                    }

                    case HSSFCell.CELL_TYPE_STRING: {
                        HSSFRichTextString richTextString = cell.getRichStringCellValue();
                        log.debug(richTextString.toString());
                        contents.append(richTextString.toString()).append(" ");
                        break;
                    }

                    case HSSFCell.CELL_TYPE_BOOLEAN: {
                        contents.append(String.valueOf(cell.getBooleanCellValue())).append(" ");
                        break;
                    }
                    }
                }
            }
        }
        Document doc = SearchUtils.addFields(file, contents.toString());
        return doc;
    }

    /**
    * Create Text Document.
    * @param file Text File.
    * @param Long attachmentId.
    * @return {@link Document}
    * @throws Exception
    */
    public static Document createTextDocument(final File file) throws Exception {
        //FIXME: 'FileReader' is never closed
        final String docText = new FileReader(file).toString();
        final Document doc = SearchUtils.addFields(file, docText);
        return doc;
    }

    /**
     * Open Index Writer
     * @param directoryStore
     * @param indexWriter
     * @throws CorruptIndexException
     * @throws LockObtainFailedException
     * @throws IOException
     */
    public static IndexWriter openIndexWriter(final DirectoryIndexStore directoryStore, IndexWriter indexWriter)
            throws CorruptIndexException, LockObtainFailedException, IOException {
        final Directory directory = directoryStore.getDirectory();
        log.debug("Get Directory ----------" + directory.toString());
        if (indexWriter != null) {
            indexWriter.close();
        }
        log.debug("Index Directory is locked?  ----------> " + indexWriter.isLocked(directory));
        indexWriter = new IndexWriter(directory, new StandardAnalyzer(SearchUtils.LUCENE_VERSION), true,
                IndexWriter.MaxFieldLength.UNLIMITED);
        Assert.notNull(indexWriter);
        return indexWriter;
    }

    /**
     * Close Index writer.
     * @param indexWriter
     * @throws CorruptIndexException
     * @throws IOException
     */
    public static void closeIndexWriter(final IndexWriter indexWriter) throws CorruptIndexException, IOException {
        Assert.notNull(indexWriter);
        if (indexWriter == null) {
            log.error("Index writer is null");
        } else {
            indexWriter.close();
            log.debug("Index writer was closed");
        }
    }
}