Java tutorial
/* ************************************************************************************ * Copyright (C) 2001-2011 encuestame: system online surveys Copyright (C) 2009 * encuestame Development Team. * Licensed under the Apache Software License version 2.0 * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. ************************************************************************************ */ package org.encuestame.business.search; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.util.Iterator; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.poi.POIXMLException; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRichTextString; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.encuestame.core.search.DirectoryIndexStore; import org.springframework.util.Assert; /** * Search Utils. * @author Morales, Diana Paola paolaATencuestame.org * @since Mar 23, 2011 */ public class SearchUtils { /****/ protected static final String CONTENT = "content"; /****/ protected static final String FULLPATH = "fullpath"; /****/ protected static final String FILENAME = "filename"; /** Lucene Version. **/ public static final Version LUCENE_VERSION = Version.LUCENE_30; /** * Log */ private static final Log log = LogFactory.getLog(SearchUtils.class); /** * Get Filename extension. * @param path fullname file * @return */ public static String getExtension(final String path) { final String ext = path.substring(path.lastIndexOf('.') + 1); log.debug("Path file " + path); log.debug("Ext file " + ext); return ext; } /** * PDF Document content parser. * @param is Document content * @return * @throws IOException */ public static COSDocument parseDocument(final InputStream is) throws IOException { PDFParser parser = null; parser = new PDFParser(is); parser.parse(); return parser.getDocument(); } /** * Add Lucene Document fields. * @param file * @param docText * @return * @throws IOException */ public static Document addFields(final File file, final String docText) throws IOException { final String fullpath = file.getCanonicalPath(); final String filename = file.getName(); final Document doc = new Document(); if (StringUtils.isNotEmpty(docText)) { doc.add(new Field(CONTENT, docText, Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field(FULLPATH, fullpath, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(FILENAME, filename, Field.Store.YES, Field.Index.NOT_ANALYZED)); } return doc; } /** * Create PDF Document. * @param file {@link File} * @param Long attachmentId. * @return {@link Document} * @throws Exception */ public static Document createPdfDocument(final File file) throws Exception { InputStream is = new FileInputStream(file); COSDocument cosDoc = null; String docText = ""; PDDocument pdDoc = null; try { cosDoc = parseDocument(is); pdDoc = new PDDocument(cosDoc); PDFTextStripper stripper = new PDFTextStripper(); docText = stripper.getText(pdDoc); log.debug("PDF Doc Text " + docText.length()); } finally { if (pdDoc == null) { log.error("PdDocument is null"); } else { pdDoc.close(); } } final Document doc = SearchUtils.addFields(file, docText); return doc; } /** * Create Document Word. * @param file {@link File} * @param Long attachmentId. * @return {@link Document} * @throws POIXMLException * @throws Exception */ public static Document createWordDocument(final File file) throws POIXMLException, Exception { InputStream is = new FileInputStream(file); String bodyText = null; try { XWPFDocument wd = new XWPFDocument(is); XWPFWordExtractor wde = new XWPFWordExtractor(wd); bodyText = wde.getText(); } catch (Exception e) { log.debug(e); } Document doc = SearchUtils.addFields(file, bodyText); return doc; } /** * Create Spreadsheets Document. * @param file Spreadsheet {@link File}. * @param Long attachmentId. * @return {@link Document} * @throws FileNotFoundException */ public static Document createSpreadsheetsDocument(final File file) throws Exception { InputStream is = new FileInputStream(file); StringBuilder contents = new StringBuilder(); POIFSFileSystem fileSystem = new POIFSFileSystem(is); HSSFWorkbook workBook = new HSSFWorkbook(fileSystem); for (int i = 0; i < workBook.getNumberOfSheets(); i++) { HSSFSheet sheet = workBook.getSheetAt(i); Iterator<Row> rows = sheet.rowIterator(); while (rows.hasNext()) { HSSFRow row = (HSSFRow) rows.next(); // Display the row number log.debug(row.getRowNum()); Iterator<Cell> cells = row.cellIterator(); while (cells.hasNext()) { HSSFCell cell = (HSSFCell) cells.next(); // Display the cell number of the current Row switch (cell.getCellType()) { case HSSFCell.CELL_TYPE_NUMERIC: { log.debug(String.valueOf(cell.getNumericCellValue())); contents.append(String.valueOf(cell.getNumericCellValue())).append(" "); break; } case HSSFCell.CELL_TYPE_STRING: { HSSFRichTextString richTextString = cell.getRichStringCellValue(); log.debug(richTextString.toString()); contents.append(richTextString.toString()).append(" "); break; } case HSSFCell.CELL_TYPE_BOOLEAN: { contents.append(String.valueOf(cell.getBooleanCellValue())).append(" "); break; } } } } } Document doc = SearchUtils.addFields(file, contents.toString()); return doc; } /** * Create Text Document. * @param file Text File. * @param Long attachmentId. * @return {@link Document} * @throws Exception */ public static Document createTextDocument(final File file) throws Exception { //FIXME: 'FileReader' is never closed final String docText = new FileReader(file).toString(); final Document doc = SearchUtils.addFields(file, docText); return doc; } /** * Open Index Writer * @param directoryStore * @param indexWriter * @throws CorruptIndexException * @throws LockObtainFailedException * @throws IOException */ public static IndexWriter openIndexWriter(final DirectoryIndexStore directoryStore, IndexWriter indexWriter) throws CorruptIndexException, LockObtainFailedException, IOException { final Directory directory = directoryStore.getDirectory(); log.debug("Get Directory ----------" + directory.toString()); if (indexWriter != null) { indexWriter.close(); } log.debug("Index Directory is locked? ----------> " + indexWriter.isLocked(directory)); indexWriter = new IndexWriter(directory, new StandardAnalyzer(SearchUtils.LUCENE_VERSION), true, IndexWriter.MaxFieldLength.UNLIMITED); Assert.notNull(indexWriter); return indexWriter; } /** * Close Index writer. * @param indexWriter * @throws CorruptIndexException * @throws IOException */ public static void closeIndexWriter(final IndexWriter indexWriter) throws CorruptIndexException, IOException { Assert.notNull(indexWriter); if (indexWriter == null) { log.error("Index writer is null"); } else { indexWriter.close(); log.debug("Index writer was closed"); } } }