Java tutorial
/* ************************************************************************************ * Copyright (C) 2001-2011 encuestame: system online surveys Copyright (C) 2009 * encuestame Development Team. * Licensed under the Apache Software License version 2.0 * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. ************************************************************************************ */ package org.encuestame.business.search; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Date; import java.util.Iterator; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.Term; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.poi.POIXMLException; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRichTextString; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; /** * Indexer File description. * @author Morales, Diana Paola paolaATencuestame.org * @since Apr 5, 2011 */ public class IndexerFile { /** Attachment text content. **/ protected static final String CONTENT = "content"; /** Attachment full path. **/ protected static final String FULLPATH = "fullpath"; /** Attachment file name. **/ protected static final String FILENAME = "filename"; /** Attachment Id. **/ protected static final String DOCUMENTID = "documentId"; /** Attachment upload date. **/ protected static final String UPLOAD_DATE = "uploadDate"; /** Attachment type. **/ protected static final String DOCUMENT_TYPE = "documentType"; /** Attachment title. **/ protected static final String ATTACHMENT_TITLE = "title"; /** Log. **/ private static final Log log = LogFactory.getLog(IndexerFile.class); /** Auto commit option. **/ private boolean autoCommit = true; /** * Create standard lucene document * @param attachFile * @return {@link Document} doc */ public static Document createStandardLuceneDocument(AttachmentIndex attachFile) { Document doc = new Document(); doc.add(new Field(CONTENT, attachFile.getContent(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(FULLPATH, attachFile.getFilepath(), Field.Store.YES, Field.Index.NO)); doc.add(new Field(FILENAME, attachFile.getFilename(), Field.Store.YES, Field.Index.NO)); doc.add(new Field(DOCUMENTID, attachFile.getDocumentId().toString(), Field.Store.YES, Field.Index.NO)); doc.add(new Field(UPLOAD_DATE, attachFile.getUploadDate().toString(), Field.Store.YES, Field.Index.NO)); doc.add(new Field(DOCUMENT_TYPE, attachFile.getDocumentType(), Field.Store.YES, Field.Index.NO)); doc.add(new Field(ATTACHMENT_TITLE, attachFile.getTitle(), Field.Store.YES, Field.Index.NO)); return doc; } /** * Add files to index * @param attachment */ public static void addToIndex(final AttachmentIndex attachment, final IndexWriterManager indexWriter) { try { long start = System.currentTimeMillis(); indexWriter.openIndexWriter(); IndexerFile.addDocumentToIndex(attachment, indexWriter); log.debug("Add to search index for topic " + attachment.getFilename() + " in " + ((System.currentTimeMillis() - start) / 1000.000) + " s."); } catch (Exception e) { log.error("Exception while adding topic " + attachment.getFilename(), e); } } /** * Add document to index. * @param documentAttachment * @throws IOException */ private static void addDocumentToIndex(final AttachmentIndex documentAttachment, final IndexWriterManager indexWriter) throws IOException { Document standardLuceneDocument = createStandardLuceneDocument(documentAttachment); indexWriter.getIndexWriter().addDocument(standardLuceneDocument); } /** * Delete attachment from index. * @param topic */ public void deleteAttachmentFromIndex(AttachmentIndex attachmentIndex, final IndexWriterManager indexWriter) { try { long start = System.currentTimeMillis(); // delete the current document indexWriter.getIndexWriter(); this.deleteFromIndex(attachmentIndex, indexWriter); log.debug("Delete from search index for topic " + attachmentIndex.getFilename() + " in " + ((System.currentTimeMillis() - start) / 1000.000) + " s."); } catch (Exception e) { log.error("Exception while adding topic " + attachmentIndex.getFilename(), e); } } /** * Delete Document from index * @param topic * @throws IOException */ private void deleteFromIndex(AttachmentIndex attachmentIndex, final IndexWriterManager indexWriter) throws IOException { indexWriter.getIndexWriter().deleteDocuments(new Term(FILENAME, attachmentIndex.getFilename())); } /** * Commit into lucene index. * @param commitNow * @throws IOException */ private void commit(final boolean commitNow, final IndexWriterManager indexWriter) throws IOException { if (commitNow) { indexWriter.getIndexWriter().commit(); } } /** * Create Attachment Document. * @param file * @return * @throws IOException */ public static AttachmentIndex createAttachmentDocument(final File file, final Long attachmentId) throws IOException { final String path = file.getCanonicalPath(); final String fileExtension = SearchUtils.getExtension(path); final String filename = file.getName(); String contentText = ""; AttachmentIndex attachmentIndexBean = new AttachmentIndex(); log.debug("Creating attachment document type --> " + fileExtension); if ("docx".equals(fileExtension)) { XWPFWordExtractor parserDoc; try { //1- Parsear word Document parserDoc = IndexerFile.parseWordDocument(file); //2- Extract word document content contentText = IndexerFile.extractContentWordDocument(parserDoc); //3- Set values to Attachment Index } catch (POIXMLException e) { log.error("Fail createAttachmentDocument POIXMLException --> " + e); } catch (Exception e) { log.error("Fail createAttachmentDocument Exception --> " + e); } } else if ("pdf".equals(fileExtension)) { PDDocument parsePdf; parsePdf = IndexerFile.parsePdfDocument(file); try { contentText = IndexerFile.extractContentPdfDocument(parsePdf); } catch (Exception e) { log.error("Fail createAttachmentDocument PDF Exception --> " + e); } } else if ("xls".equals(fileExtension)) { HSSFWorkbook parseSpreadsheets; try { parseSpreadsheets = IndexerFile.parseSpreadsheetsDocument(file); contentText = extractContentSpreadsheetsDocument(parseSpreadsheets); } catch (Exception e) { log.error("Fail createAttachmentDocument spreadsheets Exception --> " + e); } } else if ("txt".equals(fileExtension)) { contentText = "Document text file"; } attachmentIndexBean.setContent(contentText); attachmentIndexBean.setFilepath(path); attachmentIndexBean.setFilename(filename); attachmentIndexBean.setDocumentId(attachmentId); attachmentIndexBean.setUploadDate(new Date()); attachmentIndexBean.setDocumentType(fileExtension); attachmentIndexBean.setTitle("ENCUESTAME - TITLE"); return attachmentIndexBean; } /** * Parse Word Document. * @param file * @return * @throws POIXMLException * @throws Exception */ public static XWPFWordExtractor parseWordDocument(final File file) throws POIXMLException, Exception { InputStream is = new FileInputStream(file); XWPFWordExtractor wde = null; try { XWPFDocument wd = new XWPFDocument(is); wde = new XWPFWordExtractor(wd); log.debug("Parse Word Document --------------------------> "); } catch (Exception e) { log.error("ERROR parse Word Document-------->" + e); } return wde; } /** * Extract word document content. * @param wde * @return */ public static String extractContentWordDocument(final XWPFWordExtractor wde) { String bodyText = null; try { bodyText = wde.getText(); } catch (Exception e) { log.error("ERROR extracting content Word Document-------->" + e); } return bodyText; } /** * Parse pdf Document. * @param file * @return * @throws IOException */ public static PDDocument parsePdfDocument(final File file) throws IOException { InputStream is = new FileInputStream(file); COSDocument cosDoc = null; PDDocument pdDoc = null; try { cosDoc = SearchUtils.parseDocument(is); pdDoc = new PDDocument(cosDoc); } catch (IOException e) { // TODO Auto-generated catch block log.error(e); } finally { if (pdDoc == null) { log.error("PdDocument is null"); } else { pdDoc.close(); } } return pdDoc; } /** * Extract content in PDF Document. * @param pdfDoc * @return * @throws Exception */ public static String extractContentPdfDocument(final PDDocument pdfDoc) throws Exception { String docText = null; try { PDFTextStripper stripper = new PDFTextStripper(); docText = stripper.getText(pdfDoc); log.debug("Extract content pdf document leng ----> " + docText.length()); } finally { if (docText == null) { log.error("**************** PDF content is null *********************"); } } return docText; } /** * * @param author * @param title * @param producer * @param subject * @return */ public static AttachmentIndex addMetadatatoBean(final String author, final String title, final String producer, final String subject) { AttachmentIndex attachmentPdfMetadata = new AttachmentIndex(); if (StringUtils.isNotEmpty(author)) { attachmentPdfMetadata.setAuthor(author); } if (StringUtils.isNotEmpty(title)) { attachmentPdfMetadata.setTitle(title); } if (StringUtils.isNotEmpty(producer)) { attachmentPdfMetadata.setProducer(producer); } if (StringUtils.isNotEmpty(subject)) { attachmentPdfMetadata.setSubject(subject); } return attachmentPdfMetadata; } /** * Parse spreadsheets documents. * @param file * @return * @throws Exception */ public static HSSFWorkbook parseSpreadsheetsDocument(final File file) throws Exception { InputStream is = new FileInputStream(file); POIFSFileSystem fileSystem = new POIFSFileSystem(is); HSSFWorkbook workBook = new HSSFWorkbook(fileSystem); return workBook; } /** * Extract spreadsheets content. * @param workBook * @return * @throws Exception */ public static String extractContentSpreadsheetsDocument(final HSSFWorkbook workBook) throws Exception { StringBuilder contents = new StringBuilder(); for (int i = 0; i < workBook.getNumberOfSheets(); i++) { HSSFSheet sheet = workBook.getSheetAt(i); Iterator<Row> rows = sheet.rowIterator(); while (rows.hasNext()) { HSSFRow row = (HSSFRow) rows.next(); // Display the row number log.debug(row.getRowNum()); Iterator<Cell> cells = row.cellIterator(); while (cells.hasNext()) { HSSFCell cell = (HSSFCell) cells.next(); // Display the cell number of the current Row switch (cell.getCellType()) { case HSSFCell.CELL_TYPE_NUMERIC: { log.debug(String.valueOf(cell.getNumericCellValue())); contents.append(String.valueOf(cell.getNumericCellValue())).append(" "); break; } case HSSFCell.CELL_TYPE_STRING: { HSSFRichTextString richTextString = cell.getRichStringCellValue(); log.debug(richTextString.toString()); contents.append(richTextString.toString()).append(" "); break; } case HSSFCell.CELL_TYPE_BOOLEAN: { contents.append(String.valueOf(cell.getBooleanCellValue())).append(" "); break; } } } } } return contents.toString(); } }