List of usage examples for org.apache.poi.hwpf HWPFDocument HWPFDocument
public HWPFDocument(DirectoryNode directory) throws IOException
From source file:org.dspace.submit.step.UploadStep.java
License:BSD License
/** * Process the upload of a new file!//from w w w . j av a 2 s . com * * @param context * current DSpace context * @param request * current servlet request object * @param response * current servlet response object * @param subInfo * submission info object * * @return Status or error flag which will be processed by * UI-related code! (if STATUS_COMPLETE or 0 is returned, * no errors occurred!) */ public int processUploadFile(Context context, HttpServletRequest request, HttpServletResponse response, SubmissionInfo subInfo) throws ServletException, IOException, SQLException, AuthorizeException { boolean formatKnown = true; boolean fileOK = false; BitstreamFormat bf = null; Bitstream b = null; //NOTE: File should already be uploaded. //Manakin does this automatically via Cocoon. //For JSP-UI, the SubmissionController.uploadFiles() does the actual upload Enumeration attNames = request.getAttributeNames(); //loop through our request attributes while (attNames.hasMoreElements()) { String attr = (String) attNames.nextElement(); //if this ends with "-path", this attribute //represents a newly uploaded file if (attr.endsWith("-path")) { //strip off the -path to get the actual parameter //that the file was uploaded as String param = attr.replace("-path", ""); String exten = param.substring(param.length() - 3); // Load the file's path and input stream and description String filePath = (String) request.getAttribute(param + "-path"); InputStream fileInputStreamTest = (InputStream) request.getAttribute(param + "-inputstream"); ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] buf = new byte[1024]; int n = 0; while ((n = fileInputStreamTest.read(buf)) >= 0) baos.write(buf, 0, n); byte[] content = baos.toByteArray(); InputStream fileInputStream = new ByteArrayInputStream(content); InputStream fileInputStreamPdf = new ByteArrayInputStream(content); InputStream ifAnsi = new ByteArrayInputStream(content); //InputStream fss = fileInputStream.cl //attempt to get description from attribute first, then direct from a parameter String fileDescription = (String) request.getAttribute(param + "-description"); if (fileDescription == null || fileDescription.length() == 0) { fileDescription = request.getParameter("description"); } // if information wasn't passed by User Interface, we had a problem // with the upload if (filePath == null || fileInputStream == null) { return STATUS_UPLOAD_ERROR; } if (subInfo == null) { // In any event, if we don't have the submission info, the request // was malformed return STATUS_INTEGRITY_ERROR; } // Create the bitstream Item item = subInfo.getSubmissionItem().getItem(); // do we already have a bundle? Bundle[] bundles = item.getBundles("ORIGINAL"); if (bundles.length < 1) { // set bundle's name to ORIGINAL b = item.createSingleBitstream(fileInputStream, "ORIGINAL"); } else { // we have a bundle already, just add bitstream b = bundles[0].createBitstream(fileInputStream); } //fileDescription.op if (exten.toLowerCase().equals("pdf")) { try { PDFTextStripper pdfStripper = null; PDDocument docum = null; PDFParser parser = new PDFParser(fileInputStreamPdf); COSDocument cosDoc = null; parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); docum = new PDDocument(cosDoc); //pdfStripper.getText(docum); String parsedText = pdfStripper.getText(docum); Integer fifty = (Integer) Math.round(parsedText.length() / 2); if (fifty < 0) { fifty = fifty * (-1); } Integer toCut = 500; if ((parsedText.length() - fifty) < 500) { toCut = parsedText.length(); } log.info("FUCKTHISSHIT: " + fifty + " " + toCut); String subText = parsedText.substring(fifty, fifty + toCut - 1); try { subText = subText.substring(subText.indexOf(".") + 1); } catch (Exception e) { } item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); log.info(parsedText); } catch (Exception e) { log.info("omgerror: " + e.toString()); } } if (exten.toLowerCase().equals("txt")) { StringWriter writer = new StringWriter(); IOUtils.copy(fileInputStreamPdf, writer, "UTF-8"); String theString = writer.toString(); if (theString.startsWith("\uFEFF")) { } else { StringWriter writerAnsi = new StringWriter(); IOUtils.copy(ifAnsi, writerAnsi, "Cp1252"); theString = writerAnsi.toString(); } Integer fifty = (Integer) Math.round(theString.length() * (50 / 100.0f)); Integer toCut = 500; if ((theString.length() - fifty) < 500) { toCut = theString.length(); } String subText = theString.substring(fifty, toCut - 1); item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); log.info(subText); } log.info("OMGTEST: " + exten); if (exten.toLowerCase().equals("doc")) { WordExtractor extractor = null; try { HWPFDocument document = new HWPFDocument(fileInputStreamPdf); extractor = new WordExtractor(document); String fileData = extractor.getText(); Integer fifty = (Integer) Math.round(50 * 100 / fileData.length()); Integer toCut = 500; if ((fileData.length() - fifty) < 500) { toCut = fileData.length(); } String subText = fileData.substring(fifty, toCut - 1); item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); } catch (Exception exep) { log.info("OMGTESTIK:" + exep); } } if ((exten.toLowerCase().equals("ocx"))) { XWPFDocument document = new XWPFDocument(fileInputStreamPdf); XWPFWordExtractor extractor = null; extractor = new XWPFWordExtractor(document); String text = extractor.getText(); Integer fifty = (Integer) Math.round(50 * 100 / text.length()); Integer toCut = 500; if ((text.length() - fifty) < 500) { toCut = text.length(); } String subText = text.substring(fifty, toCut - 1); item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); } // Strip all but the last filename. It would be nice // to know which OS the file came from. String noPath = filePath; while (noPath.indexOf('/') > -1) { noPath = noPath.substring(noPath.indexOf('/') + 1); } while (noPath.indexOf('\\') > -1) { noPath = noPath.substring(noPath.indexOf('\\') + 1); } b.setName(noPath); b.setSource(filePath); b.setDescription(fileDescription); // Identify the format bf = FormatIdentifier.guessFormat(context, b); b.setFormat(bf); // Update to DB b.update(); item.update(); if ((bf != null) && (bf.isInternal())) { log.warn("Attempt to upload file format marked as internal system use only"); backoutBitstream(subInfo, b, item); return STATUS_UPLOAD_ERROR; } // Check for virus if (ConfigurationManager.getBooleanProperty("submission-curation", "virus-scan")) { Curator curator = new Curator(); curator.addTask("vscan").curate(item); int status = curator.getStatus("vscan"); if (status == Curator.CURATE_ERROR) { backoutBitstream(subInfo, b, item); return STATUS_VIRUS_CHECKER_UNAVAILABLE; } else if (status == Curator.CURATE_FAIL) { backoutBitstream(subInfo, b, item); return STATUS_CONTAINS_VIRUS; } } // If we got this far then everything is more or less ok. // Comment - not sure if this is the right place for a commit here // but I'm not brave enough to remove it - Robin. context.commit(); // save this bitstream to the submission info, as the // bitstream we're currently working with subInfo.setBitstream(b); //if format was not identified if (bf == null) { return STATUS_UNKNOWN_FORMAT; } } //end if attribute ends with "-path" } //end while return STATUS_COMPLETE; }
From source file:org.esmerilprogramming.pdfcake.DocumentReplace.java
License:Open Source License
/** * Changes the real document file.//w w w . j a v a 2 s . c o m * * @param in * InputStream * @param template * DocumentTemplate * @throws Exception */ public static void changeDocFile(InputStream in, DocumentTemplate template) throws Exception { HWPFDocument document = new HWPFDocument(in); document = replaceKeys(document, template); ByteArrayInputStream bais = getDocumentAsByteArrayIS(document); template.setContentBytes(convertToPdfByteArray(bais)); }
From source file:org.exoplatform.services.document.impl.MSWordDocumentReader.java
License:Open Source License
/** * Returns only a text from .doc file content. * // w w w . ja va2 s. c om * @param is an input stream with .doc file content. * @return The string only with text from file content. */ public String getContentAsText(final InputStream is) throws IOException, DocumentReadException { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } String text = ""; try { if (is.available() == 0) { return ""; } HWPFDocument doc; try { doc = SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<HWPFDocument>() { public HWPFDocument run() throws Exception { return new HWPFDocument(is); } }); } catch (IOException e) { throw new DocumentReadException("Can't open document.", e); } Range range = doc.getRange(); text = range.text(); } finally { if (is != null) { try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } } return text.trim(); }
From source file:org.infoglue.cms.controllers.kernel.impl.simple.LuceneController.java
License:Open Source License
private String extractTextToIndex(DigitalAssetVO digitalAssetVO, File file) { String text = ""; if (logger.isInfoEnabled()) logger.info("Asset content type:" + digitalAssetVO.getAssetContentType()); if (digitalAssetVO.getAssetContentType().equalsIgnoreCase("application/pdf")) { try {/* www . j av a 2s .co m*/ Writer output = null; PDDocument document = null; try { document = PDDocument.load(file); ByteArrayOutputStream baos = new ByteArrayOutputStream(); if (!document.isEncrypted()) { output = new OutputStreamWriter(baos, "UTF-8"); PDFTextStripper stripper = new PDFTextStripper(); //stripper.setSortByPosition( sort ); //stripper.setStartPage( startPage ); //stripper.setEndPage( endPage ); stripper.writeText(document, output); text = baos.toString("UTF-8"); if (logger.isInfoEnabled()) logger.info("PDF Document has " + text.length() + " chars\n\n" + text); } } catch (Exception e) { logger.warn("Error indexing file: " + file + "\nMessage: " + e.getMessage()); } finally { if (output != null) { output.close(); } if (document != null) { document.close(); } } } catch (Exception e) { logger.warn("Error indexing:" + e.getMessage()); } } else if (digitalAssetVO.getAssetContentType().equalsIgnoreCase("application/msword")) { try { POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(file)); // Create a document for this file HWPFDocument doc = new HWPFDocument(fs); // Create a WordExtractor to read the text of the word document WordExtractor we = new WordExtractor(doc); // Extract all paragraphs in the document as strings text = we.getText(); // Output the document if (logger.isInfoEnabled()) logger.info("Word Document has " + text.length() + " chars\n\n" + text); } catch (Exception e) { logger.warn("Error indexing file: " + file + "\nMessage: " + e.getMessage()); } } return text; }
From source file:org.knime.ext.textprocessing.nodes.source.parser.word.WordDocumentParser.java
License:Open Source License
private Document parseInternal(final InputStream is) throws Exception { m_currentDoc = new DocumentBuilder(m_tokenizerName); m_currentDoc.setDocumentFile(new File(m_docPath)); m_currentDoc.setDocumentType(m_type); m_currentDoc.addDocumentCategory(m_category); m_currentDoc.addDocumentSource(m_source); POIFSFileSystem poifs = null;/*from w ww . ja va 2 s. c o m*/ HWPFDocument hdoc = null; XWPFDocument hdoc2 = null; WordExtractor extractor = null; try { // doc files if (m_docPath.endsWith(".doc")) { // copy content of input stream into byte array since content have to be red twice unfortunately. final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final byte[] buf = new byte[1024]; int i = 0; while ((i = is.read(buf)) >= 0) { baos.write(buf, 0, i); } final byte[] content = baos.toByteArray(); // open stream with copied content to read text InputStream copiedInput = new ByteArrayInputStream(content); hdoc = new HWPFDocument(copiedInput); extractor = new WordExtractor(hdoc); for (String p : extractor.getParagraphText()) { p = p.trim(); if (!onlyWhitepscaes(p)) { m_currentDoc.addParagraph(p); } } // open stream again with copied content to read meta info copiedInput = new ByteArrayInputStream(content); poifs = new POIFSFileSystem(copiedInput); final DirectoryEntry dir = poifs.getRoot(); final DocumentEntry siEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME); final PropertySet ps = new PropertySet(new DocumentInputStream(siEntry)); final SummaryInformation si = new SummaryInformation(ps); setAuthor(si.getAuthor()); setPublicationDate(si.getCreateDateTime()); // docx files } else if (m_docPath.endsWith(".docx") || m_docPath.endsWith(".docm")) { hdoc2 = new XWPFDocument(is); final List<XWPFParagraph> paragraphs = hdoc2.getParagraphs(); for (final XWPFParagraph paragraph : paragraphs) { final String text = paragraph.getText(); if (!onlyWhitepscaes(text)) { m_currentDoc.addParagraph(text); } } setAuthor(hdoc2.getProperties().getCoreProperties().getCreator()); setPublicationDate(hdoc2.getProperties().getCoreProperties().getCreated()); } m_currentDoc.createNewSection(SectionAnnotation.CHAPTER); // find title String title = null; if (m_filenameAsTitle) { title = m_docPath.trim(); } else { final List<Section> sections = m_currentDoc.getSections(); if (sections.size() > 0) { try { title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim(); } catch (IndexOutOfBoundsException e) { LOGGER.debug("Parsed word document " + m_docPath + " is empty."); title = ""; } } } if (!checkTitle(title)) { title = m_docPath.toString(); } m_currentDoc.addTitle(title); return m_currentDoc.createDocument(); } finally { is.close(); if (poifs != null) { poifs.close(); } if (hdoc != null) { hdoc.close(); } if (hdoc2 != null) { hdoc2.close(); } if (extractor != null) { extractor.close(); } } }
From source file:org.luwrain.app.preview.FilterPoi.java
License:Open Source License
public void open(String fileName) throws Exception { File docFile = new File(fileName); FileInputStream finStream = new FileInputStream(docFile.getAbsolutePath()); HWPFDocument doc = new HWPFDocument(finStream); WordExtractor wordExtract = new WordExtractor(doc); lines = wordExtract.getParagraphText(); finStream.close(); //closing fileinputstream this.fileName = fileName; }
From source file:org.modeshape.sequencer.msoffice.word.WordMetadataReader.java
License:Apache License
public static WordMetadata instance(InputStream stream) throws IOException { WordMetadata metadata = new WordMetadata(); List<WordMetadata.WordHeading> headings = new ArrayList<WordMetadata.WordHeading>(); HWPFDocument document = new HWPFDocument(stream); Range range = document.getRange();//w w w . j a va2 s . c o m StyleSheet stylesheet = document.getStyleSheet(); for (int i = 0; i < range.numParagraphs(); i++) { Paragraph paragraph = range.getParagraph(i); String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName(); if (styleName.startsWith(HEADER_PREFIX)) { String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim(); int levelNum = 0; try { levelNum = Integer.parseInt(rawLevelNum); } catch (NumberFormatException nfe) { log.debug("Could not parse heading level from: " + styleName); } String text = Paragraph.stripFields(paragraph.text()); if ('\r' == text.charAt(text.length() - 1)) { text = text.substring(0, text.length() - 1); } headings.add(new WordMetadata.WordHeading(text, levelNum)); } } metadata.setHeadings(headings); metadata.setMetadata(document.getSummaryInformation()); return metadata; }
From source file:org.nuxeo.ecm.platform.template.tests.TestOOoConvert.java
License:Apache License
@Test public void testOfficeConverter5() throws Exception { ConversionService cs = Framework.getService(ConversionService.class); BlobHolder bh = getBlobFromPath("data/testMe.html", "text/html"); String converterName = cs.getConverterName(bh.getBlob().getMimeType(), "application/msword"); assertEquals("any2doc", converterName); boolean isAvailable = cs.isConverterAvailable(converterName).isAvailable(); assumeTrue(isAvailable);// w w w . j ava 2 s .co m BlobHolder result = cs.convert(converterName, bh, null); File docFile = Framework.createTempFile("docfile", "doc"); result.getBlob().transferTo(docFile); HWPFDocument doc = new HWPFDocument(new FileInputStream(docFile)); WordExtractor extractor = new WordExtractor(doc); String text = extractor.getText(); assertTrue(text.length() > 0); assertTrue(text.contains("Titre 1")); docFile.delete(); }
From source file:org.nuxeo.typeDocPkg.WordDoc.java
License:Apache License
private HWPFDocument getHWPFDocument(String filename) { POIFSFileSystem fs = null;//from w w w. j a va 2 s. c o m try { fs = new POIFSFileSystem(new FileInputStream(filename)); return new HWPFDocument(fs); } catch (Exception e) { log.error("Error during the getHWPFDocument method: ", e); return null; } }
From source file:org.opencrx.kernel.text.WordToText.java
License:BSD License
/** * Gets the text from a Word document.//from w ww . j a v a 2 s .c o m * * @param in The InputStream representing the Word file. */ public Reader parse(InputStream in) throws ServiceException { try { HWPFDocument doc = new HWPFDocument(HWPFDocument.verifyAndBuildPOIFS(in)); StringBuilder text = new StringBuilder(); String[] paragraphs = this.getParagraphText(doc); for (String paragraph : paragraphs) { text.append(paragraph); } return new StringReader(text.toString()); } catch (Exception e) { throw new ServiceException(e); } }