Example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument

Introduction

In this page you can find the example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument.

Prototype

public HWPFDocument(DirectoryNode directory) throws IOException

Source Link

Document

This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not the default.

Usage

From source file:org.dspace.submit.step.UploadStep.java

License:BSD License

/**
 * Process the upload of a new file!//from   w w  w  .  j av a 2  s  . com
 * 
 * @param context
 *            current DSpace context
 * @param request
 *            current servlet request object
 * @param response
 *            current servlet response object
 * @param subInfo
 *            submission info object
 * 
 * @return Status or error flag which will be processed by
 *         UI-related code! (if STATUS_COMPLETE or 0 is returned,
 *         no errors occurred!)
 */
public int processUploadFile(Context context, HttpServletRequest request, HttpServletResponse response,
        SubmissionInfo subInfo) throws ServletException, IOException, SQLException, AuthorizeException {
    boolean formatKnown = true;
    boolean fileOK = false;
    BitstreamFormat bf = null;
    Bitstream b = null;

    //NOTE: File should already be uploaded. 
    //Manakin does this automatically via Cocoon.
    //For JSP-UI, the SubmissionController.uploadFiles() does the actual upload

    Enumeration attNames = request.getAttributeNames();

    //loop through our request attributes
    while (attNames.hasMoreElements()) {
        String attr = (String) attNames.nextElement();

        //if this ends with "-path", this attribute
        //represents a newly uploaded file
        if (attr.endsWith("-path")) {
            //strip off the -path to get the actual parameter 
            //that the file was uploaded as
            String param = attr.replace("-path", "");
            String exten = param.substring(param.length() - 3);
            // Load the file's path and input stream and description
            String filePath = (String) request.getAttribute(param + "-path");
            InputStream fileInputStreamTest = (InputStream) request.getAttribute(param + "-inputstream");

            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            byte[] buf = new byte[1024];
            int n = 0;
            while ((n = fileInputStreamTest.read(buf)) >= 0)
                baos.write(buf, 0, n);
            byte[] content = baos.toByteArray();

            InputStream fileInputStream = new ByteArrayInputStream(content);

            InputStream fileInputStreamPdf = new ByteArrayInputStream(content);

            InputStream ifAnsi = new ByteArrayInputStream(content);

            //InputStream fss = fileInputStream.cl

            //attempt to get description from attribute first, then direct from a parameter
            String fileDescription = (String) request.getAttribute(param + "-description");
            if (fileDescription == null || fileDescription.length() == 0) {
                fileDescription = request.getParameter("description");
            }

            // if information wasn't passed by User Interface, we had a problem
            // with the upload
            if (filePath == null || fileInputStream == null) {
                return STATUS_UPLOAD_ERROR;
            }

            if (subInfo == null) {
                // In any event, if we don't have the submission info, the request
                // was malformed
                return STATUS_INTEGRITY_ERROR;
            }

            // Create the bitstream
            Item item = subInfo.getSubmissionItem().getItem();

            // do we already have a bundle?
            Bundle[] bundles = item.getBundles("ORIGINAL");

            if (bundles.length < 1) {
                // set bundle's name to ORIGINAL
                b = item.createSingleBitstream(fileInputStream, "ORIGINAL");
            } else {
                // we have a bundle already, just add bitstream
                b = bundles[0].createBitstream(fileInputStream);
            }

            //fileDescription.op

            if (exten.toLowerCase().equals("pdf")) {
                try {
                    PDFTextStripper pdfStripper = null;
                    PDDocument docum = null;
                    PDFParser parser = new PDFParser(fileInputStreamPdf);
                    COSDocument cosDoc = null;

                    parser.parse();
                    cosDoc = parser.getDocument();
                    pdfStripper = new PDFTextStripper();
                    docum = new PDDocument(cosDoc);
                    //pdfStripper.getText(docum);

                    String parsedText = pdfStripper.getText(docum);
                    Integer fifty = (Integer) Math.round(parsedText.length() / 2);
                    if (fifty < 0) {
                        fifty = fifty * (-1);
                    }
                    Integer toCut = 500;
                    if ((parsedText.length() - fifty) < 500) {
                        toCut = parsedText.length();
                    }

                    log.info("FUCKTHISSHIT: " + fifty + " " + toCut);
                    String subText = parsedText.substring(fifty, fifty + toCut - 1);
                    try {
                        subText = subText.substring(subText.indexOf(".") + 1);
                    } catch (Exception e) {

                    }
                    item.addMetadata("dc", "textpart", null, null, subText + "...");
                    item.update();
                    context.commit();
                    log.info(parsedText);
                } catch (Exception e) {
                    log.info("omgerror: " + e.toString());
                }
            }

            if (exten.toLowerCase().equals("txt")) {
                StringWriter writer = new StringWriter();
                IOUtils.copy(fileInputStreamPdf, writer, "UTF-8");

                String theString = writer.toString();
                if (theString.startsWith("\uFEFF")) {

                } else {
                    StringWriter writerAnsi = new StringWriter();
                    IOUtils.copy(ifAnsi, writerAnsi, "Cp1252");
                    theString = writerAnsi.toString();
                }
                Integer fifty = (Integer) Math.round(theString.length() * (50 / 100.0f));
                Integer toCut = 500;
                if ((theString.length() - fifty) < 500) {
                    toCut = theString.length();
                }
                String subText = theString.substring(fifty, toCut - 1);
                item.addMetadata("dc", "textpart", null, null, subText + "...");
                item.update();
                context.commit();
                log.info(subText);
            }

            log.info("OMGTEST: " + exten);

            if (exten.toLowerCase().equals("doc")) {
                WordExtractor extractor = null;
                try {

                    HWPFDocument document = new HWPFDocument(fileInputStreamPdf);
                    extractor = new WordExtractor(document);
                    String fileData = extractor.getText();
                    Integer fifty = (Integer) Math.round(50 * 100 / fileData.length());
                    Integer toCut = 500;
                    if ((fileData.length() - fifty) < 500) {
                        toCut = fileData.length();
                    }
                    String subText = fileData.substring(fifty, toCut - 1);
                    item.addMetadata("dc", "textpart", null, null, subText + "...");
                    item.update();
                    context.commit();
                } catch (Exception exep) {
                    log.info("OMGTESTIK:" + exep);
                }
            }

            if ((exten.toLowerCase().equals("ocx"))) {
                XWPFDocument document = new XWPFDocument(fileInputStreamPdf);
                XWPFWordExtractor extractor = null;
                extractor = new XWPFWordExtractor(document);

                String text = extractor.getText();
                Integer fifty = (Integer) Math.round(50 * 100 / text.length());
                Integer toCut = 500;
                if ((text.length() - fifty) < 500) {
                    toCut = text.length();
                }
                String subText = text.substring(fifty, toCut - 1);
                item.addMetadata("dc", "textpart", null, null, subText + "...");
                item.update();
                context.commit();
            }

            // Strip all but the last filename. It would be nice
            // to know which OS the file came from.
            String noPath = filePath;

            while (noPath.indexOf('/') > -1) {
                noPath = noPath.substring(noPath.indexOf('/') + 1);
            }

            while (noPath.indexOf('\\') > -1) {
                noPath = noPath.substring(noPath.indexOf('\\') + 1);
            }

            b.setName(noPath);
            b.setSource(filePath);
            b.setDescription(fileDescription);

            // Identify the format
            bf = FormatIdentifier.guessFormat(context, b);
            b.setFormat(bf);

            // Update to DB
            b.update();
            item.update();

            if ((bf != null) && (bf.isInternal())) {
                log.warn("Attempt to upload file format marked as internal system use only");
                backoutBitstream(subInfo, b, item);
                return STATUS_UPLOAD_ERROR;
            }

            // Check for virus
            if (ConfigurationManager.getBooleanProperty("submission-curation", "virus-scan")) {
                Curator curator = new Curator();
                curator.addTask("vscan").curate(item);
                int status = curator.getStatus("vscan");
                if (status == Curator.CURATE_ERROR) {
                    backoutBitstream(subInfo, b, item);
                    return STATUS_VIRUS_CHECKER_UNAVAILABLE;
                } else if (status == Curator.CURATE_FAIL) {
                    backoutBitstream(subInfo, b, item);
                    return STATUS_CONTAINS_VIRUS;
                }
            }

            // If we got this far then everything is more or less ok.

            // Comment - not sure if this is the right place for a commit here
            // but I'm not brave enough to remove it - Robin.
            context.commit();

            // save this bitstream to the submission info, as the
            // bitstream we're currently working with
            subInfo.setBitstream(b);

            //if format was not identified
            if (bf == null) {
                return STATUS_UNKNOWN_FORMAT;
            }

        } //end if attribute ends with "-path"
    } //end while

    return STATUS_COMPLETE;

}

From source file:org.esmerilprogramming.pdfcake.DocumentReplace.java

License:Open Source License

/**
 * Changes the real document file.//w w  w  . j  a v a  2  s .  c o m
 * 
 * @param in
 *            InputStream
 * @param template
 *            DocumentTemplate
 * @throws Exception
 */
public static void changeDocFile(InputStream in, DocumentTemplate template) throws Exception {
    HWPFDocument document = new HWPFDocument(in);
    document = replaceKeys(document, template);
    ByteArrayInputStream bais = getDocumentAsByteArrayIS(document);
    template.setContentBytes(convertToPdfByteArray(bais));
}

From source file:org.exoplatform.services.document.impl.MSWordDocumentReader.java

License:Open Source License

/**
 * Returns only a text from .doc file content.
 * // w  w  w  .  ja  va2 s. c om
 * @param is an input stream with .doc file content.
 * @return The string only with text from file content.
 */
public String getContentAsText(final InputStream is) throws IOException, DocumentReadException {
    if (is == null) {
        throw new IllegalArgumentException("InputStream is null.");
    }
    String text = "";
    try {
        if (is.available() == 0) {
            return "";
        }

        HWPFDocument doc;
        try {
            doc = SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<HWPFDocument>() {
                public HWPFDocument run() throws Exception {
                    return new HWPFDocument(is);
                }
            });
        } catch (IOException e) {
            throw new DocumentReadException("Can't open document.", e);
        }

        Range range = doc.getRange();
        text = range.text();
    } finally {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("An exception occurred: " + e.getMessage());
                }
            }
        }
    }
    return text.trim();
}

From source file:org.infoglue.cms.controllers.kernel.impl.simple.LuceneController.java

License:Open Source License

private String extractTextToIndex(DigitalAssetVO digitalAssetVO, File file) {
    String text = "";

    if (logger.isInfoEnabled())
        logger.info("Asset content type:" + digitalAssetVO.getAssetContentType());

    if (digitalAssetVO.getAssetContentType().equalsIgnoreCase("application/pdf")) {
        try {/* www  . j  av a  2s .co m*/
            Writer output = null;
            PDDocument document = null;
            try {
                document = PDDocument.load(file);

                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                if (!document.isEncrypted()) {
                    output = new OutputStreamWriter(baos, "UTF-8");

                    PDFTextStripper stripper = new PDFTextStripper();

                    //stripper.setSortByPosition( sort );
                    //stripper.setStartPage( startPage );
                    //stripper.setEndPage( endPage );
                    stripper.writeText(document, output);
                    text = baos.toString("UTF-8");
                    if (logger.isInfoEnabled())
                        logger.info("PDF Document has " + text.length() + " chars\n\n" + text);
                }
            } catch (Exception e) {
                logger.warn("Error indexing file: " + file + "\nMessage: " + e.getMessage());
            } finally {
                if (output != null) {
                    output.close();
                }
                if (document != null) {
                    document.close();
                }
            }
        } catch (Exception e) {
            logger.warn("Error indexing:" + e.getMessage());
        }
    } else if (digitalAssetVO.getAssetContentType().equalsIgnoreCase("application/msword")) {
        try {
            POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(file));

            // Create a document for this file
            HWPFDocument doc = new HWPFDocument(fs);

            // Create a WordExtractor to read the text of the word document
            WordExtractor we = new WordExtractor(doc);

            // Extract all paragraphs in the document as strings
            text = we.getText();

            // Output the document
            if (logger.isInfoEnabled())
                logger.info("Word Document has " + text.length() + " chars\n\n" + text);
        } catch (Exception e) {
            logger.warn("Error indexing file: " + file + "\nMessage: " + e.getMessage());
        }
    }

    return text;
}

From source file:org.knime.ext.textprocessing.nodes.source.parser.word.WordDocumentParser.java

License:Open Source License

private Document parseInternal(final InputStream is) throws Exception {
    m_currentDoc = new DocumentBuilder(m_tokenizerName);
    m_currentDoc.setDocumentFile(new File(m_docPath));
    m_currentDoc.setDocumentType(m_type);
    m_currentDoc.addDocumentCategory(m_category);
    m_currentDoc.addDocumentSource(m_source);

    POIFSFileSystem poifs = null;/*from w  ww  . ja  va  2  s. c  o  m*/
    HWPFDocument hdoc = null;
    XWPFDocument hdoc2 = null;
    WordExtractor extractor = null;

    try {
        // doc files
        if (m_docPath.endsWith(".doc")) {
            // copy content of input stream into byte array since content have to be red twice unfortunately.
            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
            final byte[] buf = new byte[1024];
            int i = 0;
            while ((i = is.read(buf)) >= 0) {
                baos.write(buf, 0, i);
            }
            final byte[] content = baos.toByteArray();

            // open stream with copied content to read text
            InputStream copiedInput = new ByteArrayInputStream(content);
            hdoc = new HWPFDocument(copiedInput);
            extractor = new WordExtractor(hdoc);
            for (String p : extractor.getParagraphText()) {
                p = p.trim();
                if (!onlyWhitepscaes(p)) {
                    m_currentDoc.addParagraph(p);
                }
            }

            // open stream again with copied content to read meta info
            copiedInput = new ByteArrayInputStream(content);
            poifs = new POIFSFileSystem(copiedInput);
            final DirectoryEntry dir = poifs.getRoot();
            final DocumentEntry siEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME);
            final PropertySet ps = new PropertySet(new DocumentInputStream(siEntry));

            final SummaryInformation si = new SummaryInformation(ps);

            setAuthor(si.getAuthor());
            setPublicationDate(si.getCreateDateTime());

            // docx files
        } else if (m_docPath.endsWith(".docx") || m_docPath.endsWith(".docm")) {
            hdoc2 = new XWPFDocument(is);
            final List<XWPFParagraph> paragraphs = hdoc2.getParagraphs();
            for (final XWPFParagraph paragraph : paragraphs) {
                final String text = paragraph.getText();
                if (!onlyWhitepscaes(text)) {
                    m_currentDoc.addParagraph(text);
                }
            }

            setAuthor(hdoc2.getProperties().getCoreProperties().getCreator());
            setPublicationDate(hdoc2.getProperties().getCoreProperties().getCreated());
        }

        m_currentDoc.createNewSection(SectionAnnotation.CHAPTER);

        // find title
        String title = null;

        if (m_filenameAsTitle) {
            title = m_docPath.trim();
        } else {
            final List<Section> sections = m_currentDoc.getSections();
            if (sections.size() > 0) {
                try {
                    title = sections.get(0).getParagraphs().get(0).getSentences().get(0).getText().trim();
                } catch (IndexOutOfBoundsException e) {
                    LOGGER.debug("Parsed word document " + m_docPath + " is empty.");
                    title = "";
                }
            }
        }
        if (!checkTitle(title)) {
            title = m_docPath.toString();
        }
        m_currentDoc.addTitle(title);

        return m_currentDoc.createDocument();
    } finally {
        is.close();
        if (poifs != null) {
            poifs.close();
        }
        if (hdoc != null) {
            hdoc.close();
        }
        if (hdoc2 != null) {
            hdoc2.close();
        }
        if (extractor != null) {
            extractor.close();
        }
    }
}

From source file:org.luwrain.app.preview.FilterPoi.java

License:Open Source License

public void open(String fileName) throws Exception {
    File docFile = new File(fileName);
    FileInputStream finStream = new FileInputStream(docFile.getAbsolutePath());
    HWPFDocument doc = new HWPFDocument(finStream);
    WordExtractor wordExtract = new WordExtractor(doc);
    lines = wordExtract.getParagraphText();
    finStream.close(); //closing fileinputstream
    this.fileName = fileName;
}

From source file:org.modeshape.sequencer.msoffice.word.WordMetadataReader.java

License:Apache License

public static WordMetadata instance(InputStream stream) throws IOException {
    WordMetadata metadata = new WordMetadata();
    List<WordMetadata.WordHeading> headings = new ArrayList<WordMetadata.WordHeading>();

    HWPFDocument document = new HWPFDocument(stream);
    Range range = document.getRange();//w  w w  .  j a  va2 s .  c  o m

    StyleSheet stylesheet = document.getStyleSheet();

    for (int i = 0; i < range.numParagraphs(); i++) {
        Paragraph paragraph = range.getParagraph(i);

        String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName();

        if (styleName.startsWith(HEADER_PREFIX)) {
            String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim();
            int levelNum = 0;

            try {
                levelNum = Integer.parseInt(rawLevelNum);
            } catch (NumberFormatException nfe) {
                log.debug("Could not parse heading level from: " + styleName);
            }

            String text = Paragraph.stripFields(paragraph.text());

            if ('\r' == text.charAt(text.length() - 1)) {
                text = text.substring(0, text.length() - 1);
            }

            headings.add(new WordMetadata.WordHeading(text, levelNum));
        }
    }

    metadata.setHeadings(headings);
    metadata.setMetadata(document.getSummaryInformation());
    return metadata;
}

From source file:org.nuxeo.ecm.platform.template.tests.TestOOoConvert.java

License:Apache License

@Test
public void testOfficeConverter5() throws Exception {
    ConversionService cs = Framework.getService(ConversionService.class);

    BlobHolder bh = getBlobFromPath("data/testMe.html", "text/html");
    String converterName = cs.getConverterName(bh.getBlob().getMimeType(), "application/msword");
    assertEquals("any2doc", converterName);

    boolean isAvailable = cs.isConverterAvailable(converterName).isAvailable();
    assumeTrue(isAvailable);//  w w w .  j  ava 2  s .co  m

    BlobHolder result = cs.convert(converterName, bh, null);
    File docFile = Framework.createTempFile("docfile", "doc");
    result.getBlob().transferTo(docFile);

    HWPFDocument doc = new HWPFDocument(new FileInputStream(docFile));
    WordExtractor extractor = new WordExtractor(doc);

    String text = extractor.getText();
    assertTrue(text.length() > 0);
    assertTrue(text.contains("Titre 1"));

    docFile.delete();
}

From source file:org.nuxeo.typeDocPkg.WordDoc.java

License:Apache License

private HWPFDocument getHWPFDocument(String filename) {
    POIFSFileSystem fs = null;//from   w  w w.  j a va 2  s. c  o m
    try {
        fs = new POIFSFileSystem(new FileInputStream(filename));

        return new HWPFDocument(fs);

    } catch (Exception e) {
        log.error("Error during the getHWPFDocument method: ", e);
        return null;
    }
}

From source file:org.opencrx.kernel.text.WordToText.java

License:BSD License

/**
 * Gets the text from a Word document.//from w ww . j a v  a 2  s .c  o m
 * 
 * @param in The InputStream representing the Word file.
 */
public Reader parse(InputStream in) throws ServiceException {
    try {
        HWPFDocument doc = new HWPFDocument(HWPFDocument.verifyAndBuildPOIFS(in));
        StringBuilder text = new StringBuilder();
        String[] paragraphs = this.getParagraphText(doc);
        for (String paragraph : paragraphs) {
            text.append(paragraph);
        }
        return new StringReader(text.toString());
    } catch (Exception e) {
        throw new ServiceException(e);
    }
}