Example usage for org.apache.poi.xwpf.usermodel XWPFDocument XWPFDocument

Introduction

In this page you can find the example usage for org.apache.poi.xwpf.usermodel XWPFDocument XWPFDocument.

Prototype

public XWPFDocument(InputStream is) throws IOException

Source Link

Usage

From source file:offishell.word.Word.java

License:MIT License

/**
 * @param path//  www  .j a  v a 2s .  co  m
 */
protected Word(Path path) {
    if (Files.notExists(path)) {
        throw new Error(" " + path.toAbsolutePath() + " ?????");
    }

    try {
        this.path = path;
        this.calculated = new XWPFDocument(Files.newInputStream(path));

        CTTextDirection direction = calculated.getDocument().getBody().getSectPr().getTextDirection();

        if (direction != null) {
            this.textIsVerticalAlign = direction.getVal() == STTextDirection.TB_RL;
        } else {
            this.textIsVerticalAlign = false;
        }
    } catch (IOException e) {
        throw I.quiet(e);
    }
}

From source file:orcamentotraducao.OrcamentoTraducao.java

/**
 * @param args the command line arguments
 *///from   w w w .jav a  2s.c o m

public static void main(String[] args) {
    // TODO code application logic here
    Scanner scan = new Scanner(System.in);
    System.out.println("Informe o nome do arquivo:");
    String filename = scan.nextLine();
    String typeFile = filename.substring(filename.length() - 3, filename.length());
    if (!typeFile.matches("ocx") && !typeFile.matches("doc")) {
        System.out.println("Este formato de arquivo no  suportado\n");
        System.exit(0);
    }
    try {
        File file = new File(filename);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());

        String allText = "";
        int lines = 0;

        if (typeFile.matches("ocx")) {
            XWPFDocument document = new XWPFDocument(fis);

            List<XWPFParagraph> paragraphs = document.getParagraphs();

            for (XWPFParagraph para : paragraphs) {
                allText += para.getText() + " ";
                lines++;
            }
            fis.close();
        } else if (typeFile.matches("doc")) {
            WordExtractor extractor = new WordExtractor(new HWPFDocument(fis));
            allText = extractor.getText();
        }

        String allTextExploded[] = allText.split(" ");
        int words = allTextExploded.length;
        int characters = allText.length();

        System.out.println("H " + words + " palavras");
        System.out.println("H " + characters + " caracteres");
        System.out.println("H " + lines + " linhas");
        System.out.println("O oramento estimado  de R$" + calculate(characters, words, lines));

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:org.articleEditor.insertContent.POIDocxReader.java

License:Apache License

/**
 * Reads content of specified stream to the document.
 *
 * @param in stream.//from  w  w  w.ja  va  2 s  .c o m
 */
public void read(InputStream in, int offset) throws IOException, BadLocationException {
    poiDocument = new XWPFDocument(in);
    iteratePart(poiDocument.getBodyElements());
    this.currentOffset = offset;
    document.putProperty("XWPFDocument", poiDocument);
}

From source file:org.ArticleEditor.OptionsView.MenuOptionsTopComponent.java

public XWPFDocument getDocument(DataObject dataObject) throws FileNotFoundException, IOException {
    org.openide.filesystems.FileObject documentFileObject = dataObject.getPrimaryFile();
    File documentFile = FileUtil.toFile(documentFileObject);
    FileInputStream docxIS;//from  w w  w.j a v  a2s . c  o m
    docxIS = new FileInputStream(documentFile);
    XWPFDocument document = new XWPFDocument(docxIS);
    return document;
}

From source file:org.crypto.sse.TextExtractPar.java

License:Open Source License

private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException {

    Multimap<String, String> lookup1 = ArrayListMultimap.create();
    Multimap<String, String> lookup2 = ArrayListMultimap.create();

    for (File file : listOfFile) {

        for (int j = 0; j < 100; j++) {

            if (counter == (int) ((j + 1) * listOfFile.length / 100)) {
                System.out.println("Number of files read equals " + j + " %");
                break;
            }//from w w w.j  a  va 2 s.  c  o m
        }

        List<String> lines = new ArrayList<String>();
        counter++;
        FileInputStream fis = new FileInputStream(file);

        // ***********************************************************************************************//

        ///////////////////// .docx /////////////////////////////

        // ***********************************************************************************************//

        if (file.getName().endsWith(".docx")) {
            XWPFDocument doc;
            try {
                // System.out.println("File read: "+file.getName());

                doc = new XWPFDocument(fis);
                XWPFWordExtractor ex = new XWPFWordExtractor(doc);
                lines.add(ex.getText());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pptx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pptx")) {

            OPCPackage ppt;
            try {
                // System.out.println("File read: "+file.getName());

                ppt = OPCPackage.open(fis);
                XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt);
                lines.add(xw.getText());
            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .xlsx /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".xlsx")) {

            OPCPackage xls;
            try {
                // System.out.println("File read: "+file.getName());

                xls = OPCPackage.open(fis);
                XSSFExcelExtractor xe = new XSSFExcelExtractor(xls);
                lines.add(xe.getText());
            } catch (InvalidFormatException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (IOException e) {
                System.out.println("File not read: " + file.getName());

            } catch (XmlException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } catch (OpenXML4JException e) {
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .doc /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".doc")) {

            NPOIFSFileSystem fs;
            try {
                // System.out.println("File read: "+file.getName());

                fs = new NPOIFSFileSystem(file);
                WordExtractor extractor = new WordExtractor(fs.getRoot());
                for (String rawText : extractor.getParagraphText()) {
                    lines.add(extractor.stripFields(rawText));
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// .pdf /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".pdf")) {

            PDFParser parser;
            try {
                // System.out.println("File read: "+file.getName());

                parser = new PDFParser(fis);
                parser.parse();
                COSDocument cd = parser.getDocument();
                PDFTextStripper stripper = new PDFTextStripper();
                lines.add(stripper.getText(new PDDocument(cd)));

            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            }

        }

        // ***********************************************************************************************//

        ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg,
        ///////////////////// .mp4 /////////////////////////////

        // ***********************************************************************************************//

        else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg")
                && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg")
                && file.getName().endsWith(".mp4")) {

            lines.add(file.getName());

        }

        // ***********************************************************************************************//

        ///////////////////// raw text extensions
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        else {
            try {
                // System.out.println("File read: "+file.getName());

                lines = Files.readLines(file, Charsets.UTF_8);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                System.out.println("File not read: " + file.getName());
            } finally {
                try {
                    fis.close();
                } catch (IOException ioex) {
                    // omitted.
                }
            }
        }

        // ***********************************************************************************************//

        ///////////////////// Begin word extraction
        ///////////////////// /////////////////////////////

        // ***********************************************************************************************//

        int temporaryCounter = 0;

        // Filter threshold
        int counterDoc = 0;
        for (int i = 0; i < lines.size(); i++) {

            CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();

            // We are using a standard tokenizer that eliminates the stop
            // words. We can use Stemming tokenizer such Porter
            // A set of English noise keywords is used that will eliminates
            // words such as "the, a, etc"

            Analyzer analyzer = new StandardAnalyzer(noise);
            List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i));
            temporaryCounter = temporaryCounter + token.size();
            for (int j = 0; j < token.size(); j++) {

                // Avoid counting occurrences of words in the same file
                if (!lookup2.get(file.getName()).contains(token.get(j))) {
                    lookup2.put(file.getName(), token.get(j));
                }

                // Avoid counting occurrences of words in the same file
                if (!lookup1.get(token.get(j)).contains(file.getName())) {
                    lookup1.put(token.get(j), file.getName());
                }

            }

        }

    }

    // System.out.println(lookup.toString());
    return new TextExtractPar(lookup1, lookup2);

}

From source file:org.dspace.submit.step.UploadStep.java

License:BSD License

/**
 * Process the upload of a new file!/*from w  ww  . j av  a2 s  .  co m*/
 * 
 * @param context
 *            current DSpace context
 * @param request
 *            current servlet request object
 * @param response
 *            current servlet response object
 * @param subInfo
 *            submission info object
 * 
 * @return Status or error flag which will be processed by
 *         UI-related code! (if STATUS_COMPLETE or 0 is returned,
 *         no errors occurred!)
 */
public int processUploadFile(Context context, HttpServletRequest request, HttpServletResponse response,
        SubmissionInfo subInfo) throws ServletException, IOException, SQLException, AuthorizeException {
    boolean formatKnown = true;
    boolean fileOK = false;
    BitstreamFormat bf = null;
    Bitstream b = null;

    //NOTE: File should already be uploaded. 
    //Manakin does this automatically via Cocoon.
    //For JSP-UI, the SubmissionController.uploadFiles() does the actual upload

    Enumeration attNames = request.getAttributeNames();

    //loop through our request attributes
    while (attNames.hasMoreElements()) {
        String attr = (String) attNames.nextElement();

        //if this ends with "-path", this attribute
        //represents a newly uploaded file
        if (attr.endsWith("-path")) {
            //strip off the -path to get the actual parameter 
            //that the file was uploaded as
            String param = attr.replace("-path", "");
            String exten = param.substring(param.length() - 3);
            // Load the file's path and input stream and description
            String filePath = (String) request.getAttribute(param + "-path");
            InputStream fileInputStreamTest = (InputStream) request.getAttribute(param + "-inputstream");

            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            byte[] buf = new byte[1024];
            int n = 0;
            while ((n = fileInputStreamTest.read(buf)) >= 0)
                baos.write(buf, 0, n);
            byte[] content = baos.toByteArray();

            InputStream fileInputStream = new ByteArrayInputStream(content);

            InputStream fileInputStreamPdf = new ByteArrayInputStream(content);

            InputStream ifAnsi = new ByteArrayInputStream(content);

            //InputStream fss = fileInputStream.cl

            //attempt to get description from attribute first, then direct from a parameter
            String fileDescription = (String) request.getAttribute(param + "-description");
            if (fileDescription == null || fileDescription.length() == 0) {
                fileDescription = request.getParameter("description");
            }

            // if information wasn't passed by User Interface, we had a problem
            // with the upload
            if (filePath == null || fileInputStream == null) {
                return STATUS_UPLOAD_ERROR;
            }

            if (subInfo == null) {
                // In any event, if we don't have the submission info, the request
                // was malformed
                return STATUS_INTEGRITY_ERROR;
            }

            // Create the bitstream
            Item item = subInfo.getSubmissionItem().getItem();

            // do we already have a bundle?
            Bundle[] bundles = item.getBundles("ORIGINAL");

            if (bundles.length < 1) {
                // set bundle's name to ORIGINAL
                b = item.createSingleBitstream(fileInputStream, "ORIGINAL");
            } else {
                // we have a bundle already, just add bitstream
                b = bundles[0].createBitstream(fileInputStream);
            }

            //fileDescription.op

            if (exten.toLowerCase().equals("pdf")) {
                try {
                    PDFTextStripper pdfStripper = null;
                    PDDocument docum = null;
                    PDFParser parser = new PDFParser(fileInputStreamPdf);
                    COSDocument cosDoc = null;

                    parser.parse();
                    cosDoc = parser.getDocument();
                    pdfStripper = new PDFTextStripper();
                    docum = new PDDocument(cosDoc);
                    //pdfStripper.getText(docum);

                    String parsedText = pdfStripper.getText(docum);
                    Integer fifty = (Integer) Math.round(parsedText.length() / 2);
                    if (fifty < 0) {
                        fifty = fifty * (-1);
                    }
                    Integer toCut = 500;
                    if ((parsedText.length() - fifty) < 500) {
                        toCut = parsedText.length();
                    }

                    log.info("FUCKTHISSHIT: " + fifty + " " + toCut);
                    String subText = parsedText.substring(fifty, fifty + toCut - 1);
                    try {
                        subText = subText.substring(subText.indexOf(".") + 1);
                    } catch (Exception e) {

                    }
                    item.addMetadata("dc", "textpart", null, null, subText + "...");
                    item.update();
                    context.commit();
                    log.info(parsedText);
                } catch (Exception e) {
                    log.info("omgerror: " + e.toString());
                }
            }

            if (exten.toLowerCase().equals("txt")) {
                StringWriter writer = new StringWriter();
                IOUtils.copy(fileInputStreamPdf, writer, "UTF-8");

                String theString = writer.toString();
                if (theString.startsWith("\uFEFF")) {

                } else {
                    StringWriter writerAnsi = new StringWriter();
                    IOUtils.copy(ifAnsi, writerAnsi, "Cp1252");
                    theString = writerAnsi.toString();
                }
                Integer fifty = (Integer) Math.round(theString.length() * (50 / 100.0f));
                Integer toCut = 500;
                if ((theString.length() - fifty) < 500) {
                    toCut = theString.length();
                }
                String subText = theString.substring(fifty, toCut - 1);
                item.addMetadata("dc", "textpart", null, null, subText + "...");
                item.update();
                context.commit();
                log.info(subText);
            }

            log.info("OMGTEST: " + exten);

            if (exten.toLowerCase().equals("doc")) {
                WordExtractor extractor = null;
                try {

                    HWPFDocument document = new HWPFDocument(fileInputStreamPdf);
                    extractor = new WordExtractor(document);
                    String fileData = extractor.getText();
                    Integer fifty = (Integer) Math.round(50 * 100 / fileData.length());
                    Integer toCut = 500;
                    if ((fileData.length() - fifty) < 500) {
                        toCut = fileData.length();
                    }
                    String subText = fileData.substring(fifty, toCut - 1);
                    item.addMetadata("dc", "textpart", null, null, subText + "...");
                    item.update();
                    context.commit();
                } catch (Exception exep) {
                    log.info("OMGTESTIK:" + exep);
                }
            }

            if ((exten.toLowerCase().equals("ocx"))) {
                XWPFDocument document = new XWPFDocument(fileInputStreamPdf);
                XWPFWordExtractor extractor = null;
                extractor = new XWPFWordExtractor(document);

                String text = extractor.getText();
                Integer fifty = (Integer) Math.round(50 * 100 / text.length());
                Integer toCut = 500;
                if ((text.length() - fifty) < 500) {
                    toCut = text.length();
                }
                String subText = text.substring(fifty, toCut - 1);
                item.addMetadata("dc", "textpart", null, null, subText + "...");
                item.update();
                context.commit();
            }

            // Strip all but the last filename. It would be nice
            // to know which OS the file came from.
            String noPath = filePath;

            while (noPath.indexOf('/') > -1) {
                noPath = noPath.substring(noPath.indexOf('/') + 1);
            }

            while (noPath.indexOf('\\') > -1) {
                noPath = noPath.substring(noPath.indexOf('\\') + 1);
            }

            b.setName(noPath);
            b.setSource(filePath);
            b.setDescription(fileDescription);

            // Identify the format
            bf = FormatIdentifier.guessFormat(context, b);
            b.setFormat(bf);

            // Update to DB
            b.update();
            item.update();

            if ((bf != null) && (bf.isInternal())) {
                log.warn("Attempt to upload file format marked as internal system use only");
                backoutBitstream(subInfo, b, item);
                return STATUS_UPLOAD_ERROR;
            }

            // Check for virus
            if (ConfigurationManager.getBooleanProperty("submission-curation", "virus-scan")) {
                Curator curator = new Curator();
                curator.addTask("vscan").curate(item);
                int status = curator.getStatus("vscan");
                if (status == Curator.CURATE_ERROR) {
                    backoutBitstream(subInfo, b, item);
                    return STATUS_VIRUS_CHECKER_UNAVAILABLE;
                } else if (status == Curator.CURATE_FAIL) {
                    backoutBitstream(subInfo, b, item);
                    return STATUS_CONTAINS_VIRUS;
                }
            }

            // If we got this far then everything is more or less ok.

            // Comment - not sure if this is the right place for a commit here
            // but I'm not brave enough to remove it - Robin.
            context.commit();

            // save this bitstream to the submission info, as the
            // bitstream we're currently working with
            subInfo.setBitstream(b);

            //if format was not identified
            if (bf == null) {
                return STATUS_UNKNOWN_FORMAT;
            }

        } //end if attribute ends with "-path"
    } //end while

    return STATUS_COMPLETE;

}

From source file:org.eclipse.sw360.licenseinfo.outputGenerators.DocxGenerator.java

License:Open Source License

@Override
public byte[] generateOutputFile(Collection<LicenseInfoParsingResult> projectLicenseInfoResults,
        String projectName, String projectVersion, String licenseInfoHeaderText) throws SW360Exception {
    ByteArrayOutputStream docxOutputStream = new ByteArrayOutputStream();
    Optional<byte[]> docxTemplateFile;
    XWPFDocument xwpfDocument;/*from  w  w  w  . j a  v a 2 s.co  m*/
    try {
        switch (getOutputVariant()) {
        case DISCLOSURE:
            docxTemplateFile = CommonUtils.loadResource(DocxGenerator.class, DOCX_TEMPLATE_FILE);
            xwpfDocument = new XWPFDocument(new ByteArrayInputStream(docxTemplateFile.get()));
            if (docxTemplateFile.isPresent()) {
                fillDocument(xwpfDocument, projectLicenseInfoResults, projectName, projectVersion,
                        licenseInfoHeaderText, false);
            } else {
                throw new SW360Exception(
                        "Could not load the template for xwpf document: " + DOCX_TEMPLATE_FILE);
            }
            break;
        case REPORT:
            docxTemplateFile = CommonUtils.loadResource(DocxGenerator.class, DOCX_TEMPLATE_REPORT_FILE);
            xwpfDocument = new XWPFDocument(new ByteArrayInputStream(docxTemplateFile.get()));
            if (docxTemplateFile.isPresent()) {
                fillDocument(xwpfDocument, projectLicenseInfoResults, projectName, projectVersion,
                        licenseInfoHeaderText, true);
            } else {
                throw new SW360Exception(
                        "Could not load the template for xwpf document: " + DOCX_TEMPLATE_REPORT_FILE);
            }
            break;
        default:
            throw new IllegalArgumentException("Unknown generator variant type: " + getOutputVariant());
        }
        xwpfDocument.write(docxOutputStream);
        docxOutputStream.close();
    } catch (XmlException e) {
        throw new SW360Exception("Got XmlException while generating docx document: " + e.getMessage());
    } catch (IOException e) {
        throw new SW360Exception("Got IOException when generating docx document: " + e.getMessage());
    } catch (TException e) {
        throw new SW360Exception("Error reading sw360 licenses: " + e.getMessage());
    }
    return docxOutputStream.toByteArray();
}

From source file:org.encuestame.business.search.IndexerFile.java

License:Apache License

/**
 * Parse Word Document.//from w w w .j  av a  2s  .c o m
 * @param file
 * @return
 * @throws POIXMLException
 * @throws Exception
 */
public static XWPFWordExtractor parseWordDocument(final File file) throws POIXMLException, Exception {
    InputStream is = new FileInputStream(file);
    XWPFWordExtractor wde = null;
    try {
        XWPFDocument wd = new XWPFDocument(is);
        wde = new XWPFWordExtractor(wd);
        log.debug("Parse Word Document --------------------------> ");
    } catch (Exception e) {
        log.error("ERROR parse Word Document-------->" + e);
    }
    return wde;
}

From source file:org.encuestame.business.search.SearchUtils.java

License:Apache License

/**
* Create Document Word.//from ww w.  jav  a2 s  .  c  o  m
* @param file {@link File}
* @param Long attachmentId.
* @return {@link Document}
* @throws POIXMLException
* @throws Exception
*/
public static Document createWordDocument(final File file) throws POIXMLException, Exception {
    InputStream is = new FileInputStream(file);
    String bodyText = null;
    try {
        XWPFDocument wd = new XWPFDocument(is);
        XWPFWordExtractor wde = new XWPFWordExtractor(wd);
        bodyText = wde.getText();
    } catch (Exception e) {
        log.debug(e);
    }
    Document doc = SearchUtils.addFields(file, bodyText);
    return doc;
}

From source file:org.exoplatform.services.document.impl.MSXWordDocumentReader.java

License:Open Source License

/**
 * Returns only a text from .docx file content.
 * //www.  j a  v a 2 s  .co m
 * @param is an input stream with .docx file content.
 * @return The string only with text from file content.
 */
public String getContentAsText(final InputStream is) throws IOException, DocumentReadException {
    if (is == null) {
        throw new IllegalArgumentException("InputStream is null.");
    }
    String text = "";
    try {
        if (is.available() == 0) {
            return "";
        }

        XWPFDocument doc;
        try {
            doc = SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<XWPFDocument>() {
                public XWPFDocument run() throws Exception {
                    return new XWPFDocument(is);
                }
            });
        } catch (IOException e) {
            throw new DocumentReadException("Can't open message.", e);
        } catch (OpenXML4JRuntimeException e) {
            throw new DocumentReadException("Can't open message.", e);
        }

        final XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        text = SecurityHelper.doPrivilegedAction(new PrivilegedAction<String>() {
            public String run() {
                return extractor.getText();
            }
        });
    } finally {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("An exception occurred: " + e.getMessage());
                }
            }
        }
    }
    return text.trim();
}