Example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument

List of usage examples for org.apache.poi.hwpf HWPFDocument HWPFDocument

Introduction

In this page you can find the example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument.

Prototype

public HWPFDocument(DirectoryNode directory) throws IOException 

Source Link

Document

This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not the default.

Usage

From source file:javaapplication1.HWPFTest.java

public static void doStuff() {
    String filePath = "D:\\insiders_report4.doc";
    POIFSFileSystem fs = null;// w  w  w  .j a va  2 s .  c o  m
    try {
        fs = new POIFSFileSystem(new FileInputStream(filePath));
        HWPFDocument doc = new HWPFDocument(fs);
        doc = replaceText(doc, "#FIO#", " ? ?");
        doc = replaceText(doc, "#BIN#", "900524300077");
        doc = replaceText(doc, "#INCDAY#", "05");
        doc = replaceText(doc, "#INCMONTH#", "");
        doc = replaceText(doc, "#INCYEAR#", "2016");
        doc = replaceText(doc, "#EXCDAY#", "05");
        doc = replaceText(doc, "#EXCMONTH#", "");
        doc = replaceText(doc, "#EXCYEAR#", "2016");
        doc = replaceText(doc, "#MAINCHIEF#", "  ?");
        saveWord("D:\\result.doc", doc);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:javaapplication1.utils.MyWordToHtml.java

public static void convert(String path, String file) throws Throwable {
    InputStream input = new FileInputStream(path + file);
    HWPFDocument wordDocument = new HWPFDocument(input);
    WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
            DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
    MyPictureManager pictureManager = new MyPictureManager();
    wordToHtmlConverter.setPicturesManager(pictureManager);

    wordToHtmlConverter.processDocument(wordDocument);
    List<?> pics = wordDocument.getPicturesTable().getAllPictures();
    File dir = new File("D:\\pics");
    dir.mkdir();//from w w  w  . j av  a  2s .com
    if (pics != null) {
        for (int i = 0; i < pics.size(); i++) {
            Picture pic = (Picture) pics.get(i);
            try {
                pic.writeImageContent(new FileOutputStream(path + "pics/" + pic.suggestFullFileName()));
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
        }
    }
    Document htmlDocument = wordToHtmlConverter.getDocument();
    ByteArrayOutputStream outStream = new ByteArrayOutputStream();
    DOMSource domSource = new DOMSource(htmlDocument);
    StreamResult streamResult = new StreamResult(outStream);

    TransformerFactory tf = TransformerFactory.newInstance();
    Transformer serializer = tf.newTransformer();
    serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
    serializer.setOutputProperty(OutputKeys.INDENT, "yes");
    serializer.setOutputProperty(OutputKeys.METHOD, "html");
    serializer.transform(domSource, streamResult);
    outStream.close();

    String content = new String(outStream.toByteArray());

    writeFile(content, path + "result.html", "UTF-8");
}

From source file:mc.program.Importer.java

public void importDOC() {
    try {//from w ww . j  a  va2 s. c  o  m
        // Set up objects for getting from .doc file
        FileInputStream fis = new FileInputStream(sourceFile.getAbsolutePath());
        HWPFDocument document = new HWPFDocument(fis);
        WordExtractor extractor = new WordExtractor(document);

        // Extract text
        String[] fileData = extractor.getParagraphText();

        // Put text into array list
        for (String fileData1 : fileData) {
            Scanner scanner = new Scanner(fileData1);
            while (scanner.hasNext()) {
                sourceText.add(scanner.next());
            }
        }

        fis.close();
        extractor.close();
    } catch (Exception ex) {
        System.out.print(ex);
    }
}

From source file:me.philnate.textmanager.utils.WordCount.java

License:Open Source License

/**
 * opens the given file, if it's a .doc or .docx file and returns the number
 * of words within the document/*  w w w  . jav  a2 s  . c o  m*/
 * 
 * @param file
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
public static long countFile(File file) throws FileNotFoundException, IOException {
    try (FileInputStream fis = new FileInputStream(file.getAbsolutePath())) {
        if (file.getName().endsWith(".docx")) {
            XWPFDocument document = new XWPFDocument(fis);
            XWPFWordExtractor extractor = new XWPFWordExtractor(document);
            return linecount(extractor.getText());
        } else if (file.getName().endsWith(".doc")) {
            HWPFDocument document = new HWPFDocument(fis);
            WordExtractor extractor = new WordExtractor(document);
            return WordCount.linecount(extractor.getText());
        } else {
            throw new IllegalArgumentException("Can't handle non doc(X) files");
        }
    }
}

From source file:mj.ocraptor.extraction.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFDocument document;// www. j a v a 2s . com
    try {
        document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }

    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
            document);

    // mj
    extractImageText(xhtml, document);

    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    // the pictures should be in order, and may be directly
    // placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(),
            headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(),
            headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}

From source file:Modelo.EscribirWord.java

public void crearWord(Objeto obj) throws IOException {
    String filePath = "HojaInventarioTemplate.doc";
    String filePathFinal = "HojaInventarioObjeto.doc";
    POIFSFileSystem fs = null;/*from w w w  .  j av  a2  s.  c  om*/

    try {
        fs = new POIFSFileSystem(new FileInputStream(filePath));
        HWPFDocument doc = new HWPFDocument(fs);

        doc = replaceText(doc, "$nombreObjeto", obj.getNombreObjeto());
        doc = replaceText(doc, "$formaAdquisicion", obj.getFormaAdquisicion());
        doc = replaceText(doc, "$fechaIngreso", obj.getFechaIngreso());
        doc = replaceText(doc, "$numRegistro", obj.getNumRegistro());
        doc = replaceText(doc, "$valorEconomico", obj.getValorEconomico());
        doc = replaceText(doc, "$nombreFuente", obj.getNombreFuente());
        doc = replaceText(doc, "$fechaInventario", obj.getFechaInventario());

        doc = replaceText(doc, "$numCatalogo", obj.getNumCatalogo());

        doc = replaceText(doc, "$numInventario", obj.getNumInventario());
        doc = replaceText(doc, "$otrosNumeros", obj.getOtrosNumeros());
        doc = replaceText(doc, "$direccionFuente", obj.getDireccionFuente());
        doc = replaceText(doc, "$fechaCatalogo", obj.getFechaCatalogo());
        doc = replaceText(doc, "$espesor", obj.getEspesor());
        doc = replaceText(doc, "$alto", obj.getAlto());
        doc = replaceText(doc, "$ancho", obj.getAncho());
        doc = replaceText(doc, "$largo", obj.getLargo());
        doc = replaceText(doc, "$diametro", obj.getDiametro());
        doc = replaceText(doc, "$peso", obj.getPeso());
        doc = replaceText(doc, "$procedencia", obj.getProcedencia());
        doc = replaceText(doc, "$materiaYTecnica", obj.getMateriaYTecnica());
        doc = replaceText(doc, "$numeroNegativo", obj.getNumeroNegativo());
        doc = replaceText(doc, "$autor", obj.getAutor());
        doc = replaceText(doc, "$epoca", obj.getEpoca());
        doc = replaceText(doc, "$descripcion", obj.getDescripcion());
        doc = replaceText(doc, "$documentacion", obj.getDocumentacion());
        doc = replaceText(doc, "$observaciones", obj.getObservaciones());
        doc = replaceText(doc, "$recibio", obj.getRecibio());
        doc = replaceText(doc, "$inventario", obj.getInventario());
        doc = replaceText(doc, "$catalogo", obj.getCatalogo());
        doc = replaceText(doc, "$aprobo", obj.getAprobo());

        saveWord(filePathFinal, doc);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:no.trank.openpipe.parse.ms.WordParser.java

License:Apache License

@Override
public ParserResult parse(ParseData data) throws IOException, ParserException {
    final HWPFDocument doc = new HWPFDocument(data.getInputStream());
    final ParserResultImpl result = new ParserResultImpl();
    result.setTitle(doc.getSummaryInformation().getTitle());
    final WordExtractor extractor = new WordExtractor(doc);
    result.setText(POIUtils.getCleanText(extractor.getText()));
    if (data.includeProperties()) {
        result.setProperties(POIUtils.getProperties(doc));
    }//from w ww. j  a  v  a  2 s  . c  om
    return result;
}

From source file:orcamentotraducao.OrcamentoTraducao.java

/**
 * @param args the command line arguments
 *//*from  ww w . j  a va  2s .c om*/

public static void main(String[] args) {
    // TODO code application logic here
    Scanner scan = new Scanner(System.in);
    System.out.println("Informe o nome do arquivo:");
    String filename = scan.nextLine();
    String typeFile = filename.substring(filename.length() - 3, filename.length());
    if (!typeFile.matches("ocx") && !typeFile.matches("doc")) {
        System.out.println("Este formato de arquivo no  suportado\n");
        System.exit(0);
    }
    try {
        File file = new File(filename);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());

        String allText = "";
        int lines = 0;

        if (typeFile.matches("ocx")) {
            XWPFDocument document = new XWPFDocument(fis);

            List<XWPFParagraph> paragraphs = document.getParagraphs();

            for (XWPFParagraph para : paragraphs) {
                allText += para.getText() + " ";
                lines++;
            }
            fis.close();
        } else if (typeFile.matches("doc")) {
            WordExtractor extractor = new WordExtractor(new HWPFDocument(fis));
            allText = extractor.getText();
        }

        String allTextExploded[] = allText.split(" ");
        int words = allTextExploded.length;
        int characters = allText.length();

        System.out.println("H " + words + " palavras");
        System.out.println("H " + characters + " caracteres");
        System.out.println("H " + lines + " linhas");
        System.out.println("O oramento estimado  de R$" + calculate(characters, words, lines));

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:org.apache.tika.parser.microsoft.WordExtractor.java

License:Apache License

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml)
        throws IOException, SAXException, TikaException {
    HWPFDocument document;/*  w w w  . j a v a 2  s  .  com*/
    try {
        document = new HWPFDocument(root);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }
    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(
            document);
    HeaderStories headerFooter = new HeaderStories(document);

    // Grab the list of pictures. As far as we can tell,
    //  the pictures should be in order, and may be directly
    //  placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);

    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(),
            headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);

    // Do the main paragraph text
    Range r = document.getRange();
    ListManager listManager = new ListManager(document);
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager,
                xhtml);
    }

    // Do everything else
    for (String paragraph : wordExtractor.getMainTextboxText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }

    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }

    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(),
            headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);

    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null;) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }

    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}

From source file:org.docx4j.convert.in.Doc.java

License:Apache License

/**
 * @param in//from   w ww .j  a v  a2  s.co  m
 *            doc file
 * @return new WordprocessingMLPackage containing the results of the
 *         conversion
 * @throws Exception
 */
public static WordprocessingMLPackage convert(InputStream in) throws Exception {

    HWPFDocument doc = new HWPFDocument(in);

    WordprocessingMLPackage out = WordprocessingMLPackage.createPackage();

    convert(doc, out);

    return out;
}