List of usage examples for org.apache.poi.xwpf.usermodel XWPFDocument XWPFDocument
public XWPFDocument(InputStream is) throws IOException
From source file:org.sleuthkit.autopsy.modules.embeddedfileextractor.ImageExtractor.java
License:Open Source License
/** * Extract images from docx format files. * * @param af the file from which images are to be extracted. * * @return list of extracted images. Returns null in case no images were * extracted.// w w w . ja va 2s. c o m */ private List<ExtractedImage> extractImagesFromDocx(AbstractFile af) { List<ExtractedImage> listOfExtractedImages; XWPFDocument docx = null; try { docx = new XWPFDocument(new ReadContentInputStream(af)); } catch (Throwable ex) { // instantiating POI containers throw RuntimeExceptions logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.docxContainer.init.err", af.getName()), ex); //NON-NLS return null; } List<XWPFPictureData> listOfAllPictures = null; try { listOfAllPictures = docx.getAllPictures(); } catch (Exception ex) { // log internal Java and Apache errors as WARNING logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.processing.err", af.getName()), ex); //NON-NLS return null; } // if no images are extracted from the PPT, return null, else initialize // the output folder for image extraction. String outputFolderPath; if (listOfAllPictures.isEmpty()) { return null; } else { outputFolderPath = getOutputFolderPath(this.parentFileName); } if (outputFolderPath == null) { logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.extractImageFrom.outputPath.exception.msg", af.getName())); //NON-NLS return null; } listOfExtractedImages = new ArrayList<>(); byte[] data = null; for (XWPFPictureData xwpfPicture : listOfAllPictures) { String fileName = xwpfPicture.getFileName(); try { data = xwpfPicture.getData(); } catch (Exception ex) { // log internal Java and Apache errors as WARNING logger.log(Level.WARNING, NbBundle.getMessage(this.getClass(), "EmbeddedFileExtractorIngestModule.ImageExtractor.processing.err", af.getName()), ex); //NON-NLS return null; } writeExtractedImage(Paths.get(outputFolderPath, fileName).toString(), data); listOfExtractedImages.add( new ExtractedImage(fileName, getFileRelativePath(fileName), xwpfPicture.getData().length, af)); } return listOfExtractedImages; }
From source file:org.terrier.indexing.POIDocument.java
License:Mozilla Public License
protected POITextExtractor getExtractor(String filename, InputStream docStream) throws IOException { //Word .doc: if (filename.endsWith(".doc")) { return new WordExtractor(docStream); }//from w w w.j a v a 2s.c om //Word .docx: if (filename.endsWith(".docx")) { return new XWPFWordExtractor(new XWPFDocument(docStream)); } //Powertpoint .ppt: if (filename.endsWith(".ppt")) { return new PowerPointExtractor(docStream); } //Powertpoint .pptx: if (filename.endsWith(".pptx")) { return new XSLFPowerPointExtractor(new XMLSlideShow(docStream)); } //Publisher .pub: if (filename.endsWith(".pub")) { return new PublisherTextExtractor(docStream); } //Excel: .xls: if (filename.endsWith(".xls")) { return new ExcelExtractor(new POIFSFileSystem(docStream)); } //Excel: .xlsx: if (filename.endsWith(".xlsx")) { return new org.apache.poi.xssf.extractor.XSSFExcelExtractor(new XSSFWorkbook(docStream)); } //Visio: .vsd: if (filename.endsWith(".vsd")) { return new VisioTextExtractor(docStream); } return null; }
From source file:org.wandora.utils.MSOfficeBox.java
License:Open Source License
public static String getDocxText(File file) { try {/*from ww w .ja va 2s. co m*/ XWPFDocument docx = new XWPFDocument(new FileInputStream(file)); XWPFWordExtractor extractor = new XWPFWordExtractor(docx); String text = extractor.getText(); return text; } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:org.wso2.carbon.apimgt.impl.indexing.indexer.DocumentIndexer.java
License:Open Source License
/** * Write document content to document artifact as its raw content * * @param registry//from ww w. j a v a 2 s . c o m * @param documentResource * @return * @throws RegistryException * @throws IOException * @throws APIManagementException */ private String fetchDocumentContent(Registry registry, Resource documentResource) throws RegistryException, IOException, APIManagementException { GenericArtifactManager docArtifactManager = APIUtil.getArtifactManager(registry, APIConstants.DOCUMENTATION_KEY); GenericArtifact documentArtifact = docArtifactManager.getGenericArtifact(documentResource.getUUID()); String sourceType = documentArtifact.getAttribute(APIConstants.DOC_SOURCE_TYPE); String contentString = null; if (Documentation.DocumentSourceType.FILE.name().equals(sourceType)) { Association fileAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_FILE_ASSOCIATION); Association fileAssociation; if (fileAssociations.length < 1) { String error = "No document associated to API"; log.error(error); throw new APIManagementException(error); } //a file document can have one file association fileAssociation = fileAssociations[0]; String contentPath = fileAssociation.getDestinationPath(); if (!registry.resourceExists(contentPath)) { String error = "API not found at " + contentPath; log.error(error); throw new APIManagementException(error); } Resource contentResource = registry.get(contentPath); String fileName = ((ResourceImpl) contentResource).getName(); String extension = FilenameUtils.getExtension(fileName); InputStream inputStream = null; try { inputStream = contentResource.getContentStream(); switch (extension) { case APIConstants.PDF_EXTENSION: PDFParser pdfParser = new PDFParser(inputStream); pdfParser.parse(); COSDocument cosDocument = pdfParser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); contentString = stripper.getText(new PDDocument(cosDocument)); break; case APIConstants.DOC_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); WordExtractor msWord2003Extractor = new WordExtractor(pfs); contentString = msWord2003Extractor.getText(); break; } case APIConstants.DOCX_EXTENSION: XWPFDocument doc = new XWPFDocument(inputStream); XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc); contentString = msWord2007Extractor.getText(); break; case APIConstants.XLS_EXTENSION: { POIFSFileSystem pfs = new POIFSFileSystem(inputStream); ExcelExtractor extractor = new ExcelExtractor(pfs); contentString = extractor.getText(); break; } case APIConstants.XLSX_EXTENSION: XSSFWorkbook xssfSheets = new XSSFWorkbook(inputStream); XSSFExcelExtractor xssfExcelExtractor = new XSSFExcelExtractor(xssfSheets); contentString = xssfExcelExtractor.getText(); break; case APIConstants.PPT_EXTENSION: { POIFSFileSystem fs = new POIFSFileSystem(inputStream); PowerPointExtractor extractor = new PowerPointExtractor(fs); contentString = extractor.getText(); break; } case APIConstants.PPTX_EXTENSION: XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream); XSLFPowerPointExtractor xslfPowerPointExtractor = new XSLFPowerPointExtractor(xmlSlideShow); contentString = xslfPowerPointExtractor.getText(); break; case APIConstants.TXT_EXTENSION: case APIConstants.WSDL_EXTENSION: case APIConstants.XML_DOC_EXTENSION: BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String line; StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); break; } } finally { IOUtils.closeQuietly(inputStream); } } else if (Documentation.DocumentSourceType.INLINE.name().equals(sourceType)) { Association contentAssociations[] = registry.getAssociations(documentResource.getPath(), APIConstants.DOCUMENTATION_CONTENT_ASSOCIATION); Association contentAssociation; //an inline document can have one or no content associations if (contentAssociations.length == 1) { contentAssociation = contentAssociations[0]; String contentPath = contentAssociation.getDestinationPath(); if (registry.resourceExists(contentPath)) { Resource contentResource = registry.get(contentPath); InputStream instream = null; BufferedReader reader = null; String line; try { instream = contentResource.getContentStream(); reader = new BufferedReader(new InputStreamReader(instream)); StringBuilder contentBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { contentBuilder.append(line); } contentString = contentBuilder.toString(); } finally { if (reader != null) { IOUtils.closeQuietly(reader); } } } } } return contentString; }
From source file:org.wso2.carbon.pc.core.DocumentIndexer.java
License:Open Source License
@Override public IndexDocument getIndexedDocument(AsyncIndexer.File2Index fileData) throws SolrException, RegistryException { try {//from www .jav a2 s . c om String wordText = null; try { //Extract MSWord 2003 document files POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data)); WordExtractor msWord2003Extractor = new WordExtractor(fs); wordText = msWord2003Extractor.getText(); } catch (OfficeXmlFileException e) { //if 2003 extraction failed, try with MSWord 2007 document files extractor XWPFDocument doc = new XWPFDocument(new ByteArrayInputStream(fileData.data)); XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc); wordText = msWord2007Extractor.getText(); } catch (Exception e) { //The reason for not throwing an exception is that since this is an indexer that runs in the background //throwing an exception might lead to adverse behaviors in the client side and might lead to //other files not being indexed String msg = "Failed to extract the document while indexing"; log.error(msg, e); } IndexDocument indexDoc = new IndexDocument(fileData.path, wordText, null); Map<String, List<String>> fields = new HashMap<String, List<String>>(); fields.put("path", Arrays.asList(fileData.path)); if (fileData.mediaType != null) { fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList(fileData.mediaType)); } else { fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList("application/pdf")); } indexDoc.setFields(fields); return indexDoc; } catch (IOException e) { String msg = "Failed to write to the index"; log.error(msg, e); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, msg); } }
From source file:org.wurtele.ifttt.watchers.TrainingScheduleWatcher.java
License:Open Source License
private void processWordFile(Path path) { try {/*from w w w . j a va 2 s . c om*/ XWPFDocument doc = new XWPFDocument(Files.newInputStream(path)); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); List<List<String>> data = new ArrayList<>(); DateFormat df1 = new SimpleDateFormat("MMM dd, yyyy"); DateFormat df2 = new SimpleDateFormat("MMM dd, yyyy HH:mm"); Arrays.asList(extractor.getText().split("\n")).stream().forEach((line) -> { try { df1.parse(line.split("\t")[0]); List<String> list = new ArrayList<>(); list.addAll(Arrays.asList(line.split("\t"))); data.add(list); } catch (ParseException pe) { } if (line.startsWith("\t")) data.get(data.size() - 1).addAll(Arrays.asList(line.substring(1).split("\t"))); }); List<TrainingScheduleEntry> entries = new ArrayList<>(); for (List<String> event : data) { TrainingScheduleEntry entry = new TrainingScheduleEntry(); entry.setStart(df2.parse(event.get(0) + " " + event.get(1))); entry.setEnd(df2.parse(event.get(0) + " " + event.get(2))); entry.setGroup(event.get(4)); entry.setTitle(event.get(5)); entry.setNotes(event.get(6).length() > 6 ? event.get(6).substring(6) : event.get(6)); if (event.size() > 13) { for (int i = 7; i < 7 + event.size() - 13; i++) { entry.setNotes(entry.getNotes() + " " + event.get(i)); } } entry.setInstructor(event.get(event.size() - 6).trim()); entry.setUniform(event.get(event.size() - 5)); entry.setLocation(event.get(event.size() - 2)); entries.add(entry); } if (!entries.isEmpty()) { Collections.sort(entries); try (OutputStream os = Files.newOutputStream(processedPath(path)); ObjectOutputStream oos = new ObjectOutputStream(os)) { oos.writeObject(entries); } logger.info("Processed " + path); Date start = DateUtils.truncate(entries.get(0).getStart(), Calendar.DATE); Date end = DateUtils.truncate(entries.get(entries.size() - 1).getEnd(), Calendar.DATE); DateFormat df = new SimpleDateFormat("MMM d, yyyy"); String payload = APNS.newPayload().category("scheduleCategory") .alertTitle("Training Schedule Received") .alertBody(entries.size() + " events found for " + (start.before(end) ? df.format(start) + " - " + df.format(end) : df.format(start))) .sound("default").customField("schedule", path.getParent().getFileName().toString() + "/" + FilenameUtils.getBaseName(path.getFileName().toString())) .build(); PushDevices.getDevices().stream().forEach((device) -> { PushUtils.getService().push(device, payload); }); } } catch (Exception e) { logger.error("Failed to process training schedule file: " + path, e); FAILED.add(path); } }
From source file:pe.gob.onpe.rae.controller.registro.registroController.java
@RequestMapping(value = "generateFVDoc/{codExpediente}", method = RequestMethod.GET) public void generateFVDoc(HttpServletRequest request, @PathVariable("codExpediente") int codExpediente, HttpServletResponse response) {/*from w w w .ja va 2 s . c o m*/ try { ServletContext sc = request.getSession().getServletContext(); Expediente expediente = new Expediente(codExpediente); expediente = expedienteDAO.find(expediente); Ambito amb = new Ambito(expediente.getAmbito().getId()); amb = ambitoDAO.find(amb); int totalElectoresRemitidos = expedientePadronDAO.getCountByExpediente(expediente); int totalElectoresIncorporados = expedientePadronDAO.getCountByExpedienteAndEstado(expediente, Parametros.ESTADO_ELECTOR_ACTIVO); JsonParser jsonParser = new JsonParser(); JsonObject jsonObject = (JsonObject) jsonParser.parse(amb.getInformacion()); String nombre = jsonObject.get("nombres").toString() + " " + jsonObject.get("apellidoPaterno").toString() + " " + jsonObject.get("apellidoMaterno").toString(); InputStream is = registroController.class.getResourceAsStream("/ejemplo.docx"); XWPFDocument document = new XWPFDocument(is); XWPFHeaderFooterPolicy policy = document.getHeaderFooterPolicy(); if (policy == null) { CTSectPr sectPr = document.getDocument().getBody().addNewSectPr(); policy = new XWPFHeaderFooterPolicy(document, sectPr); } if (policy.getDefaultHeader() == null && policy.getFirstPageHeader() == null && policy.getDefaultFooter() == null) { XWPFFooter footerD = policy.getFooter(1);// createFooter(policy.DEFAULT); XWPFRun run = footerD.getParagraphs().get(0).createRun(); run.setText("usuario"); XWPFParagraph paragraph = footerD.createParagraph(); paragraph.setAlignment(ParagraphAlignment.DISTRIBUTE); run = paragraph.createRun(); run.setFontFamily("Arial"); run.setFontSize(8); run.setText( "Jr.Washington N 1894, Cercado de Lima. Central Telefonica: 417-0630 www.onpe.gob.pe informes@onpe.gob.pe"); } XWPFParagraph paragraph = document.createParagraph(); XWPFRun run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.setText("Lima,"); run.addBreak(); paragraph = document.createParagraph(); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.setBold(true); run.setText("OFICIO N -2016-GPP/ONPE"); run.setUnderline(UnderlinePatterns.SINGLE); run.addBreak(); paragraph = document.createParagraph(); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.setText("Seor"); XWPFRun run1 = paragraph.createRun(); run1.setFontSize(11); run1.setFontFamily("Arial"); run1.setText(nombre.replace("\"", "")); run1.setBold(true); run1.addBreak(); XWPFRun run2 = paragraph.createRun(); run2.setFontSize(11); run2.setFontFamily("Arial"); run2.setText(jsonObject.get("cargo").toString().replace("\"", "")); run2.addBreak(); run2.setText("Centro Poblado " + amb.getNombreAmbito()); run2.addBreak(); run2.setText("Av. 28 de Julio S/N Centro Cvico Huacrachuco - Municipalidad Provincial de " + amb.getProvincia()); run2.addBreak(); run2.setText(amb.getDepartamento() + " - " + amb.getProvincia() + " - " + amb.getDistrito()); run2.addBreak(); run2 = paragraph.createRun(); run2.setFontSize(11); run2.setFontFamily("Arial"); run2.setUnderline(UnderlinePatterns.WORDS); run2.setText("Presente"); run2 = paragraph.createRun(); run2.setFontSize(11); run2.setFontFamily("Arial"); run2.setText(".-"); paragraph = document.createParagraph(); run.addBreak(); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.addBreak(); run.setText("Asunto"); run.addTab(); run.addTab(); run.setText(": SOLICITUD DE CREACIN DE MESA DE SUFRAGIO."); run.addBreak(); paragraph = document.createParagraph(); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.setText("Referencia"); run.addTab(); run.setText(": OFICIO N 087-2016/M-CP.CHOCOBAMBA (16AGO2016) - Exp. " + expediente.getExpediente()); run.addBreak(); paragraph = document.createParagraph(); paragraph.setAlignment(ParagraphAlignment.THAI_DISTRIBUTE); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.setText( "Me dirijo a usted con relacin al documento de la referencia con la finalidad de hacer de su " + "conocimiento que se ha cumplido con todos los requisitos que dan inicio al trmite de " + "instalacin de mesas de sufragio en el Centro Poblado " + amb.getNombreAmbito() + ", distrito " + amb.getDistrito() + ", " + "provincia " + amb.getProvincia() + ", departamento " + amb.getDepartamento() + "."); paragraph = document.createParagraph(); paragraph.setAlignment(ParagraphAlignment.THAI_DISTRIBUTE); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.addBreak(); run.setText("Al respecto, el mencionado expediente contiene un listado de electores que solicitan ser " + "parte de la mesa de sufragio de la localidad " + amb.getNombreAmbito() + ", el cual, luego de la validacin " + "realizada, se informa que podrn ser incorporados " + totalElectoresIncorporados + " electores del total de " + totalElectoresRemitidos + " registros " + "de electores remitidos. Se adjunta un cuadro resumen con las observaciones mencionadas."); paragraph = document.createParagraph(); paragraph.setAlignment(ParagraphAlignment.THAI_DISTRIBUTE); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.addBreak(); run.setText( "Asimismo, se programar un viaje para la verificacin de rutas, tiempos y servicios de la " + "localidad, la cual se coordinar previamente con las autoridades del centro poblado a fin de " + "programarla adecuadamente; luego de lo cual se emitir un informe de respuesta al " + "resultado de la solicitud, que de ser positivo, conllevara a la instalacin de mesas de sufragio " + "en el centro poblado en mencin, con miras a las "); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.setBold(true); run.setText("Elecciones Regionales y Municipales de 2018."); paragraph = document.createParagraph(); paragraph.setAlignment(ParagraphAlignment.THAI_DISTRIBUTE); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.addBreak(); run.setText("Finalmente, de requerir mayor informacin, agradeceremos se comunique con nosotros al " + "telefono 417-0630 anexo 8484 o al 8481."); paragraph = document.createParagraph(); paragraph.setAlignment(ParagraphAlignment.THAI_DISTRIBUTE); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.addBreak(); run.setText("Sin otro particular."); paragraph = document.createParagraph(); paragraph.setAlignment(ParagraphAlignment.THAI_DISTRIBUTE); run = paragraph.createRun(); run.setFontSize(11); run.setFontFamily("Arial"); run.addBreak(); run.addBreak(); run.setText("Atentamente,"); response.setContentType("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); document.write(response.getOutputStream()); } catch (Exception ex) { Logger.getLogger(registroController.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:persistentie.PixelMapper.java
public List<String> leesDocFile(String bestandsNaam) { List<String> zin = new ArrayList<>(); letterLijst = new ArrayList<>(); File file = null;/*w ww . j a v a2s .c om*/ XWPFWordExtractor extractor = null; try { XWPFDocument document = new XWPFDocument(Files.newInputStream(Paths.get(bestandsNaam))); List<XWPFParagraph> paragraphs = document.getParagraphs(); for (XWPFParagraph par : paragraphs) { /** * Elke paragraph op spaties splitten en elk woord in de letterlijst plaatsen. */ zin = Arrays.asList(par.getParagraphText().split(" ")); for (String woord : zin) { letterLijst.add(woord); letterLijst.add(" "); } letterLijst.add("\n"); } } catch (Exception exep) { exep.printStackTrace(); } return letterLijst; }
From source file:poi.xssf.usermodel.examples.EmbeddedObjects.java
License:Apache License
public static void main(String[] args) throws Exception { OPCPackage pkg = OPCPackage.open(args[0]); XSSFWorkbook workbook = new XSSFWorkbook(pkg); for (PackagePart pPart : workbook.getAllEmbedds()) { String contentType = pPart.getContentType(); // Excel Workbook - either binary or OpenXML if (contentType.equals("application/vnd.ms-excel")) { HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream()); }/*w w w . j a v a 2s. c om*/ // Excel Workbook - OpenXML file format else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) { XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(pPart.getInputStream()); } // Word Document - binary (OLE2CDF) file format else if (contentType.equals("application/msword")) { HWPFDocument document = new HWPFDocument(pPart.getInputStream()); } // Word Document - OpenXML file format else if (contentType .equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { XWPFDocument document = new XWPFDocument(pPart.getInputStream()); } // PowerPoint Document - binary file format else if (contentType.equals("application/vnd.ms-powerpoint")) { HSLFSlideShow slideShow = new HSLFSlideShow(pPart.getInputStream()); } // PowerPoint Document - OpenXML file format else if (contentType .equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) { OPCPackage docPackage = OPCPackage.open(pPart.getInputStream()); XSLFSlideShow slideShow = new XSLFSlideShow(docPackage); } // Any other type of embedded object. else { System.out.println("Unknown Embedded Document: " + contentType); InputStream inputStream = pPart.getInputStream(); } } pkg.close(); }
From source file:ro.dabuno.office.integration.MailMerge.java
private void merge(File wordTemplate, File dataFile, String outputFile) throws Exception { log.info("Merging data from " + wordTemplate + " and " + dataFile + " into " + outputFile); // read the data-rows from the CSV or XLS(X) file Data data = new Data(); data.read(dataFile);//ww w . j a v a 2s . c o m // now open the word file and apply the changes try (InputStream is = new FileInputStream(wordTemplate)) { try (XWPFDocument doc = new XWPFDocument(is)) { // apply the lines and concatenate the results into the document applyLines(data, doc); log.info("Writing overall result to " + outputFile); try (OutputStream out = new FileOutputStream(outputFile)) { doc.write(out); } } } }