List of usage examples for org.apache.poi.xwpf.usermodel XWPFDocument XWPFDocument
public XWPFDocument(InputStream is) throws IOException
From source file:offishell.word.Word.java
License:MIT License
/** * @param path// www .j a v a 2s . co m */ protected Word(Path path) { if (Files.notExists(path)) { throw new Error(" " + path.toAbsolutePath() + " ?????"); } try { this.path = path; this.calculated = new XWPFDocument(Files.newInputStream(path)); CTTextDirection direction = calculated.getDocument().getBody().getSectPr().getTextDirection(); if (direction != null) { this.textIsVerticalAlign = direction.getVal() == STTextDirection.TB_RL; } else { this.textIsVerticalAlign = false; } } catch (IOException e) { throw I.quiet(e); } }
From source file:orcamentotraducao.OrcamentoTraducao.java
/** * @param args the command line arguments *///from w w w .jav a 2s.c o m public static void main(String[] args) { // TODO code application logic here Scanner scan = new Scanner(System.in); System.out.println("Informe o nome do arquivo:"); String filename = scan.nextLine(); String typeFile = filename.substring(filename.length() - 3, filename.length()); if (!typeFile.matches("ocx") && !typeFile.matches("doc")) { System.out.println("Este formato de arquivo no suportado\n"); System.exit(0); } try { File file = new File(filename); FileInputStream fis = new FileInputStream(file.getAbsolutePath()); String allText = ""; int lines = 0; if (typeFile.matches("ocx")) { XWPFDocument document = new XWPFDocument(fis); List<XWPFParagraph> paragraphs = document.getParagraphs(); for (XWPFParagraph para : paragraphs) { allText += para.getText() + " "; lines++; } fis.close(); } else if (typeFile.matches("doc")) { WordExtractor extractor = new WordExtractor(new HWPFDocument(fis)); allText = extractor.getText(); } String allTextExploded[] = allText.split(" "); int words = allTextExploded.length; int characters = allText.length(); System.out.println("H " + words + " palavras"); System.out.println("H " + characters + " caracteres"); System.out.println("H " + lines + " linhas"); System.out.println("O oramento estimado de R$" + calculate(characters, words, lines)); } catch (Exception e) { e.printStackTrace(); } }
From source file:org.articleEditor.insertContent.POIDocxReader.java
License:Apache License
/** * Reads content of specified stream to the document. * * @param in stream.//from w w w.ja va 2 s .c o m */ public void read(InputStream in, int offset) throws IOException, BadLocationException { poiDocument = new XWPFDocument(in); iteratePart(poiDocument.getBodyElements()); this.currentOffset = offset; document.putProperty("XWPFDocument", poiDocument); }
From source file:org.ArticleEditor.OptionsView.MenuOptionsTopComponent.java
public XWPFDocument getDocument(DataObject dataObject) throws FileNotFoundException, IOException { org.openide.filesystems.FileObject documentFileObject = dataObject.getPrimaryFile(); File documentFile = FileUtil.toFile(documentFileObject); FileInputStream docxIS;//from w w w.j a v a2s . c o m docxIS = new FileInputStream(documentFile); XWPFDocument document = new XWPFDocument(docxIS); return document; }
From source file:org.crypto.sse.TextExtractPar.java
License:Open Source License
private static TextExtractPar extractOneDoc(File[] listOfFile) throws FileNotFoundException { Multimap<String, String> lookup1 = ArrayListMultimap.create(); Multimap<String, String> lookup2 = ArrayListMultimap.create(); for (File file : listOfFile) { for (int j = 0; j < 100; j++) { if (counter == (int) ((j + 1) * listOfFile.length / 100)) { System.out.println("Number of files read equals " + j + " %"); break; }//from w w w.j a va 2 s. c o m } List<String> lines = new ArrayList<String>(); counter++; FileInputStream fis = new FileInputStream(file); // ***********************************************************************************************// ///////////////////// .docx ///////////////////////////// // ***********************************************************************************************// if (file.getName().endsWith(".docx")) { XWPFDocument doc; try { // System.out.println("File read: "+file.getName()); doc = new XWPFDocument(fis); XWPFWordExtractor ex = new XWPFWordExtractor(doc); lines.add(ex.getText()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pptx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pptx")) { OPCPackage ppt; try { // System.out.println("File read: "+file.getName()); ppt = OPCPackage.open(fis); XSLFPowerPointExtractor xw = new XSLFPowerPointExtractor(ppt); lines.add(xw.getText()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .xlsx ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".xlsx")) { OPCPackage xls; try { // System.out.println("File read: "+file.getName()); xls = OPCPackage.open(fis); XSSFExcelExtractor xe = new XSSFExcelExtractor(xls); lines.add(xe.getText()); } catch (InvalidFormatException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (IOException e) { System.out.println("File not read: " + file.getName()); } catch (XmlException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } catch (OpenXML4JException e) { System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .doc ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".doc")) { NPOIFSFileSystem fs; try { // System.out.println("File read: "+file.getName()); fs = new NPOIFSFileSystem(file); WordExtractor extractor = new WordExtractor(fs.getRoot()); for (String rawText : extractor.getParagraphText()) { lines.add(extractor.stripFields(rawText)); } } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// .pdf ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".pdf")) { PDFParser parser; try { // System.out.println("File read: "+file.getName()); parser = new PDFParser(fis); parser.parse(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); lines.add(stripper.getText(new PDDocument(cd))); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } } // ***********************************************************************************************// ///////////////////// Media Files such as gif, jpeg, .wmv, .mpeg, ///////////////////// .mp4 ///////////////////////////// // ***********************************************************************************************// else if (file.getName().endsWith(".gif") && file.getName().endsWith(".jpeg") && file.getName().endsWith(".wmv") && file.getName().endsWith(".mpeg") && file.getName().endsWith(".mp4")) { lines.add(file.getName()); } // ***********************************************************************************************// ///////////////////// raw text extensions ///////////////////// ///////////////////////////// // ***********************************************************************************************// else { try { // System.out.println("File read: "+file.getName()); lines = Files.readLines(file, Charsets.UTF_8); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("File not read: " + file.getName()); } finally { try { fis.close(); } catch (IOException ioex) { // omitted. } } } // ***********************************************************************************************// ///////////////////// Begin word extraction ///////////////////// ///////////////////////////// // ***********************************************************************************************// int temporaryCounter = 0; // Filter threshold int counterDoc = 0; for (int i = 0; i < lines.size(); i++) { CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop // words. We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, lines.get(i)); temporaryCounter = temporaryCounter + token.size(); for (int j = 0; j < token.size(); j++) { // Avoid counting occurrences of words in the same file if (!lookup2.get(file.getName()).contains(token.get(j))) { lookup2.put(file.getName(), token.get(j)); } // Avoid counting occurrences of words in the same file if (!lookup1.get(token.get(j)).contains(file.getName())) { lookup1.put(token.get(j), file.getName()); } } } } // System.out.println(lookup.toString()); return new TextExtractPar(lookup1, lookup2); }
From source file:org.dspace.submit.step.UploadStep.java
License:BSD License
/** * Process the upload of a new file!/*from w ww . j av a2 s . co m*/ * * @param context * current DSpace context * @param request * current servlet request object * @param response * current servlet response object * @param subInfo * submission info object * * @return Status or error flag which will be processed by * UI-related code! (if STATUS_COMPLETE or 0 is returned, * no errors occurred!) */ public int processUploadFile(Context context, HttpServletRequest request, HttpServletResponse response, SubmissionInfo subInfo) throws ServletException, IOException, SQLException, AuthorizeException { boolean formatKnown = true; boolean fileOK = false; BitstreamFormat bf = null; Bitstream b = null; //NOTE: File should already be uploaded. //Manakin does this automatically via Cocoon. //For JSP-UI, the SubmissionController.uploadFiles() does the actual upload Enumeration attNames = request.getAttributeNames(); //loop through our request attributes while (attNames.hasMoreElements()) { String attr = (String) attNames.nextElement(); //if this ends with "-path", this attribute //represents a newly uploaded file if (attr.endsWith("-path")) { //strip off the -path to get the actual parameter //that the file was uploaded as String param = attr.replace("-path", ""); String exten = param.substring(param.length() - 3); // Load the file's path and input stream and description String filePath = (String) request.getAttribute(param + "-path"); InputStream fileInputStreamTest = (InputStream) request.getAttribute(param + "-inputstream"); ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] buf = new byte[1024]; int n = 0; while ((n = fileInputStreamTest.read(buf)) >= 0) baos.write(buf, 0, n); byte[] content = baos.toByteArray(); InputStream fileInputStream = new ByteArrayInputStream(content); InputStream fileInputStreamPdf = new ByteArrayInputStream(content); InputStream ifAnsi = new ByteArrayInputStream(content); //InputStream fss = fileInputStream.cl //attempt to get description from attribute first, then direct from a parameter String fileDescription = (String) request.getAttribute(param + "-description"); if (fileDescription == null || fileDescription.length() == 0) { fileDescription = request.getParameter("description"); } // if information wasn't passed by User Interface, we had a problem // with the upload if (filePath == null || fileInputStream == null) { return STATUS_UPLOAD_ERROR; } if (subInfo == null) { // In any event, if we don't have the submission info, the request // was malformed return STATUS_INTEGRITY_ERROR; } // Create the bitstream Item item = subInfo.getSubmissionItem().getItem(); // do we already have a bundle? Bundle[] bundles = item.getBundles("ORIGINAL"); if (bundles.length < 1) { // set bundle's name to ORIGINAL b = item.createSingleBitstream(fileInputStream, "ORIGINAL"); } else { // we have a bundle already, just add bitstream b = bundles[0].createBitstream(fileInputStream); } //fileDescription.op if (exten.toLowerCase().equals("pdf")) { try { PDFTextStripper pdfStripper = null; PDDocument docum = null; PDFParser parser = new PDFParser(fileInputStreamPdf); COSDocument cosDoc = null; parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); docum = new PDDocument(cosDoc); //pdfStripper.getText(docum); String parsedText = pdfStripper.getText(docum); Integer fifty = (Integer) Math.round(parsedText.length() / 2); if (fifty < 0) { fifty = fifty * (-1); } Integer toCut = 500; if ((parsedText.length() - fifty) < 500) { toCut = parsedText.length(); } log.info("FUCKTHISSHIT: " + fifty + " " + toCut); String subText = parsedText.substring(fifty, fifty + toCut - 1); try { subText = subText.substring(subText.indexOf(".") + 1); } catch (Exception e) { } item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); log.info(parsedText); } catch (Exception e) { log.info("omgerror: " + e.toString()); } } if (exten.toLowerCase().equals("txt")) { StringWriter writer = new StringWriter(); IOUtils.copy(fileInputStreamPdf, writer, "UTF-8"); String theString = writer.toString(); if (theString.startsWith("\uFEFF")) { } else { StringWriter writerAnsi = new StringWriter(); IOUtils.copy(ifAnsi, writerAnsi, "Cp1252"); theString = writerAnsi.toString(); } Integer fifty = (Integer) Math.round(theString.length() * (50 / 100.0f)); Integer toCut = 500; if ((theString.length() - fifty) < 500) { toCut = theString.length(); } String subText = theString.substring(fifty, toCut - 1); item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); log.info(subText); } log.info("OMGTEST: " + exten); if (exten.toLowerCase().equals("doc")) { WordExtractor extractor = null; try { HWPFDocument document = new HWPFDocument(fileInputStreamPdf); extractor = new WordExtractor(document); String fileData = extractor.getText(); Integer fifty = (Integer) Math.round(50 * 100 / fileData.length()); Integer toCut = 500; if ((fileData.length() - fifty) < 500) { toCut = fileData.length(); } String subText = fileData.substring(fifty, toCut - 1); item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); } catch (Exception exep) { log.info("OMGTESTIK:" + exep); } } if ((exten.toLowerCase().equals("ocx"))) { XWPFDocument document = new XWPFDocument(fileInputStreamPdf); XWPFWordExtractor extractor = null; extractor = new XWPFWordExtractor(document); String text = extractor.getText(); Integer fifty = (Integer) Math.round(50 * 100 / text.length()); Integer toCut = 500; if ((text.length() - fifty) < 500) { toCut = text.length(); } String subText = text.substring(fifty, toCut - 1); item.addMetadata("dc", "textpart", null, null, subText + "..."); item.update(); context.commit(); } // Strip all but the last filename. It would be nice // to know which OS the file came from. String noPath = filePath; while (noPath.indexOf('/') > -1) { noPath = noPath.substring(noPath.indexOf('/') + 1); } while (noPath.indexOf('\\') > -1) { noPath = noPath.substring(noPath.indexOf('\\') + 1); } b.setName(noPath); b.setSource(filePath); b.setDescription(fileDescription); // Identify the format bf = FormatIdentifier.guessFormat(context, b); b.setFormat(bf); // Update to DB b.update(); item.update(); if ((bf != null) && (bf.isInternal())) { log.warn("Attempt to upload file format marked as internal system use only"); backoutBitstream(subInfo, b, item); return STATUS_UPLOAD_ERROR; } // Check for virus if (ConfigurationManager.getBooleanProperty("submission-curation", "virus-scan")) { Curator curator = new Curator(); curator.addTask("vscan").curate(item); int status = curator.getStatus("vscan"); if (status == Curator.CURATE_ERROR) { backoutBitstream(subInfo, b, item); return STATUS_VIRUS_CHECKER_UNAVAILABLE; } else if (status == Curator.CURATE_FAIL) { backoutBitstream(subInfo, b, item); return STATUS_CONTAINS_VIRUS; } } // If we got this far then everything is more or less ok. // Comment - not sure if this is the right place for a commit here // but I'm not brave enough to remove it - Robin. context.commit(); // save this bitstream to the submission info, as the // bitstream we're currently working with subInfo.setBitstream(b); //if format was not identified if (bf == null) { return STATUS_UNKNOWN_FORMAT; } } //end if attribute ends with "-path" } //end while return STATUS_COMPLETE; }
From source file:org.eclipse.sw360.licenseinfo.outputGenerators.DocxGenerator.java
License:Open Source License
@Override public byte[] generateOutputFile(Collection<LicenseInfoParsingResult> projectLicenseInfoResults, String projectName, String projectVersion, String licenseInfoHeaderText) throws SW360Exception { ByteArrayOutputStream docxOutputStream = new ByteArrayOutputStream(); Optional<byte[]> docxTemplateFile; XWPFDocument xwpfDocument;/*from w w w . j a v a 2 s.co m*/ try { switch (getOutputVariant()) { case DISCLOSURE: docxTemplateFile = CommonUtils.loadResource(DocxGenerator.class, DOCX_TEMPLATE_FILE); xwpfDocument = new XWPFDocument(new ByteArrayInputStream(docxTemplateFile.get())); if (docxTemplateFile.isPresent()) { fillDocument(xwpfDocument, projectLicenseInfoResults, projectName, projectVersion, licenseInfoHeaderText, false); } else { throw new SW360Exception( "Could not load the template for xwpf document: " + DOCX_TEMPLATE_FILE); } break; case REPORT: docxTemplateFile = CommonUtils.loadResource(DocxGenerator.class, DOCX_TEMPLATE_REPORT_FILE); xwpfDocument = new XWPFDocument(new ByteArrayInputStream(docxTemplateFile.get())); if (docxTemplateFile.isPresent()) { fillDocument(xwpfDocument, projectLicenseInfoResults, projectName, projectVersion, licenseInfoHeaderText, true); } else { throw new SW360Exception( "Could not load the template for xwpf document: " + DOCX_TEMPLATE_REPORT_FILE); } break; default: throw new IllegalArgumentException("Unknown generator variant type: " + getOutputVariant()); } xwpfDocument.write(docxOutputStream); docxOutputStream.close(); } catch (XmlException e) { throw new SW360Exception("Got XmlException while generating docx document: " + e.getMessage()); } catch (IOException e) { throw new SW360Exception("Got IOException when generating docx document: " + e.getMessage()); } catch (TException e) { throw new SW360Exception("Error reading sw360 licenses: " + e.getMessage()); } return docxOutputStream.toByteArray(); }
From source file:org.encuestame.business.search.IndexerFile.java
License:Apache License
/** * Parse Word Document.//from w w w .j av a 2s .c o m * @param file * @return * @throws POIXMLException * @throws Exception */ public static XWPFWordExtractor parseWordDocument(final File file) throws POIXMLException, Exception { InputStream is = new FileInputStream(file); XWPFWordExtractor wde = null; try { XWPFDocument wd = new XWPFDocument(is); wde = new XWPFWordExtractor(wd); log.debug("Parse Word Document --------------------------> "); } catch (Exception e) { log.error("ERROR parse Word Document-------->" + e); } return wde; }
From source file:org.encuestame.business.search.SearchUtils.java
License:Apache License
/** * Create Document Word.//from ww w. jav a2 s . c o m * @param file {@link File} * @param Long attachmentId. * @return {@link Document} * @throws POIXMLException * @throws Exception */ public static Document createWordDocument(final File file) throws POIXMLException, Exception { InputStream is = new FileInputStream(file); String bodyText = null; try { XWPFDocument wd = new XWPFDocument(is); XWPFWordExtractor wde = new XWPFWordExtractor(wd); bodyText = wde.getText(); } catch (Exception e) { log.debug(e); } Document doc = SearchUtils.addFields(file, bodyText); return doc; }
From source file:org.exoplatform.services.document.impl.MSXWordDocumentReader.java
License:Open Source License
/** * Returns only a text from .docx file content. * //www. j a v a 2 s .co m * @param is an input stream with .docx file content. * @return The string only with text from file content. */ public String getContentAsText(final InputStream is) throws IOException, DocumentReadException { if (is == null) { throw new IllegalArgumentException("InputStream is null."); } String text = ""; try { if (is.available() == 0) { return ""; } XWPFDocument doc; try { doc = SecurityHelper.doPrivilegedIOExceptionAction(new PrivilegedExceptionAction<XWPFDocument>() { public XWPFDocument run() throws Exception { return new XWPFDocument(is); } }); } catch (IOException e) { throw new DocumentReadException("Can't open message.", e); } catch (OpenXML4JRuntimeException e) { throw new DocumentReadException("Can't open message.", e); } final XWPFWordExtractor extractor = new XWPFWordExtractor(doc); text = SecurityHelper.doPrivilegedAction(new PrivilegedAction<String>() { public String run() { return extractor.getText(); } }); } finally { if (is != null) { try { is.close(); } catch (IOException e) { if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } } } return text.trim(); }