opennlp.tools.similarity.apps.solr.WordDocBuilderEndNotes.java Source code

Introduction

Here is the source code for opennlp.tools.similarity.apps.solr.WordDocBuilderEndNotes.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package opennlp.tools.similarity.apps.solr;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.math.BigInteger;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import javax.xml.bind.JAXBException;

import net.billylieurance.azuresearch.AzureSearchImageResult;
import net.billylieurance.azuresearch.AzureSearchResultSet;
import net.billylieurance.azuresearch.AzureSearchWebResult;

import org.apache.commons.lang.StringUtils;
import org.docx4j.XmlUtils;
import org.docx4j.dml.wordprocessingDrawing.Inline;
import org.docx4j.jaxb.Context;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.exceptions.InvalidFormatException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage;
import org.docx4j.openpackaging.parts.WordprocessingML.EndnotesPart;
import org.docx4j.wml.CTEndnotes;
import org.docx4j.wml.CTFtnEdn;
import org.docx4j.wml.Drawing;
import org.docx4j.wml.P;
import org.docx4j.wml.R;

import opennlp.tools.similarity.apps.BingQueryRunner;
import opennlp.tools.similarity.apps.Fragment;
import opennlp.tools.similarity.apps.HitBase;

public class WordDocBuilderEndNotes extends WordDocBuilderSingleImageSearchCall {

    public String buildWordDoc(List<HitBase> content, String title) {

        String outputDocFinename = absPath + "written/" + title.replace(' ', '_').replace('\"', ' ').trim()
                + ".docx";

        WordprocessingMLPackage wordMLPackage = null;

        List<String> imageURLs = getAllImageSearchResults(title);
        int count = 0;
        BigInteger refId = BigInteger.ONE;
        try {
            wordMLPackage = WordprocessingMLPackage.createPackage();

            CTEndnotes endnotes = null;
            try {
                EndnotesPart ep = new EndnotesPart();
                endnotes = Context.getWmlObjectFactory().createCTEndnotes();
                ep.setJaxbElement(endnotes);
                wordMLPackage.getMainDocumentPart().addTargetPart(ep);
            } catch (InvalidFormatException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }

            wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Title", title.toUpperCase());
            for (HitBase para : content) {
                if (para.getFragments() == null || para.getFragments().size() < 1) // no found content in this hit
                    continue;
                try {
                    String processedParaTitle = processParagraphTitle(para.getTitle());

                    if (processedParaTitle != null && !processedParaTitle.endsWith("..")
                            || StringUtils.isAlphanumeric(processedParaTitle)) {
                        wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",
                                processedParaTitle);
                    }
                    String paraText = processParagraphText(para.getFragments().toString());
                    wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);

                    CTFtnEdn endnote = Context.getWmlObjectFactory().createCTFtnEdn();
                    endnotes.getEndnote().add(endnote);

                    endnote.setId(refId);
                    refId.add(BigInteger.ONE);
                    String url = para.getUrl();
                    String endnoteBody = "<w:p xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" ><w:pPr><w:pStyle w:val=\"EndnoteText\"/></w:pPr><w:r><w:rPr>"
                            + "<w:rStyle w:val=\"EndnoteReference\"/></w:rPr><w:endnoteRef/></w:r><w:r><w:t xml:space=\"preserve\"> "
                            + url + "</w:t></w:r></w:p>";
                    try {
                        endnote.getEGBlockLevelElts().add(XmlUtils.unmarshalString(endnoteBody));
                    } catch (JAXBException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }

                    // Add the body text referencing it
                    String docBody = "<w:p xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" ><w:r><w:t>"//+ paraText
                            /*+ refId.toString()*/ + "</w:t></w:r><w:r><w:rPr><w:rStyle w:val=\"EndnoteReference\"/></w:rPr><w:endnoteReference w:id=\""
                            + refId.toString() + "\"/></w:r></w:p>";

                    try {
                        wordMLPackage.getMainDocumentPart().addParagraph(docBody);
                    } catch (JAXBException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }

                    try {
                        addImageByImageURLToPackage(count, wordMLPackage, imageURLs);
                    } catch (Exception e) {
                        // no need to report issues
                        //e.printStackTrace();
                    }
                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                count++;
            }
            // now add URLs
            wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", "REFERENCES");
            for (HitBase para : content) {
                if (para.getFragments() == null || para.getFragments().size() < 1) // no found content in this hit
                    continue;
                try {
                    wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle", para.getTitle());
                    String paraText = para.getUrl();
                    wordMLPackage.getMainDocumentPart().addParagraphOfText(paraText);

                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }

            try {
                wordMLPackage.save(new File(outputDocFinename));
                System.out.println("Finished creating docx =" + outputDocFinename);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            try {
                String fileNameToDownload = "/var/www/wrt_latest/"
                        + title.replace(' ', '_').replace('\"', ' ').trim() + ".docx";
                wordMLPackage.save(new File(fileNameToDownload));
                System.out.println("Wrote a doc for download :" + fileNameToDownload);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        return outputDocFinename;
    }

    public static String processParagraphText(String title) {

        return title.replace("[", "").replace("]", "").replace(" | ", "").replace(".,", ".").replace(".\"", "\"")
                .replace(". .", ".").replace(",.", ".");
    }

    public static String processParagraphTitle(String title) {
        String titleDelim = title.replace('-', '&').replace('|', '&');
        String[] titleParts = titleDelim.split("&");

        int lenCurr = -1;
        String bestPart = null;
        for (String candidatePart : titleParts) { // if this part longer and does not have periods
            if (lenCurr < candidatePart.length() && candidatePart.indexOf('.') < 0) {
                lenCurr = candidatePart.length();
                bestPart = candidatePart;
            }
        }

        return bestPart;
    }

    public static void main(String[] args) {
        WordDocBuilderEndNotes b = new WordDocBuilderEndNotes();
        List<HitBase> content = new ArrayList<HitBase>();
        for (int i = 0; i < 10; i++) {
            HitBase h = new HitBase();
            h.setTitle("albert einstein " + i);
            List<Fragment> frs = new ArrayList<Fragment>();
            frs.add(new Fragment(" content " + i, 0));
            h.setFragments(frs);
            h.setUrl("http://www." + i + ".com");
            content.add(h);
        }

        b.buildWordDoc(content, "albert einstein");
    }
}