Example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument

Introduction

In this page you can find the example usage for org.apache.poi.hwpf HWPFDocument HWPFDocument.

Prototype

public HWPFDocument(DirectoryNode directory) throws IOException

Source Link

Document

This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not the default.

Usage

From source file:RefDiviedMain.java

License:Creative Commons License

public static void readMyDocument(String fileName) {

    POIFSFileSystem fs = null;//from www .  ja  v a  2s .c  om

    try {

        fs = new POIFSFileSystem(new FileInputStream(fileName));

        HWPFDocument doc = new HWPFDocument(fs);

        /** Read the content **/
        Main.readParagraphs(doc, RefDiviedMain.ta, RefDiviedMain.refs);

        int realIndex = 0;

        try {

            for (int i = 0; i < refs.size(); i++) {

                String temp = refs.get(i).trim();

                System.out.println(i + ":" + temp);

                refs.set(i, temp);

                if (temp == null || temp.length() < 3) {

                    continue;

                }

                realIndex++;

                ta.append("\n" + "dealing with the " + i + " ref:" + refs.get(i));

                if (realIndex == 1) {
                    RefDiviedMain.isOriginal = temp;
                    continue;
                }

                if (realIndex == 2) {

                    title = (temp == null ? "Unknown Title" : temp);
                    continue;
                }

                if (temp.startsWith("[a]")) {
                    if (realIndex < 3) {
                        RefDiviedMain.error("author tag is located at wrong place.");
                    }
                    if (temp.equals("[a]")) {

                        int j = i;

                        while (!refs.get(j).equals("[/a]")) {

                            temp = refs.get(j).trim();
                            refs.set(j, temp);

                            if (temp == null || temp.length() < 3) {
                                j++;

                                continue;

                            }

                            authors.add(refs.get(j));

                            j++;

                        }

                        i = j;
                        continue;

                    } else {
                        RefDiviedMain.error("[a] must not be followed by anything.\n" + temp);
                    }
                }

                Pattern p = Pattern.compile("^\\s{0,}\\d+/\\d+/\\d+\\s{0,}$");

                if (temp.startsWith("Manuscript accepted")) {
                    manuscriptDateAccepted = temp.replace("Manuscript accepted:", "");
                    Matcher m = p.matcher(manuscriptDateAccepted);
                    if (m.find()) {
                        continue;
                    } else {
                        RefDiviedMain.error(
                                "manuscriptDateAccepted must follow by data format:dd/dd/dddd.\n" + temp);
                    }

                }

                if (temp.startsWith("Short title")) {

                    shortTitle = temp.replace("Short title:", "");
                    continue;
                }

                if (temp.startsWith("doi:")) {

                    doi = temp.replace("doi:", "");
                    continue;
                }

                if (temp.startsWith("ppub:")) {

                    ppub = temp.replace("ppub:", "");
                    Pattern ppub_p = Pattern.compile("^\\s{0,}\\d+/\\d+\\s{0,}$");
                    Matcher m = ppub_p.matcher(ppub);
                    if (m.find()) {
                        continue;
                    } else {
                        RefDiviedMain.error("ppub data format must be:dd/dd/dddd.\n" + temp);
                    }
                }

                if (temp.startsWith("epub:")) {

                    epub = temp.replace("epub:", "");
                    Matcher m = p.matcher(epub);
                    if (m.find()) {
                        continue;
                    } else {
                        RefDiviedMain.error("epub data format must be:dd/dd/dddd.\n" + temp);
                    }
                }

                if (temp.startsWith("volume:")) {

                    volume = temp.replace("volume:", "");
                    Pattern pc = Pattern.compile("^\\s{0,}\\d+\\s{0,}$");
                    Matcher m = pc.matcher(volume);
                    if (m.find()) {
                        continue;
                    } else {
                        RefDiviedMain.error("must only be digit number:\n" + temp);
                    }
                }

                if (temp.startsWith("issue")) {

                    issue = temp.replace("issue:", "");
                    Pattern pc = Pattern.compile("^\\s{0,}\\d+\\s{0,}$");
                    Matcher m = pc.matcher(issue);
                    if (m.find()) {
                        continue;
                    } else {
                        RefDiviedMain.error("must only be digit number:\n" + temp);
                    }
                }

                if (temp.startsWith("fpage")) {

                    fpage = temp.replace("fpage:", "");
                    Pattern pc = Pattern.compile("^\\s{0,}\\d+\\s{0,}$");
                    Matcher m = pc.matcher(fpage);
                    if (m.find()) {
                        continue;
                    } else {
                        RefDiviedMain.error("must only be digit number:\n" + temp);
                    }
                }

                if (temp.startsWith("lpage")) {

                    lpage = temp.replace("lpage:", "");
                    Pattern pc = Pattern.compile("^\\s{0,}\\d+\\s{0,}$");
                    Matcher m = pc.matcher(lpage);
                    if (m.find()) {
                        continue;
                    } else {
                        RefDiviedMain.error("must only be digit number:\n" + temp);
                    }
                }

                if (temp.startsWith("date accepted")) {

                    dateAccepted = temp.replace("date accepted:", "");
                    Matcher m = p.matcher(dateAccepted);
                    if (m.find()) {
                        continue;
                    } else {
                        RefDiviedMain.error("data format must be:dd/dd/dddd.\n" + temp);
                    }
                }

                if (temp.startsWith("copyright-statement")) {

                    copyrightStat = temp.replace("copyright-statement:", "");
                    continue;
                }

                if (temp.startsWith("copyright-year")) {

                    copyrightYear = temp.replace("copyright-year:", "");
                    Pattern pc = Pattern.compile("^\\s{0,}\\d{4}\\s{0,}$");
                    Matcher m = pc.matcher(copyrightYear);
                    if (m.find()) {
                        continue;
                    } else {
                        RefDiviedMain.error("must only be 4 digit numbers:\n" + temp);
                    }
                }

                if (temp.equalsIgnoreCase("Abstract")) {
                    int j = i;
                    while (!refs.get(j).trim().equals("[s1]Keywords")) {

                        ta.append("try find out introduction " + refs.get(j) + "\n");

                        temp = refs.get(j).trim();
                        refs.set(j, temp);

                        if (temp == null || temp.length() < 3) {
                            j++;

                            continue;

                        }

                        abstractArr.add(refs.get(j));

                        j++;

                    }

                    i = j - 1;
                    continue;
                }

                if (temp.trim().equalsIgnoreCase("[s1]Keywords")) {
                    int j = i;
                    while (!refs.get(j).trim().equals("[body]")) {

                        ta.append("try find out introduction " + refs.get(j) + "\n");

                        temp = refs.get(j).trim();
                        refs.set(j, temp);

                        if (temp == null || temp.length() < 3) {
                            j++;

                            continue;

                        }

                        keywordArr.add(refs.get(j));

                        j++;

                    }

                    i = j - 1;
                    continue;
                }

                if (temp.equals("[body]")) {
                    RefDiviedMain.isValidBoday = true;
                    int j = i;
                    while (!refs.get(j).equals("Acknowledgement") && !refs.get(j).equals("[back]")) {

                        ta.append("try find out introduction " + refs.get(j) + "\n");

                        temp = refs.get(j).trim();
                        refs.set(j, temp);

                        if (temp == null || temp.length() < 3) {
                            j++;

                            continue;

                        }

                        RefDiviedMain.discussion.add(refs.get(j));

                        j++;

                    }

                    i = j - 1;
                    continue;
                }

                if (temp.equals("Acknowledgement")) {

                    int j = i + 1;

                    while (!refs.get(j).equals("[back]")) {

                        temp = refs.get(j).trim();
                        refs.set(j, temp);

                        if (temp == null || temp.length() < 3) {
                            j++;

                            continue;

                        }

                        RefDiviedMain.acknowledgement.add(refs.get(j));

                        j++;

                    }

                    i = j - 1;
                    continue;
                }

                if (temp.equals("[back]")) {
                    RefDiviedMain.isValidBack = true;

                    int j = i;

                    while (!refs.get(j).equals("Table") && !refs.get(j).equals("Figure legends")
                            && !refs.get(j).equals("References")) {

                        temp = refs.get(j).trim();
                        refs.set(j, temp);

                        if (temp == null || temp.length() < 3) {
                            j++;

                            continue;

                        }

                        RefDiviedMain.disclosure.add(refs.get(j));

                        j++;

                    }

                    i = j - 1;

                    continue;

                }

                if (temp.equalsIgnoreCase("Table")) {

                    int j = i + 1;

                    while (!refs.get(j).startsWith("Figure legends") && !refs.get(j).equals("References")) {

                        List<String> aTable = new ArrayList<String>();

                        refs.set(j, refs.get(j).trim());

                        String tempTemp = refs.get(j).trim();
                        System.out.println("setup table:" + tempTemp);

                        if (refs.get(j) == null || tempTemp.length() < 3) {
                            j++;

                            continue;

                        }

                        if (!tempTemp.contains("[title]")) {
                            RefDiviedMain.error("problem happened around " + tempTemp);
                            //System.exit(-1);
                        }

                        String[] a = refs.get(j).split("\\[title\\]");

                        aTable.add(a[0]);

                        String[] b = new String[2];

                        if (a[1].contains("footnotes")) {
                            b = a[1].split("\\[footnotes\\]");
                            j++;
                            while (!refs.get(j).startsWith("Figure legends")
                                    && !refs.get(j).equals("References") && !refs.get(j).startsWith("Table")) {
                                b[1] += "aaaaa" + refs.get(j);
                                j++;
                            }
                            j--;
                        } else {
                            b[0] = a[1];
                            b[1] = "";
                        }

                        aTable.add(b[0]);

                        aTable.add(b[1]);

                        RefDiviedMain.table.add(aTable);

                        j++;

                    }

                    i = j - 1;
                    continue;
                }

                if (temp.equals("Figure legends")) {

                    int j = i + 1;

                    while (!refs.get(j).startsWith("References")) {

                        List<String> aTable = new ArrayList<String>();

                        refs.set(j, refs.get(j).trim());

                        String tempTemp = refs.get(j).trim();

                        if (tempTemp == null || tempTemp.length() < 3) {
                            j++;

                            continue;

                        }

                        if (!tempTemp.contains("[legend]")) {
                            RefDiviedMain.error("problem happened around " + tempTemp);
                            // System.exit(-1);
                        }

                        String[] a = refs.get(j).split("\\[legend\\]");

                        aTable.add(a[0]);

                        String[] b = a[1].split("\\[file\\]");

                        aTable.add(b[0]);

                        aTable.add(b[1]);

                        RefDiviedMain.figure.add(aTable);

                        j++;
                    }

                    i = j - 1;
                    continue;
                }

                if (temp.equals("References")) {
                    RefDiviedMain.isValidRefs = true;

                    i++;

                    while (i < refs.size()) {

                        System.out.println("adding reference before:" + refs.get(i));
                        String result = refs.get(i);
                        result += "httphttp";

                        String secondString = null;
                        if (i + 1 < refs.size()) {
                            secondString = refs.get(i + 1);
                        }

                        System.out.println("new string 1:" + secondString);
                        String thirdString = null;
                        if (i + 2 < refs.size()) {
                            thirdString = refs.get(i + 2);
                        }
                        System.out.println("new string 2:" + thirdString);
                        if (secondString != null && (secondString.trim().startsWith("http://")
                                || secondString.trim().startsWith("Http://"))) {
                            result += secondString.trim();
                            System.out.println("adding second string:" + result);
                            i++;

                            if (thirdString != null && (thirdString.trim().startsWith("http://")
                                    || thirdString.trim().startsWith("Http://"))) {
                                result += thirdString.trim();
                                System.out.println("adding third string:" + result);
                                i++;
                            }
                        }

                        result = result.replaceAll("http://dx.doi.org/", "aaaaadoi");
                        result = result.replaceAll("Http://dx.doi.org/", "aaaaadoi");
                        result = result.replaceAll("http://www.ncbi.nlm.nih.gov/pubmed/", "aaaaapmid");
                        result = result.replaceAll("Http://www.ncbi.nlm.nih.gov/pubmed/", "aaaaapmid");

                        System.out.println("adding reference after:" + result);

                        RefDiviedMain.references.add(result);

                        i++;

                    }

                }

                //doit(refs.get(i), i);

            }

        } catch (Exception e) {

            // TODO Auto-generated catch block

            ta.append("\nerrors happen:\n");
            ta.append(e.getMessage() + "\n");

        }

        int pageNumber = 1;

        /** We will try reading the header for page 1**/
        //readHeader(doc, pageNumber);
        /** Let's try reading the footer for page 1**/
        //readFooter(doc, pageNumber);
        /** Read the document summary**/
        //readDocumentSummary(doc);
    } catch (Exception e) {

        ta.append(e.getMessage());

    }

}

From source file:NewEmptyJUnitTest.java

/**
 * Tests that we can work with both {@link POIFSFileSystem}
 *  and {@link NPOIFSFileSystem}//from w w w  .jav  a 2s . com
 */
public void testDifferentPOIFS() throws Exception {
    POIDataSamples docTests = POIDataSamples.getDocumentInstance();

    // Open the two filesystems
    DirectoryNode[] files = new DirectoryNode[2];
    files[0] = (new POIFSFileSystem(docTests.openResourceAsStream("test2.doc"))).getRoot();
    NPOIFSFileSystem npoifsFileSystem = new NPOIFSFileSystem(docTests.getFile("test2.doc"));
    files[1] = npoifsFileSystem.getRoot();

    // Open directly 
    for (DirectoryNode dir : files) {
        WordExtractor extractor = new WordExtractor(dir);
        assertEquals(p_text1_block, extractor.getText());
    }

    // Open via a HWPFDocument
    for (DirectoryNode dir : files) {
        HWPFDocument doc = new HWPFDocument(dir);
        WordExtractor extractor = new WordExtractor(doc);
        assertEquals(p_text1_block, extractor.getText());
    }

    npoifsFileSystem.close();
}

From source file:RefSouceOnlyMain.java

License:Creative Commons License

public static void readMyDocument(String fileName) {

    POIFSFileSystem fs = null;//from w  ww. j ava2 s  .c  o m

    try {

        fs = new POIFSFileSystem(new FileInputStream(fileName));

        HWPFDocument doc = new HWPFDocument(fs);

        /** Read the content **/
        Main.readParagraphs(doc, RefSouceOnlyMain.ta, RefSouceOnlyMain.refs);

        int realIndex = 0;

        try {

            for (int i = 0; i < refs.size(); i++) {

                String temp = refs.get(i).trim();

                System.out.println(i + ":" + temp);

                refs.set(i, temp);

                if (temp == null || temp.length() < 3) {

                    continue;

                }

                realIndex++;

                ta.append("\n" + "dealing with the " + i + " ref:" + refs.get(i));

                if (realIndex == 1) {
                    RefSouceOnlyMain.isOriginal = temp;
                    continue;
                }

                if (realIndex == 2) {

                    title = (temp == null ? "Unknown Title" : temp);
                    continue;
                }

                if (temp.startsWith("[a]")) {
                    if (realIndex < 3) {
                        RefSouceOnlyMain.error("author tag is located at wrong place.");
                    }
                    if (temp.equals("[a]")) {

                        int j = i;

                        while (!refs.get(j).equals("[/a]")) {

                            temp = refs.get(j).trim();
                            refs.set(j, temp);

                            if (temp == null || temp.length() < 3) {
                                j++;

                                continue;

                            }

                            authors.add(refs.get(j));

                            j++;

                        }

                        i = j;
                        continue;

                    } else {
                        RefSouceOnlyMain.error("[a] must not be followed by anything.\n" + temp);
                    }
                }

                Pattern p = Pattern.compile("^\\s{0,}\\d+/\\d+/\\d+\\s{0,}$");

                if (temp.startsWith("Manuscript accepted")) {
                    manuscriptDateAccepted = temp.replace("Manuscript accepted:", "");
                    Matcher m = p.matcher(manuscriptDateAccepted);
                    if (m.find()) {
                        continue;
                    } else {
                        RefSouceOnlyMain.error(
                                "manuscriptDateAccepted must follow by data format:dd/dd/dddd.\n" + temp);
                    }

                }

                if (temp.startsWith("Short title")) {

                    shortTitle = temp.replace("Short title:", "");
                    continue;
                }

                if (temp.startsWith("doi:")) {

                    doi = temp.replace("doi:", "");
                    continue;
                }

                if (temp.startsWith("ppub:")) {

                    ppub = temp.replace("ppub:", "");
                    Pattern ppub_p = Pattern.compile("^\\s{0,}\\d+/\\d+\\s{0,}$");
                    Matcher m = ppub_p.matcher(ppub);
                    if (m.find()) {
                        continue;
                    } else {
                        RefSouceOnlyMain.error("ppub data format must be:dd/dd/dddd.\n" + temp);
                    }
                }

                if (temp.startsWith("epub:")) {

                    epub = temp.replace("epub:", "");
                    Matcher m = p.matcher(epub);
                    if (m.find()) {
                        continue;
                    } else {
                        RefSouceOnlyMain.error("epub data format must be:dd/dd/dddd.\n" + temp);
                    }
                }

                if (temp.startsWith("volume:")) {

                    volume = temp.replace("volume:", "");
                    Pattern pc = Pattern.compile("^\\s{0,}\\d+\\s{0,}$");
                    Matcher m = pc.matcher(volume);
                    if (m.find()) {
                        continue;
                    } else {
                        RefSouceOnlyMain.error("must only be digit number:\n" + temp);
                    }
                }

                if (temp.startsWith("issue")) {

                    issue = temp.replace("issue:", "");
                    Pattern pc = Pattern.compile("^\\s{0,}\\d+\\s{0,}$");
                    Matcher m = pc.matcher(issue);
                    if (m.find()) {
                        continue;
                    } else {
                        RefSouceOnlyMain.error("must only be digit number:\n" + temp);
                    }
                }

                if (temp.startsWith("fpage")) {

                    fpage = temp.replace("fpage:", "");
                    Pattern pc = Pattern.compile("^\\s{0,}\\d+\\s{0,}$");
                    Matcher m = pc.matcher(fpage);
                    if (m.find()) {
                        continue;
                    } else {
                        RefSouceOnlyMain.error("must only be digit number:\n" + temp);
                    }
                }

                if (temp.startsWith("lpage")) {

                    lpage = temp.replace("lpage:", "");
                    Pattern pc = Pattern.compile("^\\s{0,}\\d+\\s{0,}$");
                    Matcher m = pc.matcher(lpage);
                    if (m.find()) {
                        continue;
                    } else {
                        RefSouceOnlyMain.error("must only be digit number:\n" + temp);
                    }
                }

                if (temp.startsWith("date accepted")) {

                    dateAccepted = temp.replace("date accepted:", "");
                    Matcher m = p.matcher(dateAccepted);
                    if (m.find()) {
                        continue;
                    } else {
                        RefSouceOnlyMain.error("data format must be:dd/dd/dddd.\n" + temp);
                    }
                }

                if (temp.startsWith("copyright-statement")) {

                    copyrightStat = temp.replace("copyright-statement:", "");
                    continue;
                }

                if (temp.startsWith("copyright-year")) {

                    copyrightYear = temp.replace("copyright-year:", "");
                    Pattern pc = Pattern.compile("^\\s{0,}\\d{4}\\s{0,}$");
                    Matcher m = pc.matcher(copyrightYear);
                    if (m.find()) {
                        continue;
                    } else {
                        RefSouceOnlyMain.error("must only be 4 digit numbers:\n" + temp);
                    }
                }

                if (temp.equalsIgnoreCase("Abstract")) {
                    int j = i;
                    while (!refs.get(j).trim().equals("[s1]Keywords")) {

                        ta.append("try find out introduction " + refs.get(j) + "\n");

                        temp = refs.get(j).trim();
                        refs.set(j, temp);

                        if (temp == null || temp.length() < 3) {
                            j++;

                            continue;

                        }

                        abstractArr.add(refs.get(j));

                        j++;

                    }

                    i = j - 1;
                    continue;
                }

                if (temp.trim().equalsIgnoreCase("[s1]Keywords")) {
                    int j = i;
                    while (!refs.get(j).trim().equals("[body]")) {

                        ta.append("try find out introduction " + refs.get(j) + "\n");

                        temp = refs.get(j).trim();
                        refs.set(j, temp);

                        if (temp == null || temp.length() < 3) {
                            j++;

                            continue;

                        }

                        keywordArr.add(refs.get(j));

                        j++;

                    }

                    i = j - 1;
                    continue;
                }

                if (temp.equals("[body]")) {
                    RefSouceOnlyMain.isValidBoday = true;
                    int j = i;
                    while (!refs.get(j).equals("Acknowledgement") && !refs.get(j).equals("[back]")) {

                        ta.append("try find out introduction " + refs.get(j) + "\n");

                        temp = refs.get(j).trim();
                        refs.set(j, temp);

                        if (temp == null || temp.length() < 3) {
                            j++;

                            continue;

                        }

                        RefSouceOnlyMain.discussion.add(refs.get(j));

                        j++;

                    }

                    i = j - 1;
                    continue;
                }

                if (temp.equals("Acknowledgement")) {

                    int j = i + 1;

                    while (!refs.get(j).equals("[back]")) {

                        temp = refs.get(j).trim();
                        refs.set(j, temp);

                        if (temp == null || temp.length() < 3) {
                            j++;

                            continue;

                        }

                        RefSouceOnlyMain.acknowledgement.add(refs.get(j));

                        j++;

                    }

                    i = j - 1;
                    continue;
                }

                if (temp.equals("[back]")) {
                    RefSouceOnlyMain.isValidBack = true;

                    int j = i;

                    while (!refs.get(j).equals("Table") && !refs.get(j).equals("Figure legends")
                            && !refs.get(j).equals("References")) {

                        temp = refs.get(j).trim();
                        refs.set(j, temp);

                        if (temp == null || temp.length() < 3) {
                            j++;

                            continue;

                        }

                        RefSouceOnlyMain.disclosure.add(refs.get(j));

                        j++;

                    }

                    i = j - 1;

                    continue;

                }

                if (temp.equalsIgnoreCase("Table")) {

                    int j = i + 1;

                    while (!refs.get(j).startsWith("Figure legends") && !refs.get(j).equals("References")) {

                        List<String> aTable = new ArrayList<String>();

                        refs.set(j, refs.get(j).trim());

                        String tempTemp = refs.get(j).trim();
                        System.out.println("setup table:" + tempTemp);

                        if (refs.get(j) == null || tempTemp.length() < 3) {
                            j++;

                            continue;

                        }

                        if (!tempTemp.contains("[title]")) {
                            RefSouceOnlyMain.error("problem happened around " + tempTemp);
                            //System.exit(-1);
                        }

                        String[] a = refs.get(j).split("\\[title\\]");

                        aTable.add(a[0]);

                        String[] b = new String[2];

                        if (a[1].contains("footnotes")) {
                            b = a[1].split("\\[footnotes\\]");
                            j++;
                            while (!refs.get(j).startsWith("Figure legends")
                                    && !refs.get(j).equals("References") && !refs.get(j).startsWith("Table")) {
                                b[1] += "aaaaa" + refs.get(j);
                                j++;
                            }
                            j--;
                        } else {
                            b[0] = a[1];
                            b[1] = "";
                        }

                        aTable.add(b[0]);

                        aTable.add(b[1]);

                        RefSouceOnlyMain.table.add(aTable);

                        j++;

                    }

                    i = j - 1;
                    continue;
                }

                if (temp.equals("Figure legends")) {

                    int j = i + 1;

                    while (!refs.get(j).startsWith("References")) {

                        List<String> aTable = new ArrayList<String>();

                        refs.set(j, refs.get(j).trim());

                        String tempTemp = refs.get(j).trim();

                        if (tempTemp == null || tempTemp.length() < 3) {
                            j++;

                            continue;

                        }

                        if (!tempTemp.contains("[legend]")) {
                            RefSouceOnlyMain.error("problem happened around " + tempTemp);
                            // System.exit(-1);
                        }

                        String[] a = refs.get(j).split("\\[legend\\]");

                        aTable.add(a[0]);

                        String[] b = a[1].split("\\[file\\]");

                        aTable.add(b[0]);

                        aTable.add(b[1]);

                        RefSouceOnlyMain.figure.add(aTable);

                        j++;
                    }

                    i = j - 1;
                    continue;
                }

                if (temp.equals("References")) {
                    RefSouceOnlyMain.isValidRefs = true;

                    i++;

                    while (i < refs.size()) {

                        System.out.println("adding reference:" + refs.get(i));
                        String result = refs.get(i);
                        result += "httphttp";

                        String secondString = null;
                        if (i + 1 < refs.size()) {
                            secondString = refs.get(i + 1);
                        }

                        System.out.println("new string 1:" + secondString);
                        String thirdString = null;
                        if (i + 2 < refs.size()) {
                            thirdString = refs.get(i + 2);
                        }

                        if (secondString != null && (secondString.trim().startsWith("http://")
                                || secondString.trim().startsWith("Http://"))) {
                            result += secondString.trim();
                            i++;

                            if (thirdString != null && (thirdString.trim().startsWith("http://")
                                    || thirdString.trim().startsWith("Http://"))) {
                                result += thirdString.trim();
                                i++;
                            }
                        }

                        result = result.replaceAll("http://dx.doi.org/", "aaaaadoi");
                        result = result.replaceAll("Http://dx.doi.org/", "aaaaadoi");
                        result = result.replaceAll("http://www.ncbi.nlm.nih.gov/pubmed/", "aaaaapmid");
                        result = result.replaceAll("Http://www.ncbi.nlm.nih.gov/pubmed/", "aaaaapmid");

                        RefSouceOnlyMain.references.add(result);

                        i++;

                    }

                }

                //doit(refs.get(i), i);

            }

        } catch (Exception e) {

            // TODO Auto-generated catch block

            ta.append("\nerrors happen:\n");
            ta.append(e.getMessage() + "\n");

        }

        int pageNumber = 1;

        /** We will try reading the header for page 1**/
        //readHeader(doc, pageNumber);
        /** Let's try reading the footer for page 1**/
        //readFooter(doc, pageNumber);
        /** Read the document summary**/
        //readDocumentSummary(doc);
    } catch (Exception e) {

        ta.append(e.getMessage());

    }

}

From source file:at.tugraz.sss.serv.SSFileU.java

License:Apache License

public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws Exception {

    final Document document = new Document();
    final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath));
    final HWPFDocument word = new HWPFDocument(fs);
    final WordExtractor we = new WordExtractor(word);
    final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath);
    final PdfWriter writer = PdfWriter.getInstance(document, out);
    final Range range = word.getRange();

    document.open();/*from www. ja  v a2 s .c o m*/
    writer.setPageEmpty(true);
    document.newPage();
    writer.setPageEmpty(true);

    String[] paragraphs = we.getParagraphText();

    for (int i = 0; i < paragraphs.length; i++) {

        org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
        // CharacterRun run = pr.getCharacterRun(i);
        // run.setBold(true);
        // run.setCapitalized(true);
        // run.setItalic(true);
        paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
        System.out.println("Length:" + paragraphs[i].length());
        System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());

        // add the paragraph to the document
        document.add(new Paragraph(paragraphs[i]));
    }

    document.close();
}

From source file:at.tugraz.sss.serv.util.SSFileU.java

License:Apache License

public static void writePDFFromDoc(final String docFilePath, final String pdfFilePath) throws SSErr {

    try {/*from   w  ww  . j  a  v  a 2 s . c  om*/
        final Document document = new Document();
        final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath));
        final HWPFDocument word = new HWPFDocument(fs);
        final WordExtractor we = new WordExtractor(word);
        final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath);
        final PdfWriter writer = PdfWriter.getInstance(document, out);
        final Range range = word.getRange();

        document.open();
        writer.setPageEmpty(true);
        document.newPage();
        writer.setPageEmpty(true);

        String[] paragraphs = we.getParagraphText();

        for (int i = 0; i < paragraphs.length; i++) {

            org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
            // CharacterRun run = pr.getCharacterRun(i);
            // run.setBold(true);
            // run.setCapitalized(true);
            // run.setItalic(true);
            paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
            System.out.println("Length:" + paragraphs[i].length());
            System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());

            // add the paragraph to the document
            document.add(new Paragraph(paragraphs[i]));
        }

        document.close();
    } catch (Exception error) {
        SSServErrReg.regErrThrow(error);
    }
}

From source file:b01.officeLink.ExtendedWordDocument.java

License:Apache License

public ExtendedWordDocument(InputStream istream) throws IOException {
    try {//from   w w  w  . ja v  a  2  s .  c o  m
        xwpfDocument = new XWPFDocument(istream);
    } catch (Exception e) {
        xwpfDocument = null;
        Globals.logString("Could not read EXCEL file as xlsx\n" + (e != null ? e.getMessage() : ""));
        try {
            hwpfDocument = new HWPFDocument(istream);
        } catch (Exception e1) {
            hwpfDocument = null;
            Globals.logException(e1);
        }
    }
}

From source file:b01.officeLink.ExtendedWordDocument.java

License:Apache License

public ExtendedWordDocument(POIFSFileSystem pfilesystem) throws IOException {
    hwpfDocument = new HWPFDocument(pfilesystem);
}

From source file:br.com.schumaker.beta.doc.ReadDocMaster.java

public static void main(String[] args) {
    try {/*from  ww w.jav a2 s .c  o m*/

        File file = new File(
                "/users/hudsonschumaker/downloads/Guisi01206us - Jira Guide for P3 PECB enhancement requests.doc");
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());
        HWPFDocument doc = new HWPFDocument(fis);
        WordExtractor extractor = new WordExtractor(doc);

        for (String rawText : extractor.getParagraphText()) {
            String text = extractor.stripFields(rawText);
            if (text.length() > 10)
                System.out.println(text.trim());
        }
    } catch (Exception exep) {
    }
}

From source file:br.com.schumaker.beta.doc.ReadFile.java

public static void main(String[] args) {
    WordExtractor extractor = null;//  w  w w.  j a  v a2 s .  c om
    try {

        File file = new File(
                "/users/hudsonschumaker/downloads/Guisi01206us - Jira Guide for P3 PECB enhancement requests.doc");
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());
        HWPFDocument doc = new HWPFDocument(fis);
        String text = doc.getDocumentText();
        System.out.println(text);
    } catch (Exception exep) {
    }
}

From source file:com.anphat.customer.controller.ExportContractToDocController.java

private void getData() throws Exception {
    try {//  ww  w .j  a v a2  s  . c om
        documentDoc = new HWPFDocument(new FileInputStream(Constants.PATH_TEMPLATE + fileName));
        lstTableDoc = documentDoc.getListTables();
    } catch (Exception e) {
        throw new Exception("Khng ?c c file biu mu");
    }
}