opennlp.tools.parse_thicket.apps.SnippetToParagraph.java Source code

Introduction

Here is the source code for opennlp.tools.parse_thicket.apps.SnippetToParagraph.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package opennlp.tools.parse_thicket.apps;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.logging.Logger;

import org.apache.commons.lang.StringUtils;

import opennlp.tools.similarity.apps.ContentGeneratorSupport;
import opennlp.tools.similarity.apps.Fragment;
import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
import opennlp.tools.similarity.apps.HitBase;
import opennlp.tools.similarity.apps.RelatedSentenceFinder;
import opennlp.tools.similarity.apps.utils.PageFetcher;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
import opennlp.tools.similarity.apps.utils.Utils;
import opennlp.tools.textsimilarity.TextProcessor;

public class SnippetToParagraph extends ContentGeneratorSupport /*RelatedSentenceFinder */ {
    private PageFetcher pFetcher = new PageFetcher();
    private static Logger LOG = Logger.getLogger("com.become.parse_thicket.apps.SnippetToParagraph");

    public HitBase formTextFromOriginalPageGivenSnippetDirect(HitBase item) {

        // put orig sentence in structure
        List<String> origs = new ArrayList<String>();

        item.setOriginalSentences(origs);
        String title = item.getTitle().replace("<b>", " ").replace("</b>", " ").replace("  ", " ").replace("  ",
                " ");
        // generation results for this sentence
        List<Fragment> result = new ArrayList<Fragment>();
        // form plain text from snippet
        String snapshot = item.getAbstractText().replace("<b>", " ").replace("</b>", " ").replace("  ", " ")
                .replace("  ", " ");

        String snapshotMarked = snapshot.replace("...", " _should_find_orig_ . _should_find_orig_");
        List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);
        List<String> allFragms = new ArrayList<String>();
        allFragms.addAll(fragments);

        List<String> sents = new ArrayList<String>();
        String downloadedPage;
        try {
            if (snapshotMarked.length() != snapshot.length()) {
                downloadedPage = pFetcher.fetchPage(item.getUrl());
                if (downloadedPage != null && downloadedPage.length() > 100) {
                    item.setPageContent(downloadedPage);
                    String pageContent = Utils.fullStripHTML(item.getPageContent());
                    pageContent = GeneratedSentenceProcessor.normalizeForSentenceSplitting(pageContent);
                    pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",
                            // ". ")
                            .replace("..", ".").replace(". . .", " ").trim(); // sometimes
                    // html breaks
                    // are converted
                    // into ' ' (two
                    // spaces), so
                    // we need to
                    // put '.'
                    sents = TextProcessor.splitToSentences(pageContent);

                }
            }
        } catch (Exception e) {
            // TODO Auto-generated catch block
            // e.printStackTrace();
            System.err.println("Problem downloading  the page and splitting into sentences");
            return item;
        }

        for (String fragment : allFragms) {
            String followSent = null;
            if (fragment.length() < 50)
                continue;
            String pageSentence = "";
            // try to find original sentence from webpage
            if (fragment.indexOf("_should_find_orig_") > -1 && sents != null && sents.size() > 0)
                try {
                    String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
                            fragment.replace("_should_find_orig_", ""), (String[]) sents.toArray(new String[] {}));
                    pageSentence = mainAndFollowSent[0];
                    followSent = mainAndFollowSent[1];

                } catch (Exception e) {

                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            else
                // or get original snippet
                pageSentence = fragment;
            if (pageSentence != null)
                pageSentence = pageSentence.replace("_should_find_orig_", "");
            String pageSentenceProc = GeneratedSentenceProcessor.acceptableMinedSentence(pageSentence);
            if (pageSentenceProc != null) {
                pageSentenceProc = GeneratedSentenceProcessor.processSentence(pageSentenceProc);
                if (followSent != null) {
                    pageSentenceProc += " " + GeneratedSentenceProcessor.processSentence(followSent);
                }

                pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
                Fragment f = new Fragment(pageSentenceProc, 1);
                f.setSourceURL(item.getUrl());
                f.fragment = fragment;
                result.add(f);
                System.out.println("Accepted sentence: " + pageSentenceProc + "| with title= " + title);
                System.out.println("For fragment = " + fragment);
            } else
                System.out.println("Rejected sentence due to wrong area at webpage: " + pageSentence);
        }

        item.setFragments(result);
        return item;
    }

    public HitBase formTextFromOriginalPageGivenSnippet(HitBase item) {

        String[] sents = extractSentencesFromPage(item.getUrl());

        String title = item.getTitle().replace("<b>", " ").replace("</b>", " ").replace("  ", " ").replace("  ",
                " ");
        // generation results for this sentence
        List<String> result = new ArrayList<String>();
        // form plain text from snippet
        String snapshot = item.getAbstractText().replace("<b>", " ").replace("</b>", " ").replace("  ", " ")
                .replace("  ", " ").replace("\"", "");

        String snapshotMarked = snapshot.replace(" ...", ".");
        List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);
        if (fragments.size() < 3 && StringUtils.countMatches(snapshotMarked, ".") > 1) {
            snapshotMarked = snapshotMarked.replace("..", "&").replace(".", "&");
            String[] fragmSents = snapshotMarked.split("&");
            fragments = Arrays.asList(fragmSents);
        }

        for (String f : fragments) {
            String followSent = null;
            if (f.length() < 50)
                continue;
            String pageSentence = "";
            // try to find original sentence from webpage

            try {
                String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(f, sents);
                pageSentence = mainAndFollowSent[0];
                followSent = mainAndFollowSent[1];
                if (pageSentence != null)
                    result.add(pageSentence);
                else {
                    result.add(f);
                    LOG.info("Could not find the original sentence \n" + f + "\n in the page ");
                }
                //if (followSent !=null)
                //   result.add(followSent);
            } catch (Exception e) {

                e.printStackTrace();
            }
        }
        item.setOriginalSentences(result);
        return item;
    }

    public List<String> cleanListOfSents(List<String> sents) {
        List<String> sentsClean = new ArrayList<String>();
        for (String s : sents) {
            if (s == null || s.trim().length() < 30 || s.length() < 20)
                continue;
            sentsClean.add(s);
        }
        return sentsClean;
    }

    private String[] removeDuplicates(String[] hits) {
        StringDistanceMeasurer meas = new StringDistanceMeasurer();

        List<Integer> idsToRemove = new ArrayList<Integer>();
        List<String> hitsDedup = new ArrayList<String>();
        try {
            for (int i = 0; i < hits.length; i++)
                for (int j = i + 1; j < hits.length; j++) {
                    String title1 = hits[i];
                    String title2 = hits[j];
                    if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
                        continue;
                    if (meas.measureStringDistance(title1, title2) > 0.7) {
                        idsToRemove.add(j); // dupes found, later list member to
                        // be deleted
                    }
                }
            for (int i = 0; i < hits.length; i++)
                if (!idsToRemove.contains(i))
                    hitsDedup.add(hits[i]);
            if (hitsDedup.size() < hits.length) {
                System.out.println(
                        "Removed duplicates from relevant search results, including " + hits[idsToRemove.get(0)]);
            }
        } catch (Exception e) {
            System.out.println("Problem removing duplicates from relevant images");
        }

        return hitsDedup.toArray(new String[0]);

    }

    public String[] extractSentencesFromPage(String url) {

        int maxSentsFromPage = 100;
        List<String[]> results = new ArrayList<String[]>();

        String downloadedPage = pFetcher.fetchPage(url, 20000);
        if (downloadedPage == null || downloadedPage.length() < 100) {
            return null;
        }

        String pageOrigHTML = pFetcher.fetchOrigHTML(url);

        downloadedPage = downloadedPage.replace("     ", "&");
        downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
        String[] sents = downloadedPage.split("#");
        List<TextChunk> sentsList = new ArrayList<TextChunk>();
        for (String s : sents) {
            s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ").replace(": ", ". ")
                    .replace("- ", ". ").replace(". .", ".").trim();
            sentsList.add(new TextChunk(s, s.length()));
        }

        Collections.sort(sentsList, new TextChunkComparable());
        String[] longestSents = new String[maxSentsFromPage];
        int j = 0;
        int initIndex = sentsList.size() - 1 - maxSentsFromPage;
        if (initIndex < 0)
            initIndex = 0;
        for (int i = initIndex; i < sentsList.size() && j < maxSentsFromPage; i++) {
            longestSents[j] = sentsList.get(i).text;
            j++;
        }

        sents = cleanSplitListOfSents(longestSents);

        //sents = removeDuplicates(sents);
        //sents = verifyEnforceStartsUpperCase(sents);

        return sents;
    }

    protected String[] cleanSplitListOfSents(String[] longestSents) {
        float minFragmentLength = 40, minFragmentLengthSpace = 4;

        List<String> sentsClean = new ArrayList<String>();
        for (String sentenceOrMultSent : longestSents) {
            if (sentenceOrMultSent == null || sentenceOrMultSent.length() < 20)
                continue;
            if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent) == null) {
                System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "
                        + sentenceOrMultSent);
                continue;
            }
            // aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.
            int numOfDots = sentenceOrMultSent.replace('.', '&').split("&").length;
            float avgSentenceLengthInTextPortion = (float) sentenceOrMultSent.length() / (float) numOfDots;
            if (avgSentenceLengthInTextPortion < minFragmentLength)
                continue;
            // o oo o ooo o o o ooo oo ooo o o oo
            numOfDots = sentenceOrMultSent.replace(' ', '&').split("&").length;
            avgSentenceLengthInTextPortion = (float) sentenceOrMultSent.length() / (float) numOfDots;
            if (avgSentenceLengthInTextPortion < minFragmentLengthSpace)
                continue;

            List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);

            // forced split by ',' somewhere in the middle of sentence
            // disused - Feb 26 13
            //furtherSplit = furtherMakeSentencesShorter(furtherSplit);
            furtherSplit.remove(furtherSplit.size() - 1);
            for (String s : furtherSplit) {
                if (s.indexOf('|') > -1)
                    continue;
                s = s.replace("<em>", " ").replace("</em>", " ");
                s = Utils.convertToASCII(s);
                sentsClean.add(s);
            }
        }

        return (String[]) sentsClean.toArray(new String[0]);
    }

    private String[] verifyEnforceStartsUpperCase(String[] sents) {
        for (int i = 0; i < sents.length; i++) {
            String s = sents[i];
            s = StringUtils.trim(s);
            String sFirstChar = s.substring(0, 1);
            if (!sFirstChar.toUpperCase().equals(sFirstChar)) {
                s = sFirstChar.toUpperCase() + s.substring(1);
            }
            sents[i] = s;
        }
        return sents;
    }

    private List<String> cleanProductFeatures(List<String> productFeaturesList) {
        List<String> results = new ArrayList<String>();
        for (String feature : productFeaturesList) {
            if (feature.startsWith("Unlimited Free") || feature.startsWith("View Larger")
                    || feature.startsWith("View Larger") || feature.indexOf("shipping") > 0)
                continue;
            results.add(feature);
        }
        return results;
    }

    public class TextChunk {
        public TextChunk(String s, int length) {
            this.text = s;
            this.len = length;
        }

        public String text;
        public int len;
    }

    public class TextChunkComparable implements Comparator<TextChunk> {
        public int compare(TextChunk ch1, TextChunk ch2) {
            if (ch1.len > ch2.len)
                return 1;
            else if (ch1.len < ch2.len)
                return -1;
            else
                return 0;

        }
    }

    public static void main(String[] args) {

    }

}