Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.apps.review_builder; import java.util.Arrays; import java.util.List; import opennlp.tools.similarity.apps.utils.Utils; import org.apache.commons.lang.StringUtils; public class MinedSentenceProcessor { public static String acceptableMinedSentence(String sent) { // if too many commas => seo text String[] commas = StringUtils.split(sent, ','); String[] spaces = StringUtils.split(sent, ' '); if ((float) commas.length / (float) spaces.length > 0.7) { System.out.println("Rejection: too many commas"); return null; } String[] otherDelimiters = StringUtils.split(sent, '/'); if ((float) otherDelimiters.length / (float) spaces.length > 0.7) { System.out.println("Rejection: too many delimiters"); return null; } otherDelimiters = StringUtils.split(sent, '.'); if ((float) otherDelimiters.length / (float) spaces.length > 0.7) { System.out.println("Rejection: too many delimiters"); return null; } otherDelimiters = StringUtils.split(sent, '!'); if ((float) otherDelimiters.length / (float) spaces.length > 0.7) { System.out.println("Rejection: too many delimiters"); return null; } otherDelimiters = StringUtils.split(sent, '='); if ((float) otherDelimiters.length / (float) spaces.length > 0.7) { System.out.println("Rejection: too many delimiters"); return null; } String[] pipes = StringUtils.split(sent, '|'); if (StringUtils.split(sent, '|').length > 2 || StringUtils.split(sent, '>').length > 2) { System.out.println("Rejection: too many |s or >s "); return null; } String sentTry = sent.toLowerCase(); // if too many long spaces String sentSpaces = sentTry.replace(" ", ""); if (sentSpaces.length() - sentTry.length() > 10) // too many spaces - // suspicious return null; if (sentTry.indexOf("click here") > -1 || sentTry.indexOf(" wikip") > -1 || sentTry.indexOf("copyright") > -1 || sentTry.indexOf("operating hours") > -1 || sentTry.indexOf("days per week") > -1 || sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1 || sentTry.indexOf("find the latest") > -1 || sentTry.startsWith("subscribe") || sentTry.indexOf("Terms of Service") > -1 || sentTry.indexOf("clicking here") > -1 || sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1 || sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by") || sentTry.indexOf("available online") > -1 || sentTry.indexOf("get online") > -1 || sentTry.indexOf("buy online") > -1 || sentTry.indexOf("not valid") > -1 || sentTry.indexOf("discount") > -1 || sentTry.indexOf("official site") > -1 || sentTry.indexOf("this video") > -1 || sentTry.indexOf("this book") > -1 || sentTry.indexOf("this product") > -1 || sentTry.indexOf("paperback") > -1 || sentTry.indexOf("hardcover") > -1 || sentTry.indexOf("audio cd") > -1 || sentTry.indexOf("related searches") > -1 || sentTry.indexOf("permission is granted") > -1 || sentTry.indexOf("[edit") > -1 || sentTry.indexOf("edit categories") > -1 || sentTry.indexOf("free license") > -1 || sentTry.indexOf("permission is granted") > -1 || sentTry.indexOf("under the terms") > -1 || sentTry.indexOf("rights reserved") > -1 || sentTry.indexOf("wikipedia") > -1 || sentTry.endsWith("the") || sentTry.endsWith("the.") || sentTry.startsWith("below") || sentTry.indexOf("recipient of") > -1 || sentTry.indexOf("this message") > -1 || sentTry.indexOf("mailing list") > -1 || sentTry.indexOf("purchase order") > -1 || sentTry.indexOf("mon-fri") > -1 || sentTry.indexOf("email us") > -1 || sentTry.indexOf("privacy pol") > -1 || sentTry.indexOf("back to top") > -1 || sentTry.indexOf("click here") > -1 || sentTry.indexOf("for details") > -1 || sentTry.indexOf("assistance?") > -1 || sentTry.indexOf("chat live") > -1 || sentTry.indexOf("free shipping") > -1 || sentTry.indexOf("company info") > -1 || sentTry.indexOf("satisfaction g") > -1 || sentTry.indexOf("contact us") > -1 || sentTry.startsWith("fax") || sentTry.startsWith("write") || sentTry.startsWith("email") || sentTry.indexOf("conditions") > -1 || sentTry.indexOf("chat live") > -1 || sentTry.startsWith("we ") || sentTry.indexOf("the recipient") > -1 || sentTry.indexOf("day return") > -1 || sentTry.indexOf("days return") > -1 || sentTry.startsWith("fax") || sentTry.indexOf("refund it") > -1 || sentTry.indexOf("your money") > -1 || sentTry.startsWith("free") || sentTry.indexOf("purchase orders") > -1 || sentTry.startsWith("exchange it ") || sentTry.indexOf("return it") > -1 || sentTry.indexOf("credit card") > -1 || sentTry.indexOf("storeshop") > -1 || sentTry.startsWith("find") || sentTry.startsWith("shop") || sentTry.startsWith("unlimited") || sentTry.indexOf("for a limited time") > -1 || sentTry.indexOf("prime members") > -1 || sentTry.indexOf("amazon members") > -1 || sentTry.indexOf("unlimited free") > -1 || sentTry.indexOf("shipping") > -1 || sentTry.startsWith("amazon") // not a script text || sentTry.indexOf("document.body") > -1 || sentTry.indexOf(" var ") > -1 || sentTry.indexOf("search suggestions") > -1 || sentTry.startsWith("Search") ) return null; //Millions of Amazon Prime members enjoy instant videos, free Kindle books and unlimited free two-day shipping. // count symbols indicating wrong parts of page to mine for text // if short and contains too many symbols indicating wrong area: reject String sentWrongSym = sentTry.replace(">", "&&&").replace("", "&&&").replace("|", "&&&") .replace(":", "&&&").replace("/", "&&&").replace("-", "&&&").replace("%", "&&&"); if ((sentWrongSym.length() - sentTry.length()) >= 4 && sentTry.length() < 200) // twice ot more return null; sent = sent.replace('[', ' ').replace(']', ' ').replace("_should_find_orig_", "").replace(". .", ". ") .replace("amp;", " ").replace("1.", " ").replace("2.", " ").replace("3.", " ").replace("4.", " ") .replace("2009", "2011").replace("2008", "2011").replace("2006", "2011").replace("2007", "2011") .replace("VIDEO:", " ").replace("Video:", " ").replace("no comments", " ").replace(" ", " ") .replace(" ", " ").replace("(more.)", "").replace("more.", "").replace("<more>", "") .replace("[more]", "").replace(".,", ".").replace("<", "").replace("p>", "") .replace("product description", ""); // TODO .replace("a.", "."); int endIndex = sent.indexOf(" posted"); if (endIndex > 0) sent = sent.substring(0, endIndex); return sent; } public static String processSentence(String pageSentence) { if (pageSentence == null) return ""; pageSentence = Utils.fullStripHTML(pageSentence); pageSentence = StringUtils.chomp(pageSentence, ".."); pageSentence = StringUtils.chomp(pageSentence, ". ."); pageSentence = StringUtils.chomp(pageSentence, " ."); pageSentence = StringUtils.chomp(pageSentence, "."); pageSentence = StringUtils.chomp(pageSentence, "..."); pageSentence = StringUtils.chomp(pageSentence, " ...."); pageSentence = pageSentence.replace("::", ":").replace(".,", ". ").replace("(.)", ""); pageSentence = pageSentence.trim(); pageSentence = pageSentence.replaceAll("\\s+", " "); // make single // spaces // everywhere String[] pipes = StringUtils.split(pageSentence, '|'); // removed // shorter part // of sentence // at the end // after pipe if (pipes.length == 2 && ((float) pipes[0].length() / (float) pipes[1].length() > 3.0)) { int pipePos = pageSentence.indexOf("|"); if (pipePos > -1) pageSentence = pageSentence.substring(0, pipePos - 1).trim(); } if (!StringUtils.contains(pageSentence, '.') && !StringUtils.contains(pageSentence, '?') && !StringUtils.contains(pageSentence, '!')) pageSentence = pageSentence + ". "; pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim(); if (!pageSentence.endsWith(".")) pageSentence += ". "; return pageSentence; } public static String normalizeForSentenceSplitting(String pageContent) { pageContent.replace("Jan.", "January").replace("Feb.", "February").replace("Mar.", "March") .replace("Apr.", "April").replace("Jun.", "June").replace("Jul.", "July").replace("Aug.", "August") .replace("Sep.", "September").replace("Oct.", "October").replace("Nov.", "November") .replace("Dec.", "December"); return pageContent; } }