List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:org.apdplat.superword.extract.SynonymAntonymExtractor.java
/** * ????//from www. jav a2s . c o m * @param html * @return */ public static SynonymAntonym parseSynonymAntonym(String html, String word) { SynonymAntonym synonymAntonym = new SynonymAntonym(); synonymAntonym.setWord(new Word(word, "")); try { for (Element element : Jsoup.parse(html).select(SYNONYM_ANTONYM_CSS_PATH)) { String type = element.select(TYPE).text().trim(); LOGGER.debug("type:" + type); Elements elements = element.select(WORDS); for (Element ele : elements) { String w = ele.text().trim(); LOGGER.debug("word:" + w); if (StringUtils.isNotBlank(w)) { switch (type) { case "??": synonymAntonym.addSynonym(new Word(w, "")); break; case "???": synonymAntonym.addAntonym(new Word(w, "")); break; default: LOGGER.error("???????" + type); } } else { LOGGER.error("??????" + word); } } } LOGGER.info("??????" + synonymAntonym); } catch (Exception e) { LOGGER.error("??????", e); } return synonymAntonym; }
From source file:org.apdplat.superword.tools.Definition.java
public static List<String> parseDefinitionFromHtml(String html, String cssPath, String word, Dictionary dictionary) {/*from w w w.j a va 2s. c om*/ if (dictionary == Dictionary.OXFORD) { return parseDefinitionForOxford(html, null); } if (dictionary == Dictionary.WEBSTER) { return parseDefinitionForWebster(html, null); } List<String> list = new ArrayList<>(); try { Document document = Jsoup.parse(html); for (String cp : cssPath.split("\\|")) { cp = cp.trim(); if (StringUtils.isBlank(cp)) { continue; } for (Element element : document.select(cp)) { String definition = element.text(); if (StringUtils.isNotBlank(definition)) { definition = definition.trim(); if (!definition.startsWith("?")) { list.add(definition); } } } if (!list.isEmpty()) { break; } } } catch (Exception e) { LOGGER.error("?" + word, e); } return list; }
From source file:org.apdplat.superword.tools.Definition.java
public static List<String> parseDefinitionForWebster(String html, String cssPath) { List<String> list = new ArrayList<>(); try {/*from ww w . j a v a2s . c o m*/ for (Element element : Jsoup.parse(html) .select("div.tense-box.quick-def-box.simple-def-box.card-box.def-text div.inner-box-wrapper")) { StringBuilder definition = new StringBuilder(); String partOfSpeech = element.select("div.word-attributes span.main-attr em").text().trim(); for (Element defElement : element.select( "div.definition-block.def-text ul.definition-list.no-count li p.definition-inner-item span")) { String def = defElement.text().trim(); if (def.length() < 3) { continue; } if (Character.isAlphabetic(def.charAt(0))) { def = ": " + def; } else { int index = 0; while (!Character.isAlphabetic(def.charAt(++index))) { // } def = ": " + def.substring(index); } definition.append(partOfSpeech).append(" ").append(def); list.add(definition.toString()); definition.setLength(0); } } } catch (Exception e) { LOGGER.error("?", e); } return list; }
From source file:org.apdplat.superword.tools.IPUtils.java
public static List<String> getIPLocation(String ip) { List<String> locations = new ArrayList<>(); try {//w w w. j a v a 2s . c o m Elements elements = Jsoup.parse(new URL("http://ip138.com/ips138.asp?ip=" + ip), 60000).select("ul li"); for (Element element : elements) { String text = element.text(); if (StringUtils.isNotBlank(text)) { String[] attrs = text.split(""); if (attrs != null && attrs.length == 2) { locations.add(attrs[1]); } } } } catch (Exception e) { LOG.error("?IP???", e); } return locations; }
From source file:org.apdplat.superword.tools.PrefixSuffixOptimizer.java
/** * ?????//w w w . j a va 2 s . c o m * * @param element */ public static void replace(Element element) { String oldText = element.text(); StringBuilder newText = new StringBuilder(); System.out.println("oldText: " + oldText); String[] items = oldText.trim().replace(".", ",").split(","); for (String item : items) { item = item.trim(); if (!StringUtils.isAlpha(item)) { newText.append(item).append(", "); continue; } if (StringUtils.isAllUpperCase(item)) { newText.append("<strong><a target=\"_blank\" href=\"http://www.iciba.com/").append(item) .append("\">").append(item).append("</a></strong>").append(", "); } else { newText.append("<a target=\"_blank\" href=\"http://www.iciba.com/").append(item).append("\">") .append(item).append("</a>").append(", "); } WORDS.add(item.toLowerCase()); } if (newText.length() > 2) { String text = newText.substring(0, newText.length() - 2); System.out.println("newText: " + text); element.html(text); } }
From source file:org.apdplat.superword.tools.Pronunciation.java
public static List<String> parsePronunciationFromHtml(String html, String cssPath, String word, Dictionary dictionary) {//w w w. ja v a 2 s . c o m List<String> list = new ArrayList<>(); try { for (Element element : Jsoup.parse(html).select(cssPath)) { String pronunciation = element.text(); if (StringUtils.isNotBlank(pronunciation)) { pronunciation = pronunciation.replace("Pronunciation:", ""); pronunciation = pronunciation.trim(); if (!list.contains(pronunciation)) { list.add(pronunciation); } } } } catch (Exception e) { LOGGER.error("?" + word, e); } return list; }
From source file:org.apdplat.superword.tools.ProxyIp.java
private static String getIps(Element element) { StringBuilder ip = new StringBuilder(); Elements all = element.children(); LOGGER.info(""); LOGGER.info("?IP?" + element.text()); AtomicInteger count = new AtomicInteger(); all.forEach(ele -> {// ww w. jav a2s. c o m String html = ele.outerHtml(); LOGGER.info(count.incrementAndGet() + "?" + "HTML" + html.replaceAll("[\n\r]", "")); String text = ele.text(); if (ele.hasAttr("style") && (ele.attr("style").equals("display: none;") || ele.attr("style").equals("display:none;"))) { LOGGER.info("?" + text); } else { if (StringUtils.isNotBlank(text)) { LOGGER.info("?" + text); ip.append(text); } else { LOGGER.info(""); } } }); LOGGER.info("----------------------------------------------------------------"); LOGGER.info("?ip: " + ip); LOGGER.info("----------------------------------------------------------------"); Matcher matcher = IP_PATTERN.matcher(ip.toString()); if (matcher.find()) { String _ip = matcher.group(); LOGGER.info("ip??" + _ip); return _ip; } else { LOGGER.info("ip??" + ip); } return null; }
From source file:org.apdplat.superword.tools.WordClassifier.java
public static void parse(String word, String html, Map<String, List<String>> data) { Document doc = Jsoup.parse(html); Elements es = doc.select(TYPE_CSS_PATH); for (Element e : es) { String type = e.text(); LOGGER.debug("?" + type); if (StringUtils.isNotBlank(type)) { data.putIfAbsent(type, new ArrayList<>()); data.get(type).add(word);//from w w w. j a v a 2 s. c o m } } es = doc.select(UNFOUND_CSS_PATH); for (Element e : es) { String notFound = e.text(); LOGGER.debug("?" + notFound); if (StringUtils.isNotBlank(notFound) && (notFound.contains("?") || notFound.contains("??"))) { NOT_FOUND_WORDS.add(word); } } }
From source file:org.apdplat.superword.tools.WordClassifierForYouDao.java
public static void parse(String word, String html, Map<String, List<String>> data) { Document doc = Jsoup.parse(html); Elements es = doc.select(TYPE_CSS_PATH); for (Element e : es) { String types = e.text(); LOGGER.debug("?" + types); for (String type : types.split("\\s+")) { if (StringUtils.isNotBlank(type)) { data.putIfAbsent(type, new ArrayList<>()); data.get(type).add(word); }//from w w w.j a v a 2 s. com } } es = doc.select(UNFOUND_CSS_PATH); for (Element e : es) { String notFound = e.text(); LOGGER.debug("?" + notFound); if (StringUtils.isNotBlank(notFound) && (notFound.contains("?") || notFound.contains("??"))) { NOT_FOUND_WORDS.add(word); } } }
From source file:org.apdplat.superword.tools.WordsFetcher.java
public static Set<Word> parse(String html) { Set<Word> words = new HashSet<>(); try {//w ww .j a va 2s .c o m for (Element element : Jsoup.parse(html).select(WORD_CSS_PATH)) { String word = element.text().trim(); if (StringUtils.isNotBlank(word) && WordSources.isEnglish(word)) { words.add(new Word(word, "")); LOGGER.debug("???:" + word); } } } catch (Exception e) { LOGGER.error("???", e); } return words; }