Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package com.xue777hua.emails.test; import com.xue777hua.util.Log; import com.xue777hua.util.StringUtils; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringEscapeUtils; /** * * @author morgan */ public class Test { public static void main(String[] args) { String text = "<div id=\"frag_1\" class=\"page_fragment auth_frag\" data-first=\"true\" data-fid=\"1\"><div class=\"module_topic_paths\"></div><h1 class=\"svTitle\" id=\"tm005\">Effect of inulin and pectin on rheological and thermal properties of potato starch paste and gel</h1><ul class=\"authorGroup noCollab\"><li><a href=\"#\" class=\"authorName\" id=\"authname_N41d730a0N3ee493d4\" data-t=\"a\" data-fn=\"Teresa\" data-ln=\"Witczak\" data-pos=\"1\" data-tb=\"\">Teresa Witczak</a><a title=\"Affiliation: a\" href=\"#af005\" class=\"intra_ref auth_aff\" id=\"baf005\"><sup>a</sup></a><sup>, </sup><a title=\"Corresponding author contact information\" href=\"#cor1\" id=\"bcor1\" class=\"intra_ref auth_corr\"><img class=\"imgLazyJSB\" alt=\"Corresponding author contact information\" src=\"/sd/grey_pxl.gif\" data-inlimg=\"/entities/REcor.gif\"><noscript><img alt=\"Corresponding author contact information\" src=\"http://origin-cdn.els-cdn.com/sd/entities/REcor.gif\"></noscript></a><sup>, </sup><a href=\"mailto:t.witczak@ur.krakow.pl\" class=\"auth_mail\"><img class=\"imgLazyJSB\" src=\"/sd/grey_pxl.gif\" alt=\"E-mail the corresponding author\" data-inlimg=\"/entities/REemail.gif\"><noscript><img src=\"http://origin-cdn.els-cdn.com/sd/entities/REemail.gif\" alt=\"E-mail the corresponding author\"></noscript></a>, </li><li><a href=\"#\" class=\"authorName\" id=\"authname_N41d730a0N3ee4953c\" data-t=\"a\" data-fn=\"Mariusz\" data-ln=\"Witczak\" data-pos=\"2\" data-tb=\"\">Mariusz Witczak</a><a title=\"Affiliation: a\" href=\"#af005\" class=\"intra_ref auth_aff\" id=\"baf005\"><sup>a</sup></a>, </li><li><a href=\"#\" class=\"authorName\" id=\"authname_N41d730a0N3ee495f0\" data-t=\"a\" data-fn=\"Rafał\" data-ln=\"Ziobro\" data-pos=\"3\" data-tb=\"\">Rafa Ziobro</a><a title=\"Affiliation: b\" href=\"#af010\" class=\"intra_ref auth_aff\" id=\"baf010\"><sup>b</sup></a></li></ul><!--VALIDHTML--><ul class=\"affiliation\"><li id=\"af005\"><sup>a</sup> <span id=\"\">Department of Engineering and Machinery for Food Industry, University of Agriculture in Krakow, Balicka 122 Str., 30-149 Krakow, Poland</span></li><li id=\"af010\"><sup>b</sup> <span id=\"\">Department of Carbohydrates Technology, University of Agriculture in Krakow, Balicka 122 Str., 30-149 Krakow, Poland</span></li></ul><!--VALIDHTML--><!--VALIDHTML--><dl class=\"articleDates\"><dd>Received 24 May 2013, Revised 1 October 2013, Accepted 1 October 2013, Available online 11 October 2013</dd></dl><!--VALIDHTML--><div class=\"moreInformation\"></div><div id=\"ppvPlaceHolder\" class=\"hidden\"></div><!--VALIDHTML--><div id=\"showMoreButtons\"></div><dl class=\"extLinks\"><dd class=\"doiLink\"></dd><dd class=\"rightsLink\"></dd></dl><div class=\"articleOAlabelForced\"></div><div id=\"refersToAndreferredToBy\"><dl id=\"referredToBy\" class=\"documentThread\"><!--Referred To By--></dl></div><!--FRAGMENTEND--><div class=\"page_fragment_ind auth_frag\" data-id=\"frag_2\"></div></div>"; String authorList = ""; String articleTitle = ""; // ???title Pattern articleTitlePattern = Pattern.compile("<h1.+?svTitle.+?>(.+?)</h1>"); Matcher articleTitleMatcher = articleTitlePattern.matcher(text); while (articleTitleMatcher.find()) { articleTitle = articleTitleMatcher.group(1); articleTitle = StringEscapeUtils.unescapeHtml(articleTitle); articleTitle = StringUtils.stripTags(articleTitle); System.out.println("" + articleTitle); } // ??? Pattern p = Pattern.compile("<ul.+?authorGroup.+?>(.+?)</ul>"); Matcher m = p.matcher(text); while (m.find()) { authorList = m.group(1); } p = Pattern.compile("<li>(.+?)</li>"); m = p.matcher(authorList); while (m.find()) { String authorItem = m.group(1); if (authorItem.contains("mailto")) { Pattern nameEmailPattern = Pattern .compile("data-tb=\"[\\d]{0,}\">(.+?)</a>.*href=\"mailto:(.+?)\" class=\"auth_mail\">"); Matcher nameEmailMatcher = nameEmailPattern.matcher(authorItem); if (nameEmailMatcher.find()) { String name = nameEmailMatcher.group(1); String email = nameEmailMatcher.group(2); name = StringEscapeUtils.unescapeHtml(name); // ???new ArrayList List<String> fieldList = new ArrayList<String>(); fieldList.add(name); fieldList.add(email); fieldList.add(articleTitle); int hashKey = (name + email + articleTitle).hashCode(); System.out.println("?????TitlehashKey:" + name + "|" + email + "|" + articleTitle + "|" + hashKey + ", nameEmailsList"); } } } } }