Java tutorial
/* * ==================================================================== * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <>. * */ package com.da.daum; import; import; import; import; import; import; import; import; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import; import; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.Consts; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.ResponseHandler; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.conn.routing.HttpRoute; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.cookie.Cookie; import org.apache.http.impl.client.BasicResponseHandler; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; import com.da.img.FileHelper; /** * oneline: * * @author changwng * */ public class DaumCafeOneLineList { private static String SO_URL = ""; private static String host_url = ""; private static String photo_url = "http://photo." + SO_URL + "/album/theme/"; private static String story_url = "http://story." + SO_URL + "/honor/"; private static String CAFE_NM = "CHILIL"; private static String SAVE_DIR = "c:/temp/daum"; private static String STORY_DIR = "c:/temp"; public static String BOARD_TYPE = "_memo"; private Map pageMap = new HashMap(); private Map commentPageMap = new HashMap(); private DaumCafeOneLineParser parser = null; static Pattern pattern = Pattern.compile("<a[^>]*href=[\"']?([^>\"']+)[\"']?[^>]*>"); static Pattern pattern_img = Pattern.compile("<img[^>]*src=[\"']?([^>\"']+)[\"']?[^>]*>"); static Pattern pattern_author = Pattern.compile("<span(.*?)style=\"cursor:hand\">(.*?)<\\/span>", Pattern.MULTILINE + Pattern.CASE_INSENSITIVE); // all html tag static Log log = LogFactory.getLog(Class.class); public static void main(String[] args) { DaumCafeOneLineList cfl = new DaumCafeOneLineList(); log.warn("Logging Works"); System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.SimpleLog"); System.setProperty("org.apache.commons.logging.simplelog.showdatetime", "true"); System.setProperty("org.apache.commons.logging.simplelog.log.httpclient.wire", "debug"); System.setProperty("", "debug"); try { // // String body = // CHttpUtil.DownloadHtml(""); /* * String body = CHttpUtil.DownloadHtml( * "" * ); System.out.println(body); */ String nPage = "1"; String p_author_id = ""; String p_gnum = ""; // ga String p_host_url = ""; // ga if (args.length > 0) { nPage = args[0]; } if (args.length > 1) { p_author_id = args[1]; } if (args.length > 2) { STORY_DIR = args[2]; } if (args.length > 3) { p_host_url = args[3]; SO_URL = p_host_url; /* * host_url = "http://www."+SO_URL; photo_url = * "http://photo."+SO_URL+"/album/theme/"; story_url = * "http://story."+SO_URL+"/honor/"; */ } cfl.executeURL(nPage, p_author_id, p_gnum); // cfl.executeAuthorList(nPage, p_author_id, p_gnum); } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (URISyntaxException e) { // TODO Auto-generated catch block e.printStackTrace(); } } protected void executeURL(String p_page, String p_author_id, String p_gnum) throws IOException, ClientProtocolException, URISyntaxException { HttpClient httpclient = null; String output = SAVE_DIR + "/" + CAFE_NM + "/" + BOARD_TYPE + "/_oneLine.txt"; FileHelper.createFile(output); FileWriter fw = new FileWriter(output); BufferedWriter bw = new BufferedWriter(fw); try { httpclient = getPoolHttpClient(); String responseBody = ""; String strUrl = ""; String file = ""; //httpclient.setRedirectHandler(new spaceRedirectHandler()); List<DaumListVo> lst = null; executeLogin(httpclient); this.parser = new DaumCafeOneLineParser(); int max_page = 100000; int init_page = 1; p_page = "1"; String viewBody = ""; if (!"".equals(StringUtils.stripToEmpty(p_page))) { init_page = Integer.parseInt(p_page); } String bodyAndComment = ""; InputStream istream; // ?? . for (int i = init_page; i < max_page; i++) { lst = getBoardList(httpclient, String.valueOf(i)); if (lst.size() < 1) { break; } // for (DaumListVo vo : lst) { //strUrl = host_url+vo.getViewUrl(); /*System.out.println("========================================="); System.out.println(strUrl); System.out.println("=========================================");*/ /*file = SAVE_DIR+"/"+BOARD_TYPE+"/"+vo.getRnum()+"."+vo.getSubject().replaceAll("\\*", "").replaceAll("\\/", "_").replaceAll("\"", "_")+"_" +vo.getIdAlais().replaceAll("\\*", "").replaceAll("\\/", "_").replaceAll("\"", "_") +"_"+vo.getCreatYmd()+"" +".txt"; System.out.println("save file:"+file);*/ // ? //responseBody = execGetUrl(httpclient,strUrl); // // //vo.setRnum("56700"); String comment = getSaveComment(httpclient, vo); file = SAVE_DIR + "/" + CAFE_NM + "/" + BOARD_TYPE + "/" + vo.getIdAlais().replaceAll("\\*", "").replaceAll("\\/", "_").replaceAll("\"", "_") .replaceAll("\\*", "_").replaceAll("\\?", "_").replaceAll(":", "_") + "_" + vo.getCreatYmd().replaceAll(":", "_") + "_" + vo.getRnum() + ".txt"; System.out.println("save file:" + file); bodyAndComment = vo.getIdAlais() + "|" + vo.getCreatYmd() + "\r\n" + vo.getSubject() + "\r\n\r\n" + comment; FileHelper.createFile(file); FileUtils.writeStringToFile(new File(file), bodyAndComment, "utf-8"); bw.write(bodyAndComment); } } } catch (Exception ex) { ex.printStackTrace(System.out); System.out.println("ERROR: " + ex.getLocalizedMessage()); } finally { bw.close(); fw.close(); httpclient.getConnectionManager().shutdown(); } } private String getSaveComment(HttpClient httpclient, DaumListVo vo) throws IOException, ClientProtocolException { String responseBody; String strUrl; List<DaumListVo> lstComment = null; StringBuilder sb = new StringBuilder(); commentPageMap = new HashMap(); for (int x = 1; x < 50; x++) { if (x == 1) { // strUrl = host_url + "/CHILIL/" + BOARD_TYPE + "/" + vo.getRnum() + "/comments?page=1"; // host_url+"/CHILIL/LPN/316/comments?page=1"; } else { strUrl = host_url + (String) commentPageMap.get(String.valueOf(x)); } System.out.println("strUrl:" + strUrl); if (strUrl.equals("")) { lstComment = new ArrayList<DaumListVo>(); } else { // System.out.println("commnet strUrl:"+strUrl); responseBody = execGetUrl(httpclient, strUrl); lstComment = parser.setDaumListVoCommentList(responseBody, vo.getRnum(), commentPageMap); //lstComment = parser.setDaumListVoCommentList(responseBody, "316", commentPageMap); } if (lstComment.size() < 1) { break; } for (DaumListVo commentvo : lstComment) { //System.out.println(commentvo.getSubject()); if (null != commentvo.getIdAlais()) { sb.append("\t" + commentvo.getIdAlais() + "|" + commentvo.getCreatYmd() + "\r\n\t " + commentvo.getSubject() + "\r\n"); } } } return sb.toString(); } private void saveImageFile(HttpClient httpclient, String responseBody, DaumListVo vo) throws URISyntaxException, IOException, ClientProtocolException { String output; InputStream istream; Matcher match = pattern_img.matcher(responseBody); int idx = 0; while (match.find()) { String imgUrl =; if (!"".equals(imgUrl)) { idx++; // System.out.println("imgUrl:"+imgUrl); istream = getDownloadUrlInputStream(httpclient, imgUrl); output = SAVE_DIR + "/" + BOARD_TYPE + "/" + vo.getRnum() + "_" + String.valueOf(idx) + ".jpg"; fileDownCopy(output, istream); if (istream != null) { istream.close(); } } } } private String saveViewFile(String responseBody, String file) throws IOException { String viewBody; viewBody = parser.setDaumView(responseBody); //viewBody= this.htmlRemove(viewBody); //viewBody= StringEscapeUtils.escapeHtml(viewBody); viewBody = viewBody.replaceAll("<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>", ""); viewBody = viewBody.replaceAll(" ", ""); viewBody = viewBody.replaceAll("\n", "\r\n"); return viewBody; } private HttpClient getPoolHttpClient() { HttpClient httpclient; SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry); // Increase max total connection to 200 cm.setMaxTotal(200); // Increase default max connection per route to 20 cm.setDefaultMaxPerRoute(20); // Increase max connections for localhost:80 to 50 HttpHost localhost = new HttpHost("locahost", 80); cm.setMaxPerRoute(new HttpRoute(localhost), 50); //DefaultHttpClient httpclient = new DefaultHttpClient(); httpclient = new DefaultHttpClient(cm); return httpclient; } private void fileDownCopy(String output, InputStream istream) { try { if (!(new File(output)).exists()) { if (FileHelper.createFile(output)) { System.out.println("save File:" + output); FileOutputStream os = new FileOutputStream(output); IOUtils.copy(istream, os); if (os != null) { os.close(); } } } else { System.out.println("Image File Exist :" + output); } } catch (Exception ex) { } } private List<DaumListVo> getBoardList(HttpClient httpclient, String n_page) throws ClientProtocolException, IOException { String listBody = "", viewBody = ""; //String strUrl = "" + n_page; int prev_page = Integer.parseInt(n_page) - 1; if (prev_page < 1) { prev_page = 1; } // prev_page=1&firstbbsdepth=000CW&lastbbsdepth=000CJ¬iceYn=&page=1 //¬iceYn=N&page=5199 //String strUrl = host_url+"/CHILIL/"+BOARD_TYPE+"?prev_page="+String.valueOf(prev_page)+"&firstbbsdepth=0005n&lastbbsdepth=0005W¬iceYn=&page=" + n_page; String strUrl = host_url + "/CHILIL/_memo?boardType=C¬iceYn=N&page=" + n_page; /*if(!"1".equals(n_page)) { strUrl = host_url+(String) pageMap.get(n_page); }*/ //String strUrl = "¬iceYn=&page=" + n_page; System.out.println("========================================="); System.out.println(strUrl); System.out.println("========================================="); if (strUrl.equals("")) { return new ArrayList<DaumListVo>(); } listBody = execGetUrl(httpclient, strUrl); return parser.setDaumListVoList(listBody, this.pageMap); } private String getViewImageUrlSwitch(String viewurl) { String p_imgwidth = viewurl.substring(viewurl.indexOf("p_imgwidth"), viewurl.indexOf("&p_imgheight")); p_imgwidth = StringUtils.replace(p_imgwidth, "p_imgwidth=", ""); // System.out.println("p_imgwidth src :"+p_imgwidth); String p_imgheight = viewurl.substring(viewurl.indexOf("p_imgheight"), viewurl.indexOf("&p_width")); p_imgheight = StringUtils.replace(p_imgheight, "p_imgheight=", ""); // System.out.println("p_imgheight src :"+p_imgheight); String[] aUrl = viewurl.split("&"); if (aUrl.length == 6) { aUrl[2] = "p_width=" + p_imgwidth; aUrl[3] = "p_height=" + p_imgheight; viewurl = StringUtils.join(aUrl, "&"); } return viewurl; } private String getDownloadUrl(HttpClient httpclient, HttpGet httpget, ResponseHandler<String> responseHandler, String p_url) throws URISyntaxException, IOException, ClientProtocolException { String ret = ""; try { httpget.setURI(new URI(p_url)); ret = httpclient.execute(httpget, responseHandler); } catch (Exception ex) { System.out.println("getDownloadUrl ERROR:" + ex.getLocalizedMessage()); } return ret; // return responseBody; } private InputStream getDownloadUrlInputStream(HttpClient httpclient, String p_url) throws URISyntaxException, IOException, ClientProtocolException { // p_url = URLEncoder.encode(p_url, "utf-8"); // System.out.println("p_url2:"+p_url); InputStream is = null; try { //HttpGet httpget = new HttpGet(); HttpGet httpget = new HttpGet(p_url); //httpget.setURI(new URI(p_url)); HttpResponse response = httpclient.execute(httpget); HttpEntity resEntity = response.getEntity(); is = resEntity.getContent(); ; } catch (Exception ex) { } return is; } private HttpGet executeLogin(HttpClient httpclient) throws IOException, ClientProtocolException { // HttpGet httpget = new // HttpGet(""); //HttpGet httpget = new HttpGet(""); /* * HttpResponse response = httpclient.execute(httpget); * System.out.println("----------------------------------------"); * Header[] headers = response.getAllHeaders(); for (int i = 0; * i<headers.length; i++) { System.out.println(headers[i]); } HttpEntity * entity = response.getEntity(); System.out.println("Login form get: " * + response.getStatusLine()); EntityUtils.consume(entity); * System.out.println("Initial set of cookies:"); */ /* * List<Cookie> cookies = httpclient.getCookieStore().getCookies(); if * (cookies.isEmpty()) { System.out.println("None"); } else { for (int i * = 0; i < cookies.size(); i++) { System.out.println("- " + * cookies.get(i).toString()); } } */ // // Protocol.registerProtocol("https", new Protocol("https",new // EasySSLProtocolSocketFactory(), 443)); // // // //String login_url = ""; String login_url = ""; List<NameValuePair> nvps = new ArrayList<NameValuePair>(); nvps.add(new BasicNameValuePair("url", "")); nvps.add(new BasicNameValuePair("relative", "")); nvps.add(new BasicNameValuePair("mobilefull", "1")); nvps.add(new BasicNameValuePair("weblogin", "1")); nvps.add(new BasicNameValuePair("id", "changwng")); nvps.add(new BasicNameValuePair("pw", "qncjdjssl")); nvps.add(new BasicNameValuePair("stln", "on")); nvps.add(new BasicNameValuePair("saved_id", "on")); execPostWidthParam(httpclient, login_url, nvps); return null; } private String execPostWidthParam(HttpClient httpclient, String p_url, List<NameValuePair> nvps) throws IOException, ClientProtocolException { String retStr = ""; HttpResponse response; HttpEntity entity; List<Cookie> cookies; HttpPost httpost = new HttpPost(p_url); //List<NameValuePair> nvps = new ArrayList<NameValuePair>(); /* * nvps.add(new BasicNameValuePair("id", "changwng")); nvps.add(new * BasicNameValuePair("pw", "cw89040310")); */ httpost.setEntity(new UrlEncodedFormEntity(nvps, Consts.UTF_8)); response = httpclient.execute(httpost); entity = response.getEntity(); // System.out.println("Login form get: " + response.getStatusLine()); /* * System.out.println("----------------------------------------"); * Header[] headers = response.getAllHeaders(); for (int i = 0; * i<headers.length; i++) { System.out.println(headers[i]); } * System.out.println("----------------------------------------"); * System.out.println("Post logon cookies:"); cookies = * httpclient.getCookieStore().getCookies(); if (cookies.isEmpty()) { * System.out.println("None"); } else { for (int i = 0; i < * cookies.size(); i++) { System.out.println("- " + * cookies.get(i).toString()); } } */ retStr = EntityUtils.toString(entity); //System.out.println("retStr:" + retStr); EntityUtils.consume(entity); return retStr; } private String execGetUrl(HttpClient httpclient, String p_url) throws IOException, ClientProtocolException { String ret = ""; HttpGet httpget = new HttpGet(p_url); ResponseHandler<String> responseHandler = new BasicResponseHandler(); try { httpget.setURI(new URI(p_url)); ret = httpclient.execute(httpget, responseHandler); } catch (URISyntaxException e) { // TODO Auto-generated catch block System.out.println(e.getLocalizedMessage()); } return ret; } private String execPostUrl(HttpClient httpclient, String p_url) throws IOException, ClientProtocolException { String retStr = ""; HttpResponse response; HttpEntity entity; List<Cookie> cookies; HttpPost httpost = new HttpPost(p_url); List<NameValuePair> nvps = new ArrayList<NameValuePair>(); /* * nvps.add(new BasicNameValuePair("id", "changwng")); * nvps.add(new BasicNameValuePair("pw", "cw89040310")); */ httpost.setEntity(new UrlEncodedFormEntity(nvps, Consts.UTF_8)); response = httpclient.execute(httpost); entity = response.getEntity(); //System.out.println("Login form get: " + response.getStatusLine()); /* * System.out.println("----------------------------------------"); * Header[] headers = response.getAllHeaders(); for (int i = 0; * i<headers.length; i++) { System.out.println(headers[i]); } * System.out.println("----------------------------------------"); * System.out.println("Post logon cookies:"); cookies = * httpclient.getCookieStore().getCookies(); if (cookies.isEmpty()) { * System.out.println("None"); } else { for (int i = 0; i < * cookies.size(); i++) { System.out.println("- " + * cookies.get(i).toString()); } } */ retStr = EntityUtils.toString(entity); // System.out.println("retStr:" + retStr); EntityUtils.consume(entity); return retStr; } public String htmlRemove(String str) { StringBuffer t = new StringBuffer(); StringBuffer t2 = new StringBuffer(); char[] c = str.toCharArray(); char ch; int d = 0; boolean check = false; boolean scriptChkeck = false; boolean styleCheck = false; for (int i = 0, len = c.length; i < len; i++) { ch = c[i]; if (ch == '<') { check = true; } if (!check & !scriptChkeck && !styleCheck) { t.append(ch); } d++; t2.append(ch); if (d > 9) { t2.delete(0, 1); } if (!scriptChkeck) { if (t2.toString().toLowerCase().indexOf("<script") == 0) { scriptChkeck = true; } } if (scriptChkeck) { if (t2.toString().toLowerCase().indexOf("</script>") == 0) { scriptChkeck = false; } } if (!styleCheck) { if (t2.toString().toLowerCase().indexOf("<style") == 0) { styleCheck = true; } } if (styleCheck) { if (t2.toString().toLowerCase().indexOf("</style>") == 0) { styleCheck = false; } } if (ch == '>') { check = false; } } return t.toString().replace(" ", ""); } }