com.da.img.SoStroyAllList.java Source code

Java tutorial

Introduction

Here is the source code for com.da.img.SoStroyAllList.java

Source

/*
 * ====================================================================
 *
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 *
 */

package com.da.img;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Consts;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

/**
 * http://story.soraspace.info/bank/story_mn.php?p_userid=thskrsns67&p_snum=015&p_num=58109
 * @author changwng
 *
 */

public class SoStroyAllList {

    private static String SO_URL = "soraven.info";
    private static String host_url = "http://www." + SO_URL;
    private static String photo_url = "http://photo." + SO_URL + "/album/theme/";
    private static String story_url = "http://story." + SO_URL + "/honor/";

    private static String SAVE_DIR = "c:/temp/story";
    private static String STORY_DIR = "c:/temp";
    static Pattern pattern = Pattern.compile("<a[^>]*href=[\"']?([^>\"']+)[\"']?[^>]*>");
    static Pattern pattern_img = Pattern.compile("<img[^>]*src=[\"']?([^>\"']+)[\"']?[^>]*>");
    static Pattern pattern_author = Pattern.compile("<span(.*?)style=\"cursor:hand\">(.*?)<\\/span>",
            Pattern.MULTILINE + Pattern.CASE_INSENSITIVE); // all html tag

    public static void main(String[] args) {
        SoStroyAllList cfl = new SoStroyAllList();
        try {
            //   BoardAllList 1 174, 354, c:/temp
            // ba 1 174 354 c:\tmp

            String nPage = "1";
            String p_author_id = "queen201";
            String p_gnum = ""; // ga
            String p_host_url = ""; // ga
            if (args.length > 0) {
                nPage = args[0];
            }
            if (args.length > 1) {
                p_author_id = args[1];
            }
            if (args.length > 2) {
                STORY_DIR = args[2];
            }

            if (args.length > 3) {
                p_host_url = args[3];
                SO_URL = p_host_url;
                host_url = "http://www." + SO_URL;
                photo_url = "http://photo." + SO_URL + "/album/theme/";
                story_url = "http://story." + SO_URL + "/honor/";
            }
            cfl.executeURL(nPage, p_author_id, p_gnum);
            //cfl.executeAuthorList(nPage, p_author_id, p_gnum);
        } catch (ClientProtocolException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (URISyntaxException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    protected List<AuthorVo> executeAuthorList(String p_page, String p_author_id, String p_gnum)
            throws IOException, ClientProtocolException, URISyntaxException {
        DefaultHttpClient httpclient = new DefaultHttpClient();
        List<AuthorVo> lst = new ArrayList<AuthorVo>();
        AuthorVo authorVo = new AuthorVo();
        try {
            HttpGet httpget = executeLogin(httpclient);
            ResponseHandler<String> responseHandler = new BasicResponseHandler();
            // http://story.soraspace.info/honor/honor_list_02.php
            // http://story.soraspace.info/honor/honor_list_03.php
            // http://story.soraspace.info/honor/honor_list_04.php
            // http://story.soraspace.info/honor/honor_list_05.php
            int max_page = 3;
            int init_page = 3;
            String listBody = "", strUrl = "";
            for (int i = init_page; i <= max_page; i++) {
                strUrl = story_url + "honor_list_0" + String.valueOf(i) + ".php";
                listBody = getDownloadUrl(httpclient, httpget, responseHandler, strUrl);
                listBody = listBody.substring(listBody.indexOf("<!-- START  ? -->"),
                        listBody.indexOf("<!-- END  ? -->"));
                Matcher match = pattern_author.matcher(listBody);
                while (match.find()) {
                    authorVo = new AuthorVo();
                    //System.out.println(match);
                    String listurl = match.group(0);
                    //System.out.println(listurl);
                    String listurl1 = match.group(1);
                    listurl1 = listurl1.replace("onClick=\"soraShowUserLayer('", "").replace(" ", "");
                    listurl1 = listurl1.substring(0, listurl1.indexOf("'"));
                    System.out.println(listurl1);
                    //System.out.println(listurl1.length());
                    String listurl2 = match.group(2);
                    System.out.println(listurl2);
                    authorVo.setAuthorSpan(listurl);
                    authorVo.setAuthorId(listurl1);
                    authorVo.setAuthorAlias(listurl2);
                    lst.add(authorVo);
                }

            }
        } catch (Exception ex) {
        }
        return lst;
    }

    protected void executeURL(String p_page, String p_author_id, String p_gnum)
            throws IOException, ClientProtocolException, URISyntaxException {
        DefaultHttpClient httpclient = new DefaultHttpClient();
        try {
            List<AuthorVo> lstAuthor = executeAuthorList(p_page, p_author_id, p_gnum);
            HttpGet httpget = executeLogin(httpclient);
            ResponseHandler<String> responseHandler = new BasicResponseHandler();
            String responseBody = "";
            // http://story.soraspace.info/honor/honor_list_02.php
            // http://story.soraspace.info/honor/honor_list_03.php
            // http://story.soraspace.info/honor/honor_list_04.php
            // http://story.soraspace.info/honor/honor_list_05.php
            List<ImageVo> lst = null;
            int max_page = 10000;
            int init_page = 1;
            if (!"".equals(StringUtils.stripToEmpty(p_page))) {
                init_page = Integer.parseInt(p_page);
            }
            // init_page = 17;
            for (AuthorVo vo : lstAuthor) {
                SAVE_DIR = STORY_DIR + "/story/" + vo.getAuthorAlias() + "(" + vo.getAuthorId() + ")";
                p_author_id = vo.getAuthorId();
                for (int i = init_page; i < max_page; i++) {
                    lst = getBoardList(httpclient, httpget, responseHandler, p_gnum, String.valueOf(i),
                            p_author_id);
                    System.out.println("Story Size: " + lst.size());
                    if (lst.size() < 1) {
                        break;
                    }
                }
            }

        } catch (Exception ex) {
            ex.printStackTrace(System.out);
            System.out.println("ERROR: " + ex.getLocalizedMessage());
        } finally {
            // When HttpClient instance is no longer needed,
            // shut down the connection manager to ensure
            // immediate deallocation of all system resources
            httpclient.getConnectionManager().shutdown();
        }
    }

    private void fileDownCopy(String output, InputStream istream) {
        try {
            if (!(new File(output)).exists()) {
                if (FileHelper.createFile(output)) {
                    System.out.println("save File:" + output);
                    FileOutputStream os = new FileOutputStream(output);
                    IOUtils.copy(istream, os);
                    if (os != null) {
                        os.close();
                    }
                }
            } else {
                System.out.println("Image File Exist :" + output);
            }
        } catch (Exception ex) {
        }
    }

    private List<ImageVo> getBoardList(DefaultHttpClient httpclient, HttpGet httpget,
            ResponseHandler<String> responseHandler, String p_gnum, String n_page, String p_author_id)
            throws URISyntaxException, IOException, ClientProtocolException {
        String listBody = "", viewBody = "";

        // String strUrl =
        // "http://photo.soraspace.info/album/theme/pic_list.php?p_page=1&p_sort=D&p_anum=173&p_gnum=351&p_soption=&p_stxt=";
        // thskrsns67
        // bluesman
        /*   String strUrl = "http://photo." + SO_URL
        + "/album/theme/pic_list.php?p_page=" + n_page
        + "&p_sort=D&p_anum=" + p_anum + "&p_gnum=" + p_gnum
        + "&p_soption=&p_stxt=";*/

        if ("".equals(StringUtils.stripToEmpty(p_author_id))) {
            p_author_id = "bluesman";
        }

        String strUrl = "http://story." + SO_URL + "/honor/author_board_list.php?p_page=" + n_page + "&p_userid="
                + p_author_id;
        System.out.println("=========================================");
        System.out.println(strUrl);
        System.out.println("=========================================");
        // httpget.setURI(new URI(strUrl));
        // lip 689576
        // http://photo.soraspace.info/album/theme/pic_list.php?p_page=1&p_sort=D&p_anum=173&p_gnum=481&p_soption=&p_stxt=

        listBody = getDownloadUrl(httpclient, httpget, responseHandler, strUrl);
        /* String file = SAVE_DIR+"/list_"+p_author_id+"_"+n_page;
         System.out.println(file);
         FileHelper.createFile(file+".txt");
         FileUtils.writeStringToFile(new File(file+".txt"), listBody, "utf-8");
         */

        int nlastIdx = listBody.lastIndexOf(
                "<img src=\"http://image." + SO_URL + "/common/btn_brd_next.gif\" align=absmiddle></a>");
        //   System.out.println("nlastIdx:" + nlastIdx);
        Matcher match = pattern.matcher(listBody);
        String listurl = "";
        String viewurl = "";
        String p_num = "";
        List<ImageVo> lst = new ArrayList<ImageVo>();
        ImageVo vo = null;
        while (match.find()) {

            listurl = match.group(1);
            if (listurl.indexOf("author_board_view.php") == 0) // href  ?
            {
                vo = new ImageVo();
                System.out.println(" ba listurl :" + listurl);
                String p_storyname = listurl.substring(listurl.indexOf("p_storyname=") + 12,
                        listurl.indexOf("&p_userid="));
                System.out.println("p_storyname: " + p_storyname);

                p_num = listurl.substring(listurl.indexOf("?") + 1, listurl.indexOf("&"));
                p_num = p_num.replaceAll("p_num=", "");
                System.out.println(" ba p_num :" + p_num);

                vo.setFileName(SAVE_DIR + "/" + p_author_id + "_" + p_storyname + "_" + p_num + ".txt");
                p_storyname = URLEncoder.encode(p_storyname, "euc-kr");
                listurl = listurl.substring(0, listurl.indexOf("&p_grade="));
                listurl = listurl + "&p_storyname=" + p_storyname + "&p_userid=" + p_author_id;
                //URLEncoder.encode(p_url, "utf-8");

                //System.out.println("story_url+listurl: "+story_url+listurl);
                try {
                    viewBody = getDownloadUrl(httpclient, httpget, responseHandler, story_url + listurl);
                    viewBody = viewBody.substring(viewBody.indexOf("<!-- START ? -->"),
                            viewBody.indexOf("<!-- END  -->"));
                    viewBody = htmlRemove(viewBody);
                    viewBody = viewBody.replaceAll("&quot;", "");
                    FileHelper.createFile(vo.getFileName());
                    FileUtils.writeStringToFile(new File(vo.getFileName()), viewBody, "utf-8");
                    // lastindex   
                    if (nlastIdx > -1) {
                        lst.add(vo);
                    }
                } catch (Exception ex) {
                    System.out.println(ex.getLocalizedMessage());
                }

            }
        }
        return lst;
    }

    private String getViewImageUrlSwitch(String viewurl) {

        String p_imgwidth = viewurl.substring(viewurl.indexOf("p_imgwidth"), viewurl.indexOf("&p_imgheight"));
        p_imgwidth = StringUtils.replace(p_imgwidth, "p_imgwidth=", "");
        // System.out.println("p_imgwidth src :"+p_imgwidth);
        String p_imgheight = viewurl.substring(viewurl.indexOf("p_imgheight"), viewurl.indexOf("&p_width"));
        p_imgheight = StringUtils.replace(p_imgheight, "p_imgheight=", "");
        // System.out.println("p_imgheight src :"+p_imgheight);
        String[] aUrl = viewurl.split("&");
        if (aUrl.length == 6) {
            aUrl[2] = "p_width=" + p_imgwidth;
            aUrl[3] = "p_height=" + p_imgheight;
            viewurl = StringUtils.join(aUrl, "&");
        }

        return viewurl;
    }

    private String getDownloadUrl(DefaultHttpClient httpclient, HttpGet httpget,
            ResponseHandler<String> responseHandler, String p_url)
            throws URISyntaxException, IOException, ClientProtocolException {
        // String responseBody;
        // p_url = URLEncoder.encode(p_url, "utf-8");
        // System.out.println("p_url:"+p_url);
        // httpget.setURI(new
        // URI("http://story.soraspace.info/bank/story_mn.php?p_userid=bluesman&p_snum=201&p_num=35821"));
        String ret = "";
        try {
            httpget.setURI(new URI(p_url));
            ret = httpclient.execute(httpget, responseHandler);
        } catch (Exception ex) {
            System.out.println("getDownloadUrl ERROR:" + ex.getLocalizedMessage());
        }
        return ret;
        // return responseBody;
    }

    private InputStream getDownloadUrlInputStream(DefaultHttpClient httpclient, HttpGet httpget, String p_url)
            throws URISyntaxException, IOException, ClientProtocolException {
        // p_url = URLEncoder.encode(p_url, "utf-8");
        // System.out.println("p_url2:"+p_url);
        InputStream is = null;
        try {
            httpget.setURI(new URI(p_url));
            HttpResponse response = httpclient.execute(httpget);
            HttpEntity resEntity = response.getEntity();
            is = resEntity.getContent();
            ;
        } catch (Exception ex) {
        }
        return is;
    }

    private HttpGet executeLogin(DefaultHttpClient httpclient) throws IOException, ClientProtocolException {
        HttpGet httpget = new HttpGet(host_url);
        HttpResponse response = httpclient.execute(httpget);
        HttpEntity entity = response.getEntity();
        System.out.println("Login form get: " + response.getStatusLine());
        EntityUtils.consume(entity);
        System.out.println("Initial set of cookies:");
        List<Cookie> cookies = httpclient.getCookieStore().getCookies();
        if (cookies.isEmpty()) {
            System.out.println("None");
        } else {
            for (int i = 0; i < cookies.size(); i++) {
                System.out.println("- " + cookies.get(i).toString());
            }
        }
        HttpPost httpost = new HttpPost(host_url + "/common/include/login.php");
        Header header1 = new BasicHeader("Accept",
                "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg,application/msword, */*");
        Header header2 = new BasicHeader("Referer", host_url + "/index.php");
        List<NameValuePair> nvps = new ArrayList<NameValuePair>();
        nvps.add(new BasicNameValuePair("p_userid", "bimohani"));
        nvps.add(new BasicNameValuePair("p_passwd", "cw8904"));
        httpost.setHeader(header1);
        httpost.setHeader(header2);
        httpost.setEntity(new UrlEncodedFormEntity(nvps, Consts.UTF_8));

        response = httpclient.execute(httpost);
        entity = response.getEntity();

        System.out.println("Login form get: " + response.getStatusLine());
        EntityUtils.consume(entity);

        System.out.println("Post logon cookies:");
        cookies = httpclient.getCookieStore().getCookies();
        if (cookies.isEmpty()) {
            System.out.println("None");
        } else {
            for (int i = 0; i < cookies.size(); i++) {
                System.out.println("- " + cookies.get(i).toString());
            }
        }
        return httpget;
    }

    public String htmlRemove(String str) {
        StringBuffer t = new StringBuffer();
        StringBuffer t2 = new StringBuffer();

        char[] c = str.toCharArray();
        char ch;
        int d = 0;
        boolean check = false;
        boolean scriptChkeck = false;
        boolean styleCheck = false;
        for (int i = 0, len = c.length; i < len; i++) {
            ch = c[i];
            if (ch == '<') {
                check = true;
            }

            if (!check & !scriptChkeck && !styleCheck) {

                t.append(ch);
            }

            d++;
            t2.append(ch);
            if (d > 9) {
                t2.delete(0, 1);

            }

            if (!scriptChkeck) {
                if (t2.toString().toLowerCase().indexOf("<script") == 0) {
                    scriptChkeck = true;
                }

            }
            if (scriptChkeck) {
                if (t2.toString().toLowerCase().indexOf("</script>") == 0) {

                    scriptChkeck = false;
                }

            }

            if (!styleCheck) {
                if (t2.toString().toLowerCase().indexOf("<style") == 0) {
                    styleCheck = true;
                }

            }
            if (styleCheck) {

                if (t2.toString().toLowerCase().indexOf("</style>") == 0) {
                    styleCheck = false;
                }

            }

            if (ch == '>') {
                check = false;
            }
        }

        return t.toString();
    }
}