Java tutorial
/* * ComprehensiveSearch2.java * Created on 2011-5-25; Project to Colt2010; $Id: ComprehensiveSearch.java 309 2013-04-25 16:38:44Z tristan $ * * Copyright (c) 2011, Xu Brothers and/or its affiliates. All rights reserved. * Xu Brothers PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. */ package com.waku.mmdataextract; import java.io.BufferedInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import org.apache.http.entity.mime.MultipartEntity; import org.apache.http.entity.mime.content.StringBody; import org.apache.log4j.Logger; import org.dom4j.Document; import org.dom4j.Element; import com.waku.common.http.MyHttpClient; /** * @versin $Rev: 309 $, $Date: 2013-04-26 00:38:44 +0800 (, 26 2013) $ * @author Jin */ public class ComprehensiveSearch { static Logger logger = Logger.getLogger(ComprehensiveSearch.class.getName()); private static final String START_ACTION = "http://shouji.gd.chinamobile.com/gdmobile/displaySearch.do?flag=searchForm&imgType=1"; private final static String SEARCH_ACTION = "http://shouji.gd.chinamobile.com/gdmobile/search.do?pageNo=005"; private static List<String> prodIdList = new ArrayList<String>(); @SuppressWarnings("deprecation") private static MultipartEntity getMultipartEntity(String brandId, int pageNumber) { MultipartEntity reqEntity = new MultipartEntity(); try { reqEntity.addPart("flag", new StringBody("search")); reqEntity.addPart("brandId", new StringBody(brandId)); reqEntity.addPart("currentPage", new StringBody(pageNumber + "")); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return reqEntity; } @SuppressWarnings("unchecked") public static void main(String[] args) { FileWriter fw = null; try { fw = new FileWriter(new File("output/ComprehensiveSearch.csv")); fw.write( ",??,?,?,,?,??,,?,1,2,3,\n"); } catch (IOException e) { e.printStackTrace(); } Document firstPage = MyHttpClient.getAsDom4jDoc(START_ACTION); // System.out.println(doc.asXML()); List<Element> brandOptions = firstPage.selectNodes("//select[@name='brandId']/option"); for (Element brandOption : brandOptions) { String brandId = brandOption.attributeValue("value"); if (!brandId.equalsIgnoreCase("0")) { for (int i = 1; true; i++) { logger.info("Get brandId/page -> " + brandId + "/" + i); if (searchDone(fw, brandId, i)) { break; } } } } try { fw.close(); } catch (IOException e) { e.printStackTrace(); } logger.info("----> Done!"); logger.info("----> Start to compare production search ... "); CompareProductions.start(prodIdList, 0); } @SuppressWarnings("unchecked") private static boolean searchDone(FileWriter fw, String brandId, int i) { Document resultPage = MyHttpClient.getAsDom4jDoc(SEARCH_ACTION, getMultipartEntity(brandId, i)); List<Element> products = resultPage.selectNodes("//tr[@onmouseout]"); logger.info("Get products count -> " + products.size()); for (Element product : products) { List<Element> items = product.elements(); // Remove last col items.remove(items.size() - 1); Element firstItem = items.get(0); String attributeValue = firstItem.attributeValue("onclick"); String productId = attributeValue.substring(attributeValue.indexOf("('") + 2, attributeValue.indexOf("')")); if (prodIdList.contains(productId)) { logger.info("Get product id duplicated -> " + productId); continue; } else { logger.info("Get product id add -> " + productId); prodIdList.add(productId); StringBuilder sb = new StringBuilder(); // Save image here String toFileName = productId + ".gif"; saveImage(firstItem.element("img").attributeValue("src"), toFileName); sb.append(toFileName + ","); items.remove(0); // remove first one for (Element item : items) { sb.append(item.getText() + ","); } logger.info(sb.toString()); sb.append("\n"); try { fw.write(sb.toString()); } catch (IOException e) { e.printStackTrace(); } } } if (products.size() < 20) return true; else return false; } public static void saveImage(String imgSrc, String toFileName) { String toFile = "output/images/" + toFileName; if (new File(toFile).exists()) { logger.info("File already saved ->" + toFile); return; } URL u = null; URLConnection uc = null; InputStream raw = null; InputStream in = null; FileOutputStream out = null; try { int endIndex = imgSrc.lastIndexOf("/") + 1; String encodeFileName = URLEncoder.encode(imgSrc.substring(endIndex), "UTF-8").replaceAll("[+]", "%20"); u = new URL("http://shouji.gd.chinamobile.com" + imgSrc.substring(0, endIndex) + encodeFileName); uc = u.openConnection(); String contentType = uc.getContentType(); int contentLength = uc.getContentLength(); if (contentType.startsWith("text/") || contentLength == -1) { logger.error("This is not a binary file. -> " + imgSrc); } raw = uc.getInputStream(); in = new BufferedInputStream(raw); byte[] data = new byte[contentLength]; int bytesRead = 0; int offset = 0; while (offset < contentLength) { bytesRead = in.read(data, offset, data.length - offset); if (bytesRead == -1) break; offset += bytesRead; } if (offset != contentLength) { logger.error("Only read " + offset + " bytes; Expected " + contentLength + " bytes"); } out = new FileOutputStream(toFile); out.write(data); out.flush(); logger.info("Saved file " + u.toString() + " to " + toFile); } catch (Exception e) { e.printStackTrace(); } finally { try { in.close(); } catch (Exception e) { } try { out.close(); } catch (Exception e) { } } } }