Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package downloadwolkflow; import MyTest.DownloadFileTest; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author AiWangtao */ public class getWorkFlowList { public static void main(String args[]) { CloseableHttpClient httpclient = HttpClients.createDefault(); String[] pageList = getPageList(); System.out.println(pageList.length); for (int i = 1; i < pageList.length; i++) { System.out.println(pageList[i]); System.out.println("---------------------------------------------------------------------------"); HttpGet httpget = new HttpGet(pageList[i]); try { HttpResponse response = httpclient.execute(httpget); String page = EntityUtils.toString(response.getEntity()); Document mainDoc = Jsoup.parse(page); Elements resultList = mainDoc.select("div.resource_list_item"); for (int j = 0; j < resultList.size(); j++) { Element workflowResult = resultList.get(j); Element detailInfo = workflowResult.select("div.main_panel").first().select("p.title.inline") .first().select("a").first(); String detailUrl = "http://www.myexperiment.org" + detailInfo.attributes().get("href") + ".html"; System.out.println(detailUrl); downloadWorkFlow(detailUrl, httpclient); Thread.sleep(1000); } } catch (IOException ex) { Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex); } catch (InterruptedException ex) { Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex); } } try { httpclient.close(); } catch (IOException ex) { Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex); } } public static String[] getPageList() { String[] pageList = null; CloseableHttpClient httpclient = HttpClients.createDefault(); try { HttpGet httpget = new HttpGet("http://www.myexperiment.org/workflows"); HttpResponse response = httpclient.execute(httpget); String mainpage = EntityUtils.toString(response.getEntity()); Document mainDoc = Jsoup.parse(mainpage); Element pageinfo = mainDoc.select("div.pagination ").first(); // System.out.println(pageinfo.toString()); Elements pagesElemenets = pageinfo.select("[href]"); int pageSize = Integer.parseInt(pagesElemenets.get(pagesElemenets.size() - 2).text()); pageList = new String[pageSize + 1]; for (int i = 1; i <= pageSize; i++) { pageList[i] = "http://www.myexperiment.org/workflows?page=" + i; } } catch (IOException ex) { Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex); } return pageList; } private static void downloadWorkFlow(String detailUrl, CloseableHttpClient httpclient) { try { HttpGet httpget = new HttpGet(detailUrl); HttpResponse response = httpclient.execute(httpget); String page = EntityUtils.toString(response.getEntity()); Document mainDoc = Jsoup.parse(page); Element downloadEle = mainDoc.select("div#myexp_content ul li a").first(); if (downloadEle == null) { downloadEle = mainDoc.select("div#myexp_content ul li:nth-child(1) span a").first(); } String downloadUrl = downloadEle.attributes().get("href"); Thread.sleep(500); if (downloadUrl.contains("download")) { downloadFiles(downloadUrl, httpclient); } else { System.out.println(detailUrl + " do not contain valuable resource"); } } catch (IOException ex) { Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex); } catch (InterruptedException ex) { Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex); } } private static void writeToFile(String downloadUrl, HttpClient httpclient) throws IOException { HttpGet httpget = new HttpGet(downloadUrl); HttpResponse response = httpclient.execute(httpget); String xml = EntityUtils.toString(response.getEntity()); String filename = downloadUrl.split("/")[downloadUrl.split("/").length - 1].split("\\?")[0]; System.out.println(filename); try (PrintWriter out = new PrintWriter("data/" + filename)) { out.println(xml); } } private static void downloadFiles(String downloadUrl, CloseableHttpClient httpclient) { HttpGet httpget = new HttpGet(downloadUrl); HttpEntity entity = null; try { HttpResponse response = httpclient.execute(httpget); entity = response.getEntity(); if (entity != null) { InputStream is = entity.getContent(); String filename = downloadUrl.split("/")[downloadUrl.split("/").length - 1].split("\\?")[0]; System.out.println(filename); BufferedInputStream bis = new BufferedInputStream(is); BufferedOutputStream bos = new BufferedOutputStream( new FileOutputStream(new File("data/" + filename))); int readedByte; while ((readedByte = bis.read()) != -1) { bos.write(readedByte); } bis.close(); bos.close(); } } catch (IOException ex) { Logger.getLogger(DownloadFileTest.class.getName()).log(Level.SEVERE, null, ex); } } }