webscrap.WebScrap.java Source code

Java tutorial

Introduction

Here is the source code for webscrap.WebScrap.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package webscrap;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 * @author Namrata
 */
public class WebScrap {

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        // TODO code application logic here
        Document doc;
        try {
            doc = Jsoup.connect(
                    "http://www.metmuseum.org/collection/the-collection-online/search/15538?pos=1&rpp=30&pg=1&rndkey=20150122&ft=*&deptids=2")
                    .get();

            File jsonFile = new File("Records.json");
            FileWriter output = new FileWriter(jsonFile);
            JSONArray store = new JSONArray();
            //Declarations for JSON output
            String nameTag = "Name";
            String name;
            String artistTag = "Artist";
            String artistName;
            String imgURLTag = "imgURL";
            String imgsrc;
            String dateTag = "Date";
            String date;
            String geoTag = "Geography";
            String geoVal;
            String cultureTag = "Culture";
            String culture;
            String mediumTag = "Medium";
            String medium;
            String dimTag = "Dimension";
            String dim;
            String classTag = "Classification";
            String classification;
            String credit_line_tag = "Credit_Line";
            String credit_line;
            String accessNumTag = "Accession_Number";
            String accessNum;
            String RnRTag = "Rights_and_Reproduction";
            String RnR;

            //trying to load the next urls
            String next = "http://www.metmuseum.org/collection/the-collection-online/search/11432?pos=1&rpp=30&pg=1&rndkey=20150123&ft=*&deptids=2";
            int i = 500;
            while (i != 0) {

                name = "";
                artistName = "";
                imgsrc = "";
                date = "";
                //geoVal = "not available";
                //culture = "not available";
                medium = "";
                dim = "";
                classification = "";
                credit_line = "";
                accessNum = "";
                //RnR = "not available";

                doc = Jsoup.connect(next).get();
                String o_title = doc.getElementsByTag("h2").text();
                String[] part_o = o_title.split("Email");
                String part_o1 = part_o[0];
                String part_o2 = part_o[1];
                //System.out.println(o_title);
                name = part_o1;
                //String artist = doc.getElementsByTag("h3").text();
                //System.out.println(artist);
                //artistName = artist;
                Elements imgdiv = doc.select("div#inner-image-container img");
                for (Element e : imgdiv) {
                    imgsrc = e.absUrl("src");
                }

                Elements divs;
                divs = doc.select("div.tombstone");
                Elements divchild;
                divchild = divs.select("div");
                int count = 0;
                for (Element div : divchild) {
                    String info = div.text();
                    if (count != 0) {
                        String[] parts = info.split(":");
                        String part1 = parts[0];
                        String part2 = parts[1];

                        switch (part1) {
                        case "Artist":
                            artistName = part2;
                            break;
                        case "Date":
                            date = part2;
                            break;
                        case "Geography":
                            geoVal = part2;
                            break;
                        case "Culture":
                            culture = part2;
                            break;
                        case "Medium":
                            medium = part2;
                            break;
                        case "Dimensions":
                            dim = part2;
                            break;
                        case "Classification":
                            classification = part2;
                            break;
                        case "Credit Line":
                            credit_line = part2;
                            break;
                        case "Accession Number":
                            accessNum = part2;
                            break;
                        case "Rights and Reproduction":
                            RnR = part2;
                            break;
                        }
                    }
                    count++;
                }
                if (classification.equals(" Paintings")) {
                    //System.out.println(nameTag+name);
                    //System.out.println(artistTag+artistName);
                    //System.out.println(imgURLTag+imgsrc);
                    //System.out.println(dateTag+date);
                    //System.out.println(mediumTag+medium);
                    //System.out.println(dimTag+dim);
                    //System.out.println(classTag+classification);
                    //System.out.println(credit_line_tag+credit_line);
                    //System.out.println(accessNumTag+accessNum);
                    //System.out.println(i);
                    //json writing
                    JSONObject jsonObj = new JSONObject();
                    jsonObj.put(nameTag, name);
                    jsonObj.put(artistTag, artistName);
                    jsonObj.put(imgURLTag, imgsrc);
                    jsonObj.put(dateTag, date);
                    jsonObj.put(mediumTag, medium);
                    jsonObj.put(dimTag, dim);
                    jsonObj.put(classTag, classification);
                    jsonObj.put(credit_line_tag, credit_line);
                    jsonObj.put(accessNumTag, accessNum);

                    store.add(jsonObj);
                    i--;
                }
                //going to next page      
                Element link = doc.select("a.next").first();
                next = link.attr("abs:href");

            }
            output.write(store.toJSONString());
            output.write("\n");
            output.flush();
            output.close();

        } catch (IOException e) {
        }

    }
}