com.dajodi.scandic.JSoupScraper.java Source code

Java tutorial

Introduction

Here is the source code for com.dajodi.scandic.JSoupScraper.java

Source

/*
 * Copyright 2012 - Jon DeYoung
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.dajodi.scandic;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.http.protocol.HTTP;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.dajodi.scandic.model.MemberInfo;
import com.dajodi.scandic.model.ScandicStay;

public class JSoupScraper implements HtmlScraper {

    @Override
    public Map<String, String> scrapeFormInputFields(InputStream inStream) {

        try {
            Document doc = Jsoup.parse(inStream, HTTP.UTF_8, "");

            Element form = doc.body().getElementById("aspnetForm");

            Elements inputNodes = form.getElementsByTag("input");
            Map<String, String> inputMap = new HashMap<String, String>();

            for (Element element : inputNodes) {

                String name = element.attr("name");
                String value = element.attr("value");

                if (name != null) {
                    inputMap.put(name, value == null ? "" : value);
                } else {
                    //TODO: remove me
                    Log.d("Something weird");
                }
            }

            doc.empty();
            return inputMap;
        } catch (Exception e) {
            throw new ScandicHtmlException(e);
        }
    }

    @Override
    public MemberInfo scrapeMemberInfo(InputStream inStream) {

        Document doc;
        try {
            doc = Jsoup.parse(inStream, HTTP.UTF_8, "");

            Element accountOverview = doc.getElementById("AccountOverview");

            String points = getStringFromNode(accountOverview, "ctl00_MainBodyRegion_AccountOverview1_totalPoints",
                    "?");
            String membershipNumber = getStringFromNode(accountOverview,
                    "ctl00_MainBodyRegion_AccountOverview1_membershipNo", "?");
            String membershipLevel = getStringFromNode(accountOverview,
                    "ctl00_MainBodyRegion_AccountOverview1_memberLevel", "?");
            String nights = getStringFromNode(accountOverview, "ctl00_MainBodyRegion_AccountOverview1_strngNights",
                    "?");

            int qualNights = Util.UNKNOWN_NIGHTS;

            if ("?".equals(nights)) {
                boolean noTransactions = accountOverview
                        .select("#ctl00_MainBodyRegion_AccountOverview1_NoTransaction").size() == 1;
                if (noTransactions) {
                    // as expected
                    qualNights = Util.NO_NIGHTS;
                } else {
                    Log.d("somethign really strange, number of nights could not be found");
                }
            } else {
                qualNights = Util.parseNumNights(nights);
            }

            MemberInfo.Level level = MemberInfo.Level.fromEnglishText(membershipLevel);

            List<ScandicStay> stays = getStays(accountOverview);

            MemberInfo memberInfo = new MemberInfo();
            memberInfo.setMembershipId(membershipNumber);
            memberInfo.setLevel(level);
            memberInfo.setPoints(Util.parseInt(points, Util.UNKNOWN_POINTS));
            memberInfo.setQualifyingNights(qualNights);
            memberInfo.setStaysLast12Months(stays);
            memberInfo.setLastUpdated(new Date());

            return memberInfo;

        } catch (IOException e) {
            throw new ScandicHtmlException(e);
        }

    }

    private static String getStringFromNode(Element accountOverview, String id, String defaultValue) {
        Element node = accountOverview.getElementById(id);
        if (node == null)
            return defaultValue;
        return Util.trimIfNonNull(node.text());
    }

    private List<ScandicStay> getStays(Element accountOverview) {
        Element tableNode = accountOverview
                .getElementById("ctl00_MainBodyRegion_AccountOverview1_tableTransactions");

        if (tableNode == null) {
            return Collections.emptyList();
        }

        Elements trs = tableNode.getElementsByTag("tr");

        List<ScandicStay> stays = new ArrayList<ScandicStay>();
        int order = 0;
        for (Element tr : trs) {
            if (tr.getElementsByTag("th").isEmpty()) {
                Elements tds = tr.getElementsByTag("td");
                if (tds.size() == 3) {
                    String location = Util.trimIfNonNull(tds.get(0).text());
                    String date = Util.trimIfNonNull(tds.get(1).text());
                    String stayPoints = Util.trimIfNonNull(tds.get(2).text());
                    ScandicStay stay = new ScandicStay();

                    Date[] dates = Util.parseDates(date);
                    int numNights = Util.daysBetween(dates[0], dates[1]);

                    stay.setHotelName(location);
                    stay.setNumPoints(Integer.parseInt(stayPoints));
                    stay.setFromDate(dates[0]);
                    stay.setToDate(dates[1]);
                    stay.setNumNights(numNights);
                    stay.setHtmlOrder(order);
                    stays.add(stay);
                    order++;
                } else {
                    throw new ScandicHtmlException(
                            "unknown table node, html is funky.  could hide row if this is a serious problem.");
                }
            }
        }

        return stays;
    }

}