Example usage for org.jsoup.nodes Element attr

List of usage examples for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey) 

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:net.niyonkuru.koodroid.html.SubscribersHandler.java

@Override
public ArrayList<ContentProviderOperation> parse(Document doc, ContentResolver resolver)
        throws HandlerException {
    final ArrayList<ContentProviderOperation> batch = new ArrayList<ContentProviderOperation>();

    Element subscriberLi = doc.select("div#banSelector li:has(div)").first();
    while (subscriberLi != null) {
        String text = subscriberLi.text();

        /* this assumes the name and phone number are separated by a space */
        int separator = text.lastIndexOf(' ') + 1;

        String subscriberId = text.substring(separator).replaceAll("\\D", "");
        if (subscriberId.length() != 10)
            throw new HandlerException(getString(R.string.parser_error_unexpected_input));

        final ContentProviderOperation.Builder builder;

        final Uri subscriberUri = Subscribers.buildSubscriberUri(subscriberId);
        if (subscriberExists(subscriberUri, resolver)) {
            builder = ContentProviderOperation.newUpdate(subscriberUri);
            builder.withValue(Subscribers.UPDATED, System.currentTimeMillis());
        } else {//  www  . j av  a2 s  .c  om
            builder = ContentProviderOperation.newInsert(Subscribers.CONTENT_URI);
        }
        builder.withValue(Subscribers.SUBSCRIBER_ID, subscriberId);

        String fullName = "";
        String[] names = text.substring(0, separator).split("\\s");
        for (String name : names) {
            fullName += ParserUtils.capitalize(name) + " ";
        }
        builder.withValue(Subscribers.SUBSCRIBER_FULL_NAME, fullName.trim());

        if (subscriberLi.hasAttr("onClick")) {
            String switchUrl = subscriberLi.attr("onClick");

            /* extract only the url */
            switchUrl = switchUrl.substring(switchUrl.indexOf('/'), switchUrl.lastIndexOf('\''));
            builder.withValue(Subscribers.SUBSCRIBER_SWITCHER, switchUrl);
        } else { /* this is the default subscriber as it doesn't have a switcher url */
            ContentValues cv = new ContentValues(1);
            cv.put(Settings.SUBSCRIBER, subscriberId);

            resolver.insert(Settings.CONTENT_URI, cv);
        }
        builder.withValue(Subscribers.SUBSCRIBER_EMAIL, mParent);

        batch.add(builder.build());

        subscriberLi = subscriberLi.nextElementSibling();
    }
    if (batch.size() == 0)
        throw new HandlerException(getString(R.string.parser_error_unexpected_input));

    JSONObject metadata = new JSONObject();
    try {
        metadata.put("subscribers", batch.size());
        metadata.put("language", getString(R.string.locale));
    } catch (JSONException ignored) {
    }
    Crittercism.setMetadata(metadata);
    Crittercism.setUsername(mParent);

    return batch;
}

From source file:de.geeksfactory.opacclient.apis.SISIS.java

protected void parse_medialist(List<LentItem> media, Document doc, int offset) {
    Elements copytrs = doc.select(".data tr");
    doc.setBaseUri(opac_url);/*from   w w  w .j a  va  2 s  .c om*/

    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);

    int trs = copytrs.size();
    if (trs == 1) {
        return;
    }
    assert (trs > 0);
    for (int i = 1; i < trs; i++) {
        Element tr = copytrs.get(i);
        LentItem item = new LentItem();

        if (tr.text().contains("keine Daten")) {
            return;
        }

        item.setTitle(tr.child(1).select("strong").text().trim());
        try {
            item.setAuthor(tr.child(1).html().split("<br[ /]*>")[1].trim());

            String[] col2split = tr.child(2).html().split("<br[ /]*>");
            String deadline = col2split[0].trim();
            if (deadline.contains("-")) {
                deadline = deadline.split("-")[1].trim();
            }
            try {
                item.setDeadline(fmt.parseLocalDate(deadline).toString());
            } catch (IllegalArgumentException e1) {
                e1.printStackTrace();
            }

            if (col2split.length > 1) {
                item.setHomeBranch(col2split[1].trim());
            }

            if (tr.select("a").size() > 0) {
                for (Element link : tr.select("a")) {
                    String href = link.attr("abs:href");
                    Map<String, String> hrefq = getQueryParamsFirst(href);
                    if (hrefq.get("methodToCall").equals("renewalPossible")) {
                        item.setProlongData(offset + "$" + href.split("\\?")[1]);
                        item.setRenewable(true);
                        break;
                    }
                }
            } else if (tr.select(".textrot, .textgruen, .textdunkelblau").size() > 0) {
                item.setProlongData("" + tr.select(".textrot, .textgruen, .textdunkelblau").text());
                item.setRenewable(false);
            }

        } catch (Exception ex) {
            ex.printStackTrace();
        }

        media.add(item);
    }
    assert (media.size() == trs - 1);

}

From source file:org.shareok.data.sagedata.SageJournalIssueDateProcessor.java

@SuppressWarnings("empty-statement")
public void retrieveSageJournalVolIssueDates(Map<String, String> processedJournalsMap) {
    List<String> processedJournals = new ArrayList<>();
    //        JSONObject jsonObj = getSavedSageJournalVolIssueDateInformation();
    try {//from  ww w  .j  a va  2s  . co m
        Map<String, Map<String, String>> journalMap = getSavedSageJournalVolIssueDateInformation();
        if (null == journalMap) {
            journalMap = new HashMap<>();
        }
        Document doc = null;
        try {
            doc = Jsoup.connect("http://journals.sagepub.com/action/showPublications?pageSize=20&startPage=199")
                    .userAgent(
                            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                    .cookie("auth", "token").timeout(300000).get();
            Elements trs = doc.select("form#browsePublicationsForm").get(0).select("table").get(0)
                    .select("tbody").get(0).select("tr");
            for (Element tr : trs) {
                Element link = tr.select("td").get(1).select("a").get(0);
                String journalName = link.text();
                String journalLink = SageDataUtil.SAGE_HTTP_PREFIX + link.attr("href");
                String[] linkInfo = journalLink.split("/");
                String journalIssuesLink = SageDataUtil.SAGE_HTTP_PREFIX + "/loi/"
                        + linkInfo[linkInfo.length - 1];
                if (null == journalMap.get(journalName)) {
                    Map<String, String> infoMap = new HashMap<>();
                    infoMap.put("homeLink", journalLink);
                    infoMap.put("issueLink", journalIssuesLink);
                    journalMap.put(journalName, infoMap);
                } else {
                    Map<String, String> infoMap = journalMap.get(journalName);
                    if (null == infoMap.get("homeLink")) {
                        infoMap.put("homeLink", journalLink);
                    }
                    if (null == infoMap.get("issueLink")) {
                        infoMap.put("issueLink", journalIssuesLink);
                    }
                }
            }
            int kk = 0;
            mainLoop: for (String journal : journalMap.keySet()) {
                System.out.println("Print out journal " + journal + " information :");
                if (null != processedJournalsMap && (journal == null ? processedJournalsMap.get(journal) == null
                        : journal.equals(processedJournalsMap.get(journal)))) {
                    System.out.println("Journal : has already been processed!");
                    continue;
                }
                //                    if(journal.contains("Christian Education")){
                //                        System.out.println("Journal name : International Journal of Health Services, cannot be processed!");
                ////                        continue;
                //                    }
                //                    if(journal.contains("Plastic Surgery")){
                //                        System.out.println("Journal name : International Journal of Health Services, cannot be processed!");
                //                        continue;
                //                    }
                Map<String, String> journalInfoMap = journalMap.get(journal);
                for (String key : journalInfoMap.keySet()) {
                    if (key.equals("issueLink")) {
                        Document loiDdoc = null;
                        try {
                            loiDdoc = Jsoup.connect(journalInfoMap.get(key)).userAgent(
                                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                                    .cookie("auth", "token").timeout(300000).get();
                        } catch (HttpStatusException ex) {
                            ex.printStackTrace();
                            break;
                        }
                        Thread.sleep(2200);
                        if (null != loiDdoc) {
                            Map<String, Map<String, String>> dataMap;
                            if (null != journalMap.get(journal).get("data")) {
                                dataMap = DataUtil.getMapFromJson(journalMap.get(journal).get("data"));
                            } else {
                                dataMap = new HashMap<>();
                            }
                            Elements decaseDivs = loiDdoc.select("div.decade");
                            if (null != decaseDivs && decaseDivs.size() > 0) {
                                for (Element decade : decaseDivs) {
                                    Elements yearsDiv = decade.select("div.years").get(0).children();
                                    if (null != yearsDiv && yearsDiv.size() > 0) {
                                        for (Element yearEle : yearsDiv) {
                                            Elements volumesDiv = yearEle.select("div.volumes").get(0)
                                                    .children();
                                            if (null != volumesDiv && volumesDiv.size() > 0) {
                                                for (Element volumeEle : volumesDiv) {
                                                    String volume = volumeEle.select("a").get(0).text().trim()
                                                            .split("Volume")[1].trim();
                                                    Elements issueInfoDivEles = volumeEle
                                                            .select("div.js_issue");
                                                    if (null != issueInfoDivEles
                                                            && issueInfoDivEles.size() > 0) {
                                                        for (Element issueInfoDiv : issueInfoDivEles) {
                                                            String issueText = issueInfoDiv.select("a").get(0)
                                                                    .text();
                                                            issueText = issueText.split(", ")[0]
                                                                    .split("Issue")[1].trim();
                                                            String oldIssueDate = "";
                                                            String issueDate = "";
                                                            if (NO_ARTICLE_PUB_DATE_JOURNALS_LIST
                                                                    .contains(journal)) {
                                                                issueDate = "01 " + issueInfoDiv
                                                                        .select("span.loiIssueCoverDateText")
                                                                        .get(0).text().trim();
                                                                oldIssueDate = issueDate;
                                                                //                                                            if(issueDate.contains("Winter")){
                                                                //                                                                issueDate = issueDate.replaceAll("Winter", "October");
                                                                //                                                            }
                                                                //                                                            if(issueDate.contains("Fall") || issueDate.contains("Autumn")){
                                                                //                                                                issueDate = issueDate.replaceAll("Fall", "September");
                                                                //                                                                issueDate = issueDate.replaceAll("Autumn", "September");
                                                                //                                                            }
                                                                //                                                            if(issueDate.contains("Summer")){
                                                                //                                                                issueDate = issueDate.replaceAll("Summer", "April");
                                                                //                                                            }
                                                                //                                                            if(issueDate.contains("Spring")){
                                                                //                                                                issueDate = issueDate.replaceAll("Spring", "January");
                                                                //                                                            }
                                                                //                                                            try{                                                            
                                                                //                                                                // for date string like "01 July-October 2016"
                                                                //                                                                if(issueDate.contains("-")){
                                                                //                                                                    String[] dateInfo = issueDate.split("-");
                                                                //                                                                    issueDate = dateInfo[0] + " " + dateInfo[1].split(" ")[1];
                                                                //                                                                }
                                                                //                                                                // for date string like "01 July/October 2016"
                                                                //                                                                if(issueDate.contains("/")){
                                                                //                                                                    String[] dataInfo = issueDate.split("/");
                                                                //                                                                    issueDate = dataInfo[0] + " " + dataInfo[1].split(" ")[1];
                                                                //                                                                }
                                                                //                                                            }
                                                                //                                                            catch(ArrayIndexOutOfBoundsException ex){
                                                                //                                                                System.out.println("Journal name: "+journal);
                                                                //                                                                System.out.println("Volume: "+volume+", issue: "+issueText);
                                                                //                                                                System.out.println("This date string cannot be parsed: "+oldIssueDate);
                                                                //                                                                ex.printStackTrace();
                                                                //                                                                continue;
                                                                //                                                            }
                                                                try {
                                                                    issueDate = "01 " + issueInfoDiv.select(
                                                                            "span.loiIssueCoverDateText").get(0)
                                                                            .text().trim();
                                                                    oldIssueDate = issueDate;
                                                                    issueDate = DataHandlersUtil
                                                                            .convertFullMonthDateStringFormat(
                                                                                    issueDate);
                                                                } catch (ParseException ex) {
                                                                    //                                                                if(!journal.contains("OMEGA - Journal of Death and Dying")){
                                                                    //                                                                    continue;
                                                                    //                                                                }
                                                                    System.out.println(
                                                                            "Journal name: " + journal);
                                                                    System.out.println("Volume: " + volume
                                                                            + ", issue: " + issueText);
                                                                    System.out.println(
                                                                            "This date string cannot be parsed: "
                                                                                    + oldIssueDate);
                                                                    ex.printStackTrace();
                                                                    continue;
                                                                }

                                                            } else {
                                                                try {
                                                                    Element issueLinkEle = issueInfoDiv
                                                                            .select("a").get(0);
                                                                    String issueLink = issueLinkEle
                                                                            .attr("href");
                                                                    Document issueDoc = null;
                                                                    try {
                                                                        issueDoc = Jsoup.connect(issueLink)
                                                                                .userAgent(
                                                                                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                                                                                .cookie("auth", "token")
                                                                                .timeout(300000).get();
                                                                    } catch (HttpStatusException ex) {
                                                                        ex.printStackTrace();
                                                                        break mainLoop;
                                                                    }
                                                                    Thread.sleep(2200);
                                                                    Elements articleDivs = issueDoc
                                                                            .select("div.art_title, .linkable");
                                                                    String articleLink = SageDataUtil.SAGE_HTTP_PREFIX
                                                                            + articleDivs.get(0)
                                                                                    .select("a.ref, .nowrap")
                                                                                    .get(0).attr("href");
                                                                    if (articleLink.contains("pdf/")) {
                                                                        System.out.println("journal: " + journal
                                                                                + " volume=" + volume
                                                                                + " issue=" + issueText
                                                                                + " has ONLY PDF links!");
                                                                        try {
                                                                            issueDate = issueInfoDiv.select(
                                                                                    "span.loiIssueCoverDateText")
                                                                                    .get(0).text().trim();
                                                                            oldIssueDate = issueDate;
                                                                            if (issueDate.contains("Winter")) {
                                                                                issueDate = issueDate
                                                                                        .replaceAll("Winter",
                                                                                                "December");
                                                                            }
                                                                            if (issueDate.contains("Fall")
                                                                                    || issueDate.contains(
                                                                                            "Autumn")) {
                                                                                issueDate = issueDate
                                                                                        .replaceAll("Fall",
                                                                                                "September");
                                                                                issueDate = issueDate
                                                                                        .replaceAll("Autumn",
                                                                                                "September");
                                                                            }
                                                                            if (issueDate.contains("Summer")) {
                                                                                issueDate = issueDate
                                                                                        .replaceAll("Summer",
                                                                                                "June");
                                                                            }
                                                                            if (issueDate.contains("Spring")) {
                                                                                issueDate = issueDate
                                                                                        .replaceAll("Spring",
                                                                                                "March");
                                                                            }
                                                                            if (issueDate.contains("/")) {
                                                                                String[] dataInfo = issueDate
                                                                                        .split("/");
                                                                                String dateInfo1 = dataInfo[0]
                                                                                        .trim();
                                                                                String date;
                                                                                String month1;
                                                                                String[] dateInfo1Arr = dateInfo1
                                                                                        .split(" ");
                                                                                if (dateInfo1Arr.length == 2) {
                                                                                    date = dateInfo1Arr[0];
                                                                                    month1 = dateInfo1Arr[1];
                                                                                } else {
                                                                                    date = "01";
                                                                                    month1 = dataInfo[0].trim();
                                                                                }
                                                                                String month2 = dataInfo[1]
                                                                                        .split("\\s+")[0];
                                                                                String year = dataInfo[1]
                                                                                        .split("\\s+")[1];
                                                                                String date1 = DataHandlersUtil
                                                                                        .convertFullMonthDateStringFormat(
                                                                                                date + " "
                                                                                                        + month1
                                                                                                        + " "
                                                                                                        + year);
                                                                                String date2 = DataHandlersUtil
                                                                                        .convertFullMonthDateStringFormat(
                                                                                                date + " "
                                                                                                        + month2
                                                                                                        + " "
                                                                                                        + year);
                                                                                issueDate = date1 + "::"
                                                                                        + date2;
                                                                            }
                                                                            //  The Journal of Psychiatry & Law dd MMMM-MMMM yyyy pattern
                                                                            else if (issueDate.contains("-")) {
                                                                                if (journal.equals(
                                                                                        "OMEGA - Journal of Death and Dying")) {
                                                                                    Document articleDoc = null;
                                                                                    try {
                                                                                        articleDoc = Jsoup
                                                                                                .connect(
                                                                                                        articleLink)
                                                                                                .userAgent(
                                                                                                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                                                                                                .cookie("auth",
                                                                                                        "token")
                                                                                                .timeout(300000)
                                                                                                .get();
                                                                                    } catch (HttpStatusException ex) {
                                                                                        ex.printStackTrace();
                                                                                        break mainLoop;
                                                                                    }
                                                                                    Thread.sleep(2200);
                                                                                    Element pubDateDiv = articleDoc
                                                                                            .select("div.published-dates")
                                                                                            .get(0);
                                                                                    issueDate = pubDateDiv
                                                                                            .text()
                                                                                            .split("Issue published:")[1]
                                                                                                    .trim();
                                                                                    oldIssueDate = issueDate;
                                                                                    issueDate = DataHandlersUtil
                                                                                            .convertFullMonthDateStringFormat(
                                                                                                    issueDate);
                                                                                } else {
                                                                                    String[] dataInfo = issueDate
                                                                                            .split("-");
                                                                                    String dateInfo1 = dataInfo[0]
                                                                                            .trim();
                                                                                    String date;
                                                                                    String month1;
                                                                                    String[] dateInfo1Arr = dateInfo1
                                                                                            .split(" ");
                                                                                    if (dateInfo1Arr.length == 2) {
                                                                                        date = dateInfo1Arr[0]
                                                                                                .trim();
                                                                                        month1 = dateInfo1Arr[1]
                                                                                                .trim();
                                                                                    } else {
                                                                                        date = "01";
                                                                                        month1 = dataInfo[0]
                                                                                                .trim();
                                                                                    }
                                                                                    String month2 = dataInfo[1]
                                                                                            .split("\\s+")[0];
                                                                                    String year = dataInfo[1]
                                                                                            .split("\\s+")[1];
                                                                                    String date1 = DataHandlersUtil
                                                                                            .convertFullMonthDateStringFormat(
                                                                                                    date + " "
                                                                                                            + month1
                                                                                                            + " "
                                                                                                            + year);
                                                                                    String date2 = DataHandlersUtil
                                                                                            .convertFullMonthDateStringFormat(
                                                                                                    date + " "
                                                                                                            + month2
                                                                                                            + " "
                                                                                                            + year);
                                                                                    issueDate = date1 + "::"
                                                                                            + date2;
                                                                                }
                                                                            } else {
                                                                                issueDate = "01 " + issueDate;
                                                                                issueDate = DataHandlersUtil
                                                                                        .convertFullMonthDateStringFormat(
                                                                                                issueDate);
                                                                            }
                                                                        } catch (ParseException
                                                                                | ArrayIndexOutOfBoundsException ex) {
                                                                            System.out.println(
                                                                                    "Journal name: " + journal);
                                                                            System.out.println("Volume: "
                                                                                    + volume + ", issue: "
                                                                                    + issueText);
                                                                            System.out.println(
                                                                                    "This date string cannot be parsed: "
                                                                                            + issueDate);
                                                                            ex.printStackTrace();
                                                                            continue;
                                                                        }
                                                                    } else {
                                                                        Document articleDoc = null;
                                                                        try {
                                                                            articleDoc = Jsoup
                                                                                    .connect(articleLink)
                                                                                    .userAgent(
                                                                                            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                                                                                    .cookie("auth", "token")
                                                                                    .timeout(300000).get();
                                                                        } catch (HttpStatusException ex) {
                                                                            ex.printStackTrace();
                                                                            break mainLoop;
                                                                        }
                                                                        Thread.sleep(2200);
                                                                        Element pubDateDiv = articleDoc
                                                                                .select("div.published-dates")
                                                                                .get(0);
                                                                        issueDate = pubDateDiv.text()
                                                                                .split("Issue published:")[1]
                                                                                        .trim();
                                                                        oldIssueDate = issueDate;
                                                                        issueDate = DataHandlersUtil
                                                                                .convertFullMonthDateStringFormat(
                                                                                        issueDate);
                                                                    }

                                                                } catch (Exception ex) {
                                                                    logger.error(
                                                                            "Cannot get the issue date for journal ="
                                                                                    + journal + " volume="
                                                                                    + volume + " issue="
                                                                                    + issueText + " date="
                                                                                    + oldIssueDate,
                                                                            ex);
                                                                    continue;
                                                                }
                                                            }
                                                            if (DataHandlersUtil.datesCompare(issueDate,
                                                                    "2010-01-01") < 0) {
                                                                if (dataMap.size() > 0) {
                                                                    ObjectMapper mapper = new ObjectMapper();
                                                                    String json = mapper
                                                                            .writeValueAsString(dataMap);
                                                                    journalInfoMap.put("data", json);
                                                                }
                                                                processedJournals.add(journal);
                                                                continue mainLoop;
                                                            }
                                                            try {
                                                                if (null != dataMap && dataMap.size() > 0
                                                                        && null != dataMap.get(volume)
                                                                        && null != dataMap.get(volume)
                                                                                .get(issueText)) {
                                                                    continue;
                                                                } else {
                                                                    Map<String, String> issueMap = dataMap
                                                                            .get(volume);
                                                                    if (null == issueMap) {
                                                                        issueMap = new HashMap<>();
                                                                        issueMap.put(issueText, issueDate);
                                                                        dataMap.put(volume, issueMap);
                                                                    } else {
                                                                        issueMap.put(issueText, issueDate);
                                                                    }
                                                                    System.out.println("This is vol. " + volume
                                                                            + " and issue " + issueText
                                                                            + " and date " + issueDate);
                                                                }
                                                            } catch (Exception ex) {
                                                                System.out.println(
                                                                        "Cannot add the pub date info into data map for vol. "
                                                                                + volume + " and issue "
                                                                                + issueText + " and date "
                                                                                + issueDate);
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }

                                }
                            }
                            if (dataMap.size() > 0) {
                                ObjectMapper mapper = new ObjectMapper();
                                String json = mapper.writeValueAsString(dataMap);
                                journalInfoMap.put("data", json);
                            }
                        }

                    }
                }
                processedJournals.add(journal);
                if (kk > 100) {
                    break;
                }
                kk++;
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        ObjectMapper mapper = new ObjectMapper();
        String json = mapper.writeValueAsString(journalMap);
        String sageJournalIssueDateInfoFilePath = ShareokdataManager.getSageJournalIssueDateInfoFilePath();
        File sageFile = new File(sageJournalIssueDateInfoFilePath);
        if (sageFile.exists()) {
            String sageJournalIssueDateInfoFilePathOld = sageJournalIssueDateInfoFilePath.split("\\.")[0] + "_"
                    + DataHandlersUtil.getCurrentTimeString() + ".json";
            sageFile.renameTo(new File(sageJournalIssueDateInfoFilePathOld));
        }
        DocumentProcessorUtil.outputStringToFile(json,
                ShareokdataManager.getSageJournalIssueDateInfoFilePath());
        System.out.println("processed journals = " + mapper.writeValueAsString(processedJournals));
    } catch (Exception ex) {
        logger.error("Cannot process the issue dates.", ex);
    }
}

From source file:com.vaushell.shaarlijavaapi.ShaarliClient.java

private String extract(final Element source, final String templateName) {
    if (source == null) {
        throw new IllegalArgumentException();
    }/* www  .  j  a  v  a 2 s.c o  m*/

    final ShaarliTemplates.Template template = templates.get(templateName);
    if (template == null) {
        throw new IllegalArgumentException("template '" + templateName + "' not found");
    }

    final Element elt;
    if (template.cssPath.isEmpty()) {
        elt = source;
    } else {
        final Elements elts = source.select(template.cssPath);
        if (elts.isEmpty()) {
            return null;
        }

        elt = elts.first();
    }

    String content;
    if (template.attribut.isEmpty()) {
        content = elt.text();
    } else {
        content = elt.attr(template.attribut);
    }
    if (content == null) {
        return null;
    }
    content = content.trim();

    if (!template.regex.isEmpty()) {
        final Pattern p = Pattern.compile(template.regex);
        final Matcher m = p.matcher(content);
        if (m.find()) {
            content = m.group().trim();
        }
    }

    if (content.isEmpty()) {
        return null;
    }

    return content;
}

From source file:de.geeksfactory.opacclient.apis.SISIS.java

public List<SearchField> getSearchFields() throws IOException, JSONException {
    if (!initialised) {
        start();/*from   w  w  w.  j a v a  2 s .  c  o m*/
    }

    String html = httpGet(opac_url + "/search.do?methodToCall=switchSearchPage&SearchType=2", ENCODING);
    Document doc = Jsoup.parse(html);
    List<SearchField> fields = new ArrayList<>();

    Elements options = doc.select("select[name=searchCategories[0]] option");
    for (Element option : options) {
        TextSearchField field = new TextSearchField();
        field.setDisplayName(option.text());
        field.setId(option.attr("value"));
        field.setHint("");
        fields.add(field);
    }

    for (Element dropdown : doc.select("#tab-content select")) {
        parseDropdown(dropdown, fields);
    }

    return fields;
}

From source file:ExtractorContentTest.java

@Test
public void collectAllComparisonOf() throws IOException {

    List<Element> hrefs = new ArrayList<Element>();
    _collectAllComparisonOf(//from www .j  a v  a  2s  . com
            "/w/index.php?title=Special%3APrefixIndex&prefix=Comparison&namespace=0&hideredirects=1", hrefs);

    System.err.println("#hrefs=" + hrefs.size());
    StringBuilder content = new StringBuilder();
    content.append("Title ; URL\n"); // header
    for (Element href : hrefs) {
        String hText = href.attr("title");
        String hURL = href.attr("href");
        content.append("" + hText + " ; " + URL_BASE_NAME + hURL + "\n");
    }

    //FileUtils.writeStringToFile(new File ("comparisonsData.csv"), content.toString());

}

From source file:de.geeksfactory.opacclient.apis.SISIS.java

@Override
public AccountData account(Account acc) throws IOException, JSONException, OpacErrorException {
    start(); // TODO: Is this necessary?

    int resultNum;

    if (!login(acc)) {
        return null;
    }/*w w w .j  a  v  a 2s  .  c  o m*/

    // Geliehene Medien
    String html = httpGet(opac_url + "/userAccount.do?methodToCall=showAccount&typ=1", ENCODING);
    List<LentItem> medien = new ArrayList<>();
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);
    parse_medialist(medien, doc, 1);
    if (doc.select(".box-right").size() > 0) {
        for (Element link : doc.select(".box-right").first().select("a")) {
            String href = link.attr("abs:href");
            Map<String, String> hrefq = getQueryParamsFirst(href);
            if (hrefq == null || hrefq.get("methodToCall") == null) {
                continue;
            }
            if (hrefq.get("methodToCall").equals("pos") && !"1".equals(hrefq.get("anzPos"))) {
                html = httpGet(href, ENCODING);
                parse_medialist(medien, Jsoup.parse(html), Integer.parseInt(hrefq.get("anzPos")));
            }
        }
    }
    if (doc.select("#label1").size() > 0) {
        resultNum = 0;
        String rNum = doc.select("#label1").first().text().trim().replaceAll(".*\\(([0-9]*)\\).*", "$1");
        if (rNum.length() > 0) {
            resultNum = Integer.parseInt(rNum);
        }

        assert (resultNum == medien.size());
    }

    // Ordered media ("Bestellungen")
    html = httpGet(opac_url + "/userAccount.do?methodToCall=showAccount&typ=6", ENCODING);
    List<ReservedItem> reserved = new ArrayList<>();
    doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);
    parse_reslist("6", reserved, doc, 1);
    Elements label6 = doc.select("#label6");
    if (doc.select(".box-right").size() > 0) {
        for (Element link : doc.select(".box-right").first().select("a")) {
            String href = link.attr("abs:href");
            Map<String, String> hrefq = getQueryParamsFirst(href);
            if (hrefq == null || hrefq.get("methodToCall") == null) {
                break;
            }
            if (hrefq.get("methodToCall").equals("pos") && !"1".equals(hrefq.get("anzPos"))) {
                html = httpGet(href, ENCODING);
                parse_reslist("6", reserved, Jsoup.parse(html), Integer.parseInt(hrefq.get("anzPos")));
            }
        }
    }

    // Prebooked media ("Vormerkungen")
    html = httpGet(opac_url + "/userAccount.do?methodToCall=showAccount&typ=7", ENCODING);
    doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);
    parse_reslist("7", reserved, doc, 1);
    if (doc.select(".box-right").size() > 0) {
        for (Element link : doc.select(".box-right").first().select("a")) {
            String href = link.attr("abs:href");
            Map<String, String> hrefq = getQueryParamsFirst(href);
            if (hrefq == null || hrefq.get("methodToCall") == null) {
                break;
            }
            if (hrefq.get("methodToCall").equals("pos") && !"1".equals(hrefq.get("anzPos"))) {
                html = httpGet(href, ENCODING);
                parse_reslist("7", reserved, Jsoup.parse(html), Integer.parseInt(hrefq.get("anzPos")));
            }
        }
    }
    if (label6.size() > 0 && doc.select("#label7").size() > 0) {
        resultNum = 0;
        String rNum = label6.text().trim().replaceAll(".*\\(([0-9]*)\\).*", "$1");
        if (rNum.length() > 0) {
            resultNum = Integer.parseInt(rNum);
        }
        rNum = doc.select("#label7").text().trim().replaceAll(".*\\(([0-9]*)\\).*", "$1");
        if (rNum.length() > 0) {
            resultNum += Integer.parseInt(rNum);
        }
        assert (resultNum == reserved.size());
    }

    AccountData res = new AccountData(acc.getId());

    if (doc.select("#label8").size() > 0) {
        String text = doc.select("#label8").first().text().trim();
        if (text.matches("Geb.+hren[^\\(]+\\(([0-9.,]+)[^0-9A-Z]*(|EUR|CHF|Fr)\\)")) {
            text = text.replaceAll("Geb.+hren[^\\(]+\\(([0-9.,]+)[^0-9A-Z]*(|EUR|CHF|Fr)\\)", "$1 $2");
            res.setPendingFees(text);
        }
    }
    Pattern p = Pattern.compile("[^0-9.]*", Pattern.MULTILINE);
    if (doc.select(".box3").size() > 0) {
        for (Element box : doc.select(".box3")) {
            if (box.select("strong").size() == 1) {
                String text = box.select("strong").text();
                if (text.equals("Jahresgebhren")) {
                    text = box.text();
                    text = p.matcher(text).replaceAll("");
                    res.setValidUntil(text);
                }
            }

        }
    }

    res.setLent(medien);
    res.setReservations(reserved);
    return res;
}

From source file:ExtractorContentTest.java

@Test
public void testStatistics() throws Exception {

    List<Element> hrefs = new ArrayList<Element>();
    _collectAllComparisonOf(/* w w  w  .  ja v a2 s  .  co m*/
            "/w/index.php?title=Special%3APrefixIndex&prefix=Comparison&namespace=0&hideredirects=1", hrefs);

    int j = 0; // j-th comparison 
    int nRelevant = 0;
    for (Element href : hrefs) {
        String hURL = href.attr("href");
        int n = "/wiki/".length();
        String wikiPageName = hURL.substring(n);
        System.err.println("(" + j++ + ") " + wikiPageName);

        if (excludePCMs.contains(wikiPageName)) {
            System.err.println("Ignoring");
            continue;
        }

        PCMStatistic stat = computeStatistic(wikiPageName);

        // we exploit here the stats by printing 

        int nTable = stat.getNumbersOfTables();
        System.err.println("numbers of tables:" + nTable);

        if (nTable > 0)
            nRelevant++;

        Collection<CatalogStat> catalogStats = stat.getCatalogStats();
        int i = 1;
        for (CatalogStat catalogStat : catalogStats) {
            System.err.println("table(" + i++ + ")");
            System.err.println("#headers=" + catalogStat.getNumbersOfHeaders());
            System.err.println("#products=" + catalogStat.getNumbersOfProduct());
        }
        System.err.println("\n\n\n");

    }

    System.err.println("number of relevant PCMs: " + nRelevant);

    //   String wikiPageName = "Comparison_of_Java_virtual_machines"; 

}

From source file:ExtractorContentTest.java

private void _collectAllComparisonOf(String url, List<Element> hrefs) throws IOException {

    Document doc = Jsoup.connect("" + URL_BASE_NAME + url).get();
    Elements aHrefs = doc.select("a[href]");

    Element urlNext = null;/*from   w w w.  ja va2  s  .c o  m*/
    for (Element aHref : aHrefs) {
        Element h = aHref.getElementsByAttribute("href").first(); // val() ;
        String hText = h.attr("title");
        String hURL = h.attr("href");
        if (hText.contains("Comparison") && hURL.startsWith("/wiki/")) {
            hrefs.add(aHref);
        }
        String aText = aHref.text();
        if (aText.contains("Next page") && hURL.startsWith("/w/index.php?"))
            urlNext = aHref;

    }

    if (urlNext != null) {
        _collectAllComparisonOf(urlNext.attr("href"), hrefs);
    }

}

From source file:be.ibridge.kettle.jsoup.JsoupInput.java

private Object[] buildRow() throws KettleException {
    // Create new row...
    Object[] outputRowData = buildEmptyRow();

    if (data.readrow != null)
        outputRowData = data.readrow.clone();

    // Read fields...
    for (int i = 0; i < data.nrInputFields; i++) {
        // Get field
        JsoupInputField field = meta.getInputFields()[i];

        // get jsoup array for field
        Elements jsoupa = data.resultList.get(i);
        String nodevalue = null;//from   w w w. j  a va2 s  . com
        if (jsoupa != null) {
            Element jo = jsoupa.get(data.recordnr);
            if (jo != null) {

                // Do Element Type
                switch (field.getElementType()) {
                case JsoupInputField.ELEMENT_TYPE_NODE:
                    // Do Result Type
                    switch (field.getResultType()) {
                    case JsoupInputField.RESULT_TYPE_TEXT:
                        nodevalue = jo.text();
                        break;
                    case JsoupInputField.RESULT_TYPE_TYPE_OUTER_HTML:
                        nodevalue = jo.outerHtml();
                        break;
                    case JsoupInputField.RESULT_TYPE_TYPE_INNER_HTML:
                        nodevalue = jo.html();
                        break;
                    default:
                        nodevalue = jo.toString();
                        break;
                    }
                    break;
                case JsoupInputField.ELEMENT_TYPE_ATTRIBUT:
                    nodevalue = jo.attr(field.getAttribute());
                    break;
                default:
                    nodevalue = jo.toString();
                    break;
                }
            }
        }

        // Do trimming
        switch (field.getTrimType()) {
        case JsoupInputField.TYPE_TRIM_LEFT:
            nodevalue = Const.ltrim(nodevalue);
            break;
        case JsoupInputField.TYPE_TRIM_RIGHT:
            nodevalue = Const.rtrim(nodevalue);
            break;
        case JsoupInputField.TYPE_TRIM_BOTH:
            nodevalue = Const.trim(nodevalue);
            break;
        default:
            break;
        }

        if (meta.isInFields()) {
            // Add result field to input stream
            outputRowData = RowDataUtil.addValueData(outputRowData, data.totalpreviousfields + i, nodevalue);
        }
        // Do conversions
        //
        ValueMetaInterface targetValueMeta = data.outputRowMeta.getValueMeta(data.totalpreviousfields + i);
        ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(data.totalpreviousfields + i);
        outputRowData[data.totalpreviousfields + i] = targetValueMeta.convertData(sourceValueMeta, nodevalue);

        // Do we need to repeat this field if it is null?
        if (meta.getInputFields()[i].isRepeated()) {
            if (data.previousRow != null && Const.isEmpty(nodevalue)) {
                outputRowData[data.totalpreviousfields + i] = data.previousRow[data.totalpreviousfields + i];
            }
        }
    } // End of loop over fields...   

    int rowIndex = data.nrInputFields;

    // See if we need to add the filename to the row...
    if (meta.includeFilename() && !Const.isEmpty(meta.getFilenameField())) {
        outputRowData[rowIndex++] = data.filename;
    }
    // See if we need to add the row number to the row...  
    if (meta.includeRowNumber() && !Const.isEmpty(meta.getRowNumberField())) {
        outputRowData[rowIndex++] = new Long(data.rownr);
    }
    // Possibly add short filename...
    if (meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0) {
        outputRowData[rowIndex++] = data.shortFilename;
    }
    // Add Extension
    if (meta.getExtensionField() != null && meta.getExtensionField().length() > 0) {
        outputRowData[rowIndex++] = data.extension;
    }
    // add path
    if (meta.getPathField() != null && meta.getPathField().length() > 0) {
        outputRowData[rowIndex++] = data.path;
    }
    // Add Size
    if (meta.getSizeField() != null && meta.getSizeField().length() > 0) {
        outputRowData[rowIndex++] = new Long(data.size);
    }
    // add Hidden
    if (meta.isHiddenField() != null && meta.isHiddenField().length() > 0) {
        outputRowData[rowIndex++] = new Boolean(data.path);
    }
    // Add modification date
    if (meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0) {
        outputRowData[rowIndex++] = data.lastModificationDateTime;
    }
    // Add Uri
    if (meta.getUriField() != null && meta.getUriField().length() > 0) {
        outputRowData[rowIndex++] = data.uriName;
    }
    // Add RootUri
    if (meta.getRootUriField() != null && meta.getRootUriField().length() > 0) {
        outputRowData[rowIndex++] = data.rootUriName;
    }
    data.recordnr++;

    RowMetaInterface irow = getInputRowMeta();

    data.previousRow = irow == null ? outputRowData : (Object[]) irow.cloneRow(outputRowData); // copy it to make
    // surely the next step doesn't change it in between...

    return outputRowData;
}