Example usage for org.jsoup.nodes Element absUrl

List of usage examples for org.jsoup.nodes Element absUrl

Introduction

In this page you can find the example usage for org.jsoup.nodes Element absUrl.

Prototype

public String absUrl(String attributeKey) 

Source Link

Document

Get an absolute URL from a URL attribute that may be relative (i.e.

Usage

From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java

/**
 * Method used to perform recursive creation indexing for a given web page 
 * in search database.//from  w w  w.ja  va 2 s  .  c  o  m
 *
 * @param webPage webPage.url is entered url
 * webPage.title is set
 * @param html Jsoup.Document of entered url
 * @param recursionNumber used to stop recursion at exceeding 
 * MAX_RECURSION_SEARCH_NUMBER
 */
private void indexElements(WebPage webPage, Document html, final int recursionNumber)
        throws IOException, ParseException {
    String title = html.title();
    if (referencedTitles.contains(title.trim())) {
        return;
    }
    referencedTitles.add(title.trim());
    webPage.setTitle(title);
    if (containsPage(webPage)) {
        System.out.println(webPage.getUrl() + " is already indexed");
        return;
    }
    Element prevElement = null;
    Elements elements = html.body().getAllElements(); //.getElementsByTag("a");
    addDoc(webPage, html.text());
    //        for (Element element : elements) {
    ////                System.out.println(element.nodeName() + " element.text() " 
    ////                        + element.text() + " url " 
    ////                        + element.absUrl("href"));
    //            if (element.nodeName().equalsIgnoreCase("body")) {
    //                addDoc(webPage, element.text());
    //                break;
    ////                continue;
    //            }
    //            if (null == prevElement) {
    //                prevElement = element;
    ////            } else if (prevElementContainsElementText(prevElement, element)) {
    ////                continue;
    //            }
    ////            if (null !== webPagesService.findWebPage(element.absUrl("href")))
    //            if (element.text().trim().isEmpty()) {
    //                continue;
    //            }
    ////            StringTokenizer str = new StringTokenizer(element.text());
    ////            str.
    //            addDoc(webPage, element.text());
    //        }
    if (recursionNumber > MAX_RECURSION_SEARCH_NUMBER || referencedSites.size() > MAX_NUMBER_SITES_INDEXED) {
        //            System.out.println(recursionNumber + " " 
        //                    + referencedSites.contains(webPage.getUrl()));
        return;
    }
    elements.parallelStream()
            .filter((Element e) -> e.nodeName().equalsIgnoreCase("a") && null != e.absUrl(HREF)
                    && !e.absUrl(HREF).trim().isEmpty() && !referencedSites.contains(e.absUrl(HREF))
                    && !referencedSites.contains(removeSharpEtc(e.absUrl(HREF))))
            .forEach((Element element) -> {
                WebPage webPage1 = new WebPage(element.absUrl(HREF));
                String url1 = webPage1.getUrl();
                //                    System.out.println(recursionNumber + " recursion for '" 
                //                            + url1 + "'");
                try {
                    Document htmlR = Jsoup.connect(url1).get();
                    indexElements(webPage1, htmlR, recursionNumber + 1);
                } catch (IOException | ParseException e) {
                    System.out.println("Exception " + e.getMessage());
                }
                referencedSites.add(url1);
            });
    //        for (Element element : elements) {
    //            if (!element.nodeName().equalsIgnoreCase("a")) {
    //                continue;
    //            }
    //            WebPage webPage1 = new WebPage(element.absUrl("href"));
    //            if (null == webPage1.getUrl() 
    //                    || webPage1.getUrl().isEmpty()
    //                    || referencedSites.contains(webPage1.getUrl())) {
    //                continue;
    //            }
    //            System.out.println(recursionNumber + "recursion for " 
    //                    + element.absUrl("href"));
    //            try {
    //                Document htmlR = Jsoup.connect(webPage1.getUrl()).get();
    //                webPage1.setTitle(htmlR.title());
    //                indexElements(webPage1, htmlR, recursionNumber + 1);
    //            } catch (IOException e) {
    //                System.out.println("IOException " + e.getMessage());
    //            }
    //            referencedSites.add(webPage1.getUrl());
    //        }
}

From source file:com.aquest.emailmarketing.web.controllers.BroadcastController.java

/**
 * Define content.// w ww  .  j  a  v  a 2 s  .co m
 *
 * @param model the model
 * @param broadcast1 the broadcast1
 * @param result the result
 * @param principal the principal
 * @return the string
 * @throws IOException 
 */
@RequestMapping(value = "/defineContent", method = RequestMethod.POST)
public String defineContent(Model model, @Valid @ModelAttribute("broadcast") Broadcast broadcast1,
        @RequestParam(value = "fromUrl", required = false) String fromUrl,
        @RequestParam(value = "optimize", required = false) boolean optimize,
        @RequestParam(value = "baseurl", required = false) String baseUrl,
        @RequestParam(value = "rel2abs", required = false) boolean rel2abs, BindingResult result,
        Principal principal) throws IOException {
    String htmlBodyPrep = "";
    Broadcast broadcast = broadcastService.getBroadcastById(broadcast1.getId());
    broadcast.setSubject(broadcast1.getSubject());
    if (fromUrl != "") {
        Document doc = Jsoup.connect(fromUrl).get();
        htmlBodyPrep = doc.outerHtml();
        broadcast.setHtmlbody(htmlBodyPrep);
        System.out.println(htmlBodyPrep);
    }
    if (broadcast1.getHtmlbody() != null) {
        htmlBodyPrep = broadcast1.getHtmlbody();
        broadcast.setHtmlbody(htmlBodyPrep);
        System.out.println("Da vidimo: " + htmlBodyPrep);
    }
    if (rel2abs == true) {
        if (baseUrl != null) {
            System.out.println(baseUrl);
            Document doc = Jsoup.parse(broadcast.getHtmlbody(), baseUrl);
            System.out.println(doc.toString());

            Elements images = doc.select("img");
            for (Element e : images) {
                e.attr("src", e.absUrl("src"));
                System.out.println(e.absUrl("src"));
            }
            broadcast.setHtmlbody(doc.outerHtml());
            htmlBodyPrep = doc.outerHtml();
        } else {
            // ovde staviti error handling
        }
    }
    if (optimize == true) {
        //          /* PREMAILER API OPTIONS
        //           * line_length - Line length used by to_plain_text. Boolean, default is 65.
        //             warn_level - What level of CSS compatibility warnings to show (see Warnings).
        //                NONE = 0
        //                SAFE = 1
        //                POOR = 2
        //                RISKY = 3
        //             link_query_string - A string to append to every a href="" link. Do not include the initial ?.
        //             base_url - Used to calculate absolute URLs for local files.
        //             css - Manually specify CSS stylesheets.
        //             css_to_attributes - Copy related CSS attributes into HTML attributes (e.g. background-color to bgcolor)
        //             css_string - Pass CSS as a string
        //             remove_ids - Remove ID attributes whenever possible and convert IDs used as anchors to hashed to avoid collisions in webmail programs. Default is false.
        //             remove_classes - Remove class attributes. Default is false.
        //             remove_comments -  Remove html comments. Default is false.
        //             preserve_styles - Whether to preserve any link rel=stylesheet and style elements. Default is false.
        //             preserve_reset - Whether to preserve styles associated with the MailChimp reset code
        //             with_html_string -  Whether the html param should be treated as a raw string.
        //             verbose - Whether to print errors and warnings to $stderr. Default is false.
        //             adapter - Which HTML parser to use, either :nokogiri or :hpricot. Default is :hpricot.
        //          */
        Premailer premailer = new Premailer();
        PremailerInterface premailerInterface = premailer.getPremailerInstance();

        Map<String, Object> options = new HashMap<String, Object>();
        options.put("with_html_string", true);
        options.put("base_url", fromUrl);
        premailerInterface.init(broadcast.getHtmlbody(), options);
        //premailerInterface.init(htmlBodyPrep, options);
        broadcast.setHtmlbody(premailerInterface.inline_css());
        System.out.println(premailerInterface.inline_css());
        premailer.destroyInstance();
    }
    broadcast.setPlaintext(broadcast1.getPlaintext());
    broadcastService.SaveOrUpdate(broadcast);
    // Find URLs in html body and add tracking code
    Urls urls = new Urls();
    String html = broadcast.getHtmlbody();
    //HashSet to avoid duplicates
    Set<String> urlList = new HashSet<String>();
    Document doc = Jsoup.parse(html);
    Elements links = doc.select("a[href]");
    for (Element link : links) {
        if (link.attr("abs:href").length() > 5) {
            urlList.add(link.attr("abs:href"));
        }
    }
    model.addAttribute("urlList", urlList);
    model.addAttribute("urls", urls);

    // Google Analytics - utmCampaign List
    List<String> utmCampaignList = new ArrayList<String>();
    utmCampaignList.add("[BROADAST_NAME]");
    model.addAttribute("utmCampaignList", utmCampaignList);

    // Google Analytics - utmSource List
    List<String> utmSourceList = new ArrayList<String>();
    utmSourceList.add("[CAMPAIGN_NAME]");
    model.addAttribute("utmSourceList", utmSourceList);

    // Google Analytics - utmContent List
    List<String> utmContentList = new ArrayList<String>();
    utmContentList.add("[EMAIL]");
    //TODO: add all variables from CM_EMAIL_BROADCAST_LIST
    model.addAttribute("utmContentList", utmContentList);

    model.addAttribute("broadcast", broadcast);
    return "tracking";
}

From source file:de.geeksfactory.opacclient.apis.Heidi.java

@Override
public void start() throws IOException {
    String html = httpGet(opac_url + "/search.cgi?art=f", ENCODING, false, cookieStore);
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);/*w w w  . j  a v a 2  s.c om*/
    sessid = null;
    for (Element link : doc.select("a")) {
        String sid = getQueryParamsFirst(link.absUrl("href")).get("sess");
        if (sid != null) {
            sessid = sid;
            break;
        }
    }
    super.start();
}

From source file:dk.dma.msinm.service.MessageService.java

/**
 * Make sure that links are absolute./*w w  w  . jav a  2s . c om*/
 * @param doc the HTML document
 */
protected void externalizeLinks(Document doc, String tag, String attr) {
    Elements elms = doc.select(tag + "[" + attr + "]");
    for (Element e : elms) {

        String url = e.absUrl(attr);
        if (url.length() == 0) {
            // Disable link
            e.attr(attr, "#");
            continue;
        }
        // Update the link to be the absolute link
        e.attr(attr, url);
    }
}

From source file:de.geeksfactory.opacclient.apis.Heidi.java

private SearchRequestResult parse_search(String html, int page) {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);/* www .  j a v  a  2s  .com*/

    int results_total = 0;
    if (doc.select("#heiditreffer").size() > 0) {
        String resstr = doc.select("#heiditreffer").text();
        String resnum = resstr.replaceAll("\\(([0-9.]+)([^0-9]*)\\)", "$1").replace(".", "");
        results_total = Integer.parseInt(resnum);
    }

    Elements table = doc.select("table.treffer tr");
    List<SearchResult> results = new ArrayList<>();
    for (int i = 0; i < table.size(); i++) {
        Element tr = table.get(i);
        SearchResult sr = new SearchResult();

        StringBuilder description = null;
        String author = "";

        for (Element link : tr.select("a")) {
            String kk = getQueryParamsFirst(link.absUrl("href")).get("katkey");
            if (kk != null) {
                sr.setId(kk);
                break;
            }
        }

        if (tr.select("span.Z3988").size() == 1) {
            // Luckily there is a <span class="Z3988"> item which provides
            // data in a standardized format.
            List<NameValuePair> z3988data;
            boolean hastitle = false;
            try {
                description = new StringBuilder();
                z3988data = URLEncodedUtils
                        .parse(new URI("http://dummy/?" + tr.select("span.Z3988").attr("title")), "UTF-8");
                for (NameValuePair nv : z3988data) {
                    if (nv.getValue() != null) {
                        if (!nv.getValue().trim().equals("")) {
                            if (nv.getName().equals("rft.btitle") && !hastitle) {
                                description.append("<b>").append(nv.getValue()).append("</b>");
                                hastitle = true;
                            } else if (nv.getName().equals("rft.atitle") && !hastitle) {
                                description.append("<b>").append(nv.getValue()).append("</b>");
                                hastitle = true;
                            } else if (nv.getName().equals("rft.au")) {
                                author = nv.getValue();
                            } else if (nv.getName().equals("rft.aufirst")) {
                                author = author + ", " + nv.getValue();
                            } else if (nv.getName().equals("rft.aulast")) {
                                author = nv.getValue();
                            } else if (nv.getName().equals("rft.date")) {
                                description.append("<br />").append(nv.getValue());
                            }
                        }
                    }
                }
            } catch (URISyntaxException e) {
                description = null;
            }
        }
        if (!"".equals(author)) {
            author = author + "<br />";
        }
        sr.setInnerhtml(author + description.toString());

        if (tr.select(".kurzstat").size() > 0) {
            String stattext = tr.select(".kurzstat").first().text();
            if (stattext.contains("ausleihbar")) {
                sr.setStatus(Status.GREEN);
            } else if (stattext.contains("online")) {
                sr.setStatus(Status.GREEN);
            } else if (stattext.contains("entliehen")) {
                sr.setStatus(Status.RED);
            } else if (stattext.contains("Prsenznutzung")) {
                sr.setStatus(Status.YELLOW);
            } else if (stattext.contains("bestellen")) {
                sr.setStatus(Status.YELLOW);
            }
        }
        if (tr.select(".typbild").size() > 0) {
            String typtext = tr.select(".typbild").first().text();
            if (typtext.contains("Buch")) {
                sr.setType(MediaType.BOOK);
            } else if (typtext.contains("DVD-ROM")) {
                sr.setType(MediaType.CD_SOFTWARE);
            } else if (typtext.contains("Online-Ressource")) {
                sr.setType(MediaType.EDOC);
            } else if (typtext.contains("DVD")) {
                sr.setType(MediaType.DVD);
            } else if (typtext.contains("Film")) {
                sr.setType(MediaType.MOVIE);
            } else if (typtext.contains("Zeitschrift")) {
                sr.setType(MediaType.MAGAZINE);
            } else if (typtext.contains("Musiknoten")) {
                sr.setType(MediaType.SCORE_MUSIC);
            } else if (typtext.contains("Bildliche Darstellung")) {
                sr.setType(MediaType.ART);
            } else if (typtext.contains("Zeitung")) {
                sr.setType(MediaType.NEWSPAPER);
            } else if (typtext.contains("Karte")) {
                sr.setType(MediaType.MAP);
            } else if (typtext.contains("Mehrteilig")) {
                sr.setType(MediaType.PACKAGE_BOOKS);
            }
        }

        results.add(sr);
    }
    // TODO
    return new SearchRequestResult(results, results_total, page);
}

From source file:crawler.HackerEarthCrawler.java

@Override
public void crawl() {

    int flag = 0;

    //set of urls which should be crawled
    TreeSet<String> linksset = new TreeSet<String>();
    TreeSet<String> tempset = new TreeSet<String>();
    TreeSet<String> tutorialset = new TreeSet<String>();
    //final set of problem urls
    TreeSet<String> problemset = new TreeSet<String>();
    //visited for maintaing status of if url is already crawled or not
    TreeMap<String, Integer> visited = new TreeMap<String, Integer>();

    //add base url
    linksset.add(baseUrl);// w w  w.j av a2 s .  c  om
    //mark base url as not crawled
    visited.put(baseUrl, 0);

    try {
        while (true) {
            flag = 0;
            tempset.clear();

            for (String str : linksset) {
                //check if url is already crawled or not and it has valid domain name
                if ((visited.get(str) == 0) && (str.startsWith("https://www.hackerearth.com/"))) {
                    System.out.println("crawling  " + str);

                    //retriving response of current url as document
                    Document doc = Jsoup.connect(str).timeout(0).userAgent(
                            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0")
                            .referrer("http://www.google.com").ignoreHttpErrors(true).get();
                    //retriving all urls from current page
                    Elements links = doc.select("a[href]");

                    //mark url as crawled
                    visited.put(str, 1);

                    //mark flag as url is crawled
                    flag = 1;
                    //retrive all urls
                    for (Element link : links) {
                        if (link.absUrl("href").endsWith("/tutorial/")) {
                            tutorialset.add(link.absUrl("href"));
                        }
                        //check if url is problem url then add it in problemurlset
                        if (link.absUrl("href").startsWith("https://www.hackerearth.com/")
                                && isProblemUrl(link.absUrl("href"))) {
                            problemset.add(link.absUrl("href"));
                        }
                        //check if url has valid domain and it has problem urls or not
                        if (link.absUrl("href").contains(("https://www.hackerearth.com/"))
                                && isCrawlable(link.absUrl("href"))) {
                            //if link is not visited then mark it as uncrawled
                            if (!visited.containsKey(link.absUrl("href"))) {
                                visited.put(link.absUrl("href"), 0);
                            }
                            //add it in tempsetorary set
                            tempset.add(link.absUrl("href"));
                            //System.out.println("\n  base: "+str+" ::: link  : " + link.absUrl("href"));
                        }
                    }
                }
            }
            //if nothing is left to crawl break the loop
            if (flag == 0) {
                break;
            }
            //add all retrieved links to linksset
            linksset.addAll(tempset);
        }

        System.out.println("\n\ntotal problem urls " + problemset.size());

        int i = 0;
        for (String str : problemset) {
            System.out.println("link " + i + " : " + str);
            i++;
        }

    } catch (IOException ex) {
        Logger.getLogger(HackerEarthCrawler.class.getName()).log(Level.SEVERE, null, ex);
    }

    //scrap and store into database
    //for every problem url scrap problem page
    for (String problemUrl : problemset) {

        System.out.println("problemUrl :" + problemUrl);
        try {
            //create problem class to store in database
            Problem problem = new Problem();
            String problemSIOC = "", problemIOC = "";
            String problemTitle = "", problemStatement = "", problemInput = "", problemOutput = "",
                    problemConstraints = "";
            String sampleInput = "", sampleOutput = "";
            String problemExplanation = "";
            //set default timelimit to 1 second
            double problemTimeLimit = 1.0;
            ArrayList<String> tags = new ArrayList<String>();

            //get response for given problem url
            Response response = Jsoup.connect(problemUrl).execute();
            Document doc = response.parse();

            //retrieve problem title from page
            Element elementTitle = doc.getElementsByTag("title").first();
            StringTokenizer stTitle = new StringTokenizer(elementTitle.text(), "|");
            problemTitle = stTitle.nextToken().trim();

            Element content = doc.getElementsByClass("starwars-lab").first();
            problemSIOC = content.text();
            Elements e = content.children();

            //to find problem statement
            String breakloop[] = { "input", "input:", "input :", "input format:", "input format :",
                    "input format", "Input and output", "constraints :", "constraints:", "constraints",
                    "$$Input :$$" };
            flag = 0;
            for (Element p : e) {
                String tempStatement = "";
                for (Element pp : p.getAllElements()) {

                    for (String strbreak : breakloop) {
                        if (StringUtils.equalsIgnoreCase(pp.ownText(), strbreak)) {
                            //System.out.println("strbreak :"+strbreak);

                            tempStatement = p.text().substring(0,
                                    p.text().toLowerCase().indexOf(strbreak.toLowerCase()));
                            // System.out.println("temp "+tempStatement);
                            flag = 1;
                            break;
                        }
                    }
                }

                if (flag == 1) {
                    problemStatement += tempStatement;
                    //remove extra space at end
                    if (tempStatement.length() == 0) {
                        problemStatement = problemStatement.substring(0, problemStatement.length() - 1);
                    }
                    break;
                }
                problemStatement += p.text() + " ";
            }

            System.out.println("problemSIOC :" + problemSIOC);
            System.out.println("problemStatement :" + problemStatement);

            if (problemStatement.length() <= problemSIOC.length()) {
                //remove problem statement from whole text and remove extra spaces at the beginning and the end
                problemIOC = problemSIOC.substring(problemStatement.length()).trim();
            } else {
                problemIOC = "";
            }

            System.out.println("problemIOC :" + problemIOC);

            //keywords for identifying input
            String decideInput[] = { "Input format :", "Input format:", "Input format", "inputformat:",
                    "inputformat :", "inputformat", "input and output", "input :", "input:", "input" };
            //keywords for identifying output
            String decideOutput[] = { "output format :", "output format:", "Output format", "outputformat:",
                    "outputformat :", "outputformat", "output :", "output:", "output" };
            //keywords for identifying constraint
            String decideConstraint[] = { "constraints:", "constraints :", "constraints", "Constraints :",
                    "constraint:", "constraint :", "constraint", "Contraints :" };

            int posin = 0, posoutput = 0, poscon = 0, idxin, idxout, idxcon, flaginput = 0, flagoutput = 0,
                    flagcon = 0, inlen = 0, outlen = 0, conlen = 0;

            //find inputformat position,length of keyword
            for (idxin = 0; idxin < decideInput.length; idxin++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideInput[idxin])) {

                    posin = problemIOC.toLowerCase().indexOf(decideInput[idxin].toLowerCase());
                    flaginput = 1;
                    inlen = decideInput[idxin].length();

                    //decide it is keyowrd for actucal input or it is "sample input"
                    if (StringUtils.containsIgnoreCase(problemIOC, "sample input")) {
                        if (posin > problemIOC.toLowerCase().indexOf("sample input")) {
                            flaginput = 0;
                            inlen = 0;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
            }

            //find outputformat position,length of keyword
            for (idxout = 0; idxout < decideOutput.length; idxout++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideOutput[idxout])) {
                    posoutput = problemIOC.toLowerCase().indexOf(decideOutput[idxout].toLowerCase());
                    flagoutput = 1;
                    outlen = decideOutput[idxout].length();
                    break;
                }
            }

            //find constraint position,length of keyword
            for (idxcon = 0; idxcon < decideConstraint.length; idxcon++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideConstraint[idxcon])) {
                    poscon = problemIOC.toLowerCase().indexOf(decideConstraint[idxcon].toLowerCase());
                    flagcon = 1;
                    conlen = decideConstraint[idxcon].length();
                    break;
                }
            }

            System.out.println("input " + flaginput + " " + inlen + " " + posin);
            System.out.println("output " + flagoutput + " " + outlen + " " + posoutput);
            System.out.println("constraint " + flagcon + " " + conlen + " " + poscon);
            //retrieve problem input and output if present in problem page

            //if input format is present
            if (flaginput == 1) {
                //if input keyword is "input and output" and contraint is present in problem page
                if (idxin == 6 && flagcon == 1) {
                    problemInput = problemIOC.substring(inlen, poscon);
                }
                //if input keyword is "input and output" and contraint is not present in problem page
                else if (idxin == 6 && flagcon == 0) {
                    problemInput = problemIOC.substring(inlen);
                }
                //if output format and constraint is present
                else if (flagoutput == 1 && flagcon == 1) {
                    //if constraint is present before input format
                    if (poscon < posin) {
                        problemInput = problemIOC.substring(posin + inlen, posoutput);
                        problemOutput = problemIOC.substring(posoutput + outlen);
                    }
                    //if constraint is present before sample
                    else if (poscon < posoutput) {
                        problemInput = problemIOC.substring(inlen, poscon);
                        problemOutput = problemIOC.substring(posoutput + outlen);
                    } else {
                        problemInput = problemIOC.substring(inlen, posoutput);
                        problemOutput = problemIOC.substring(posoutput + outlen, poscon);
                    }
                }
                //if constraint is not present
                else if (flagoutput == 1 && flagcon == 0) {
                    problemInput = problemIOC.substring(inlen, posoutput);
                    problemOutput = problemIOC.substring(posoutput + outlen);
                } else if (flagoutput == 0 && flagcon == 1) {
                    if (poscon < posin) {
                        problemInput = problemIOC.substring(posin + inlen);
                    } else {
                        problemInput = problemIOC.substring(poscon + conlen, posin);
                    }
                    problemOutput = "";
                } else {
                    problemInput = problemIOC.substring(inlen);
                    problemOutput = "";
                }
            }
            //if input format and output format is not present
            else {
                problemInput = "";
                problemOutput = "";
            }

            //if constraint is present
            if (flagcon == 1) {
                //if constraint is present before input format
                if (poscon < posin) {
                    problemConstraints = problemIOC.substring(0, posin);
                }
                //if constraint is present before output format
                else if (poscon < posoutput) {
                    problemConstraints = problemIOC.substring(poscon + conlen, posoutput);
                } else {
                    problemConstraints = problemIOC.substring(poscon + conlen);
                }
            }

            System.out.println("problemInput :" + problemInput);
            System.out.println("problemOutput :" + problemOutput);
            System.out.println("problemConstraints :" + problemConstraints);

            //retrieve problem tags from problem page
            Element elementtag = doc.getElementsByClass("problem-tags").first().child(1);
            StringTokenizer st = new StringTokenizer(elementtag.text(), ",");
            while (st.hasMoreTokens()) {
                tags.add(st.nextToken().trim());
            }

            //retrieve sample input sample output if present
            Element elementSIO = doc.getElementsByClass("input-output-container").first();
            //if sample input output is present
            if (elementSIO != null) {
                //find position of sample output
                int soutpos = elementSIO.text().indexOf("SAMPLE OUTPUT");
                sampleInput = elementSIO.text().substring(12, soutpos);
                sampleOutput = elementSIO.text().substring(soutpos + 13);
                System.out.println("Sample input :\n" + sampleInput + "\n\n\n");
                System.out.println("Sample Output :\n" + sampleOutput);
            } else {
                sampleInput = "";
                sampleOutput = "";
            }

            //retrieve problem explanation from problem page if present
            Element elementExplanation = doc.getElementsByClass("standard-margin").first().child(0);
            if (elementExplanation.text().toLowerCase().contains("explanation")) {
                problemExplanation = elementExplanation.nextElementSibling().text();
            }
            System.out.println("Explanation :" + problemExplanation);

            //retrieve timelimit
            Element elementTL = doc.getElementsByClass("problem-guidelines").first().child(0).child(1);
            StringTokenizer stTL = new StringTokenizer(elementTL.ownText(), " ");
            problemTimeLimit = Double.parseDouble(stTL.nextToken());

            //System.out.println("problemTimeLimit :"+problemTimeLimit);
            //set all retrieved information to problem class
            problem.setProblemUrl(problemUrl);
            if (problemTitle.length() == 0) {
                problemTitle = null;
            }
            if (problemStatement.length() == 0) {
                problemStatement = null;
            }
            if (problemInput.length() == 0) {
                problemInput = null;
            }
            if (problemOutput.length() == 0) {
                problemOutput = null;
            }
            if (problemExplanation.length() == 0) {
                problemExplanation = null;
            }
            if (problemConstraints.length() == 0) {
                problemConstraints = null;
            }
            problem.setTitle(problemTitle);
            problem.setProblemUrl(problemUrl);
            problem.setProblemStatement(problemStatement);
            problem.setInputFormat(problemInput);
            problem.setOutputFormat(problemOutput);
            problem.setTimeLimit(problemTimeLimit);
            problem.setExplanation(problemExplanation);
            problem.setConstraints(problemConstraints);

            //set sample input output to problem class
            SampleInputOutput sampleInputOutput = new SampleInputOutput(problem, sampleInput, sampleOutput);
            problem.getSampleInputOutputs().add(sampleInputOutput);
            //set platform as hackerearth
            problem.setPlatform(Platform.HackerEarth);
            for (String strtag : tags) {
                problem.getTags().add(strtag);
            }

            //store in database
            Session session = null;
            Transaction transaction = null;
            try {
                //start session
                session = HibernateUtil.getSessionFactory().openSession();
                transaction = session.beginTransaction();

                //check if problem is already stored in database
                String hql = "FROM Problem p where p.problemUrl = :problem_url";
                Problem oldProblem = (Problem) session.createQuery(hql).setString("problem_url", problemUrl)
                        .uniqueResult();
                String task;

                //if problem is present in database
                if (oldProblem != null) {
                    //update the old problem
                    task = "updated";
                    //retrieve id of old problem
                    problem.setId(oldProblem.getId());
                    session.delete(oldProblem);
                    session.flush();
                    session.save(problem);
                } else {
                    task = "saved";
                    session.save(problem);
                }

                transaction.commit();
                //log the info to console
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}",
                        new Object[] { task, problem.getProblemUrl() });
            } catch (HibernateException ee) {
                if (transaction != null) {
                    transaction.rollback();
                }
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE,
                        "Cannot Insert/Update problem into databse: " + problemUrl, e);
            } finally {
                //close the session
                if (session != null) {
                    session.close();
                }
            }
        } catch (Exception ee) {
            System.out.println(ee.toString());
        }
    }

    System.out.println("\n\n\n\ntutorial urls\n\n");
    try {

        for (String tutorialurl : tutorialset) {
            //System.out.println(tutorialurl+"\n\n");
            Response tutorialres = Jsoup.connect(tutorialurl).execute();
            Document doc = tutorialres.parse();

            Tutorial tutorial = new Tutorial();
            tutorial.setContent(doc.getElementsByClass("tutorial").first().text());

            tutorial.setName(baseUrl);
            tutorialurl = tutorialurl.substring(0, tutorialurl.length() - 10);
            StringTokenizer tutorialtok = new StringTokenizer(tutorialurl, "/");

            String tempstr = "";
            while (tutorialtok.hasMoreTokens()) {
                tempstr = tutorialtok.nextToken();
            }

            Session session = null;
            Transaction transaction = null;
            try {
                //start session
                session = HibernateUtil.getSessionFactory().openSession();
                transaction = session.beginTransaction();

                //check if problem is already stored in database
                String hql = "FROM Tutorial p where p.name = :name";
                Tutorial oldProblem = (Tutorial) session.createQuery(hql).setString("name", tempstr)
                        .uniqueResult();
                String task;

                //if problem is present in database
                if (oldProblem != null) {
                    //update the old problem
                    task = "updated";
                    //retrieve id of old problem
                    tutorial.setName(oldProblem.getName());
                    session.delete(oldProblem);
                    session.flush();
                    session.save(tutorial);
                } else {
                    task = "saved";
                    tutorial.setName(tempstr);
                    session.save(tutorial);
                }

                transaction.commit();
                //log the info to console
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}",
                        new Object[] { task, tutorial.getName() });
            } catch (HibernateException ee) {
                if (transaction != null) {
                    transaction.rollback();
                }
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE,
                        "Cannot Insert/Update problem into databse: " + tempstr, ee);
            } finally {
                //close the session
                if (session != null) {
                    session.close();
                }
            }

        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    }
}

From source file:de.geeksfactory.opacclient.apis.Pica.java

protected DetailledItem parse_result(String html) {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);/* w  w w . ja  v a 2  s  . c  om*/

    DetailledItem result = new DetailledItem();
    for (Element a : doc.select("a[href*=PPN")) {
        Map<String, String> hrefq = getQueryParamsFirst(a.absUrl("href"));
        String ppn = hrefq.get("PPN");
        result.setId(ppn);
        break;
    }

    // GET COVER
    if (doc.select("td.preslabel:contains(ISBN) + td.presvalue").size() > 0) {
        Element isbnElement = doc.select("td.preslabel:contains(ISBN) + td.presvalue").first();
        String isbn = "";
        for (Node child : isbnElement.childNodes()) {
            if (child instanceof TextNode) {
                isbn = ((TextNode) child).text().trim();
                break;
            }
        }
        result.setCover(ISBNTools.getAmazonCoverURL(isbn, true));
    }

    // GET TITLE AND SUBTITLE
    String titleAndSubtitle;
    Element titleAndSubtitleElem = null;
    String titleRegex = ".*(Titel|Aufsatz|Zeitschrift|Gesamttitel"
            + "|Title|Article|Periodical|Collective\\stitle" + "|Titre|Article|P.riodique|Titre\\sg.n.ral).*";
    String selector = "td.preslabel:matches(" + titleRegex + ") + td.presvalue";
    if (doc.select(selector).size() > 0) {
        titleAndSubtitleElem = doc.select(selector).first();
        titleAndSubtitle = titleAndSubtitleElem.text().trim();
        int slashPosition = Math.min(titleAndSubtitle.indexOf("/"), titleAndSubtitle.indexOf(":"));
        String title;
        if (slashPosition > 0) {
            title = titleAndSubtitle.substring(0, slashPosition).trim();
            String subtitle = titleAndSubtitle.substring(slashPosition + 1).trim();
            result.addDetail(new Detail(stringProvider.getString(StringProvider.SUBTITLE), subtitle));
        } else {
            title = titleAndSubtitle;
        }
        result.setTitle(title);
    } else {
        result.setTitle("");
    }

    // Details
    int line = 0;
    Elements lines = doc.select("td.preslabel + td.presvalue");
    if (titleAndSubtitleElem != null) {
        lines.remove(titleAndSubtitleElem);
    }
    for (Element element : lines) {
        Element titleElem = element.firstElementSibling();
        String detail = "";
        if (element.select("div").size() > 1 && element.select("div").text().equals(element.text())) {
            boolean first = true;
            for (Element div : element.select("div")) {
                if (!div.text().replace("\u00a0", " ").trim().equals("")) {
                    if (!first) {
                        detail += "\n" + div.text().replace("\u00a0", " ").trim();
                    } else {
                        detail += div.text().replace("\u00a0", " ").trim();
                        first = false;
                    }
                }
            }
        } else {
            detail = element.text().replace("\u00a0", " ").trim();
        }
        String title = titleElem.text().replace("\u00a0", " ").trim();

        if (element.select("hr").size() > 0)
        // after the separator we get the copies
        {
            break;
        }

        if (detail.length() == 0 && title.length() == 0) {
            line++;
            continue;
        }
        if (title.contains(":")) {
            title = title.substring(0, title.indexOf(":")); // remove colon
        }
        result.addDetail(new Detail(title, detail));

        if (element.select("a").size() == 1 && !element.select("a").get(0).text().trim().equals("")) {
            String url = element.select("a").first().absUrl("href");
            if (!url.startsWith(opac_url)) {
                result.addDetail(new Detail(stringProvider.getString(StringProvider.LINK), url));
            }
        }

        line++;
    }
    line++; // next line after separator

    // Copies
    Copy copy = new Copy();
    String location = "";

    // reservation info will be stored as JSON
    JSONArray reservationInfo = new JSONArray();

    while (line < lines.size()) {
        Element element = lines.get(line);
        if (element.select("hr").size() == 0) {
            Element titleElem = element.firstElementSibling();
            String detail = element.text().trim();
            String title = titleElem.text().replace("\u00a0", " ").trim();

            if (detail.length() == 0 && title.length() == 0) {
                line++;
                continue;
            }

            if (title.contains("Standort") || title.contains("Vorhanden in") || title.contains("Location")) {
                location += detail;
            } else if (title.contains("Sonderstandort")) {
                location += " - " + detail;
            } else if (title.contains("Systemstelle") || title.contains("Subject")) {
                copy.setDepartment(detail);
            } else if (title.contains("Fachnummer") || title.contains("locationnumber")) {
                copy.setLocation(detail);
            } else if (title.contains("Signatur") || title.contains("Shelf mark")) {
                copy.setShelfmark(detail);
            } else if (title.contains("Anmerkung")) {
                location += " (" + detail + ")";
            } else if (title.contains("Link")) {
                result.addDetail(new Detail(title.replace(":", "").trim(), detail));
            } else if (title.contains("Status") || title.contains("Ausleihinfo")
                    || title.contains("Ausleihstatus") || title.contains("Request info")) {
                // Find return date
                Pattern pattern = Pattern.compile("(till|bis) (\\d{2}-\\d{2}-\\d{4})");
                Matcher matcher = pattern.matcher(detail);
                if (matcher.find()) {
                    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd-MM-yyyy").withLocale(Locale.GERMAN);
                    try {
                        copy.setStatus(detail.substring(0, matcher.start() - 1).trim());
                        copy.setReturnDate(fmt.parseLocalDate(matcher.group(2)));
                    } catch (IllegalArgumentException e) {
                        e.printStackTrace();
                        copy.setStatus(detail);
                    }
                } else {
                    copy.setStatus(detail);
                }
                // Get reservation info
                if (element.select("a:has(img[src*=inline_arrow])").size() > 0) {
                    Element a = element.select("a:has(img[src*=inline_arrow])").first();
                    boolean multipleCopies = a.text().matches(".*(Exemplare|Volume list).*");
                    JSONObject reservation = new JSONObject();
                    try {
                        reservation.put("multi", multipleCopies);
                        reservation.put("link", _extract_url(a.absUrl("href")));
                        reservation.put("desc", location);
                        reservationInfo.put(reservation);
                    } catch (JSONException e1) {
                        e1.printStackTrace();
                    }
                    result.setReservable(true);
                }
            }
        } else {
            copy.setBranch(location);
            result.addCopy(copy);
            location = "";
            copy = new Copy();
        }
        line++;
    }

    if (copy.notEmpty()) {
        copy.setBranch(location);
        result.addCopy(copy);
    }

    if (reservationInfo.length() == 0) {
        // No reservation info found yet, because we didn't find any copies.
        // If there is a reservation link somewhere in the rows we interpreted
        // as details, we still want to use it.
        if (doc.select("td a:has(img[src*=inline_arrow])").size() > 0) {
            Element a = doc.select("td a:has(img[src*=inline_arrow])").first();
            boolean multipleCopies = a.text().matches(".*(Exemplare|Volume list).*");
            JSONObject reservation = new JSONObject();
            try {
                reservation.put("multi", multipleCopies);
                reservation.put("link", _extract_url(a.attr("href")));
                reservation.put("desc", location);
                reservationInfo.put(reservation);
            } catch (JSONException e1) {
                e1.printStackTrace();
            }
            result.setReservable(true);
        }
    }
    result.setReservation_info(reservationInfo.toString());

    // Volumes
    if (doc.select("a[href^=FAM?PPN=]").size() > 0) {
        String href = doc.select("a[href^=FAM?PPN=]").attr("href");
        String ppn = getQueryParamsFirst(href).get("PPN");
        Map<String, String> data = new HashMap<>();
        data.put("ppn", ppn);
        result.setVolumesearch(data);
    }

    return result;
}

From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java

protected DetailledItem parse_result(String html) {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);/* w ww. ja  v  a  2s.c  o  m*/

    DetailledItem result = new DetailledItem();

    if (doc.select(".detail_cover img").size() == 1) {
        result.setCover(doc.select(".detail_cover img").get(0).attr("src"));
    }

    result.setTitle(doc.select(".detail_titel").text());

    Elements detailtrs = doc.select(".detailzeile table tr");
    for (int i = 0; i < detailtrs.size(); i++) {
        Element tr = detailtrs.get(i);
        if (tr.child(0).hasClass("detail_feld")) {
            String title = tr.child(0).text();
            String content = tr.child(1).text();
            if (title.equals("Gesamtwerk:") || title.equals("Erschienen in:")) {
                try {
                    if (tr.child(1).select("a").size() > 0) {
                        Element link = tr.child(1).select("a").first();
                        List<NameValuePair> query = URLEncodedUtils.parse(new URI(link.absUrl("href")),
                                "UTF-8");
                        for (NameValuePair q : query) {
                            if (q.getName().equals("MedienNr")) {
                                result.setCollectionId(q.getValue());
                            }
                        }
                    }
                } catch (URISyntaxException e) {
                }
            } else {

                if (content.contains("hier klicken") && tr.child(1).select("a").size() > 0) {
                    content += " " + tr.child(1).select("a").first().attr("href");
                }

                result.addDetail(new Detail(title, content));
            }
        }
    }

    Elements detailcenterlinks = doc.select(".detailzeile_center a.detail_link");
    for (int i = 0; i < detailcenterlinks.size(); i++) {
        Element a = detailcenterlinks.get(i);
        result.addDetail(new Detail(a.text().trim(), a.absUrl("href")));
    }

    try {
        JSONObject copymap = new JSONObject();
        if (data.has("copiestable")) {
            copymap = data.getJSONObject("copiestable");
        } else {
            Elements ths = doc.select(".exemplartab .exemplarmenubar th");
            for (int i = 0; i < ths.size(); i++) {
                Element th = ths.get(i);
                String head = th.text().trim();
                if (head.equals("Zweigstelle")) {
                    copymap.put("branch", i);
                } else if (head.equals("Abteilung")) {
                    copymap.put("department", i);
                } else if (head.equals("Bereich") || head.equals("Standort")) {
                    copymap.put("location", i);
                } else if (head.equals("Signatur")) {
                    copymap.put("signature", i);
                } else if (head.equals("Barcode") || head.equals("Medien-Nummer")) {
                    copymap.put("barcode", i);
                } else if (head.equals("Status")) {
                    copymap.put("status", i);
                } else if (head.equals("Frist") || head.matches("Verf.+gbar")) {
                    copymap.put("returndate", i);
                } else if (head.equals("Vorbestellungen") || head.equals("Reservierungen")) {
                    copymap.put("reservations", i);
                }
            }
        }
        Elements exemplartrs = doc.select(".exemplartab .tabExemplar, .exemplartab .tabExemplar_");
        DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);
        for (int i = 0; i < exemplartrs.size(); i++) {
            Element tr = exemplartrs.get(i);

            Copy copy = new Copy();

            Iterator<?> keys = copymap.keys();
            while (keys.hasNext()) {
                String key = (String) keys.next();
                int index;
                try {
                    index = copymap.has(key) ? copymap.getInt(key) : -1;
                } catch (JSONException e1) {
                    index = -1;
                }
                if (index >= 0) {
                    try {
                        copy.set(key, tr.child(index).text(), fmt);
                    } catch (IllegalArgumentException e) {
                        e.printStackTrace();
                    }
                }
            }

            result.addCopy(copy);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    try {
        Elements bandtrs = doc.select("table .tabBand a");
        for (int i = 0; i < bandtrs.size(); i++) {
            Element tr = bandtrs.get(i);

            Volume volume = new Volume();
            volume.setId(tr.attr("href").split("=")[1]);
            volume.setTitle(tr.text());
            result.addVolume(volume);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    if (doc.select(".detail_vorbest a").size() == 1) {
        result.setReservable(true);
        result.setReservation_info(doc.select(".detail_vorbest a").attr("href"));
    }
    return result;
}

From source file:de.geeksfactory.opacclient.apis.SISIS.java

protected DetailledItem parse_result(String html) throws IOException {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);/* w w  w  .j a  va 2s. c om*/

    String html2 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showTitleActive", ENCODING);

    Document doc2 = Jsoup.parse(html2);
    doc2.setBaseUri(opac_url);

    String html3 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showAvailabilityActive",
            ENCODING);

    Document doc3 = Jsoup.parse(html3);
    doc3.setBaseUri(opac_url);

    DetailledItem result = new DetailledItem();

    try {
        result.setId(doc.select("#bibtip_id").text().trim());
    } catch (Exception ex) {
        ex.printStackTrace();
    }
    List<String> reservationlinks = new ArrayList<>();
    for (Element link : doc3.select("#vormerkung a, #tab-content a")) {
        String href = link.absUrl("href");
        Map<String, String> hrefq = getQueryParamsFirst(href);
        if (result.getId() == null) {
            // ID retrieval
            String key = hrefq.get("katkey");
            if (key != null) {
                result.setId(key);
                break;
            }
        }

        // Vormerken
        if (hrefq.get("methodToCall") != null) {
            if (hrefq.get("methodToCall").equals("doVormerkung")
                    || hrefq.get("methodToCall").equals("doBestellung")) {
                reservationlinks.add(href.split("\\?")[1]);
            }
        }
    }
    if (reservationlinks.size() == 1) {
        result.setReservable(true);
        result.setReservation_info(reservationlinks.get(0));
    } else if (reservationlinks.size() == 0) {
        result.setReservable(false);
    } else {
        // TODO: Multiple options - handle this case!
    }

    if (doc.select(".data td img").size() == 1) {
        result.setCover(doc.select(".data td img").first().attr("abs:src"));
        try {
            downloadCover(result);
        } catch (Exception e) {

        }
    }

    if (doc.select(".aw_teaser_title").size() == 1) {
        result.setTitle(doc.select(".aw_teaser_title").first().text().trim());
    } else if (doc.select(".data td strong").size() > 0) {
        result.setTitle(doc.select(".data td strong").first().text().trim());
    } else {
        result.setTitle("");
    }
    if (doc.select(".aw_teaser_title_zusatz").size() > 0) {
        result.addDetail(new Detail("Titelzusatz", doc.select(".aw_teaser_title_zusatz").text().trim()));
    }

    String title = "";
    String text = "";
    boolean takeover = false;
    Element detailtrs = doc2.select(".box-container .data td").first();
    for (Node node : detailtrs.childNodes()) {
        if (node instanceof Element) {
            if (((Element) node).tagName().equals("strong")) {
                title = ((Element) node).text().trim();
                text = "";
            } else {
                if (((Element) node).tagName().equals("a")
                        && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) {
                    text = text + node.attr("href");
                    takeover = true;
                    break;
                }
            }
        } else if (node instanceof TextNode) {
            text = text + ((TextNode) node).text();
        }
    }
    if (!takeover) {
        text = "";
        title = "";
    }

    detailtrs = doc2.select("#tab-content .data td").first();
    if (detailtrs != null) {
        for (Node node : detailtrs.childNodes()) {
            if (node instanceof Element) {
                if (((Element) node).tagName().equals("strong")) {
                    if (!text.equals("") && !title.equals("")) {
                        result.addDetail(new Detail(title.trim(), text.trim()));
                        if (title.equals("Titel:")) {
                            result.setTitle(text.trim());
                        }
                        text = "";
                    }

                    title = ((Element) node).text().trim();
                } else {
                    if (((Element) node).tagName().equals("a")
                            && (((Element) node).text().trim().contains("hier klicken")
                                    || title.equals("Link:"))) {
                        text = text + node.attr("href");
                    } else {
                        text = text + ((Element) node).text();
                    }
                }
            } else if (node instanceof TextNode) {
                text = text + ((TextNode) node).text();
            }
        }
    } else {
        if (doc2.select("#tab-content .fulltitle tr").size() > 0) {
            Elements rows = doc2.select("#tab-content .fulltitle tr");
            for (Element tr : rows) {
                if (tr.children().size() == 2) {
                    Element valcell = tr.child(1);
                    String value = valcell.text().trim();
                    if (valcell.select("a").size() == 1) {
                        value = valcell.select("a").first().absUrl("href");
                    }
                    result.addDetail(new Detail(tr.child(0).text().trim(), value));
                }
            }
        } else {
            result.addDetail(new Detail(stringProvider.getString(StringProvider.ERROR),
                    stringProvider.getString(StringProvider.COULD_NOT_LOAD_DETAIL)));
        }
    }
    if (!text.equals("") && !title.equals("")) {
        result.addDetail(new Detail(title.trim(), text.trim()));
        if (title.equals("Titel:")) {
            result.setTitle(text.trim());
        }
    }
    for (Element link : doc3.select("#tab-content a")) {
        Map<String, String> hrefq = getQueryParamsFirst(link.absUrl("href"));
        if (result.getId() == null) {
            // ID retrieval
            String key = hrefq.get("katkey");
            if (key != null) {
                result.setId(key);
                break;
            }
        }
    }
    for (Element link : doc3.select(".box-container a")) {
        if (link.text().trim().equals("Download")) {
            result.addDetail(
                    new Detail(stringProvider.getString(StringProvider.DOWNLOAD), link.absUrl("href")));
        }
    }

    Map<String, Integer> copy_columnmap = new HashMap<>();
    // Default values
    copy_columnmap.put("barcode", 1);
    copy_columnmap.put("branch", 3);
    copy_columnmap.put("status", 4);
    Elements copy_columns = doc.select("#tab-content .data tr#bg2 th");
    for (int i = 0; i < copy_columns.size(); i++) {
        Element th = copy_columns.get(i);
        String head = th.text().trim();
        if (head.contains("Status")) {
            copy_columnmap.put("status", i);
        }
        if (head.contains("Zweigstelle")) {
            copy_columnmap.put("branch", i);
        }
        if (head.contains("Mediennummer")) {
            copy_columnmap.put("barcode", i);
        }
        if (head.contains("Standort")) {
            copy_columnmap.put("location", i);
        }
        if (head.contains("Signatur")) {
            copy_columnmap.put("signature", i);
        }
    }

    Pattern status_lent = Pattern.compile(
            "^(entliehen) bis ([0-9]{1,2}.[0-9]{1,2}.[0-9]{2," + "4}) \\(gesamte Vormerkungen: ([0-9]+)\\)$");
    Pattern status_and_barcode = Pattern.compile("^(.*) ([0-9A-Za-z]+)$");

    Elements exemplartrs = doc.select("#tab-content .data tr").not("#bg2");
    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);
    for (Element tr : exemplartrs) {
        try {
            Copy copy = new Copy();
            Element status = tr.child(copy_columnmap.get("status"));
            Element barcode = tr.child(copy_columnmap.get("barcode"));
            String barcodetext = barcode.text().trim().replace(" Wegweiser", "");

            // STATUS
            String statustext;
            if (status.getElementsByTag("b").size() > 0) {
                statustext = status.getElementsByTag("b").text().trim();
            } else {
                statustext = status.text().trim();
            }
            if (copy_columnmap.get("status").equals(copy_columnmap.get("barcode"))) {
                Matcher matcher1 = status_and_barcode.matcher(statustext);
                if (matcher1.matches()) {
                    statustext = matcher1.group(1);
                    barcodetext = matcher1.group(2);
                }
            }

            Matcher matcher = status_lent.matcher(statustext);
            if (matcher.matches()) {
                copy.setStatus(matcher.group(1));
                copy.setReservations(matcher.group(3));
                copy.setReturnDate(fmt.parseLocalDate(matcher.group(2)));
            } else {
                copy.setStatus(statustext);
            }
            copy.setBarcode(barcodetext);
            if (status.select("a[href*=doVormerkung]").size() == 1) {
                copy.setResInfo(status.select("a[href*=doVormerkung]").attr("href").split("\\?")[1]);
            }

            String branchtext = tr.child(copy_columnmap.get("branch")).text().trim().replace(" Wegweiser", "");
            copy.setBranch(branchtext);

            if (copy_columnmap.containsKey("location")) {
                copy.setLocation(
                        tr.child(copy_columnmap.get("location")).text().trim().replace(" Wegweiser", ""));
            }

            if (copy_columnmap.containsKey("signature")) {
                copy.setShelfmark(
                        tr.child(copy_columnmap.get("signature")).text().trim().replace(" Wegweiser", ""));
            }

            result.addCopy(copy);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    try {
        Element isvolume = null;
        Map<String, String> volume = new HashMap<>();
        Elements links = doc.select(".data td a");
        int elcount = links.size();
        for (int eli = 0; eli < elcount; eli++) {
            List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(links.get(eli).attr("href")), "UTF-8");
            for (NameValuePair nv : anyurl) {
                if (nv.getName().equals("methodToCall") && nv.getValue().equals("volumeSearch")) {
                    isvolume = links.get(eli);
                } else if (nv.getName().equals("catKey")) {
                    volume.put("catKey", nv.getValue());
                } else if (nv.getName().equals("dbIdentifier")) {
                    volume.put("dbIdentifier", nv.getValue());
                }
            }
            if (isvolume != null) {
                volume.put("volume", "true");
                result.setVolumesearch(volume);
                break;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    return result;
}

From source file:com.weavers.duqhan.business.impl.ProductServiceImpl.java

@Override
public void loadTempProducts(List<StatusBean> statusBeans) {
    boolean isSuccess = true;
    String startDate = new Date().toString();
    Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE,
            "(==I==)DATE: " + startDate + "Store product details in temp product table start.....");
    try {/*from   www  . j  a  v  a 2s .com*/
        String status = "";
        for (StatusBean statusBean : statusBeans) {
            status = "Link duplicate";
            Temtproductlinklist temtproductlinklist = temtproductlinklistDao.loadById(statusBean.getId());
            if (temtproductlinklist != null && temtproductlinklist.getStatus() == 0) {
                Product testProduct = productDao.getProductByExternelLink(temtproductlinklist.getLink());
                if (testProduct == null) {
                    String value = "";
                    Elements detailMain;
                    Elements detailSub;
                    Elements specifics;
                    double votes = 0.0;
                    double stars = 0.0;
                    double feedback = 0.0;
                    String url = temtproductlinklist.getLink();
                    try {
                        testProduct = new Product();
                        Product savedTestProduct;

                        //=================== Random sleep START ===================//
                        //                            TimeUnit.SECONDS.sleep(30 + (int) (Math.random() * 100));
                        Random randomObj = new Random();
                        TimeUnit.SECONDS.sleep(randomObj.ints(30, 60).findFirst().getAsInt());
                        //=================== Random sleep END =====================//

                        Document doc = Jsoup.connect(url).get();
                        detailMain = doc.select("#j-detail-page");
                        if (!detailMain.isEmpty()) {

                            //=================== Criteria Block START==================//
                            detailMain = doc.select(".rantings-num");
                            if (!detailMain.isEmpty()) {
                                votes = Double.valueOf(detailMain.text().split(" votes")[0].split("\\(")[1]);
                            }
                            detailMain = doc.select(".percent-num");
                            if (!detailMain.isEmpty()) {
                                stars = Double.valueOf(detailMain.text());
                            }
                            detailMain = doc.select("ul.ui-tab-nav li[data-trigger='feedback'] a");
                            if (!detailMain.isEmpty()) {
                                feedback = Double.valueOf(detailMain.text().split("\\(")[1].split("\\)")[0]);
                            }
                            //=================== Criteria Block END==================//

                            if (votes > 10.0 && stars > 4.0 && feedback > 4.0) {
                                detailMain = doc.select(".detail-wrap .product-name");
                                testProduct.setName(detailMain
                                        .text());/*.substring(0, Math.min(detailMain.text().length(), 50))*/
                                detailMain = doc.select(".detail-wrap .product-name");
                                testProduct.setDescription(detailMain.text());
                                testProduct.setExternalLink(url);
                                testProduct.setVendorId(1l);//??????????????????????

                                //=================== Packaging block START==================//
                                Double weight = 1.0;
                                Double width = 1.0;
                                Double height = 1.0;
                                Double length = 1.0;
                                detailMain = doc.select(
                                        "div#j-product-desc div.pnl-packaging-main ul li.packaging-item");
                                for (Element element : detailMain) {
                                    String packagingTitle = element.select("span.packaging-title").text();
                                    String packagingDesc = element.select("span.packaging-des").text();
                                    if (packagingTitle.trim().equals("Package Weight:")) {
                                        String str = packagingDesc;
                                        str = str.replaceAll("[^.?0-9]+", " ");
                                        if (Arrays.asList(str.trim().split(" ")) != null) {
                                            if (!Arrays.asList(str.trim().split(" ")).isEmpty()) {
                                                try {
                                                    weight = Double.parseDouble(
                                                            Arrays.asList(str.trim().split(" ")).get(0));
                                                } catch (Exception e) {
                                                    weight = 1.0;
                                                }
                                            }
                                        }
                                        System.out.println("weight == " + weight);
                                    } else if (packagingTitle.trim().equals("Package Size:")) {
                                        String str = packagingDesc;
                                        str = str.replaceAll("[^.?0-9]+", " ");
                                        if (Arrays.asList(str.trim().split(" ")) != null) {
                                            if (!Arrays.asList(str.trim().split(" ")).isEmpty()) {
                                                try {
                                                    width = Double.parseDouble(
                                                            Arrays.asList(str.trim().split(" ")).get(0));
                                                    height = Double.parseDouble(
                                                            Arrays.asList(str.trim().split(" ")).get(1));
                                                    length = Double.parseDouble(
                                                            Arrays.asList(str.trim().split(" ")).get(2));
                                                } catch (Exception e) {
                                                    width = 1.0;
                                                    height = 1.0;
                                                    length = 1.0;
                                                }
                                            }
                                        }
                                        System.out.println("width == " + width);
                                        System.out.println("height == " + height);
                                        System.out.println("length == " + length);
                                    }
                                }
                                //=================== Packaging block END==================//

                                //=================== Category block START==================//
                                detailMain = doc.select("div.ui-breadcrumb div.container a");
                                Long productCategoryId = 0L;
                                String parentPath = "";
                                String thisCategory = detailMain.last().text().trim();
                                System.out.println("thisCategory == " + thisCategory);
                                Category parentCategory = new Category();
                                parentCategory.setId(0L);
                                parentCategory.setParentPath("");
                                for (Element element : detailMain) {
                                    String newCategory;
                                    newCategory = element.text().trim();
                                    System.out.println("newCategory======" + newCategory);
                                    if (newCategory.equals("Home") || newCategory.equals("All Categories")) {
                                    } else {
                                        Category category = categoryDao.getCategoryByName(newCategory);
                                        if (category != null) {
                                            if (category.getName().equals(thisCategory)) {
                                                productCategoryId = category.getId();
                                                parentPath = category.getParentPath();
                                            }
                                            parentCategory = category;
                                        } else {
                                            category = new Category();
                                            category.setId(null);
                                            category.setName(newCategory);
                                            category.setParentId(parentCategory.getId());
                                            category.setParentPath(parentCategory.getParentPath()
                                                    + parentCategory.getId() + "=");
                                            category.setQuantity(0);
                                            category.setImgUrl("-");
                                            category.setDisplayText(newCategory);
                                            Category category2 = categoryDao.save(category);
                                            if (category.getName().equals(thisCategory)) {
                                                productCategoryId = category2.getId();
                                                parentPath = category2.getParentPath();
                                            }
                                            parentCategory = category2;
                                        }
                                    }
                                }
                                //=================== Category block END==================//

                                //=============== Specifications block START==============//
                                detailMain = doc.select(".product-property-list .property-item");
                                String specifications = "";
                                for (Element element : detailMain) {
                                    specifications = specifications
                                            + element.select(".propery-title").get(0).text().replace(",", "/")
                                                    .replace(":", "-")
                                            + ":" + element.select(".propery-des").get(0).text()
                                                    .replace(",", "/").replace(":", "-")
                                            + ",";//TODO:, check
                                }
                                //=============== Specifications Block END==============//

                                //=============== Shipping Time Block START==============//
                                String shippingTime = "";
                                detailMain = doc.select(".shipping-days[data-role='delivery-days']");
                                System.out.println("value detailMain" + detailMain.toString());
                                shippingTime = detailMain.text();
                                //=============== Shipping Time Block END==============//

                                //=============== Shipping Cost Block START==============//
                                detailMain = doc.select(".logistics-cost");
                                value = detailMain.text();
                                if (!value.equalsIgnoreCase("Free Shipping")) {
                                    //                                        f = 0.00;
                                } else {
                                    //                                        f = Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1"));
                                }
                                //=============== Shipping Cost Block END==============//

                                //=================Product save 1st START==============//
                                testProduct.setCategoryId(productCategoryId);
                                testProduct.setLastUpdate(new Date());
                                testProduct.setParentPath(parentPath);
                                testProduct.setImgurl("-");
                                testProduct.setProperties("-");
                                testProduct.setProductWidth(width);
                                testProduct.setProductLength(length);
                                testProduct.setProductWeight(weight);
                                testProduct.setProductHeight(height);
                                testProduct.setShippingRate(0.0);
                                testProduct.setShippingTime("45");
                                testProduct.setSpecifications(specifications);
                                savedTestProduct = productDao.save(testProduct);
                                //====================Product save 1st END==============//

                                //========= Property, Property Value, Property Product Map Block START ========//
                                double discountPrice = 0.0;
                                double actualPrice = 0.0;
                                double markupPrice = 0.0;
                                String id = "";
                                String allProperties = "";
                                //------------------------Read Color css START---------------------//
                                specifics = doc.select("#j-product-info-sku dl.p-property-item");
                                Elements cssdetailMain = doc.select("link[href]");
                                Document cssdoc = new Document("");
                                System.out.println(
                                        "====================================================cssdetailMain"
                                                + cssdetailMain.size());
                                for (Element element : cssdetailMain) {
                                    String cssurl = element.attr("abs:href");
                                    if (cssurl.contains("??main-detail")) {
                                        try {
                                            cssdoc = Jsoup.connect(cssurl).get();
                                        } catch (IOException ex) {

                                        }
                                        break;
                                    }
                                }
                                //-----------------------Read Color css END--------------------------//

                                //-----------Product Property, Property Value START--------//
                                Map<String, ProductPropertyvalues> propertyValuesMap = new HashMap<>();
                                if (!specifics.isEmpty()) {
                                    ProductProperties testPorperties;
                                    ProductProperties saveTestPorperties;
                                    ProductPropertyvalues testPropertyValues;
                                    for (Element specific : specifics) {
                                        System.out.println("head  ==== " + specific.select("dt").text());
                                        testPorperties = productPropertiesDao
                                                .loadByName(specific.select("dt").text());
                                        if (testPorperties == null) {
                                            testPorperties = new ProductProperties();
                                            testPorperties.setPropertyName(specific.select("dt").text());
                                            saveTestPorperties = productPropertiesDao.save(testPorperties);
                                        } else {
                                            saveTestPorperties = testPorperties;
                                        }
                                        allProperties = allProperties + saveTestPorperties.getId().toString()
                                                + "-";
                                        detailSub = specific.select("dd ul li");
                                        String valu = "-";
                                        for (Element element : detailSub) {
                                            testPropertyValues = new ProductPropertyvalues();
                                            id = element.select("a[data-sku-id]").attr("data-sku-id").trim();
                                            testPropertyValues.setRefId(id);
                                            if (element.hasClass("item-sku-image")) {
                                                valu = element.select("a img[src]").get(0).absUrl("src")
                                                        .split(".jpg")[0] + ".jpg";
                                                String title = element.select("a img").get(0).attr("title");
                                                String imgUrl = GoogleBucketFileUploader
                                                        .uploadProductImage(valu, savedTestProduct.getId());
                                                valu = "<img src='" + imgUrl + "' title='" + title
                                                        + "' style='height:40px; width:40px;'/>";
                                            } else if (element.hasClass("item-sku-color")) {
                                                String style = cssdoc.html().split("sku-color-" + id)[1]
                                                        .split("}")[0].substring(1);
                                                valu = "<span style='" + style
                                                        + "' ; height:40px; width:40px; display:block;'></span>";
                                            } else {
                                                valu = element.select("a span").toString();
                                            }
                                            System.out.println("valu === " + valu);
                                            testPropertyValues.setProductId(savedTestProduct.getId());
                                            testPropertyValues.setPropertyId(saveTestPorperties.getId());
                                            testPropertyValues.setValueName(valu);
                                            propertyValuesMap.put(id,
                                                    productPropertyvaluesDao.save(testPropertyValues));
                                        }
                                    }
                                    savedTestProduct.setProperties(allProperties);
                                }
                                //-----------Product Property, Property Value END--------//

                                //----------------------Read json START------------------//
                                List<AxpProductDto> axpProductDtos = new ArrayList<>();
                                Elements scripts = doc.select("script"); // Get the script part
                                for (Element script : scripts) {
                                    if (script.html().contains("var skuProducts=")) {
                                        String jsonData = "";
                                        jsonData = script.html().split("var skuProducts=")[1]
                                                .split("var GaData")[0].trim();
                                        jsonData = jsonData.substring(0, jsonData.length() - 1);
                                        Gson gsonObj = new Gson();
                                        axpProductDtos = Arrays
                                                .asList(gsonObj.fromJson(jsonData, AxpProductDto[].class));
                                        break;
                                    }
                                }
                                //----------------------Read json END------------------//

                                //-------------Product Properties Map START------------//
                                for (AxpProductDto thisAxpProductDto : axpProductDtos) {
                                    SkuVal skuVal = thisAxpProductDto.getSkuVal();
                                    if (skuVal.getActSkuCalPrice() != null) {
                                        value = skuVal.getActSkuCalPrice().trim();
                                        discountPrice = CurrencyConverter.usdTOinr(
                                                Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1")));
                                        value = skuVal.getSkuCalPrice().trim();
                                        actualPrice = CurrencyConverter.usdTOinr(
                                                Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1")));
                                        markupPrice = discountPrice * 0.15 + 100;
                                        discountPrice = Math.ceil((discountPrice + markupPrice) / 10) * 10;
                                        actualPrice = Math.round(actualPrice + markupPrice);
                                    } else {
                                        discountPrice = 0.0;
                                        value = skuVal.getSkuCalPrice().trim();
                                        actualPrice = CurrencyConverter.usdTOinr(
                                                Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1")));
                                        markupPrice = actualPrice * 0.15 + 100;
                                        discountPrice = Math.round(actualPrice + markupPrice);
                                        actualPrice = Math.round(actualPrice + markupPrice);
                                    }

                                    ProductPropertiesMap productPropertyMap = new ProductPropertiesMap();
                                    String myPropValueIds = "";
                                    if (thisAxpProductDto.getSkuAttr() != null) {
                                        String[] skuPropIds = thisAxpProductDto.getSkuPropIds().split(",");
                                        for (String skuPropId : skuPropIds) {
                                            myPropValueIds = myPropValueIds
                                                    + propertyValuesMap.get(skuPropId).getId().toString() + "_";
                                        }

                                        productPropertyMap.setPropertyvalueComposition(myPropValueIds);
                                    } else {
                                        productPropertyMap.setPropertyvalueComposition("_");
                                    }
                                    productPropertyMap.setDiscount(discountPrice);
                                    productPropertyMap.setPrice(actualPrice);
                                    productPropertyMap.setProductId(savedTestProduct);
                                    productPropertyMap.setQuantity(5l);
                                    productPropertiesMapDao.save(productPropertyMap);
                                }
                                //-------------Product Properties Map START------------//
                                //========= Property, Property Value, Property Product Map Block END ========//

                                //============= Multiple Image Block START =============//
                                detailMain = doc.select("ul.image-thumb-list span.img-thumb-item img[src]");
                                int flg = 0;
                                String imgUrl = "";
                                for (Element element : detailMain) {
                                    imgUrl = GoogleBucketFileUploader.uploadProductImage(
                                            element.absUrl("src").split(".jpg")[0] + ".jpg",
                                            savedTestProduct.getId());
                                    if (flg == 0) {
                                        flg++;
                                        savedTestProduct.setImgurl(imgUrl);
                                    } else {
                                        ProductImg productImg = new ProductImg();
                                        productImg.setId(null);
                                        productImg.setImgUrl(imgUrl);
                                        productImg.setProductId(savedTestProduct.getId());
                                        productImgDao.save(productImg);
                                    }
                                }
                                //============= Multiple Image Block END =============//

                                //=================Product save final START==============//
                                if (productDao.save(savedTestProduct) != null) {
                                    temtproductlinklist.setStatus(1);//
                                    temtproductlinklistDao.save(temtproductlinklist);
                                    status = "Success";
                                }
                                //=================Product save final START==============//
                            } else {
                                temtproductlinklist.setStatus(2);//
                                temtproductlinklistDao.save(temtproductlinklist);
                                status = "criteria mismatch";
                            }
                        } else {
                            status = "Page not found";
                        }
                    } catch (Exception ex) {
                        System.out.println(
                                "=============================================================Exception1" + ex);
                        temtproductlinklist.setStatus(4);//
                        temtproductlinklistDao.save(temtproductlinklist);
                        System.out.println("Exception === " + ex);
                        status = "Failure";
                        Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, "(==E==)DATE: "
                                + new Date().toString()
                                + "Store product details in temp product table get error in sub process.....\n Link Id: "
                                + statusBean.getId() + "\n Started on" + startDate, ex);
                    }
                } else {
                    temtproductlinklist.setStatus(3);//
                    temtproductlinklistDao.save(temtproductlinklist);
                    status = "Product exsist";
                }
            }
            //                String body = "Id: " + temtproductlinklist.getId() + "<br/> Status: " + status;
            //                MailSender.sendEmail("krisanu.nandi@pkweb.in", "Product captured", body, "subhendu.sett@pkweb.in");
            statusBean.setStatus(status);
        }
        System.out.println("=============================================================status" + status);
    } catch (Exception e) {
        System.out.println("=============================================================Exception2" + e);
        isSuccess = false;
        String body = "(==E==)DATE: " + new Date().toString()
                + "Store product details in temp product table get error.....<br/> Started on" + startDate
                + "<br/>";
        Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, body, e);
        //            MailSender.sendEmail("krisanu.nandi@pkweb.in", "Stopped store product details", body + e.getLocalizedMessage(), "subhendu.sett@pkweb.in");
    }
    if (isSuccess) {
        String body = "(==I==)DATE: " + new Date().toString()
                + "Store product details in temp product table end.....<br/> Started on" + startDate;
        Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, body);
        /*ObjectMapper mapper = new ObjectMapper();
        try {
        MailSender.sendEmail("krisanu.nandi@pkweb.in", "Completed store product details", body + "=============<br/><br/>" + mapper.writeValueAsString(statusBeans), "subhendu.sett@pkweb.in");
        } catch (JsonProcessingException ex) {
        Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, null, ex);
        }*/
    }
    //        return statusBeans;
    System.out.println("=============================================================end");
}