List of usage examples for org.jsoup.nodes Element absUrl
public String absUrl(String attributeKey)
From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java
/** * Method used to perform recursive creation indexing for a given web page * in search database.//from w w w.ja va 2 s . c o m * * @param webPage webPage.url is entered url * webPage.title is set * @param html Jsoup.Document of entered url * @param recursionNumber used to stop recursion at exceeding * MAX_RECURSION_SEARCH_NUMBER */ private void indexElements(WebPage webPage, Document html, final int recursionNumber) throws IOException, ParseException { String title = html.title(); if (referencedTitles.contains(title.trim())) { return; } referencedTitles.add(title.trim()); webPage.setTitle(title); if (containsPage(webPage)) { System.out.println(webPage.getUrl() + " is already indexed"); return; } Element prevElement = null; Elements elements = html.body().getAllElements(); //.getElementsByTag("a"); addDoc(webPage, html.text()); // for (Element element : elements) { //// System.out.println(element.nodeName() + " element.text() " //// + element.text() + " url " //// + element.absUrl("href")); // if (element.nodeName().equalsIgnoreCase("body")) { // addDoc(webPage, element.text()); // break; //// continue; // } // if (null == prevElement) { // prevElement = element; //// } else if (prevElementContainsElementText(prevElement, element)) { //// continue; // } //// if (null !== webPagesService.findWebPage(element.absUrl("href"))) // if (element.text().trim().isEmpty()) { // continue; // } //// StringTokenizer str = new StringTokenizer(element.text()); //// str. // addDoc(webPage, element.text()); // } if (recursionNumber > MAX_RECURSION_SEARCH_NUMBER || referencedSites.size() > MAX_NUMBER_SITES_INDEXED) { // System.out.println(recursionNumber + " " // + referencedSites.contains(webPage.getUrl())); return; } elements.parallelStream() .filter((Element e) -> e.nodeName().equalsIgnoreCase("a") && null != e.absUrl(HREF) && !e.absUrl(HREF).trim().isEmpty() && !referencedSites.contains(e.absUrl(HREF)) && !referencedSites.contains(removeSharpEtc(e.absUrl(HREF)))) .forEach((Element element) -> { WebPage webPage1 = new WebPage(element.absUrl(HREF)); String url1 = webPage1.getUrl(); // System.out.println(recursionNumber + " recursion for '" // + url1 + "'"); try { Document htmlR = Jsoup.connect(url1).get(); indexElements(webPage1, htmlR, recursionNumber + 1); } catch (IOException | ParseException e) { System.out.println("Exception " + e.getMessage()); } referencedSites.add(url1); }); // for (Element element : elements) { // if (!element.nodeName().equalsIgnoreCase("a")) { // continue; // } // WebPage webPage1 = new WebPage(element.absUrl("href")); // if (null == webPage1.getUrl() // || webPage1.getUrl().isEmpty() // || referencedSites.contains(webPage1.getUrl())) { // continue; // } // System.out.println(recursionNumber + "recursion for " // + element.absUrl("href")); // try { // Document htmlR = Jsoup.connect(webPage1.getUrl()).get(); // webPage1.setTitle(htmlR.title()); // indexElements(webPage1, htmlR, recursionNumber + 1); // } catch (IOException e) { // System.out.println("IOException " + e.getMessage()); // } // referencedSites.add(webPage1.getUrl()); // } }
From source file:com.aquest.emailmarketing.web.controllers.BroadcastController.java
/** * Define content.// w ww . j a v a 2 s .co m * * @param model the model * @param broadcast1 the broadcast1 * @param result the result * @param principal the principal * @return the string * @throws IOException */ @RequestMapping(value = "/defineContent", method = RequestMethod.POST) public String defineContent(Model model, @Valid @ModelAttribute("broadcast") Broadcast broadcast1, @RequestParam(value = "fromUrl", required = false) String fromUrl, @RequestParam(value = "optimize", required = false) boolean optimize, @RequestParam(value = "baseurl", required = false) String baseUrl, @RequestParam(value = "rel2abs", required = false) boolean rel2abs, BindingResult result, Principal principal) throws IOException { String htmlBodyPrep = ""; Broadcast broadcast = broadcastService.getBroadcastById(broadcast1.getId()); broadcast.setSubject(broadcast1.getSubject()); if (fromUrl != "") { Document doc = Jsoup.connect(fromUrl).get(); htmlBodyPrep = doc.outerHtml(); broadcast.setHtmlbody(htmlBodyPrep); System.out.println(htmlBodyPrep); } if (broadcast1.getHtmlbody() != null) { htmlBodyPrep = broadcast1.getHtmlbody(); broadcast.setHtmlbody(htmlBodyPrep); System.out.println("Da vidimo: " + htmlBodyPrep); } if (rel2abs == true) { if (baseUrl != null) { System.out.println(baseUrl); Document doc = Jsoup.parse(broadcast.getHtmlbody(), baseUrl); System.out.println(doc.toString()); Elements images = doc.select("img"); for (Element e : images) { e.attr("src", e.absUrl("src")); System.out.println(e.absUrl("src")); } broadcast.setHtmlbody(doc.outerHtml()); htmlBodyPrep = doc.outerHtml(); } else { // ovde staviti error handling } } if (optimize == true) { // /* PREMAILER API OPTIONS // * line_length - Line length used by to_plain_text. Boolean, default is 65. // warn_level - What level of CSS compatibility warnings to show (see Warnings). // NONE = 0 // SAFE = 1 // POOR = 2 // RISKY = 3 // link_query_string - A string to append to every a href="" link. Do not include the initial ?. // base_url - Used to calculate absolute URLs for local files. // css - Manually specify CSS stylesheets. // css_to_attributes - Copy related CSS attributes into HTML attributes (e.g. background-color to bgcolor) // css_string - Pass CSS as a string // remove_ids - Remove ID attributes whenever possible and convert IDs used as anchors to hashed to avoid collisions in webmail programs. Default is false. // remove_classes - Remove class attributes. Default is false. // remove_comments - Remove html comments. Default is false. // preserve_styles - Whether to preserve any link rel=stylesheet and style elements. Default is false. // preserve_reset - Whether to preserve styles associated with the MailChimp reset code // with_html_string - Whether the html param should be treated as a raw string. // verbose - Whether to print errors and warnings to $stderr. Default is false. // adapter - Which HTML parser to use, either :nokogiri or :hpricot. Default is :hpricot. // */ Premailer premailer = new Premailer(); PremailerInterface premailerInterface = premailer.getPremailerInstance(); Map<String, Object> options = new HashMap<String, Object>(); options.put("with_html_string", true); options.put("base_url", fromUrl); premailerInterface.init(broadcast.getHtmlbody(), options); //premailerInterface.init(htmlBodyPrep, options); broadcast.setHtmlbody(premailerInterface.inline_css()); System.out.println(premailerInterface.inline_css()); premailer.destroyInstance(); } broadcast.setPlaintext(broadcast1.getPlaintext()); broadcastService.SaveOrUpdate(broadcast); // Find URLs in html body and add tracking code Urls urls = new Urls(); String html = broadcast.getHtmlbody(); //HashSet to avoid duplicates Set<String> urlList = new HashSet<String>(); Document doc = Jsoup.parse(html); Elements links = doc.select("a[href]"); for (Element link : links) { if (link.attr("abs:href").length() > 5) { urlList.add(link.attr("abs:href")); } } model.addAttribute("urlList", urlList); model.addAttribute("urls", urls); // Google Analytics - utmCampaign List List<String> utmCampaignList = new ArrayList<String>(); utmCampaignList.add("[BROADAST_NAME]"); model.addAttribute("utmCampaignList", utmCampaignList); // Google Analytics - utmSource List List<String> utmSourceList = new ArrayList<String>(); utmSourceList.add("[CAMPAIGN_NAME]"); model.addAttribute("utmSourceList", utmSourceList); // Google Analytics - utmContent List List<String> utmContentList = new ArrayList<String>(); utmContentList.add("[EMAIL]"); //TODO: add all variables from CM_EMAIL_BROADCAST_LIST model.addAttribute("utmContentList", utmContentList); model.addAttribute("broadcast", broadcast); return "tracking"; }
From source file:de.geeksfactory.opacclient.apis.Heidi.java
@Override public void start() throws IOException { String html = httpGet(opac_url + "/search.cgi?art=f", ENCODING, false, cookieStore); Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);/*w w w . j a v a 2 s.c om*/ sessid = null; for (Element link : doc.select("a")) { String sid = getQueryParamsFirst(link.absUrl("href")).get("sess"); if (sid != null) { sessid = sid; break; } } super.start(); }
From source file:dk.dma.msinm.service.MessageService.java
/** * Make sure that links are absolute./*w w w . jav a 2s . c om*/ * @param doc the HTML document */ protected void externalizeLinks(Document doc, String tag, String attr) { Elements elms = doc.select(tag + "[" + attr + "]"); for (Element e : elms) { String url = e.absUrl(attr); if (url.length() == 0) { // Disable link e.attr(attr, "#"); continue; } // Update the link to be the absolute link e.attr(attr, url); } }
From source file:de.geeksfactory.opacclient.apis.Heidi.java
private SearchRequestResult parse_search(String html, int page) { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);/* www . j a v a 2s .com*/ int results_total = 0; if (doc.select("#heiditreffer").size() > 0) { String resstr = doc.select("#heiditreffer").text(); String resnum = resstr.replaceAll("\\(([0-9.]+)([^0-9]*)\\)", "$1").replace(".", ""); results_total = Integer.parseInt(resnum); } Elements table = doc.select("table.treffer tr"); List<SearchResult> results = new ArrayList<>(); for (int i = 0; i < table.size(); i++) { Element tr = table.get(i); SearchResult sr = new SearchResult(); StringBuilder description = null; String author = ""; for (Element link : tr.select("a")) { String kk = getQueryParamsFirst(link.absUrl("href")).get("katkey"); if (kk != null) { sr.setId(kk); break; } } if (tr.select("span.Z3988").size() == 1) { // Luckily there is a <span class="Z3988"> item which provides // data in a standardized format. List<NameValuePair> z3988data; boolean hastitle = false; try { description = new StringBuilder(); z3988data = URLEncodedUtils .parse(new URI("http://dummy/?" + tr.select("span.Z3988").attr("title")), "UTF-8"); for (NameValuePair nv : z3988data) { if (nv.getValue() != null) { if (!nv.getValue().trim().equals("")) { if (nv.getName().equals("rft.btitle") && !hastitle) { description.append("<b>").append(nv.getValue()).append("</b>"); hastitle = true; } else if (nv.getName().equals("rft.atitle") && !hastitle) { description.append("<b>").append(nv.getValue()).append("</b>"); hastitle = true; } else if (nv.getName().equals("rft.au")) { author = nv.getValue(); } else if (nv.getName().equals("rft.aufirst")) { author = author + ", " + nv.getValue(); } else if (nv.getName().equals("rft.aulast")) { author = nv.getValue(); } else if (nv.getName().equals("rft.date")) { description.append("<br />").append(nv.getValue()); } } } } } catch (URISyntaxException e) { description = null; } } if (!"".equals(author)) { author = author + "<br />"; } sr.setInnerhtml(author + description.toString()); if (tr.select(".kurzstat").size() > 0) { String stattext = tr.select(".kurzstat").first().text(); if (stattext.contains("ausleihbar")) { sr.setStatus(Status.GREEN); } else if (stattext.contains("online")) { sr.setStatus(Status.GREEN); } else if (stattext.contains("entliehen")) { sr.setStatus(Status.RED); } else if (stattext.contains("Prsenznutzung")) { sr.setStatus(Status.YELLOW); } else if (stattext.contains("bestellen")) { sr.setStatus(Status.YELLOW); } } if (tr.select(".typbild").size() > 0) { String typtext = tr.select(".typbild").first().text(); if (typtext.contains("Buch")) { sr.setType(MediaType.BOOK); } else if (typtext.contains("DVD-ROM")) { sr.setType(MediaType.CD_SOFTWARE); } else if (typtext.contains("Online-Ressource")) { sr.setType(MediaType.EDOC); } else if (typtext.contains("DVD")) { sr.setType(MediaType.DVD); } else if (typtext.contains("Film")) { sr.setType(MediaType.MOVIE); } else if (typtext.contains("Zeitschrift")) { sr.setType(MediaType.MAGAZINE); } else if (typtext.contains("Musiknoten")) { sr.setType(MediaType.SCORE_MUSIC); } else if (typtext.contains("Bildliche Darstellung")) { sr.setType(MediaType.ART); } else if (typtext.contains("Zeitung")) { sr.setType(MediaType.NEWSPAPER); } else if (typtext.contains("Karte")) { sr.setType(MediaType.MAP); } else if (typtext.contains("Mehrteilig")) { sr.setType(MediaType.PACKAGE_BOOKS); } } results.add(sr); } // TODO return new SearchRequestResult(results, results_total, page); }
From source file:crawler.HackerEarthCrawler.java
@Override public void crawl() { int flag = 0; //set of urls which should be crawled TreeSet<String> linksset = new TreeSet<String>(); TreeSet<String> tempset = new TreeSet<String>(); TreeSet<String> tutorialset = new TreeSet<String>(); //final set of problem urls TreeSet<String> problemset = new TreeSet<String>(); //visited for maintaing status of if url is already crawled or not TreeMap<String, Integer> visited = new TreeMap<String, Integer>(); //add base url linksset.add(baseUrl);// w w w.j av a2 s . c om //mark base url as not crawled visited.put(baseUrl, 0); try { while (true) { flag = 0; tempset.clear(); for (String str : linksset) { //check if url is already crawled or not and it has valid domain name if ((visited.get(str) == 0) && (str.startsWith("https://www.hackerearth.com/"))) { System.out.println("crawling " + str); //retriving response of current url as document Document doc = Jsoup.connect(str).timeout(0).userAgent( "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0") .referrer("http://www.google.com").ignoreHttpErrors(true).get(); //retriving all urls from current page Elements links = doc.select("a[href]"); //mark url as crawled visited.put(str, 1); //mark flag as url is crawled flag = 1; //retrive all urls for (Element link : links) { if (link.absUrl("href").endsWith("/tutorial/")) { tutorialset.add(link.absUrl("href")); } //check if url is problem url then add it in problemurlset if (link.absUrl("href").startsWith("https://www.hackerearth.com/") && isProblemUrl(link.absUrl("href"))) { problemset.add(link.absUrl("href")); } //check if url has valid domain and it has problem urls or not if (link.absUrl("href").contains(("https://www.hackerearth.com/")) && isCrawlable(link.absUrl("href"))) { //if link is not visited then mark it as uncrawled if (!visited.containsKey(link.absUrl("href"))) { visited.put(link.absUrl("href"), 0); } //add it in tempsetorary set tempset.add(link.absUrl("href")); //System.out.println("\n base: "+str+" ::: link : " + link.absUrl("href")); } } } } //if nothing is left to crawl break the loop if (flag == 0) { break; } //add all retrieved links to linksset linksset.addAll(tempset); } System.out.println("\n\ntotal problem urls " + problemset.size()); int i = 0; for (String str : problemset) { System.out.println("link " + i + " : " + str); i++; } } catch (IOException ex) { Logger.getLogger(HackerEarthCrawler.class.getName()).log(Level.SEVERE, null, ex); } //scrap and store into database //for every problem url scrap problem page for (String problemUrl : problemset) { System.out.println("problemUrl :" + problemUrl); try { //create problem class to store in database Problem problem = new Problem(); String problemSIOC = "", problemIOC = ""; String problemTitle = "", problemStatement = "", problemInput = "", problemOutput = "", problemConstraints = ""; String sampleInput = "", sampleOutput = ""; String problemExplanation = ""; //set default timelimit to 1 second double problemTimeLimit = 1.0; ArrayList<String> tags = new ArrayList<String>(); //get response for given problem url Response response = Jsoup.connect(problemUrl).execute(); Document doc = response.parse(); //retrieve problem title from page Element elementTitle = doc.getElementsByTag("title").first(); StringTokenizer stTitle = new StringTokenizer(elementTitle.text(), "|"); problemTitle = stTitle.nextToken().trim(); Element content = doc.getElementsByClass("starwars-lab").first(); problemSIOC = content.text(); Elements e = content.children(); //to find problem statement String breakloop[] = { "input", "input:", "input :", "input format:", "input format :", "input format", "Input and output", "constraints :", "constraints:", "constraints", "$$Input :$$" }; flag = 0; for (Element p : e) { String tempStatement = ""; for (Element pp : p.getAllElements()) { for (String strbreak : breakloop) { if (StringUtils.equalsIgnoreCase(pp.ownText(), strbreak)) { //System.out.println("strbreak :"+strbreak); tempStatement = p.text().substring(0, p.text().toLowerCase().indexOf(strbreak.toLowerCase())); // System.out.println("temp "+tempStatement); flag = 1; break; } } } if (flag == 1) { problemStatement += tempStatement; //remove extra space at end if (tempStatement.length() == 0) { problemStatement = problemStatement.substring(0, problemStatement.length() - 1); } break; } problemStatement += p.text() + " "; } System.out.println("problemSIOC :" + problemSIOC); System.out.println("problemStatement :" + problemStatement); if (problemStatement.length() <= problemSIOC.length()) { //remove problem statement from whole text and remove extra spaces at the beginning and the end problemIOC = problemSIOC.substring(problemStatement.length()).trim(); } else { problemIOC = ""; } System.out.println("problemIOC :" + problemIOC); //keywords for identifying input String decideInput[] = { "Input format :", "Input format:", "Input format", "inputformat:", "inputformat :", "inputformat", "input and output", "input :", "input:", "input" }; //keywords for identifying output String decideOutput[] = { "output format :", "output format:", "Output format", "outputformat:", "outputformat :", "outputformat", "output :", "output:", "output" }; //keywords for identifying constraint String decideConstraint[] = { "constraints:", "constraints :", "constraints", "Constraints :", "constraint:", "constraint :", "constraint", "Contraints :" }; int posin = 0, posoutput = 0, poscon = 0, idxin, idxout, idxcon, flaginput = 0, flagoutput = 0, flagcon = 0, inlen = 0, outlen = 0, conlen = 0; //find inputformat position,length of keyword for (idxin = 0; idxin < decideInput.length; idxin++) { if (StringUtils.containsIgnoreCase(problemIOC, decideInput[idxin])) { posin = problemIOC.toLowerCase().indexOf(decideInput[idxin].toLowerCase()); flaginput = 1; inlen = decideInput[idxin].length(); //decide it is keyowrd for actucal input or it is "sample input" if (StringUtils.containsIgnoreCase(problemIOC, "sample input")) { if (posin > problemIOC.toLowerCase().indexOf("sample input")) { flaginput = 0; inlen = 0; } else { break; } } else { break; } } } //find outputformat position,length of keyword for (idxout = 0; idxout < decideOutput.length; idxout++) { if (StringUtils.containsIgnoreCase(problemIOC, decideOutput[idxout])) { posoutput = problemIOC.toLowerCase().indexOf(decideOutput[idxout].toLowerCase()); flagoutput = 1; outlen = decideOutput[idxout].length(); break; } } //find constraint position,length of keyword for (idxcon = 0; idxcon < decideConstraint.length; idxcon++) { if (StringUtils.containsIgnoreCase(problemIOC, decideConstraint[idxcon])) { poscon = problemIOC.toLowerCase().indexOf(decideConstraint[idxcon].toLowerCase()); flagcon = 1; conlen = decideConstraint[idxcon].length(); break; } } System.out.println("input " + flaginput + " " + inlen + " " + posin); System.out.println("output " + flagoutput + " " + outlen + " " + posoutput); System.out.println("constraint " + flagcon + " " + conlen + " " + poscon); //retrieve problem input and output if present in problem page //if input format is present if (flaginput == 1) { //if input keyword is "input and output" and contraint is present in problem page if (idxin == 6 && flagcon == 1) { problemInput = problemIOC.substring(inlen, poscon); } //if input keyword is "input and output" and contraint is not present in problem page else if (idxin == 6 && flagcon == 0) { problemInput = problemIOC.substring(inlen); } //if output format and constraint is present else if (flagoutput == 1 && flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } //if constraint is present before sample else if (poscon < posoutput) { problemInput = problemIOC.substring(inlen, poscon); problemOutput = problemIOC.substring(posoutput + outlen); } else { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen, poscon); } } //if constraint is not present else if (flagoutput == 1 && flagcon == 0) { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } else if (flagoutput == 0 && flagcon == 1) { if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen); } else { problemInput = problemIOC.substring(poscon + conlen, posin); } problemOutput = ""; } else { problemInput = problemIOC.substring(inlen); problemOutput = ""; } } //if input format and output format is not present else { problemInput = ""; problemOutput = ""; } //if constraint is present if (flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemConstraints = problemIOC.substring(0, posin); } //if constraint is present before output format else if (poscon < posoutput) { problemConstraints = problemIOC.substring(poscon + conlen, posoutput); } else { problemConstraints = problemIOC.substring(poscon + conlen); } } System.out.println("problemInput :" + problemInput); System.out.println("problemOutput :" + problemOutput); System.out.println("problemConstraints :" + problemConstraints); //retrieve problem tags from problem page Element elementtag = doc.getElementsByClass("problem-tags").first().child(1); StringTokenizer st = new StringTokenizer(elementtag.text(), ","); while (st.hasMoreTokens()) { tags.add(st.nextToken().trim()); } //retrieve sample input sample output if present Element elementSIO = doc.getElementsByClass("input-output-container").first(); //if sample input output is present if (elementSIO != null) { //find position of sample output int soutpos = elementSIO.text().indexOf("SAMPLE OUTPUT"); sampleInput = elementSIO.text().substring(12, soutpos); sampleOutput = elementSIO.text().substring(soutpos + 13); System.out.println("Sample input :\n" + sampleInput + "\n\n\n"); System.out.println("Sample Output :\n" + sampleOutput); } else { sampleInput = ""; sampleOutput = ""; } //retrieve problem explanation from problem page if present Element elementExplanation = doc.getElementsByClass("standard-margin").first().child(0); if (elementExplanation.text().toLowerCase().contains("explanation")) { problemExplanation = elementExplanation.nextElementSibling().text(); } System.out.println("Explanation :" + problemExplanation); //retrieve timelimit Element elementTL = doc.getElementsByClass("problem-guidelines").first().child(0).child(1); StringTokenizer stTL = new StringTokenizer(elementTL.ownText(), " "); problemTimeLimit = Double.parseDouble(stTL.nextToken()); //System.out.println("problemTimeLimit :"+problemTimeLimit); //set all retrieved information to problem class problem.setProblemUrl(problemUrl); if (problemTitle.length() == 0) { problemTitle = null; } if (problemStatement.length() == 0) { problemStatement = null; } if (problemInput.length() == 0) { problemInput = null; } if (problemOutput.length() == 0) { problemOutput = null; } if (problemExplanation.length() == 0) { problemExplanation = null; } if (problemConstraints.length() == 0) { problemConstraints = null; } problem.setTitle(problemTitle); problem.setProblemUrl(problemUrl); problem.setProblemStatement(problemStatement); problem.setInputFormat(problemInput); problem.setOutputFormat(problemOutput); problem.setTimeLimit(problemTimeLimit); problem.setExplanation(problemExplanation); problem.setConstraints(problemConstraints); //set sample input output to problem class SampleInputOutput sampleInputOutput = new SampleInputOutput(problem, sampleInput, sampleOutput); problem.getSampleInputOutputs().add(sampleInputOutput); //set platform as hackerearth problem.setPlatform(Platform.HackerEarth); for (String strtag : tags) { problem.getTags().add(strtag); } //store in database Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Problem p where p.problemUrl = :problem_url"; Problem oldProblem = (Problem) session.createQuery(hql).setString("problem_url", problemUrl) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem problem.setId(oldProblem.getId()); session.delete(oldProblem); session.flush(); session.save(problem); } else { task = "saved"; session.save(problem); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, problem.getProblemUrl() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + problemUrl, e); } finally { //close the session if (session != null) { session.close(); } } } catch (Exception ee) { System.out.println(ee.toString()); } } System.out.println("\n\n\n\ntutorial urls\n\n"); try { for (String tutorialurl : tutorialset) { //System.out.println(tutorialurl+"\n\n"); Response tutorialres = Jsoup.connect(tutorialurl).execute(); Document doc = tutorialres.parse(); Tutorial tutorial = new Tutorial(); tutorial.setContent(doc.getElementsByClass("tutorial").first().text()); tutorial.setName(baseUrl); tutorialurl = tutorialurl.substring(0, tutorialurl.length() - 10); StringTokenizer tutorialtok = new StringTokenizer(tutorialurl, "/"); String tempstr = ""; while (tutorialtok.hasMoreTokens()) { tempstr = tutorialtok.nextToken(); } Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Tutorial p where p.name = :name"; Tutorial oldProblem = (Tutorial) session.createQuery(hql).setString("name", tempstr) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem tutorial.setName(oldProblem.getName()); session.delete(oldProblem); session.flush(); session.save(tutorial); } else { task = "saved"; tutorial.setName(tempstr); session.save(tutorial); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, tutorial.getName() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + tempstr, ee); } finally { //close the session if (session != null) { session.close(); } } } } catch (Exception e) { System.out.println(e.getMessage()); } }
From source file:de.geeksfactory.opacclient.apis.Pica.java
protected DetailledItem parse_result(String html) { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);/* w w w . ja v a 2 s . c om*/ DetailledItem result = new DetailledItem(); for (Element a : doc.select("a[href*=PPN")) { Map<String, String> hrefq = getQueryParamsFirst(a.absUrl("href")); String ppn = hrefq.get("PPN"); result.setId(ppn); break; } // GET COVER if (doc.select("td.preslabel:contains(ISBN) + td.presvalue").size() > 0) { Element isbnElement = doc.select("td.preslabel:contains(ISBN) + td.presvalue").first(); String isbn = ""; for (Node child : isbnElement.childNodes()) { if (child instanceof TextNode) { isbn = ((TextNode) child).text().trim(); break; } } result.setCover(ISBNTools.getAmazonCoverURL(isbn, true)); } // GET TITLE AND SUBTITLE String titleAndSubtitle; Element titleAndSubtitleElem = null; String titleRegex = ".*(Titel|Aufsatz|Zeitschrift|Gesamttitel" + "|Title|Article|Periodical|Collective\\stitle" + "|Titre|Article|P.riodique|Titre\\sg.n.ral).*"; String selector = "td.preslabel:matches(" + titleRegex + ") + td.presvalue"; if (doc.select(selector).size() > 0) { titleAndSubtitleElem = doc.select(selector).first(); titleAndSubtitle = titleAndSubtitleElem.text().trim(); int slashPosition = Math.min(titleAndSubtitle.indexOf("/"), titleAndSubtitle.indexOf(":")); String title; if (slashPosition > 0) { title = titleAndSubtitle.substring(0, slashPosition).trim(); String subtitle = titleAndSubtitle.substring(slashPosition + 1).trim(); result.addDetail(new Detail(stringProvider.getString(StringProvider.SUBTITLE), subtitle)); } else { title = titleAndSubtitle; } result.setTitle(title); } else { result.setTitle(""); } // Details int line = 0; Elements lines = doc.select("td.preslabel + td.presvalue"); if (titleAndSubtitleElem != null) { lines.remove(titleAndSubtitleElem); } for (Element element : lines) { Element titleElem = element.firstElementSibling(); String detail = ""; if (element.select("div").size() > 1 && element.select("div").text().equals(element.text())) { boolean first = true; for (Element div : element.select("div")) { if (!div.text().replace("\u00a0", " ").trim().equals("")) { if (!first) { detail += "\n" + div.text().replace("\u00a0", " ").trim(); } else { detail += div.text().replace("\u00a0", " ").trim(); first = false; } } } } else { detail = element.text().replace("\u00a0", " ").trim(); } String title = titleElem.text().replace("\u00a0", " ").trim(); if (element.select("hr").size() > 0) // after the separator we get the copies { break; } if (detail.length() == 0 && title.length() == 0) { line++; continue; } if (title.contains(":")) { title = title.substring(0, title.indexOf(":")); // remove colon } result.addDetail(new Detail(title, detail)); if (element.select("a").size() == 1 && !element.select("a").get(0).text().trim().equals("")) { String url = element.select("a").first().absUrl("href"); if (!url.startsWith(opac_url)) { result.addDetail(new Detail(stringProvider.getString(StringProvider.LINK), url)); } } line++; } line++; // next line after separator // Copies Copy copy = new Copy(); String location = ""; // reservation info will be stored as JSON JSONArray reservationInfo = new JSONArray(); while (line < lines.size()) { Element element = lines.get(line); if (element.select("hr").size() == 0) { Element titleElem = element.firstElementSibling(); String detail = element.text().trim(); String title = titleElem.text().replace("\u00a0", " ").trim(); if (detail.length() == 0 && title.length() == 0) { line++; continue; } if (title.contains("Standort") || title.contains("Vorhanden in") || title.contains("Location")) { location += detail; } else if (title.contains("Sonderstandort")) { location += " - " + detail; } else if (title.contains("Systemstelle") || title.contains("Subject")) { copy.setDepartment(detail); } else if (title.contains("Fachnummer") || title.contains("locationnumber")) { copy.setLocation(detail); } else if (title.contains("Signatur") || title.contains("Shelf mark")) { copy.setShelfmark(detail); } else if (title.contains("Anmerkung")) { location += " (" + detail + ")"; } else if (title.contains("Link")) { result.addDetail(new Detail(title.replace(":", "").trim(), detail)); } else if (title.contains("Status") || title.contains("Ausleihinfo") || title.contains("Ausleihstatus") || title.contains("Request info")) { // Find return date Pattern pattern = Pattern.compile("(till|bis) (\\d{2}-\\d{2}-\\d{4})"); Matcher matcher = pattern.matcher(detail); if (matcher.find()) { DateTimeFormatter fmt = DateTimeFormat.forPattern("dd-MM-yyyy").withLocale(Locale.GERMAN); try { copy.setStatus(detail.substring(0, matcher.start() - 1).trim()); copy.setReturnDate(fmt.parseLocalDate(matcher.group(2))); } catch (IllegalArgumentException e) { e.printStackTrace(); copy.setStatus(detail); } } else { copy.setStatus(detail); } // Get reservation info if (element.select("a:has(img[src*=inline_arrow])").size() > 0) { Element a = element.select("a:has(img[src*=inline_arrow])").first(); boolean multipleCopies = a.text().matches(".*(Exemplare|Volume list).*"); JSONObject reservation = new JSONObject(); try { reservation.put("multi", multipleCopies); reservation.put("link", _extract_url(a.absUrl("href"))); reservation.put("desc", location); reservationInfo.put(reservation); } catch (JSONException e1) { e1.printStackTrace(); } result.setReservable(true); } } } else { copy.setBranch(location); result.addCopy(copy); location = ""; copy = new Copy(); } line++; } if (copy.notEmpty()) { copy.setBranch(location); result.addCopy(copy); } if (reservationInfo.length() == 0) { // No reservation info found yet, because we didn't find any copies. // If there is a reservation link somewhere in the rows we interpreted // as details, we still want to use it. if (doc.select("td a:has(img[src*=inline_arrow])").size() > 0) { Element a = doc.select("td a:has(img[src*=inline_arrow])").first(); boolean multipleCopies = a.text().matches(".*(Exemplare|Volume list).*"); JSONObject reservation = new JSONObject(); try { reservation.put("multi", multipleCopies); reservation.put("link", _extract_url(a.attr("href"))); reservation.put("desc", location); reservationInfo.put(reservation); } catch (JSONException e1) { e1.printStackTrace(); } result.setReservable(true); } } result.setReservation_info(reservationInfo.toString()); // Volumes if (doc.select("a[href^=FAM?PPN=]").size() > 0) { String href = doc.select("a[href^=FAM?PPN=]").attr("href"); String ppn = getQueryParamsFirst(href).get("PPN"); Map<String, String> data = new HashMap<>(); data.put("ppn", ppn); result.setVolumesearch(data); } return result; }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
protected DetailledItem parse_result(String html) { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);/* w ww. ja v a 2s.c o m*/ DetailledItem result = new DetailledItem(); if (doc.select(".detail_cover img").size() == 1) { result.setCover(doc.select(".detail_cover img").get(0).attr("src")); } result.setTitle(doc.select(".detail_titel").text()); Elements detailtrs = doc.select(".detailzeile table tr"); for (int i = 0; i < detailtrs.size(); i++) { Element tr = detailtrs.get(i); if (tr.child(0).hasClass("detail_feld")) { String title = tr.child(0).text(); String content = tr.child(1).text(); if (title.equals("Gesamtwerk:") || title.equals("Erschienen in:")) { try { if (tr.child(1).select("a").size() > 0) { Element link = tr.child(1).select("a").first(); List<NameValuePair> query = URLEncodedUtils.parse(new URI(link.absUrl("href")), "UTF-8"); for (NameValuePair q : query) { if (q.getName().equals("MedienNr")) { result.setCollectionId(q.getValue()); } } } } catch (URISyntaxException e) { } } else { if (content.contains("hier klicken") && tr.child(1).select("a").size() > 0) { content += " " + tr.child(1).select("a").first().attr("href"); } result.addDetail(new Detail(title, content)); } } } Elements detailcenterlinks = doc.select(".detailzeile_center a.detail_link"); for (int i = 0; i < detailcenterlinks.size(); i++) { Element a = detailcenterlinks.get(i); result.addDetail(new Detail(a.text().trim(), a.absUrl("href"))); } try { JSONObject copymap = new JSONObject(); if (data.has("copiestable")) { copymap = data.getJSONObject("copiestable"); } else { Elements ths = doc.select(".exemplartab .exemplarmenubar th"); for (int i = 0; i < ths.size(); i++) { Element th = ths.get(i); String head = th.text().trim(); if (head.equals("Zweigstelle")) { copymap.put("branch", i); } else if (head.equals("Abteilung")) { copymap.put("department", i); } else if (head.equals("Bereich") || head.equals("Standort")) { copymap.put("location", i); } else if (head.equals("Signatur")) { copymap.put("signature", i); } else if (head.equals("Barcode") || head.equals("Medien-Nummer")) { copymap.put("barcode", i); } else if (head.equals("Status")) { copymap.put("status", i); } else if (head.equals("Frist") || head.matches("Verf.+gbar")) { copymap.put("returndate", i); } else if (head.equals("Vorbestellungen") || head.equals("Reservierungen")) { copymap.put("reservations", i); } } } Elements exemplartrs = doc.select(".exemplartab .tabExemplar, .exemplartab .tabExemplar_"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (int i = 0; i < exemplartrs.size(); i++) { Element tr = exemplartrs.get(i); Copy copy = new Copy(); Iterator<?> keys = copymap.keys(); while (keys.hasNext()) { String key = (String) keys.next(); int index; try { index = copymap.has(key) ? copymap.getInt(key) : -1; } catch (JSONException e1) { index = -1; } if (index >= 0) { try { copy.set(key, tr.child(index).text(), fmt); } catch (IllegalArgumentException e) { e.printStackTrace(); } } } result.addCopy(copy); } } catch (Exception e) { e.printStackTrace(); } try { Elements bandtrs = doc.select("table .tabBand a"); for (int i = 0; i < bandtrs.size(); i++) { Element tr = bandtrs.get(i); Volume volume = new Volume(); volume.setId(tr.attr("href").split("=")[1]); volume.setTitle(tr.text()); result.addVolume(volume); } } catch (Exception e) { e.printStackTrace(); } if (doc.select(".detail_vorbest a").size() == 1) { result.setReservable(true); result.setReservation_info(doc.select(".detail_vorbest a").attr("href")); } return result; }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected DetailledItem parse_result(String html) throws IOException { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);/* w w w .j a va 2s. c om*/ String html2 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showTitleActive", ENCODING); Document doc2 = Jsoup.parse(html2); doc2.setBaseUri(opac_url); String html3 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showAvailabilityActive", ENCODING); Document doc3 = Jsoup.parse(html3); doc3.setBaseUri(opac_url); DetailledItem result = new DetailledItem(); try { result.setId(doc.select("#bibtip_id").text().trim()); } catch (Exception ex) { ex.printStackTrace(); } List<String> reservationlinks = new ArrayList<>(); for (Element link : doc3.select("#vormerkung a, #tab-content a")) { String href = link.absUrl("href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (result.getId() == null) { // ID retrieval String key = hrefq.get("katkey"); if (key != null) { result.setId(key); break; } } // Vormerken if (hrefq.get("methodToCall") != null) { if (hrefq.get("methodToCall").equals("doVormerkung") || hrefq.get("methodToCall").equals("doBestellung")) { reservationlinks.add(href.split("\\?")[1]); } } } if (reservationlinks.size() == 1) { result.setReservable(true); result.setReservation_info(reservationlinks.get(0)); } else if (reservationlinks.size() == 0) { result.setReservable(false); } else { // TODO: Multiple options - handle this case! } if (doc.select(".data td img").size() == 1) { result.setCover(doc.select(".data td img").first().attr("abs:src")); try { downloadCover(result); } catch (Exception e) { } } if (doc.select(".aw_teaser_title").size() == 1) { result.setTitle(doc.select(".aw_teaser_title").first().text().trim()); } else if (doc.select(".data td strong").size() > 0) { result.setTitle(doc.select(".data td strong").first().text().trim()); } else { result.setTitle(""); } if (doc.select(".aw_teaser_title_zusatz").size() > 0) { result.addDetail(new Detail("Titelzusatz", doc.select(".aw_teaser_title_zusatz").text().trim())); } String title = ""; String text = ""; boolean takeover = false; Element detailtrs = doc2.select(".box-container .data td").first(); for (Node node : detailtrs.childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("strong")) { title = ((Element) node).text().trim(); text = ""; } else { if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) { text = text + node.attr("href"); takeover = true; break; } } } else if (node instanceof TextNode) { text = text + ((TextNode) node).text(); } } if (!takeover) { text = ""; title = ""; } detailtrs = doc2.select("#tab-content .data td").first(); if (detailtrs != null) { for (Node node : detailtrs.childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("strong")) { if (!text.equals("") && !title.equals("")) { result.addDetail(new Detail(title.trim(), text.trim())); if (title.equals("Titel:")) { result.setTitle(text.trim()); } text = ""; } title = ((Element) node).text().trim(); } else { if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) { text = text + node.attr("href"); } else { text = text + ((Element) node).text(); } } } else if (node instanceof TextNode) { text = text + ((TextNode) node).text(); } } } else { if (doc2.select("#tab-content .fulltitle tr").size() > 0) { Elements rows = doc2.select("#tab-content .fulltitle tr"); for (Element tr : rows) { if (tr.children().size() == 2) { Element valcell = tr.child(1); String value = valcell.text().trim(); if (valcell.select("a").size() == 1) { value = valcell.select("a").first().absUrl("href"); } result.addDetail(new Detail(tr.child(0).text().trim(), value)); } } } else { result.addDetail(new Detail(stringProvider.getString(StringProvider.ERROR), stringProvider.getString(StringProvider.COULD_NOT_LOAD_DETAIL))); } } if (!text.equals("") && !title.equals("")) { result.addDetail(new Detail(title.trim(), text.trim())); if (title.equals("Titel:")) { result.setTitle(text.trim()); } } for (Element link : doc3.select("#tab-content a")) { Map<String, String> hrefq = getQueryParamsFirst(link.absUrl("href")); if (result.getId() == null) { // ID retrieval String key = hrefq.get("katkey"); if (key != null) { result.setId(key); break; } } } for (Element link : doc3.select(".box-container a")) { if (link.text().trim().equals("Download")) { result.addDetail( new Detail(stringProvider.getString(StringProvider.DOWNLOAD), link.absUrl("href"))); } } Map<String, Integer> copy_columnmap = new HashMap<>(); // Default values copy_columnmap.put("barcode", 1); copy_columnmap.put("branch", 3); copy_columnmap.put("status", 4); Elements copy_columns = doc.select("#tab-content .data tr#bg2 th"); for (int i = 0; i < copy_columns.size(); i++) { Element th = copy_columns.get(i); String head = th.text().trim(); if (head.contains("Status")) { copy_columnmap.put("status", i); } if (head.contains("Zweigstelle")) { copy_columnmap.put("branch", i); } if (head.contains("Mediennummer")) { copy_columnmap.put("barcode", i); } if (head.contains("Standort")) { copy_columnmap.put("location", i); } if (head.contains("Signatur")) { copy_columnmap.put("signature", i); } } Pattern status_lent = Pattern.compile( "^(entliehen) bis ([0-9]{1,2}.[0-9]{1,2}.[0-9]{2," + "4}) \\(gesamte Vormerkungen: ([0-9]+)\\)$"); Pattern status_and_barcode = Pattern.compile("^(.*) ([0-9A-Za-z]+)$"); Elements exemplartrs = doc.select("#tab-content .data tr").not("#bg2"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (Element tr : exemplartrs) { try { Copy copy = new Copy(); Element status = tr.child(copy_columnmap.get("status")); Element barcode = tr.child(copy_columnmap.get("barcode")); String barcodetext = barcode.text().trim().replace(" Wegweiser", ""); // STATUS String statustext; if (status.getElementsByTag("b").size() > 0) { statustext = status.getElementsByTag("b").text().trim(); } else { statustext = status.text().trim(); } if (copy_columnmap.get("status").equals(copy_columnmap.get("barcode"))) { Matcher matcher1 = status_and_barcode.matcher(statustext); if (matcher1.matches()) { statustext = matcher1.group(1); barcodetext = matcher1.group(2); } } Matcher matcher = status_lent.matcher(statustext); if (matcher.matches()) { copy.setStatus(matcher.group(1)); copy.setReservations(matcher.group(3)); copy.setReturnDate(fmt.parseLocalDate(matcher.group(2))); } else { copy.setStatus(statustext); } copy.setBarcode(barcodetext); if (status.select("a[href*=doVormerkung]").size() == 1) { copy.setResInfo(status.select("a[href*=doVormerkung]").attr("href").split("\\?")[1]); } String branchtext = tr.child(copy_columnmap.get("branch")).text().trim().replace(" Wegweiser", ""); copy.setBranch(branchtext); if (copy_columnmap.containsKey("location")) { copy.setLocation( tr.child(copy_columnmap.get("location")).text().trim().replace(" Wegweiser", "")); } if (copy_columnmap.containsKey("signature")) { copy.setShelfmark( tr.child(copy_columnmap.get("signature")).text().trim().replace(" Wegweiser", "")); } result.addCopy(copy); } catch (Exception ex) { ex.printStackTrace(); } } try { Element isvolume = null; Map<String, String> volume = new HashMap<>(); Elements links = doc.select(".data td a"); int elcount = links.size(); for (int eli = 0; eli < elcount; eli++) { List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(links.get(eli).attr("href")), "UTF-8"); for (NameValuePair nv : anyurl) { if (nv.getName().equals("methodToCall") && nv.getValue().equals("volumeSearch")) { isvolume = links.get(eli); } else if (nv.getName().equals("catKey")) { volume.put("catKey", nv.getValue()); } else if (nv.getName().equals("dbIdentifier")) { volume.put("dbIdentifier", nv.getValue()); } } if (isvolume != null) { volume.put("volume", "true"); result.setVolumesearch(volume); break; } } } catch (Exception e) { e.printStackTrace(); } return result; }
From source file:com.weavers.duqhan.business.impl.ProductServiceImpl.java
@Override public void loadTempProducts(List<StatusBean> statusBeans) { boolean isSuccess = true; String startDate = new Date().toString(); Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, "(==I==)DATE: " + startDate + "Store product details in temp product table start....."); try {/*from www . j a v a 2s .com*/ String status = ""; for (StatusBean statusBean : statusBeans) { status = "Link duplicate"; Temtproductlinklist temtproductlinklist = temtproductlinklistDao.loadById(statusBean.getId()); if (temtproductlinklist != null && temtproductlinklist.getStatus() == 0) { Product testProduct = productDao.getProductByExternelLink(temtproductlinklist.getLink()); if (testProduct == null) { String value = ""; Elements detailMain; Elements detailSub; Elements specifics; double votes = 0.0; double stars = 0.0; double feedback = 0.0; String url = temtproductlinklist.getLink(); try { testProduct = new Product(); Product savedTestProduct; //=================== Random sleep START ===================// // TimeUnit.SECONDS.sleep(30 + (int) (Math.random() * 100)); Random randomObj = new Random(); TimeUnit.SECONDS.sleep(randomObj.ints(30, 60).findFirst().getAsInt()); //=================== Random sleep END =====================// Document doc = Jsoup.connect(url).get(); detailMain = doc.select("#j-detail-page"); if (!detailMain.isEmpty()) { //=================== Criteria Block START==================// detailMain = doc.select(".rantings-num"); if (!detailMain.isEmpty()) { votes = Double.valueOf(detailMain.text().split(" votes")[0].split("\\(")[1]); } detailMain = doc.select(".percent-num"); if (!detailMain.isEmpty()) { stars = Double.valueOf(detailMain.text()); } detailMain = doc.select("ul.ui-tab-nav li[data-trigger='feedback'] a"); if (!detailMain.isEmpty()) { feedback = Double.valueOf(detailMain.text().split("\\(")[1].split("\\)")[0]); } //=================== Criteria Block END==================// if (votes > 10.0 && stars > 4.0 && feedback > 4.0) { detailMain = doc.select(".detail-wrap .product-name"); testProduct.setName(detailMain .text());/*.substring(0, Math.min(detailMain.text().length(), 50))*/ detailMain = doc.select(".detail-wrap .product-name"); testProduct.setDescription(detailMain.text()); testProduct.setExternalLink(url); testProduct.setVendorId(1l);//?????????????????????? //=================== Packaging block START==================// Double weight = 1.0; Double width = 1.0; Double height = 1.0; Double length = 1.0; detailMain = doc.select( "div#j-product-desc div.pnl-packaging-main ul li.packaging-item"); for (Element element : detailMain) { String packagingTitle = element.select("span.packaging-title").text(); String packagingDesc = element.select("span.packaging-des").text(); if (packagingTitle.trim().equals("Package Weight:")) { String str = packagingDesc; str = str.replaceAll("[^.?0-9]+", " "); if (Arrays.asList(str.trim().split(" ")) != null) { if (!Arrays.asList(str.trim().split(" ")).isEmpty()) { try { weight = Double.parseDouble( Arrays.asList(str.trim().split(" ")).get(0)); } catch (Exception e) { weight = 1.0; } } } System.out.println("weight == " + weight); } else if (packagingTitle.trim().equals("Package Size:")) { String str = packagingDesc; str = str.replaceAll("[^.?0-9]+", " "); if (Arrays.asList(str.trim().split(" ")) != null) { if (!Arrays.asList(str.trim().split(" ")).isEmpty()) { try { width = Double.parseDouble( Arrays.asList(str.trim().split(" ")).get(0)); height = Double.parseDouble( Arrays.asList(str.trim().split(" ")).get(1)); length = Double.parseDouble( Arrays.asList(str.trim().split(" ")).get(2)); } catch (Exception e) { width = 1.0; height = 1.0; length = 1.0; } } } System.out.println("width == " + width); System.out.println("height == " + height); System.out.println("length == " + length); } } //=================== Packaging block END==================// //=================== Category block START==================// detailMain = doc.select("div.ui-breadcrumb div.container a"); Long productCategoryId = 0L; String parentPath = ""; String thisCategory = detailMain.last().text().trim(); System.out.println("thisCategory == " + thisCategory); Category parentCategory = new Category(); parentCategory.setId(0L); parentCategory.setParentPath(""); for (Element element : detailMain) { String newCategory; newCategory = element.text().trim(); System.out.println("newCategory======" + newCategory); if (newCategory.equals("Home") || newCategory.equals("All Categories")) { } else { Category category = categoryDao.getCategoryByName(newCategory); if (category != null) { if (category.getName().equals(thisCategory)) { productCategoryId = category.getId(); parentPath = category.getParentPath(); } parentCategory = category; } else { category = new Category(); category.setId(null); category.setName(newCategory); category.setParentId(parentCategory.getId()); category.setParentPath(parentCategory.getParentPath() + parentCategory.getId() + "="); category.setQuantity(0); category.setImgUrl("-"); category.setDisplayText(newCategory); Category category2 = categoryDao.save(category); if (category.getName().equals(thisCategory)) { productCategoryId = category2.getId(); parentPath = category2.getParentPath(); } parentCategory = category2; } } } //=================== Category block END==================// //=============== Specifications block START==============// detailMain = doc.select(".product-property-list .property-item"); String specifications = ""; for (Element element : detailMain) { specifications = specifications + element.select(".propery-title").get(0).text().replace(",", "/") .replace(":", "-") + ":" + element.select(".propery-des").get(0).text() .replace(",", "/").replace(":", "-") + ",";//TODO:, check } //=============== Specifications Block END==============// //=============== Shipping Time Block START==============// String shippingTime = ""; detailMain = doc.select(".shipping-days[data-role='delivery-days']"); System.out.println("value detailMain" + detailMain.toString()); shippingTime = detailMain.text(); //=============== Shipping Time Block END==============// //=============== Shipping Cost Block START==============// detailMain = doc.select(".logistics-cost"); value = detailMain.text(); if (!value.equalsIgnoreCase("Free Shipping")) { // f = 0.00; } else { // f = Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1")); } //=============== Shipping Cost Block END==============// //=================Product save 1st START==============// testProduct.setCategoryId(productCategoryId); testProduct.setLastUpdate(new Date()); testProduct.setParentPath(parentPath); testProduct.setImgurl("-"); testProduct.setProperties("-"); testProduct.setProductWidth(width); testProduct.setProductLength(length); testProduct.setProductWeight(weight); testProduct.setProductHeight(height); testProduct.setShippingRate(0.0); testProduct.setShippingTime("45"); testProduct.setSpecifications(specifications); savedTestProduct = productDao.save(testProduct); //====================Product save 1st END==============// //========= Property, Property Value, Property Product Map Block START ========// double discountPrice = 0.0; double actualPrice = 0.0; double markupPrice = 0.0; String id = ""; String allProperties = ""; //------------------------Read Color css START---------------------// specifics = doc.select("#j-product-info-sku dl.p-property-item"); Elements cssdetailMain = doc.select("link[href]"); Document cssdoc = new Document(""); System.out.println( "====================================================cssdetailMain" + cssdetailMain.size()); for (Element element : cssdetailMain) { String cssurl = element.attr("abs:href"); if (cssurl.contains("??main-detail")) { try { cssdoc = Jsoup.connect(cssurl).get(); } catch (IOException ex) { } break; } } //-----------------------Read Color css END--------------------------// //-----------Product Property, Property Value START--------// Map<String, ProductPropertyvalues> propertyValuesMap = new HashMap<>(); if (!specifics.isEmpty()) { ProductProperties testPorperties; ProductProperties saveTestPorperties; ProductPropertyvalues testPropertyValues; for (Element specific : specifics) { System.out.println("head ==== " + specific.select("dt").text()); testPorperties = productPropertiesDao .loadByName(specific.select("dt").text()); if (testPorperties == null) { testPorperties = new ProductProperties(); testPorperties.setPropertyName(specific.select("dt").text()); saveTestPorperties = productPropertiesDao.save(testPorperties); } else { saveTestPorperties = testPorperties; } allProperties = allProperties + saveTestPorperties.getId().toString() + "-"; detailSub = specific.select("dd ul li"); String valu = "-"; for (Element element : detailSub) { testPropertyValues = new ProductPropertyvalues(); id = element.select("a[data-sku-id]").attr("data-sku-id").trim(); testPropertyValues.setRefId(id); if (element.hasClass("item-sku-image")) { valu = element.select("a img[src]").get(0).absUrl("src") .split(".jpg")[0] + ".jpg"; String title = element.select("a img").get(0).attr("title"); String imgUrl = GoogleBucketFileUploader .uploadProductImage(valu, savedTestProduct.getId()); valu = "<img src='" + imgUrl + "' title='" + title + "' style='height:40px; width:40px;'/>"; } else if (element.hasClass("item-sku-color")) { String style = cssdoc.html().split("sku-color-" + id)[1] .split("}")[0].substring(1); valu = "<span style='" + style + "' ; height:40px; width:40px; display:block;'></span>"; } else { valu = element.select("a span").toString(); } System.out.println("valu === " + valu); testPropertyValues.setProductId(savedTestProduct.getId()); testPropertyValues.setPropertyId(saveTestPorperties.getId()); testPropertyValues.setValueName(valu); propertyValuesMap.put(id, productPropertyvaluesDao.save(testPropertyValues)); } } savedTestProduct.setProperties(allProperties); } //-----------Product Property, Property Value END--------// //----------------------Read json START------------------// List<AxpProductDto> axpProductDtos = new ArrayList<>(); Elements scripts = doc.select("script"); // Get the script part for (Element script : scripts) { if (script.html().contains("var skuProducts=")) { String jsonData = ""; jsonData = script.html().split("var skuProducts=")[1] .split("var GaData")[0].trim(); jsonData = jsonData.substring(0, jsonData.length() - 1); Gson gsonObj = new Gson(); axpProductDtos = Arrays .asList(gsonObj.fromJson(jsonData, AxpProductDto[].class)); break; } } //----------------------Read json END------------------// //-------------Product Properties Map START------------// for (AxpProductDto thisAxpProductDto : axpProductDtos) { SkuVal skuVal = thisAxpProductDto.getSkuVal(); if (skuVal.getActSkuCalPrice() != null) { value = skuVal.getActSkuCalPrice().trim(); discountPrice = CurrencyConverter.usdTOinr( Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1"))); value = skuVal.getSkuCalPrice().trim(); actualPrice = CurrencyConverter.usdTOinr( Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1"))); markupPrice = discountPrice * 0.15 + 100; discountPrice = Math.ceil((discountPrice + markupPrice) / 10) * 10; actualPrice = Math.round(actualPrice + markupPrice); } else { discountPrice = 0.0; value = skuVal.getSkuCalPrice().trim(); actualPrice = CurrencyConverter.usdTOinr( Double.parseDouble(value.replaceAll(".*?([\\d.]+).*", "$1"))); markupPrice = actualPrice * 0.15 + 100; discountPrice = Math.round(actualPrice + markupPrice); actualPrice = Math.round(actualPrice + markupPrice); } ProductPropertiesMap productPropertyMap = new ProductPropertiesMap(); String myPropValueIds = ""; if (thisAxpProductDto.getSkuAttr() != null) { String[] skuPropIds = thisAxpProductDto.getSkuPropIds().split(","); for (String skuPropId : skuPropIds) { myPropValueIds = myPropValueIds + propertyValuesMap.get(skuPropId).getId().toString() + "_"; } productPropertyMap.setPropertyvalueComposition(myPropValueIds); } else { productPropertyMap.setPropertyvalueComposition("_"); } productPropertyMap.setDiscount(discountPrice); productPropertyMap.setPrice(actualPrice); productPropertyMap.setProductId(savedTestProduct); productPropertyMap.setQuantity(5l); productPropertiesMapDao.save(productPropertyMap); } //-------------Product Properties Map START------------// //========= Property, Property Value, Property Product Map Block END ========// //============= Multiple Image Block START =============// detailMain = doc.select("ul.image-thumb-list span.img-thumb-item img[src]"); int flg = 0; String imgUrl = ""; for (Element element : detailMain) { imgUrl = GoogleBucketFileUploader.uploadProductImage( element.absUrl("src").split(".jpg")[0] + ".jpg", savedTestProduct.getId()); if (flg == 0) { flg++; savedTestProduct.setImgurl(imgUrl); } else { ProductImg productImg = new ProductImg(); productImg.setId(null); productImg.setImgUrl(imgUrl); productImg.setProductId(savedTestProduct.getId()); productImgDao.save(productImg); } } //============= Multiple Image Block END =============// //=================Product save final START==============// if (productDao.save(savedTestProduct) != null) { temtproductlinklist.setStatus(1);// temtproductlinklistDao.save(temtproductlinklist); status = "Success"; } //=================Product save final START==============// } else { temtproductlinklist.setStatus(2);// temtproductlinklistDao.save(temtproductlinklist); status = "criteria mismatch"; } } else { status = "Page not found"; } } catch (Exception ex) { System.out.println( "=============================================================Exception1" + ex); temtproductlinklist.setStatus(4);// temtproductlinklistDao.save(temtproductlinklist); System.out.println("Exception === " + ex); status = "Failure"; Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, "(==E==)DATE: " + new Date().toString() + "Store product details in temp product table get error in sub process.....\n Link Id: " + statusBean.getId() + "\n Started on" + startDate, ex); } } else { temtproductlinklist.setStatus(3);// temtproductlinklistDao.save(temtproductlinklist); status = "Product exsist"; } } // String body = "Id: " + temtproductlinklist.getId() + "<br/> Status: " + status; // MailSender.sendEmail("krisanu.nandi@pkweb.in", "Product captured", body, "subhendu.sett@pkweb.in"); statusBean.setStatus(status); } System.out.println("=============================================================status" + status); } catch (Exception e) { System.out.println("=============================================================Exception2" + e); isSuccess = false; String body = "(==E==)DATE: " + new Date().toString() + "Store product details in temp product table get error.....<br/> Started on" + startDate + "<br/>"; Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, body, e); // MailSender.sendEmail("krisanu.nandi@pkweb.in", "Stopped store product details", body + e.getLocalizedMessage(), "subhendu.sett@pkweb.in"); } if (isSuccess) { String body = "(==I==)DATE: " + new Date().toString() + "Store product details in temp product table end.....<br/> Started on" + startDate; Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, body); /*ObjectMapper mapper = new ObjectMapper(); try { MailSender.sendEmail("krisanu.nandi@pkweb.in", "Completed store product details", body + "=============<br/><br/>" + mapper.writeValueAsString(statusBeans), "subhendu.sett@pkweb.in"); } catch (JsonProcessingException ex) { Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, null, ex); }*/ } // return statusBeans; System.out.println("=============================================================end"); }