List of usage examples for org.jsoup.nodes Element children
public Elements children()
From source file:de.geeksfactory.opacclient.apis.Zones.java
private DetailledItem parse_result(String id, String html) { Document doc = Jsoup.parse(html); DetailledItem result = new DetailledItem(); result.setTitle(""); boolean title_is_set = false; result.setId(id);/*from www .ja v a2s.co m*/ String detailTrsQuery = version18 ? ".inRoundBox1 table table tr" : ".DetailDataCell table table:not(.inRecordHeader) tr"; Elements detailtrs1 = doc.select(detailTrsQuery); for (int i = 0; i < detailtrs1.size(); i++) { Element tr = detailtrs1.get(i); int s = tr.children().size(); if (tr.child(0).text().trim().equals("Titel") && !title_is_set) { result.setTitle(tr.child(s - 1).text().trim()); title_is_set = true; } else if (s > 1) { Element valchild = tr.child(s - 1); if (valchild.select("table").isEmpty()) { String val = valchild.text().trim(); if (val.length() > 0) { result.addDetail(new Detail(tr.child(0).text().trim(), val)); } } } } for (Element a : doc.select("a.SummaryActionLink")) { if (a.text().contains("Vormerken")) { result.setReservable(true); result.setReservation_info(a.attr("href")); } } Elements detaildiv = doc.select("div.record-item-new"); if (!detaildiv.isEmpty()) { for (int i = 0; i < detaildiv.size(); i++) { Element dd = detaildiv.get(i); String text = ""; for (Node node : dd.childNodes()) { if (node instanceof TextNode) { String snip = ((TextNode) node).text(); if (snip.length() > 0) { text += snip; } } else if (node instanceof Element) { if (((Element) node).tagName().equals("br")) { text += "\n"; } else { String snip = ((Element) node).text().trim(); if (snip.length() > 0) { text += snip; } } } } result.addDetail(new Detail("", text)); } } if (doc.select("span.z3988").size() > 0) { // Sometimes there is a <span class="Z3988"> item which provides // data in a standardized format. String z3988data = doc.select("span.z3988").first().attr("title").trim(); for (String pair : z3988data.split("&")) { String[] nv = pair.split("=", 2); if (nv.length == 2) { if (!nv[1].trim().equals("")) { if (nv[0].equals("rft.btitle") && result.getTitle().length() == 0) { result.setTitle(nv[1]); } else if (nv[0].equals("rft.atitle") && result.getTitle().length() == 0) { result.setTitle(nv[1]); } else if (nv[0].equals("rft.au")) { result.addDetail(new Detail("Author", nv[1])); } } } } } // Cover if (doc.select(".BookCover, .LargeBookCover").size() > 0) { result.setCover(doc.select(".BookCover, .LargeBookCover").first().attr("src")); } Elements copydivs = doc.select("div[id^=stock_]"); String pop = ""; for (int i = 0; i < copydivs.size(); i++) { Element div = copydivs.get(i); if (div.attr("id").startsWith("stock_head")) { pop = div.text().trim(); continue; } Copy copy = new Copy(); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); // This is getting very ugly - check if it is valid for libraries which are not Hamburg. // Seems to also work in Kiel (Zones 1.8, checked 10.10.2015) int j = 0; for (Node node : div.childNodes()) { try { if (node instanceof Element) { if (((Element) node).tag().getName().equals("br")) { copy.setBranch(pop); result.addCopy(copy); j = -1; } else if (((Element) node).tag().getName().equals("b") && j == 1) { copy.setLocation(((Element) node).text()); } else if (((Element) node).tag().getName().equals("b") && j > 1) { copy.setStatus(((Element) node).text()); } j++; } else if (node instanceof TextNode) { if (j == 0) { copy.setDepartment(((TextNode) node).text()); } if (j == 2) { copy.setBarcode(((TextNode) node).getWholeText().trim().split("\n")[0].trim()); } if (j == 6) { String text = ((TextNode) node).text().trim(); String date = text.substring(text.length() - 10); try { copy.setReturnDate(fmt.parseLocalDate(date)); } catch (IllegalArgumentException e) { e.printStackTrace(); } } j++; } } catch (Exception e) { e.printStackTrace(); } } } return result; }
From source file:crawler.HackerEarthCrawler.java
@Override public void crawl() { int flag = 0; //set of urls which should be crawled TreeSet<String> linksset = new TreeSet<String>(); TreeSet<String> tempset = new TreeSet<String>(); TreeSet<String> tutorialset = new TreeSet<String>(); //final set of problem urls TreeSet<String> problemset = new TreeSet<String>(); //visited for maintaing status of if url is already crawled or not TreeMap<String, Integer> visited = new TreeMap<String, Integer>(); //add base url linksset.add(baseUrl);/*ww w. j a v a 2s.c o m*/ //mark base url as not crawled visited.put(baseUrl, 0); try { while (true) { flag = 0; tempset.clear(); for (String str : linksset) { //check if url is already crawled or not and it has valid domain name if ((visited.get(str) == 0) && (str.startsWith("https://www.hackerearth.com/"))) { System.out.println("crawling " + str); //retriving response of current url as document Document doc = Jsoup.connect(str).timeout(0).userAgent( "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0") .referrer("http://www.google.com").ignoreHttpErrors(true).get(); //retriving all urls from current page Elements links = doc.select("a[href]"); //mark url as crawled visited.put(str, 1); //mark flag as url is crawled flag = 1; //retrive all urls for (Element link : links) { if (link.absUrl("href").endsWith("/tutorial/")) { tutorialset.add(link.absUrl("href")); } //check if url is problem url then add it in problemurlset if (link.absUrl("href").startsWith("https://www.hackerearth.com/") && isProblemUrl(link.absUrl("href"))) { problemset.add(link.absUrl("href")); } //check if url has valid domain and it has problem urls or not if (link.absUrl("href").contains(("https://www.hackerearth.com/")) && isCrawlable(link.absUrl("href"))) { //if link is not visited then mark it as uncrawled if (!visited.containsKey(link.absUrl("href"))) { visited.put(link.absUrl("href"), 0); } //add it in tempsetorary set tempset.add(link.absUrl("href")); //System.out.println("\n base: "+str+" ::: link : " + link.absUrl("href")); } } } } //if nothing is left to crawl break the loop if (flag == 0) { break; } //add all retrieved links to linksset linksset.addAll(tempset); } System.out.println("\n\ntotal problem urls " + problemset.size()); int i = 0; for (String str : problemset) { System.out.println("link " + i + " : " + str); i++; } } catch (IOException ex) { Logger.getLogger(HackerEarthCrawler.class.getName()).log(Level.SEVERE, null, ex); } //scrap and store into database //for every problem url scrap problem page for (String problemUrl : problemset) { System.out.println("problemUrl :" + problemUrl); try { //create problem class to store in database Problem problem = new Problem(); String problemSIOC = "", problemIOC = ""; String problemTitle = "", problemStatement = "", problemInput = "", problemOutput = "", problemConstraints = ""; String sampleInput = "", sampleOutput = ""; String problemExplanation = ""; //set default timelimit to 1 second double problemTimeLimit = 1.0; ArrayList<String> tags = new ArrayList<String>(); //get response for given problem url Response response = Jsoup.connect(problemUrl).execute(); Document doc = response.parse(); //retrieve problem title from page Element elementTitle = doc.getElementsByTag("title").first(); StringTokenizer stTitle = new StringTokenizer(elementTitle.text(), "|"); problemTitle = stTitle.nextToken().trim(); Element content = doc.getElementsByClass("starwars-lab").first(); problemSIOC = content.text(); Elements e = content.children(); //to find problem statement String breakloop[] = { "input", "input:", "input :", "input format:", "input format :", "input format", "Input and output", "constraints :", "constraints:", "constraints", "$$Input :$$" }; flag = 0; for (Element p : e) { String tempStatement = ""; for (Element pp : p.getAllElements()) { for (String strbreak : breakloop) { if (StringUtils.equalsIgnoreCase(pp.ownText(), strbreak)) { //System.out.println("strbreak :"+strbreak); tempStatement = p.text().substring(0, p.text().toLowerCase().indexOf(strbreak.toLowerCase())); // System.out.println("temp "+tempStatement); flag = 1; break; } } } if (flag == 1) { problemStatement += tempStatement; //remove extra space at end if (tempStatement.length() == 0) { problemStatement = problemStatement.substring(0, problemStatement.length() - 1); } break; } problemStatement += p.text() + " "; } System.out.println("problemSIOC :" + problemSIOC); System.out.println("problemStatement :" + problemStatement); if (problemStatement.length() <= problemSIOC.length()) { //remove problem statement from whole text and remove extra spaces at the beginning and the end problemIOC = problemSIOC.substring(problemStatement.length()).trim(); } else { problemIOC = ""; } System.out.println("problemIOC :" + problemIOC); //keywords for identifying input String decideInput[] = { "Input format :", "Input format:", "Input format", "inputformat:", "inputformat :", "inputformat", "input and output", "input :", "input:", "input" }; //keywords for identifying output String decideOutput[] = { "output format :", "output format:", "Output format", "outputformat:", "outputformat :", "outputformat", "output :", "output:", "output" }; //keywords for identifying constraint String decideConstraint[] = { "constraints:", "constraints :", "constraints", "Constraints :", "constraint:", "constraint :", "constraint", "Contraints :" }; int posin = 0, posoutput = 0, poscon = 0, idxin, idxout, idxcon, flaginput = 0, flagoutput = 0, flagcon = 0, inlen = 0, outlen = 0, conlen = 0; //find inputformat position,length of keyword for (idxin = 0; idxin < decideInput.length; idxin++) { if (StringUtils.containsIgnoreCase(problemIOC, decideInput[idxin])) { posin = problemIOC.toLowerCase().indexOf(decideInput[idxin].toLowerCase()); flaginput = 1; inlen = decideInput[idxin].length(); //decide it is keyowrd for actucal input or it is "sample input" if (StringUtils.containsIgnoreCase(problemIOC, "sample input")) { if (posin > problemIOC.toLowerCase().indexOf("sample input")) { flaginput = 0; inlen = 0; } else { break; } } else { break; } } } //find outputformat position,length of keyword for (idxout = 0; idxout < decideOutput.length; idxout++) { if (StringUtils.containsIgnoreCase(problemIOC, decideOutput[idxout])) { posoutput = problemIOC.toLowerCase().indexOf(decideOutput[idxout].toLowerCase()); flagoutput = 1; outlen = decideOutput[idxout].length(); break; } } //find constraint position,length of keyword for (idxcon = 0; idxcon < decideConstraint.length; idxcon++) { if (StringUtils.containsIgnoreCase(problemIOC, decideConstraint[idxcon])) { poscon = problemIOC.toLowerCase().indexOf(decideConstraint[idxcon].toLowerCase()); flagcon = 1; conlen = decideConstraint[idxcon].length(); break; } } System.out.println("input " + flaginput + " " + inlen + " " + posin); System.out.println("output " + flagoutput + " " + outlen + " " + posoutput); System.out.println("constraint " + flagcon + " " + conlen + " " + poscon); //retrieve problem input and output if present in problem page //if input format is present if (flaginput == 1) { //if input keyword is "input and output" and contraint is present in problem page if (idxin == 6 && flagcon == 1) { problemInput = problemIOC.substring(inlen, poscon); } //if input keyword is "input and output" and contraint is not present in problem page else if (idxin == 6 && flagcon == 0) { problemInput = problemIOC.substring(inlen); } //if output format and constraint is present else if (flagoutput == 1 && flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } //if constraint is present before sample else if (poscon < posoutput) { problemInput = problemIOC.substring(inlen, poscon); problemOutput = problemIOC.substring(posoutput + outlen); } else { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen, poscon); } } //if constraint is not present else if (flagoutput == 1 && flagcon == 0) { problemInput = problemIOC.substring(inlen, posoutput); problemOutput = problemIOC.substring(posoutput + outlen); } else if (flagoutput == 0 && flagcon == 1) { if (poscon < posin) { problemInput = problemIOC.substring(posin + inlen); } else { problemInput = problemIOC.substring(poscon + conlen, posin); } problemOutput = ""; } else { problemInput = problemIOC.substring(inlen); problemOutput = ""; } } //if input format and output format is not present else { problemInput = ""; problemOutput = ""; } //if constraint is present if (flagcon == 1) { //if constraint is present before input format if (poscon < posin) { problemConstraints = problemIOC.substring(0, posin); } //if constraint is present before output format else if (poscon < posoutput) { problemConstraints = problemIOC.substring(poscon + conlen, posoutput); } else { problemConstraints = problemIOC.substring(poscon + conlen); } } System.out.println("problemInput :" + problemInput); System.out.println("problemOutput :" + problemOutput); System.out.println("problemConstraints :" + problemConstraints); //retrieve problem tags from problem page Element elementtag = doc.getElementsByClass("problem-tags").first().child(1); StringTokenizer st = new StringTokenizer(elementtag.text(), ","); while (st.hasMoreTokens()) { tags.add(st.nextToken().trim()); } //retrieve sample input sample output if present Element elementSIO = doc.getElementsByClass("input-output-container").first(); //if sample input output is present if (elementSIO != null) { //find position of sample output int soutpos = elementSIO.text().indexOf("SAMPLE OUTPUT"); sampleInput = elementSIO.text().substring(12, soutpos); sampleOutput = elementSIO.text().substring(soutpos + 13); System.out.println("Sample input :\n" + sampleInput + "\n\n\n"); System.out.println("Sample Output :\n" + sampleOutput); } else { sampleInput = ""; sampleOutput = ""; } //retrieve problem explanation from problem page if present Element elementExplanation = doc.getElementsByClass("standard-margin").first().child(0); if (elementExplanation.text().toLowerCase().contains("explanation")) { problemExplanation = elementExplanation.nextElementSibling().text(); } System.out.println("Explanation :" + problemExplanation); //retrieve timelimit Element elementTL = doc.getElementsByClass("problem-guidelines").first().child(0).child(1); StringTokenizer stTL = new StringTokenizer(elementTL.ownText(), " "); problemTimeLimit = Double.parseDouble(stTL.nextToken()); //System.out.println("problemTimeLimit :"+problemTimeLimit); //set all retrieved information to problem class problem.setProblemUrl(problemUrl); if (problemTitle.length() == 0) { problemTitle = null; } if (problemStatement.length() == 0) { problemStatement = null; } if (problemInput.length() == 0) { problemInput = null; } if (problemOutput.length() == 0) { problemOutput = null; } if (problemExplanation.length() == 0) { problemExplanation = null; } if (problemConstraints.length() == 0) { problemConstraints = null; } problem.setTitle(problemTitle); problem.setProblemUrl(problemUrl); problem.setProblemStatement(problemStatement); problem.setInputFormat(problemInput); problem.setOutputFormat(problemOutput); problem.setTimeLimit(problemTimeLimit); problem.setExplanation(problemExplanation); problem.setConstraints(problemConstraints); //set sample input output to problem class SampleInputOutput sampleInputOutput = new SampleInputOutput(problem, sampleInput, sampleOutput); problem.getSampleInputOutputs().add(sampleInputOutput); //set platform as hackerearth problem.setPlatform(Platform.HackerEarth); for (String strtag : tags) { problem.getTags().add(strtag); } //store in database Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Problem p where p.problemUrl = :problem_url"; Problem oldProblem = (Problem) session.createQuery(hql).setString("problem_url", problemUrl) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem problem.setId(oldProblem.getId()); session.delete(oldProblem); session.flush(); session.save(problem); } else { task = "saved"; session.save(problem); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, problem.getProblemUrl() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + problemUrl, e); } finally { //close the session if (session != null) { session.close(); } } } catch (Exception ee) { System.out.println(ee.toString()); } } System.out.println("\n\n\n\ntutorial urls\n\n"); try { for (String tutorialurl : tutorialset) { //System.out.println(tutorialurl+"\n\n"); Response tutorialres = Jsoup.connect(tutorialurl).execute(); Document doc = tutorialres.parse(); Tutorial tutorial = new Tutorial(); tutorial.setContent(doc.getElementsByClass("tutorial").first().text()); tutorial.setName(baseUrl); tutorialurl = tutorialurl.substring(0, tutorialurl.length() - 10); StringTokenizer tutorialtok = new StringTokenizer(tutorialurl, "/"); String tempstr = ""; while (tutorialtok.hasMoreTokens()) { tempstr = tutorialtok.nextToken(); } Session session = null; Transaction transaction = null; try { //start session session = HibernateUtil.getSessionFactory().openSession(); transaction = session.beginTransaction(); //check if problem is already stored in database String hql = "FROM Tutorial p where p.name = :name"; Tutorial oldProblem = (Tutorial) session.createQuery(hql).setString("name", tempstr) .uniqueResult(); String task; //if problem is present in database if (oldProblem != null) { //update the old problem task = "updated"; //retrieve id of old problem tutorial.setName(oldProblem.getName()); session.delete(oldProblem); session.flush(); session.save(tutorial); } else { task = "saved"; tutorial.setName(tempstr); session.save(tutorial); } transaction.commit(); //log the info to console Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}", new Object[] { task, tutorial.getName() }); } catch (HibernateException ee) { if (transaction != null) { transaction.rollback(); } Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE, "Cannot Insert/Update problem into databse: " + tempstr, ee); } finally { //close the session if (session != null) { session.close(); } } } } catch (Exception e) { System.out.println(e.getMessage()); } }
From source file:com.kantenkugel.discordbot.jdocparser.JDocParser.java
private static List<DocBlock> getDocBlock(String jdocBase, Element elem, ClassDocumentation reference) { if (elem != null) { String baseLink = JDocUtil.getLink(jdocBase, reference); List<DocBlock> blocks = new ArrayList<>(10); String hashLink = null;/*from w w w. ja v a 2 s. c om*/ for (elem = elem.nextElementSibling(); elem != null; elem = elem.nextElementSibling()) { if (elem.tagName().equals("a")) { hashLink = '#' + elem.attr("name"); } else if (elem.tagName().equals("ul")) { Element tmp = elem.getElementsByTag("h4").first(); String title = JDocUtil.fixSpaces(tmp.text().trim()); String description = "", signature = ""; OrderedMap<String, List<String>> fields = new ListOrderedMap<>(); for (; tmp != null; tmp = tmp.nextElementSibling()) { if (tmp.tagName().equals("pre")) { //contains full signature signature = JDocUtil.fixSpaces(tmp.text().trim()); } else if (tmp.tagName().equals("div") && tmp.className().equals("block")) { //main block of content (description or deprecation) Element deprecationElem = tmp.getElementsByClass("deprecationComment").first(); if (deprecationElem != null) { //deprecation block fields.put("Deprecated:", Collections .singletonList(JDocUtil.formatText(deprecationElem.html(), baseLink))); } else { //description block description = JDocUtil.formatText(tmp.html(), baseLink); } } else if (tmp.tagName().equals("dl")) { //a field String fieldName = null; List<String> fieldValues = new ArrayList<>(); for (Element element : tmp.children()) { if (element.tagName().equals("dt")) { if (fieldName != null) { fields.put(fieldName, fieldValues); fieldValues = new ArrayList<>(); } fieldName = JDocUtil.fixSpaces(element.text().trim()); } else if (element.tagName().equals("dd")) { fieldValues.add(JDocUtil.formatText(element.html(), baseLink)); } } if (fieldName != null) { fields.put(fieldName, fieldValues); } } } blocks.add(new DocBlock(title, hashLink, signature, description, fields)); } } return blocks; } return null; }
From source file:org.abondar.experimental.eventsearch.EventFinder.java
public void getEvent(String eventId, String evType) { try {/*from w w w. jav a 2s . c om*/ Document dc = Jsoup.connect("https://afisha.yandex.ru/msk/events/" + eventId + "/").get(); Event eb = new Event(); eb.setEventID(eventId); eb.setCategory(eventTypes.get(evType)); Elements elems = dc.select("meta"); for (Element e : elems) { if (e.attributes().get("property").contains("og:description")) { eb.setDescription(e.attributes().get("content")); } } elems = dc.select("title"); for (Element e : elems) { eb.setName(e.html().substring(0, e.html().indexOf(""))); } elems = dc.select("a[href]"); for (Element e : elems) { for (Attribute attr : e.attributes().asList()) { if (attr.getValue().contains("/msk/places/")) { eb.setPlace(getEventPlaces(attr.getValue())); } } } elems = dc.select("tr[id]"); for (Element e : elems) { for (Attribute attr : e.attributes().asList()) { if (attr.getValue().contains("f")) { eb.setDate(e.children().first().html()); try { Element e1 = e.child(1).children().first(); Element e2 = e1.children().first(); Element e3 = e2.children().first(); Element e4 = e3.children().first(); eb.setTime(e4.html()); } catch (NullPointerException ex) { Element e1 = e.child(2).children().first(); Element e2 = e1.children().first(); Element e3 = e2.children().first(); Element e4 = e3.children().first(); eb.setTime(e4.html()); } } } } geoCode(eb); formJson(eb); } catch (IOException ex) { Logger.getLogger(EventFinder.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:com.jimplush.goose.ContentExtractor.java
/** * remove any divs that looks like non-content, clusters of links, or paras with no gusto * * @param node// w w w . jav a2 s .c o m * @return */ private Element cleanupNode(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting cleanup Node"); } node = addSiblings(node); Elements nodes = node.children(); for (Element e : nodes) { if (e.tagName().equals("p")) { continue; } if (logger.isDebugEnabled()) { logger.debug("CLEANUP NODE: " + e.id() + " class: " + e.attr("class")); } boolean highLinkDensity = isHighLinkDensity(e); if (highLinkDensity) { if (logger.isDebugEnabled()) { logger.debug("REMOVING NODE FOR LINK DENSITY: " + e.id() + " class: " + e.attr("class")); } e.remove(); continue; } // now check for word density // grab all the paragraphs in the children and remove ones that are too small to matter Elements subParagraphs = e.getElementsByTag("p"); for (Element p : subParagraphs) { if (p.text().length() < 25) { p.remove(); } } // now that we've removed shorty paragraphs let's make sure to exclude any first paragraphs that don't have paras as // their next siblings to avoid getting img bylines // first let's remove any element that now doesn't have any p tags at all Elements subParagraphs2 = e.getElementsByTag("p"); if (subParagraphs2.size() == 0 && !e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node because it doesn't have any paragraphs"); } e.remove(); continue; } //if this node has a decent enough gravityScore we should keep it as well, might be content int topNodeScore = getScore(node); int currentNodeScore = getScore(e); float thresholdScore = (float) (topNodeScore * .08); if (logger.isDebugEnabled()) { logger.debug("topNodeScore: " + topNodeScore + " currentNodeScore: " + currentNodeScore + " threshold: " + thresholdScore); } if (currentNodeScore < thresholdScore) { if (!e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node due to low threshold score"); } e.remove(); } else { if (logger.isDebugEnabled()) { logger.debug("Not removing TD node"); } } continue; } } return node; }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
@Override public ProlongResult prolong(String media, Account account, int useraction, String Selection) throws IOException { String command;// w ww . ja v a 2s . c o m // prolong media via http POST // Offenburg: URL is .../opac/verl.C // Hagen: URL is .../opax/renewmedia.C if (opacDir.contains("opax")) { command = "/renewmedia" + opacSuffix; } else { command = "/verl" + opacSuffix; } List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair(media, "YES")); nameValuePairs.add(new BasicNameValuePair("BENUTZER", account.getName())); nameValuePairs.add(new BasicNameValuePair("FUNC", "verl")); nameValuePairs.add(new BasicNameValuePair("LANG", "de")); nameValuePairs.add(new BasicNameValuePair("PASSWORD", account.getPassword())); String html = httpPost(opacUrl + "/" + opacDir + command, new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); if (html.contains("no such key")) { html = httpPost(opacUrl + "/" + opacDir + command.replace(".C", ".S"), new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); } Document doc = Jsoup.parse(html); // Check result: // First we look for a cell with text "Status" // and store the column number // Then we look in the rows below at this column if // we find any text. Stop at first text we find. // This text must start with "verlngert" Elements rowElements = doc.select("table tr"); int statusCol = -1; // Status column not yet found // rows loop for (int i = 0; i < rowElements.size(); i++) { Element tr = rowElements.get(i); Elements tdList = tr.children(); // <th> or <td> // columns loop for (int j = 0; j < tdList.size(); j++) { String cellText = tdList.get(j).text().trim(); if (statusCol < 0) { // we look for cell with text "Status" if (cellText.equals("Status")) { statusCol = j; break; // next row } } else { // we look only at Status column // In "Hagen", there are some extra empty rows below if ((j == statusCol) && (cellText.length() > 0)) { // Status found if (cellText.matches("verl.ngert.*")) { return new ProlongResult(MultiStepResult.Status.OK); } else { return new ProlongResult(MultiStepResult.Status.ERROR, cellText); } } } } // for columns } // for rows return new ProlongResult(MultiStepResult.Status.ERROR, "unknown result"); }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
/** * Parses an Untis substitution schedule table * * @param table the <code>table</code> Element from the HTML document * @param data {@link SubstitutionScheduleData#getData()} * @param day the {@link SubstitutionScheduleDay} where the substitutions will be stored * @param defaultClass the class that should be set if there is no class column in the table *///from w w w .j a va 2 s .c o m private void parseSubstitutionScheduleTable(Element table, JSONObject data, SubstitutionScheduleDay day, String defaultClass) throws JSONException, CredentialInvalidException { if (data.optBoolean(PARAM_CLASS_IN_EXTRA_LINE) || data.optBoolean("class_in_extra_line")) { // backwards compatibility for (Element element : table.select("td.inline_header")) { String className = getClassName(element.text(), data); if (isValidClass(className)) { Element zeile = null; try { zeile = element.parent().nextElementSibling(); if (zeile.select("td") == null) { zeile = zeile.nextElementSibling(); } int skipLines = 0; while (zeile != null && !zeile.select("td").attr("class").equals("list inline_header")) { if (skipLines > 0) { skipLines--; zeile = zeile.nextElementSibling(); continue; } Substitution v = new Substitution(); int i = 0; for (Element spalte : zeile.select("td")) { String text = spalte.text(); if (isEmpty(text)) { i++; continue; } int skipLinesForThisColumn = 0; Element nextLine = zeile.nextElementSibling(); boolean continueSkippingLines = true; while (continueSkippingLines) { if (nextLine != null && nextLine.children().size() == zeile.children().size()) { Element columnInNextLine = nextLine.child(spalte.elementSiblingIndex()); if (columnInNextLine.text().replaceAll("\u00A0", "").trim() .equals(nextLine.text().replaceAll("\u00A0", "").trim())) { // Continued in the next line text += " " + columnInNextLine.text(); skipLinesForThisColumn++; nextLine = nextLine.nextElementSibling(); } else { continueSkippingLines = false; } } else { continueSkippingLines = false; } } if (skipLinesForThisColumn > skipLines) skipLines = skipLinesForThisColumn; String type = data.getJSONArray(PARAM_COLUMNS).getString(i); switch (type) { case "lesson": v.setLesson(text); break; case "subject": handleSubject(v, spalte); break; case "previousSubject": v.setPreviousSubject(text); break; case "type": v.setType(text); v.setColor(colorProvider.getColor(text)); break; case "type-entfall": if (text.equals("x")) { v.setType("Entfall"); v.setColor(colorProvider.getColor("Entfall")); } else { v.setType("Vertretung"); v.setColor(colorProvider.getColor("Vertretung")); } break; case "room": handleRoom(v, spalte); break; case "teacher": handleTeacher(v, spalte, data); break; case "previousTeacher": v.setPreviousTeachers(splitTeachers(text, data)); break; case "desc": v.setDesc(text); break; case "desc-type": v.setDesc(text); String recognizedType = recognizeType(text); v.setType(recognizedType); v.setColor(colorProvider.getColor(recognizedType)); break; case "previousRoom": v.setPreviousRoom(text); break; case "substitutionFrom": v.setSubstitutionFrom(text); break; case "teacherTo": v.setTeacherTo(text); break; case "ignore": break; case "date": // used by UntisSubstitutionParser break; default: throw new IllegalArgumentException("Unknown column type: " + type); } i++; } autoDetectType(data, zeile, v); v.getClasses().add(className); if (v.getLesson() != null && !v.getLesson().equals("")) { day.addSubstitution(v); } zeile = zeile.nextElementSibling(); } } catch (Throwable e) { e.printStackTrace(); } } } } else { boolean hasType = false; for (int i = 0; i < data.getJSONArray(PARAM_COLUMNS).length(); i++) { if (data.getJSONArray(PARAM_COLUMNS).getString(i).equals("type")) { hasType = true; } } int skipLines = 0; for (Element zeile : table.select("tr.list.odd:not(:has(td.inline_header)), " + "tr.list.even:not(:has(td.inline_header)), " + "tr:has(td[align=center]):gt(0)")) { if (skipLines > 0) { skipLines--; continue; } Substitution v = new Substitution(); String klassen = defaultClass != null ? defaultClass : ""; int i = 0; for (Element spalte : zeile.select("td")) { String text = spalte.text(); String type = data.getJSONArray(PARAM_COLUMNS).getString(i); if (isEmpty(text) && !type.equals("type-entfall")) { i++; continue; } int skipLinesForThisColumn = 0; Element nextLine = zeile.nextElementSibling(); boolean continueSkippingLines = true; while (continueSkippingLines) { if (nextLine != null && nextLine.children().size() == zeile.children().size()) { Element columnInNextLine = nextLine.child(spalte.elementSiblingIndex()); if (columnInNextLine.text().replaceAll("\u00A0", "").trim() .equals(nextLine.text().replaceAll("\u00A0", "").trim())) { // Continued in the next line text += " " + columnInNextLine.text(); skipLinesForThisColumn++; nextLine = nextLine.nextElementSibling(); } else { continueSkippingLines = false; } } else { continueSkippingLines = false; } } if (skipLinesForThisColumn > skipLines) skipLines = skipLinesForThisColumn; switch (type) { case "lesson": v.setLesson(text); break; case "subject": handleSubject(v, spalte); break; case "previousSubject": v.setPreviousSubject(text); break; case "type": v.setType(text); v.setColor(colorProvider.getColor(text)); break; case "type-entfall": if (text.equals("x")) { v.setType("Entfall"); v.setColor(colorProvider.getColor("Entfall")); } else if (!hasType) { v.setType("Vertretung"); v.setColor(colorProvider.getColor("Vertretung")); } break; case "room": handleRoom(v, spalte); break; case "previousRoom": v.setPreviousRoom(text); break; case "desc": v.setDesc(text); break; case "desc-type": v.setDesc(text); String recognizedType = recognizeType(text); v.setType(recognizedType); v.setColor(colorProvider.getColor(recognizedType)); break; case "teacher": handleTeacher(v, spalte, data); break; case "previousTeacher": v.setPreviousTeachers(splitTeachers(text, data)); break; case "substitutionFrom": v.setSubstitutionFrom(text); break; case "teacherTo": v.setTeacherTo(text); break; case "class": klassen = getClassName(text, data); break; case "ignore": break; case "date": // used by UntisSubstitutionParser break; default: throw new IllegalArgumentException("Unknown column type: " + type); } i++; } if (v.getLesson() == null || v.getLesson().equals("")) { continue; } autoDetectType(data, zeile, v); List<String> affectedClasses; // Detect things like "7" Pattern singlePattern = Pattern.compile("(\\d+)"); Matcher singleMatcher = singlePattern.matcher(klassen); // Detect things like "5-12" Pattern rangePattern = Pattern.compile("(\\d+) ?- ?(\\d+)"); Matcher rangeMatcher = rangePattern.matcher(klassen); Pattern pattern2 = Pattern.compile("^(\\d+).*"); if (rangeMatcher.matches()) { affectedClasses = new ArrayList<>(); int min = Integer.parseInt(rangeMatcher.group(1)); int max = Integer.parseInt(rangeMatcher.group(2)); try { for (String klasse : getAllClasses()) { Matcher matcher2 = pattern2.matcher(klasse); if (matcher2.matches()) { int num = Integer.parseInt(matcher2.group(1)); if (min <= num && num <= max) affectedClasses.add(klasse); } } } catch (IOException e) { e.printStackTrace(); } } else if (singleMatcher.matches()) { affectedClasses = new ArrayList<>(); int grade = Integer.parseInt(singleMatcher.group(1)); try { for (String klasse : getAllClasses()) { Matcher matcher2 = pattern2.matcher(klasse); if (matcher2.matches() && grade == Integer.parseInt(matcher2.group(1))) { affectedClasses.add(klasse); } } } catch (IOException e) { e.printStackTrace(); } } else { if (data.optBoolean(PARAM_CLASSES_SEPARATED, true) && data.optBoolean("classes_separated", true)) { // backwards compatibility affectedClasses = Arrays.asList(klassen.split(", ")); } else { affectedClasses = new ArrayList<>(); try { for (String klasse : getAllClasses()) { // TODO: is there a better way? StringBuilder regex = new StringBuilder(); for (char character : klasse.toCharArray()) { if (character == '?') { regex.append("\\?"); } else { regex.append(character); } regex.append(".*"); } if (klassen.matches(regex.toString())) { affectedClasses.add(klasse); } } } catch (IOException e) { e.printStackTrace(); } } } for (String klasse : affectedClasses) { if (isValidClass(klasse)) { v.getClasses().add(klasse); } } if (data.optBoolean(PARAM_MERGE_WITH_DIFFERENT_TYPE, false)) { boolean found = false; for (Substitution subst : day.getSubstitutions()) { if (subst.equalsExcludingType(v)) { found = true; if (v.getType().equals("Vertretung")) { subst.setType("Vertretung"); subst.setColor(colorProvider.getColor("Vertretung")); } break; } } if (!found) { day.addSubstitution(v); } } else { day.addSubstitution(v); } } } }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
private DetailledItem parse_result(String html) { DetailledItem item = new DetailledItem(); Document document = Jsoup.parse(html); Elements rows = document.select("html body form table tr"); // Elements rows = document.select("html body div form table tr"); // Element rowReverseSubject = null; Detail detail = null;// ww w . ja v a 2 s . c om // prepare copiestable Copy copy_last_content = null; int copy_row = 0; String[] copy_keys = new String[] { "barcode", "branch", "department", "location", "status", "returndate", "reservations" }; int[] copy_map = new int[] { 3, 1, -1, 1, 4, -1, -1 }; try { JSONObject map = data.getJSONObject("copiestable"); for (int i = 0; i < copy_keys.length; i++) { if (map.has(copy_keys[i])) { copy_map[i] = map.getInt(copy_keys[i]); } } } catch (Exception e) { // "copiestable" is optional } DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); // go through all rows for (Element row : rows) { Elements columns = row.children(); if (columns.size() == 2) { // HTML tag " " is encoded as 0xA0 String firstColumn = columns.get(0).text().replace("\u00a0", " ").trim(); String secondColumn = columns.get(1).text().replace("\u00a0", " ").trim(); if (firstColumn.length() > 0) { // 1st column is category if (firstColumn.equalsIgnoreCase("titel")) { detail = null; item.setTitle(secondColumn); } else { if (secondColumn.contains("hier klicken") && columns.get(1).select("a").size() > 0) { secondColumn += " " + columns.get(1).select("a").first().attr("href"); } detail = new Detail(firstColumn, secondColumn); item.getDetails().add(detail); } } else { // 1st column is empty, so it is an extension to last // category if (detail != null) { String content = detail.getContent() + "\n" + secondColumn; detail.setContent(content); } else { // detail==0, so it's the first row // check if there is an amazon image if (columns.get(0).select("a img[src]").size() > 0) { item.setCover(columns.get(0).select("a img").first().attr("src")); } } } } else if (columns.size() > 3) { // This is the second section: the copies in stock ("Exemplare") // With reverse layout: first row is headline, skipped via // (copy_row > 0) if (copy_row > 0) { Copy copy = new Copy(); for (int j = 0; j < copy_keys.length; j++) { int col = copy_map[j]; if (col > -1) { String text = ""; if (copy_keys[j].equals("branch")) { // for "Standort" only use ownText() to suppress // Link "Wegweiser" text = columns.get(col).ownText().replace("\u00a0", " ").trim(); } if (text.length() == 0) { // text of children text = columns.get(col).text().replace("\u00a0", " ").trim(); } if (text.length() == 0) { // empty table cell, take the one above // this is sometimes the case for "Standort" if (copy_keys[j].equals("status")) { // but do it not for Status text = " "; } else { if (copy_last_content != null) { text = copy_last_content.get(copy_keys[j]); } else { text = ""; } } } if (copy_keys[j].equals("reservations")) { text = text.replace("Vorgemerkt: ", "").replace("Vorbestellt: ", ""); } try { copy.set(copy_keys[j], text, fmt); } catch (IllegalArgumentException e) { e.printStackTrace(); } } } if (copy.getBranch() != null && copy.getLocation() != null && copy.getLocation().equals(copy.getBranch())) { copy.setLocation(null); } item.addCopy(copy); copy_last_content = copy; } // ignore 1st row copy_row++; } // if columns.size } // for rows item.setReservable(true); // We cannot check if media is reservable if (opacDir.contains("opax")) { if (document.select("input[type=checkbox]").size() > 0) { item.setReservation_info(document.select("input[type=checkbox]").first().attr("name")); } else if (document.select("a[href^=reserv" + opacSuffix + "]").size() > 0) { String href = document.select("a[href^=reserv" + opacSuffix + "]").first().attr("href"); item.setReservation_info(href.substring(href.indexOf("resF_"))); } else { item.setReservable(false); } } else { item.setReservation_info(document.select("input[name=ID]").attr("value")); } return item; }
From source file:me.vertretungsplan.parser.UntisInfoParser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); Document navbarDoc = Jsoup.parse(getNavbarDoc().replace(" ", "")); Element select = navbarDoc.select("select[name=week]").first(); SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); String info = navbarDoc.select(".description").text(); String lastChange;//from w w w .j a v a 2 s . com try { lastChange = info.substring(info.indexOf("Stand:") + "Stand:".length()).trim(); } catch (Exception e) { try { String infoHtml = httpGet(baseUrl + "/frames/title.htm", data.optString(PARAM_ENCODING, null)); Document infoDoc = Jsoup.parse(infoHtml); String info2 = infoDoc.select(".description").text(); lastChange = info2.substring(info2.indexOf("Stand:") + "Stand:".length()).trim(); } catch (Exception e1) { lastChange = ""; } } int successfulWeeks = 0; HttpResponseException lastException = null; for (Element option : select.children()) { String week = option.attr("value"); String weekName = option.text(); if (data.optBoolean(PARAM_SINGLE_CLASSES, data.optBoolean("single_classes", false)) // backwards compatibility || data.optString(PARAM_SCHEDULE_TYPE, "substitution").equals("timetable")) { int classNumber = 1; for (String klasse : getAllClasses()) { String url = getScheduleUrl(week, classNumber, data); try { parsePage(v, lastChange, klasse, url, weekName); } catch (HttpResponseException e) { if (e.getStatusCode() == 500) { // occurs in Hannover_MMBS classNumber++; continue; } else { throw e; } } classNumber++; } successfulWeeks++; } else { String url = getScheduleUrl(week, 0, data); try { parsePage(v, lastChange, null, url, weekName); successfulWeeks++; } catch (HttpResponseException e) { lastException = e; } } } if (successfulWeeks == 0 && lastException != null) { throw lastException; } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); v.setWebsite(baseUrl + "/default.htm"); return v; }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected void parse_reslist(String type, List<ReservedItem> reservations, Document doc, int offset) { Elements copytrs = doc.select(".data tr"); doc.setBaseUri(opac_url);//from w w w.j av a2 s . c o m int trs = copytrs.size(); if (trs == 1) { return; } assert (trs > 0); for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); ReservedItem item = new ReservedItem(); if (tr.text().contains("keine Daten") || tr.children().size() == 1) { return; } item.setTitle(tr.child(1).select("strong").text().trim()); try { String[] rowsplit1 = tr.child(1).html().split("<br[ /]*>"); String[] rowsplit2 = tr.child(2).html().split("<br[ /]*>"); if (rowsplit1.length > 1) item.setAuthor(rowsplit1[1].trim()); if (rowsplit2.length > 2) item.setBranch(rowsplit2[2].trim()); if (rowsplit2.length > 2) item.setStatus(rowsplit2[0].trim()); if (tr.select("a").size() == 1) { item.setCancelData(type + "$" + offset + "$" + tr.select("a").attr("abs:href").split("\\?")[1]); } } catch (Exception e) { e.printStackTrace(); } reservations.add(item); } assert (reservations.size() == trs - 1); }