List of usage examples for org.jsoup.nodes Element getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:jp.mau.twappremover.MainActivity.java
private void getApps() { _apps.clear();/*from ww w . j a v a2 s .c o m*/ HttpGet request = new HttpGet(APP_PAGE); request.addHeader("User-Agent", USER_AGENT); request.addHeader("Cookie", "_twitter_sess=" + _session_id + "; auth_token=" + _cookie_auth); try { String result = _client.execute(request, new ResponseHandler<String>() { @Override public String handleResponse(HttpResponse response) throws ClientProtocolException, IOException { switch (response.getStatusLine().getStatusCode()) { case HttpStatus.SC_OK: return EntityUtils.toString(response.getEntity(), "UTF-8"); case HttpStatus.SC_NOT_FOUND: throw new RuntimeException("not found"); default: throw new RuntimeException("error"); } } }); Document doc = null; doc = Jsoup.parse(result); // parse top page and get authenticity token Elements forms = doc.getElementsByTag("form"); for (Element e : forms) { Elements auths = e.getElementsByAttributeValue("name", "authenticity_token"); if (auths.size() > 0) { _auth_token = auths.get(0).attr("value"); break; } } Elements apps = doc.getElementsByClass("app"); for (Element e : apps) { LinkedApp app = new LinkedApp(); if (e.getElementsByTag("strong").size() > 0) app.name = e.getElementsByTag("strong").get(0).text(); if (e.getElementsByClass("creator").size() > 0) app.creator = e.getElementsByClass("creator").get(0).text(); if (e.getElementsByClass("description").size() > 0) app.desc = e.getElementsByClass("description").get(0).text(); if (e.getElementsByClass("app-img").size() > 0) app.imgUrl = e.getElementsByClass("app-img").get(0).attr("src"); if (e.getElementsByClass("revoke").size() > 0) { String tmp = e.getElementsByClass("revoke").get(0).attr("id"); app.revokeId = tmp.replaceAll(KEY_HEADER_REVOKE, ""); } else { // revoke id ????(facebook????????) continue; } _apps.add(app); } _handler.post(new Runnable() { @Override public void run() { _appadapter.notifyDataSetChanged(); } }); } catch (Exception ex) { ex.printStackTrace(); } }
From source file:com.jimplush.goose.ContentExtractor.java
/** * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of * 100 then 100 should be our base./* w w w. jav a2 s.c o m*/ * * @param topNode * @return */ private int getBaselineScoreForSiblings(Element topNode) { int base = 100000; int numberOfParagraphs = 0; int scoreOfParagraphs = 0; Elements nodesToCheck = topNode.getElementsByTag("p"); for (Element node : nodesToCheck) { String nodeText = node.text(); WordStats wordStats = StopWords.getStopWordCount(nodeText); boolean highLinkDensity = isHighLinkDensity(node); if (wordStats.getStopWordCount() > 2 && !highLinkDensity) { numberOfParagraphs++; scoreOfParagraphs += wordStats.getStopWordCount(); } } if (numberOfParagraphs > 0) { base = scoreOfParagraphs / numberOfParagraphs; if (logger.isDebugEnabled()) { logger.debug("The base score for siblings to beat is: " + base + " NumOfParas: " + numberOfParagraphs + " scoreOfAll: " + scoreOfParagraphs); } } return base; }
From source file:us.colloquy.index.IndexHandler.java
public void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) { ///Documents/Tolstoy/diaries Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {//from w w w . j a va 2 s .c o m stream.forEach(results::add); // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------"); } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url) && useOnlyNumber) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } // System.out.println("Size: " + uriList.size()); // for (DocumentPointer pointer : uriList) // { // //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); // } }
From source file:com.jimplush.goose.ContentExtractor.java
/** * adds any siblings that may have a decent score to this node * * @param node/* w w w . ja v a 2 s. co m*/ * @return */ private Element addSiblings(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting to add siblings"); } int baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(node); Element currentSibling = node.previousElementSibling(); while (currentSibling != null) { if (logger.isDebugEnabled()) { logger.debug("SIBLINGCHECK: " + debugNode(currentSibling)); } if (currentSibling.tagName().equals("p")) { node.child(0).before(currentSibling.outerHtml()); currentSibling = currentSibling.previousElementSibling(); continue; } // check for a paraph embedded in a containing element int insertedSiblings = 0; Elements potentialParagraphs = currentSibling.getElementsByTag("p"); if (potentialParagraphs.first() == null) { currentSibling = currentSibling.previousElementSibling(); continue; } for (Element firstParagraph : potentialParagraphs) { WordStats wordStats = StopWords.getStopWordCount(firstParagraph.text()); int paragraphScore = wordStats.getStopWordCount(); if ((float) (baselineScoreForSiblingParagraphs * .30) < paragraphScore) { if (logger.isDebugEnabled()) { logger.debug("This node looks like a good sibling, adding it"); } node.child(insertedSiblings).before("<p>" + firstParagraph.text() + "<p>"); insertedSiblings++; } } currentSibling = currentSibling.previousElementSibling(); } return node; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * remove any divs that looks like non-content, clusters of links, or paras with no gusto * * @param node//from w w w .ja va 2s . c o m * @return */ private Element cleanupNode(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting cleanup Node"); } node = addSiblings(node); Elements nodes = node.children(); for (Element e : nodes) { if (e.tagName().equals("p")) { continue; } if (logger.isDebugEnabled()) { logger.debug("CLEANUP NODE: " + e.id() + " class: " + e.attr("class")); } boolean highLinkDensity = isHighLinkDensity(e); if (highLinkDensity) { if (logger.isDebugEnabled()) { logger.debug("REMOVING NODE FOR LINK DENSITY: " + e.id() + " class: " + e.attr("class")); } e.remove(); continue; } // now check for word density // grab all the paragraphs in the children and remove ones that are too small to matter Elements subParagraphs = e.getElementsByTag("p"); for (Element p : subParagraphs) { if (p.text().length() < 25) { p.remove(); } } // now that we've removed shorty paragraphs let's make sure to exclude any first paragraphs that don't have paras as // their next siblings to avoid getting img bylines // first let's remove any element that now doesn't have any p tags at all Elements subParagraphs2 = e.getElementsByTag("p"); if (subParagraphs2.size() == 0 && !e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node because it doesn't have any paragraphs"); } e.remove(); continue; } //if this node has a decent enough gravityScore we should keep it as well, might be content int topNodeScore = getScore(node); int currentNodeScore = getScore(e); float thresholdScore = (float) (topNodeScore * .08); if (logger.isDebugEnabled()) { logger.debug("topNodeScore: " + topNodeScore + " currentNodeScore: " + currentNodeScore + " threshold: " + thresholdScore); } if (currentNodeScore < thresholdScore) { if (!e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node due to low threshold score"); } e.remove(); } else { if (logger.isDebugEnabled()) { logger.debug("Not removing TD node"); } } continue; } } return node; }
From source file:ExtractorContentTest.java
private void treatSection(Element section, List<Catalog> catalogs) { // 1. get section name // FIXME what is it does not exist? // FIXME can be "h3" Elements sect2 = section.getElementsByTag("h2"); String s2 = null;/*from w ww .j a va 2 s . c o m*/ if (!sect2.isEmpty()) s2 = sect2.first().text(); // FIXME what about more than 1 ? String s3 = null; Elements sect3 = section.getElementsByTag("h3"); if (!sect3.isEmpty()) s3 = sect3.first().text(); String dt = null; Elements sectDT = section.getElementsByTag("p"); if (!sectDT.isEmpty()) { String contentDT = sectDT.first().text(); if (contentDT.startsWith(";")) dt = contentDT.replaceAll(";", ""); } // FIXME can be subsection // FIXME (1. optional step) some comments // 2. retrieve tabular Elements tables = section.getElementsByTag("table"); //if (!tables.isEmpty()) //System.err.println("\n****** " + s2 + " " + s3 + " *******\n"); for (Element table : tables) { // (0. optional step) act as subviewname Elements caption = table.select("caption"); String captionName = null; if (!caption.isEmpty()) captionName = caption.first().text(); /*** * Headers */ // List<Header> rHeaders = collectHeaders(table); boolean sortable = !table.select("[class=sortable wikitable]").isEmpty() || !table.select("[class=wikitable sortable]").isEmpty(); // FIXME: other cases Elements heads = table.select("thead"); if (sortable && (!heads.isEmpty())) { rHeaders = collectHeaders(heads.first()); } // 2 treat row Catalog product = null; Tree<String> structuralInformation = mkStructuralInformation(s2, s3, dt, captionName); if (sortable) { product = treatRows(table.select("tbody").first(), structuralInformation, rHeaders, sortable); } else product = treatRows(table, structuralInformation, rHeaders, sortable); catalogs.add(product); // } // set the "ID" / names // clean up for (Catalog catalog : catalogs) { for (Product p : catalog) { Header primaryHeader = p.getHeaders().get(0); p.setName(p.getValue(primaryHeader.getName())); } } }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected DetailledItem parse_result(String html) throws IOException { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);//from ww w. j a va 2 s .c o m String html2 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showTitleActive", ENCODING); Document doc2 = Jsoup.parse(html2); doc2.setBaseUri(opac_url); String html3 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showAvailabilityActive", ENCODING); Document doc3 = Jsoup.parse(html3); doc3.setBaseUri(opac_url); DetailledItem result = new DetailledItem(); try { result.setId(doc.select("#bibtip_id").text().trim()); } catch (Exception ex) { ex.printStackTrace(); } List<String> reservationlinks = new ArrayList<>(); for (Element link : doc3.select("#vormerkung a, #tab-content a")) { String href = link.absUrl("href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (result.getId() == null) { // ID retrieval String key = hrefq.get("katkey"); if (key != null) { result.setId(key); break; } } // Vormerken if (hrefq.get("methodToCall") != null) { if (hrefq.get("methodToCall").equals("doVormerkung") || hrefq.get("methodToCall").equals("doBestellung")) { reservationlinks.add(href.split("\\?")[1]); } } } if (reservationlinks.size() == 1) { result.setReservable(true); result.setReservation_info(reservationlinks.get(0)); } else if (reservationlinks.size() == 0) { result.setReservable(false); } else { // TODO: Multiple options - handle this case! } if (doc.select(".data td img").size() == 1) { result.setCover(doc.select(".data td img").first().attr("abs:src")); try { downloadCover(result); } catch (Exception e) { } } if (doc.select(".aw_teaser_title").size() == 1) { result.setTitle(doc.select(".aw_teaser_title").first().text().trim()); } else if (doc.select(".data td strong").size() > 0) { result.setTitle(doc.select(".data td strong").first().text().trim()); } else { result.setTitle(""); } if (doc.select(".aw_teaser_title_zusatz").size() > 0) { result.addDetail(new Detail("Titelzusatz", doc.select(".aw_teaser_title_zusatz").text().trim())); } String title = ""; String text = ""; boolean takeover = false; Element detailtrs = doc2.select(".box-container .data td").first(); for (Node node : detailtrs.childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("strong")) { title = ((Element) node).text().trim(); text = ""; } else { if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) { text = text + node.attr("href"); takeover = true; break; } } } else if (node instanceof TextNode) { text = text + ((TextNode) node).text(); } } if (!takeover) { text = ""; title = ""; } detailtrs = doc2.select("#tab-content .data td").first(); if (detailtrs != null) { for (Node node : detailtrs.childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("strong")) { if (!text.equals("") && !title.equals("")) { result.addDetail(new Detail(title.trim(), text.trim())); if (title.equals("Titel:")) { result.setTitle(text.trim()); } text = ""; } title = ((Element) node).text().trim(); } else { if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) { text = text + node.attr("href"); } else { text = text + ((Element) node).text(); } } } else if (node instanceof TextNode) { text = text + ((TextNode) node).text(); } } } else { if (doc2.select("#tab-content .fulltitle tr").size() > 0) { Elements rows = doc2.select("#tab-content .fulltitle tr"); for (Element tr : rows) { if (tr.children().size() == 2) { Element valcell = tr.child(1); String value = valcell.text().trim(); if (valcell.select("a").size() == 1) { value = valcell.select("a").first().absUrl("href"); } result.addDetail(new Detail(tr.child(0).text().trim(), value)); } } } else { result.addDetail(new Detail(stringProvider.getString(StringProvider.ERROR), stringProvider.getString(StringProvider.COULD_NOT_LOAD_DETAIL))); } } if (!text.equals("") && !title.equals("")) { result.addDetail(new Detail(title.trim(), text.trim())); if (title.equals("Titel:")) { result.setTitle(text.trim()); } } for (Element link : doc3.select("#tab-content a")) { Map<String, String> hrefq = getQueryParamsFirst(link.absUrl("href")); if (result.getId() == null) { // ID retrieval String key = hrefq.get("katkey"); if (key != null) { result.setId(key); break; } } } for (Element link : doc3.select(".box-container a")) { if (link.text().trim().equals("Download")) { result.addDetail( new Detail(stringProvider.getString(StringProvider.DOWNLOAD), link.absUrl("href"))); } } Map<String, Integer> copy_columnmap = new HashMap<>(); // Default values copy_columnmap.put("barcode", 1); copy_columnmap.put("branch", 3); copy_columnmap.put("status", 4); Elements copy_columns = doc.select("#tab-content .data tr#bg2 th"); for (int i = 0; i < copy_columns.size(); i++) { Element th = copy_columns.get(i); String head = th.text().trim(); if (head.contains("Status")) { copy_columnmap.put("status", i); } if (head.contains("Zweigstelle")) { copy_columnmap.put("branch", i); } if (head.contains("Mediennummer")) { copy_columnmap.put("barcode", i); } if (head.contains("Standort")) { copy_columnmap.put("location", i); } if (head.contains("Signatur")) { copy_columnmap.put("signature", i); } } Pattern status_lent = Pattern.compile( "^(entliehen) bis ([0-9]{1,2}.[0-9]{1,2}.[0-9]{2," + "4}) \\(gesamte Vormerkungen: ([0-9]+)\\)$"); Pattern status_and_barcode = Pattern.compile("^(.*) ([0-9A-Za-z]+)$"); Elements exemplartrs = doc.select("#tab-content .data tr").not("#bg2"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (Element tr : exemplartrs) { try { Copy copy = new Copy(); Element status = tr.child(copy_columnmap.get("status")); Element barcode = tr.child(copy_columnmap.get("barcode")); String barcodetext = barcode.text().trim().replace(" Wegweiser", ""); // STATUS String statustext; if (status.getElementsByTag("b").size() > 0) { statustext = status.getElementsByTag("b").text().trim(); } else { statustext = status.text().trim(); } if (copy_columnmap.get("status").equals(copy_columnmap.get("barcode"))) { Matcher matcher1 = status_and_barcode.matcher(statustext); if (matcher1.matches()) { statustext = matcher1.group(1); barcodetext = matcher1.group(2); } } Matcher matcher = status_lent.matcher(statustext); if (matcher.matches()) { copy.setStatus(matcher.group(1)); copy.setReservations(matcher.group(3)); copy.setReturnDate(fmt.parseLocalDate(matcher.group(2))); } else { copy.setStatus(statustext); } copy.setBarcode(barcodetext); if (status.select("a[href*=doVormerkung]").size() == 1) { copy.setResInfo(status.select("a[href*=doVormerkung]").attr("href").split("\\?")[1]); } String branchtext = tr.child(copy_columnmap.get("branch")).text().trim().replace(" Wegweiser", ""); copy.setBranch(branchtext); if (copy_columnmap.containsKey("location")) { copy.setLocation( tr.child(copy_columnmap.get("location")).text().trim().replace(" Wegweiser", "")); } if (copy_columnmap.containsKey("signature")) { copy.setShelfmark( tr.child(copy_columnmap.get("signature")).text().trim().replace(" Wegweiser", "")); } result.addCopy(copy); } catch (Exception ex) { ex.printStackTrace(); } } try { Element isvolume = null; Map<String, String> volume = new HashMap<>(); Elements links = doc.select(".data td a"); int elcount = links.size(); for (int eli = 0; eli < elcount; eli++) { List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(links.get(eli).attr("href")), "UTF-8"); for (NameValuePair nv : anyurl) { if (nv.getName().equals("methodToCall") && nv.getValue().equals("volumeSearch")) { isvolume = links.get(eli); } else if (nv.getName().equals("catKey")) { volume.put("catKey", nv.getValue()); } else if (nv.getName().equals("dbIdentifier")) { volume.put("dbIdentifier", nv.getValue()); } } if (isvolume != null) { volume.put("volume", "true"); result.setVolumesearch(volume); break; } } } catch (Exception e) { e.printStackTrace(); } return result; }
From source file:com.salsaberries.narchiver.Trawler.java
/** * Logs into the site./*from ww w . j a v a2s. c om*/ * * @return * @throws TrawlException */ private boolean login() throws TrawlException { --loginAttempts; if (loginAttempts < 0) { logger.error("Warning! Exceeded maximum number of login attempts! Program is now exiting."); throw new TrawlException("Maximum login attempts exceeded."); } logger.info("Attempting to log in at " + baseURL + site.getString("LOGIN_URL")); try { // follow redirects until you get it right HttpRequest httpRequest; HttpMessage httpGet; String url = baseURL + site.getString("LOGIN_URL"); while (true) { httpGet = new HttpMessage(HttpType.GET); httpGet.setUrl(url); httpGet.initializeDefaultHeaders(site); httpGet.addCookieHeaders(cookies); httpRequest = new HttpRequest(httpGet); if (httpRequest.getStatusCode() != 200) { getTempCookies(httpRequest.getHeaders()); // Find the header I want boolean found = false; for (Header h : httpRequest.getHeaders()) { if (h.getName().equals("Location")) { url = h.getValue(); found = true; } } if (!found) { throw new TrawlException("Redirect loop."); } } else { break; } } // Get headers ArrayList<Header> headers = httpRequest.getHeaders(); // Parse the cookies getTempCookies(headers); String body = httpRequest.getHtml(); Document doc = Jsoup.parse(body); Elements logins = doc.getElementsByAttributeValue("action", site.getString("LOGIN_SUBMIT")); if (logins.isEmpty()) { logins = doc.getElementsByAttributeValue("action", site.getString("BASE_URL") + site.getString("LOGIN_SUBMIT")); } if (logins.isEmpty()) { logins = doc.getElementsByAttributeValue("method", "POST"); } if (logins.isEmpty()) { throw new TrawlException("Failed to find login form!"); } if (logins.size() > 1) { logger.warn("Found multiple login forms. Picking the first one..."); } Element login = logins.get(0); // Extract the captcha image if appropriate String captchaResult = ""; if (!site.getString("CAPTCHA").equals("")) { // Download the captcha image HttpMessage getCaptcha = new HttpMessage(HttpType.GET); getCaptcha.setImage(true); if (!site.isNull("CAPTCHA_IMAGE")) { getCaptcha.setUrl(baseURL + site.getString("CAPTCHA_IMAGE")); getCaptcha.initializeDefaultImageHeaders(site); getCaptcha.addHeader(new Header("Referrer", baseURL + site.getString("LOGIN_URL"))); getCaptcha.addCookieHeaders(cookies); // Send it to deathbycaptcha SocketClient client = new SocketClient("njanetos", "2point7182"); HttpRequest image = new HttpRequest(getCaptcha); ByteArrayOutputStream os = new ByteArrayOutputStream(); ImageIO.write(image.getImage(), "png", os); Captcha result = client.decode(os.toByteArray()); captchaResult = result.toString(); } else { // Just try to get the image Elements captchas = login.getElementsByTag("img"); if (captchas.size() != 1) { throw new TrawlException( "Failed to find captcha, but the initialization file says there should be one."); } Element captchaImage = captchas.get(0); // Does it contain base64? if (captchaImage.attr("src").contains("base64")) { String src = captchaImage.attr("src").split(",")[1]; byte image[] = Base64.decodeBase64(src); ByteArrayOutputStream os = new ByteArrayOutputStream(); os.write(image); SocketClient client = new SocketClient("njanetos", "2point7182"); Captcha result = client.decode(os.toByteArray()); captchaResult = result.toString(); } else { if (captchaImage.attr("src").contains(baseURL)) { getCaptcha.setUrl(captchaImage.attr("src")); } else { getCaptcha.setUrl(baseURL + captchaImage.attr("src")); } getCaptcha.initializeDefaultImageHeaders(site); getCaptcha.addHeader(new Header("Referrer", baseURL + site.getString("LOGIN_URL"))); getCaptcha.addCookieHeaders(cookies); // Send it to deathbycaptcha SocketClient client = new SocketClient("njanetos", "2point7182"); HttpRequest image = new HttpRequest(getCaptcha); ByteArrayOutputStream os = new ByteArrayOutputStream(); ImageIO.write(image.getImage(), "png", os); Captcha result = client.decode(os.toByteArray()); captchaResult = result.toString(); } } logger.info("Decoded captcha: " + captchaResult); } // Grab any hidden fields Elements hidden = login.getElementsByAttributeValue("type", "hidden"); // Build the post response HttpMessage httpPost = new HttpMessage(HttpType.POST); httpPost.initializeDefaultHeaders(site); httpPost.addCookieHeaders(cookies); // TODO: Read this from the html! httpPost.setUrl(baseURL + site.getString("LOGIN_SUBMIT")); httpPost.appendContent(site.getString("USERNAME_FIELD"), site.getString("USERNAME")); httpPost.appendContent(site.getString("PASSWORD_FIELD"), site.getString("PASSWORD")); if (!captchaResult.equals("")) { httpPost.appendContent(site.getString("CAPTCHA_FIELD"), captchaResult); } for (int i = 0; i < hidden.size(); ++i) { httpPost.appendContent(hidden.get(i).attr("name"), hidden.get(i).attr("value")); } // Add the submit info Element submit = login.getElementsByAttributeValue("type", "submit").get(0); httpPost.appendContent(submit.attr("name"), submit.attr("value")); // Add the referrer httpPost.addHeader(new Header("Referer", baseURL + site.getString("LOGIN_URL"))); // Log in HttpRequest response = new HttpRequest(httpPost); headers = response.getHeaders(); // Add any relevant cookies getTempCookies(headers); logger.info("Successfully logged in, response code: " + response.getStatusCode()); // Were we redirected? If so, visit the redirection URL before continuing. if (response.getStatusCode() == 302) { // Send a GET request to the redirection URL before continuing. httpGet = new HttpMessage(HttpType.GET); httpGet.initializeDefaultHeaders(site); httpGet.addHeader(new Header("Referer", baseURL + site.getString("LOGIN_URL"))); String redirectionURL = getRedirectionURL(headers); httpGet.setUrl(redirectionURL); httpGet.addCookieHeaders(cookies); httpRequest = new HttpRequest(httpGet); logger.debug("Visited redirected page. Status code " + httpRequest.getStatusCode()); } } catch (ConnectionException | MalformedURLException | ProtocolException ex) { // Did not successfully log in logger.error(ex.getMessage()); return false; } catch (IOException ex) { // Did not successfully log in logger.error(ex.getMessage()); return false; } catch (Exception | InterruptedException ex) { // Did not successfully log in logger.error(ex.getMessage()); return false; } // Did we successfully log in? Then return true. return true; }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Retrieve the text located in //ww w . j a v a 2 s . co m * a list (UL or OL) HTML element. * * @param element * Element to be processed. * @param rawStr * Current raw text string. * @param linkedStr * Current text with hyperlinks. * @param ordered * Whether the list is numbered or not. */ private void processListElement(Element element, StringBuilder rawStr, StringBuilder linkedStr, boolean ordered) { // possibly remove the last new line character char c = rawStr.charAt(rawStr.length() - 1); if (c == '\n') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); } // possibly remove preceeding space c = rawStr.charAt(rawStr.length() - 1); if (c == ' ') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); } // possibly add a column c = rawStr.charAt(rawStr.length() - 1); if (c != '.' && c != ':' && c != ';') { rawStr.append(":"); linkedStr.append(":"); } // process each list element int count = 1; for (Element listElt : element.getElementsByTag(XmlNames.ELT_LI)) { // add leading space rawStr.append(" "); linkedStr.append(" "); // possibly add number if (ordered) { rawStr.append(count + ") "); linkedStr.append(count + ") "); } count++; // get text and links processTextElement(listElt, rawStr, linkedStr); // possibly remove the last new line character c = rawStr.charAt(rawStr.length() - 1); if (c == '\n') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); } // add final separator rawStr.append(";"); linkedStr.append(";"); } // possibly remove last separator c = rawStr.charAt(rawStr.length() - 1); if (c == ';') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); c = rawStr.charAt(rawStr.length() - 1); if (c != '.') { rawStr.append("."); linkedStr.append("."); } rawStr.append("\n"); linkedStr.append("\n"); } }
From source file:Leitura.Ecobertura.java
public void escreveTxt() throws IOException { //mtodo para pegar os nomes dos mtodos declarados String auxLinha = null;//from w ww . j a v a 2 s . co m char aux[] = null; StringBuffer sbClasse = new StringBuffer(); StringBuffer sbLinha = new StringBuffer(); StringBuffer sbMetodo = new StringBuffer(); String metodoTemp; boolean controleClasse = false; // Pega somente os elementos com tag "tr" Elements elements = document.getElementsByTag("tr"); for (Element children : elements) { if (StringUtils.isBlank(children.text())) { continue; } children.getElementsByClass("comment").remove(); // System.out.println(children.text()); //----------------- Dispensa Comentrios ----------------- //auxLinha = children.getElementsByTag("span").eq(0).text(); /*if (auxLinha.contains("/*")) { comentario = true; } else if(auxLinha.contains("//")){ comentario = true; controle = true; // controla comentrio com // } if (auxLinha.contains("*//*")) { comentario = false; }else if(auxLinha.contains("\n") && controle == true){ comentario = false; controle = false; }*/ //------------------ Fim dispensa comentrios -------------- // if (comentario == false) { //--------------------- verifica as linhas do cdigo ------------------- if (StringUtils.isNotBlank(children.getElementsByClass("numLine").text())) { aux = children.getElementsByClass("numLine").text().toCharArray(); for (int i = 0; i < aux.length; i++) { //System.out.println("["+aux[i]+"]"); if (aux[i] >= 48 && aux[i] <= 57) { // pega o nmero da linha sbLinha.append(aux[i]); } } auxLinha = sbLinha.toString(); if (StringUtils.isNotBlank(auxLinha)) { // transforma a linha para inteiro qtdeLinhas = Integer.parseInt(auxLinha); } sbLinha.delete(0, sbLinha.length()); } // ------------------- Fim linhas --------------------------------- Elements pre = children.getElementsByTag("pre"); for (Element element : pre) { String tagMetodo = element.getElementsByTag("span").eq(0).text(); //------------------------- Verifica classe ------------------------- if (element.getElementsByTag("span").text().contains("class")) { element.select("span.keyword").remove(); if (controleClasse == false) { classe = element.text().trim(); aux = classe.toCharArray(); for (int j = 0; j < aux.length; j++) { if ((65 <= aux[j]) && (aux[j] <= 90) || (aux[j] >= 97) && (aux[j] <= 122) || (aux[j] == 95)) { sbClasse.append(aux[j]); //System.out.println(j + ", " + sbClasse); if (j < aux.length - 1) { // System.out.println("size: "+aux.length+" j: "+j); if ((aux[j + 1] == ' ') || (aux[j + 1] == '{') || (aux[j + 1] == '<')) { // System.out.println("entrei"); if ((j + 1) < aux.length - 1) { for (int k = j++; k < aux.length; k++) { aux[k] = ' '; } } } } } } excluiLinhas.add(qtdeLinhas); classe = sbClasse.toString().replaceAll("\r", "").replaceAll("\t", "").replaceAll("\n", ""); controleClasse = true; } // System.out.println("Classe: " + classe); } //------------------------------- Fim verifica classe------------------------------ //------------------------------ Verifica mtodo ---------------------------------- //else if (tagMetodo.equals("privtate") || tagMetodo.equals("public") || tagMetodo.equals("protected")) { else if (element.getElementsByTag("span").text().contains("privtate") || element.getElementsByTag("span").text().contains("public") || element.getElementsByTag("span").text().contains("protected") || element.getElementsByTag("span").text().contains("static") || element.getElementsByTag("span").text().contains("final") || element.getElementsByTag("span").text().contains("native") || element.getElementsByTag("span").text().contains("synchronized") || element.getElementsByTag("span").text().contains("abstract") || element.getElementsByTag("span").text().contains("threadsafe") || element.getElementsByTag("span").text().contains("transient")) { element.select("span.keyword").remove(); if (!element.text().contains("=") && !element.text().contains(".") && !element.text().contains("@")) { String[] s = element.text().split(" "); for (int i = 0; i < s.length; i++) { if (s[i].contains("(")) { aux = s[i].toCharArray(); for (int j = 0; j < aux.length; j++) { if (aux[j] == '(') { for (int k = j; k < aux.length; k++) { aux[k] = ' '; } break; } sbMetodo.append(aux[j]); } metodoTemp = sbMetodo.toString(); if (!metodoTemp.isEmpty()) { metodo = metodoTemp.replaceAll("\r", "").replaceAll("\t", "").replaceAll("\n", ""); sbMetodo.delete(0, aux.length); informacoes = new Informacoes(classe, metodo, Integer.parseInt(auxLinha)); inf.add(informacoes); } } } } } // --------------------------- Fim Verifica Mtodo ------------------------------------ } // } } /* for(int i=0; i<inf.size(); i++){ System.out.println("Classe:"+inf.get(i).getClasse()+" Metodo:"+inf.get(i).getMetodo()+" Linha: "+inf.get(i).getLinha()); } // /* for(Map.Entry<String,Informacoes> entry : inf.entrySet()) { String key = entry.getKey(); int value = entry.getValue().getLinha(); String metodov = entry.getValue().getMetodo(); String classev = entry.getValue().getClasse(); System.out.println(key + " => " + classev+ " => " +metodov+ " => " +value); }*/ }