Example usage for org.jsoup.nodes Element getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Element getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName)

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:jp.mau.twappremover.MainActivity.java

private void getApps() {
    _apps.clear();/*from ww  w  .  j  a v  a2 s .c  o m*/

    HttpGet request = new HttpGet(APP_PAGE);
    request.addHeader("User-Agent", USER_AGENT);
    request.addHeader("Cookie", "_twitter_sess=" + _session_id + "; auth_token=" + _cookie_auth);

    try {
        String result = _client.execute(request, new ResponseHandler<String>() {
            @Override
            public String handleResponse(HttpResponse response) throws ClientProtocolException, IOException {
                switch (response.getStatusLine().getStatusCode()) {
                case HttpStatus.SC_OK:
                    return EntityUtils.toString(response.getEntity(), "UTF-8");
                case HttpStatus.SC_NOT_FOUND:
                    throw new RuntimeException("not found");
                default:
                    throw new RuntimeException("error");
                }
            }
        });

        Document doc = null;
        doc = Jsoup.parse(result);

        // parse top page and get authenticity token
        Elements forms = doc.getElementsByTag("form");
        for (Element e : forms) {
            Elements auths = e.getElementsByAttributeValue("name", "authenticity_token");
            if (auths.size() > 0) {
                _auth_token = auths.get(0).attr("value");
                break;
            }
        }

        Elements apps = doc.getElementsByClass("app");
        for (Element e : apps) {
            LinkedApp app = new LinkedApp();
            if (e.getElementsByTag("strong").size() > 0)
                app.name = e.getElementsByTag("strong").get(0).text();
            if (e.getElementsByClass("creator").size() > 0)
                app.creator = e.getElementsByClass("creator").get(0).text();
            if (e.getElementsByClass("description").size() > 0)
                app.desc = e.getElementsByClass("description").get(0).text();
            if (e.getElementsByClass("app-img").size() > 0)
                app.imgUrl = e.getElementsByClass("app-img").get(0).attr("src");
            if (e.getElementsByClass("revoke").size() > 0) {
                String tmp = e.getElementsByClass("revoke").get(0).attr("id");
                app.revokeId = tmp.replaceAll(KEY_HEADER_REVOKE, "");
            } else {
                // revoke id ????(facebook????????)
                continue;
            }
            _apps.add(app);
        }
        _handler.post(new Runnable() {
            @Override
            public void run() {
                _appadapter.notifyDataSetChanged();
            }
        });
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against
 * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring
 * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of
 * 100 then 100 should be our base./*  w  w w. jav  a2 s.c o  m*/
 *
 * @param topNode
 * @return
 */
private int getBaselineScoreForSiblings(Element topNode) {

    int base = 100000;

    int numberOfParagraphs = 0;
    int scoreOfParagraphs = 0;

    Elements nodesToCheck = topNode.getElementsByTag("p");

    for (Element node : nodesToCheck) {

        String nodeText = node.text();
        WordStats wordStats = StopWords.getStopWordCount(nodeText);
        boolean highLinkDensity = isHighLinkDensity(node);

        if (wordStats.getStopWordCount() > 2 && !highLinkDensity) {

            numberOfParagraphs++;
            scoreOfParagraphs += wordStats.getStopWordCount();
        }

    }

    if (numberOfParagraphs > 0) {
        base = scoreOfParagraphs / numberOfParagraphs;
        if (logger.isDebugEnabled()) {
            logger.debug("The base score for siblings to beat is: " + base + " NumOfParas: "
                    + numberOfParagraphs + " scoreOfAll: " + scoreOfParagraphs);
        }
    }

    return base;

}

From source file:us.colloquy.index.IndexHandler.java

public void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) {
    ///Documents/Tolstoy/diaries

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {//from   w  w w  . j a va  2 s  .c  o m

        stream.forEach(results::add);

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------");
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url)
                                && useOnlyNumber) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    //        System.out.println("Size: " + uriList.size());

    //        for (DocumentPointer pointer : uriList)
    //        {
    //            //parse and
    //            System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    //        }
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * adds any siblings that may have a decent score to this node
 *
 * @param node/*  w  w w .  ja v a 2  s.  co  m*/
 * @return
 */
private Element addSiblings(Element node) {
    if (logger.isDebugEnabled()) {
        logger.debug("Starting to add siblings");
    }
    int baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(node);

    Element currentSibling = node.previousElementSibling();
    while (currentSibling != null) {
        if (logger.isDebugEnabled()) {
            logger.debug("SIBLINGCHECK: " + debugNode(currentSibling));
        }

        if (currentSibling.tagName().equals("p")) {

            node.child(0).before(currentSibling.outerHtml());
            currentSibling = currentSibling.previousElementSibling();
            continue;
        }

        // check for a paraph embedded in a containing element
        int insertedSiblings = 0;
        Elements potentialParagraphs = currentSibling.getElementsByTag("p");
        if (potentialParagraphs.first() == null) {
            currentSibling = currentSibling.previousElementSibling();
            continue;
        }
        for (Element firstParagraph : potentialParagraphs) {
            WordStats wordStats = StopWords.getStopWordCount(firstParagraph.text());

            int paragraphScore = wordStats.getStopWordCount();

            if ((float) (baselineScoreForSiblingParagraphs * .30) < paragraphScore) {
                if (logger.isDebugEnabled()) {
                    logger.debug("This node looks like a good sibling, adding it");
                }
                node.child(insertedSiblings).before("<p>" + firstParagraph.text() + "<p>");
                insertedSiblings++;
            }

        }

        currentSibling = currentSibling.previousElementSibling();
    }
    return node;

}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * remove any divs that looks like non-content, clusters of links, or paras with no gusto
 *
 * @param node//from   w  w  w  .ja va 2s .  c  o m
 * @return
 */
private Element cleanupNode(Element node) {
    if (logger.isDebugEnabled()) {
        logger.debug("Starting cleanup Node");
    }

    node = addSiblings(node);

    Elements nodes = node.children();
    for (Element e : nodes) {
        if (e.tagName().equals("p")) {
            continue;
        }
        if (logger.isDebugEnabled()) {
            logger.debug("CLEANUP  NODE: " + e.id() + " class: " + e.attr("class"));
        }
        boolean highLinkDensity = isHighLinkDensity(e);
        if (highLinkDensity) {
            if (logger.isDebugEnabled()) {
                logger.debug("REMOVING  NODE FOR LINK DENSITY: " + e.id() + " class: " + e.attr("class"));
            }
            e.remove();
            continue;
        }
        // now check for word density
        // grab all the paragraphs in the children and remove ones that are too small to matter
        Elements subParagraphs = e.getElementsByTag("p");

        for (Element p : subParagraphs) {
            if (p.text().length() < 25) {
                p.remove();
            }
        }

        // now that we've removed shorty paragraphs let's make sure to exclude any first paragraphs that don't have paras as
        // their next siblings to avoid getting img bylines
        // first let's remove any element that now doesn't have any p tags at all
        Elements subParagraphs2 = e.getElementsByTag("p");
        if (subParagraphs2.size() == 0 && !e.tagName().equals("td")) {
            if (logger.isDebugEnabled()) {
                logger.debug("Removing node because it doesn't have any paragraphs");
            }
            e.remove();
            continue;
        }

        //if this node has a decent enough gravityScore we should keep it as well, might be content
        int topNodeScore = getScore(node);
        int currentNodeScore = getScore(e);
        float thresholdScore = (float) (topNodeScore * .08);
        if (logger.isDebugEnabled()) {
            logger.debug("topNodeScore: " + topNodeScore + " currentNodeScore: " + currentNodeScore
                    + " threshold: " + thresholdScore);
        }
        if (currentNodeScore < thresholdScore) {
            if (!e.tagName().equals("td")) {
                if (logger.isDebugEnabled()) {
                    logger.debug("Removing node due to low threshold score");
                }
                e.remove();
            } else {
                if (logger.isDebugEnabled()) {
                    logger.debug("Not removing TD node");
                }
            }

            continue;
        }

    }

    return node;

}

From source file:ExtractorContentTest.java

private void treatSection(Element section, List<Catalog> catalogs) {

    // 1. get section name
    // FIXME what is it does not exist?
    // FIXME can be "h3"
    Elements sect2 = section.getElementsByTag("h2");
    String s2 = null;/*from w  ww .j a va  2  s  . c  o  m*/

    if (!sect2.isEmpty())
        s2 = sect2.first().text(); // FIXME what about more than 1 ?

    String s3 = null;
    Elements sect3 = section.getElementsByTag("h3");
    if (!sect3.isEmpty())
        s3 = sect3.first().text();

    String dt = null;
    Elements sectDT = section.getElementsByTag("p");
    if (!sectDT.isEmpty()) {
        String contentDT = sectDT.first().text();
        if (contentDT.startsWith(";"))
            dt = contentDT.replaceAll(";", "");
    }

    // FIXME can be subsection

    // FIXME (1. optional step) some comments

    // 2. retrieve tabular
    Elements tables = section.getElementsByTag("table");
    //if (!tables.isEmpty()) 
    //System.err.println("\n****** " + s2 + " " + s3 + " *******\n");

    for (Element table : tables) {

        // (0. optional step) act as subviewname
        Elements caption = table.select("caption");
        String captionName = null;
        if (!caption.isEmpty())
            captionName = caption.first().text();

        /*** 
         * Headers
         */
        //
        List<Header> rHeaders = collectHeaders(table);

        boolean sortable = !table.select("[class=sortable wikitable]").isEmpty()
                || !table.select("[class=wikitable sortable]").isEmpty();

        // FIXME: other cases
        Elements heads = table.select("thead");
        if (sortable && (!heads.isEmpty())) {
            rHeaders = collectHeaders(heads.first());
        }

        // 2 treat row               
        Catalog product = null;
        Tree<String> structuralInformation = mkStructuralInformation(s2, s3, dt, captionName);
        if (sortable) {
            product = treatRows(table.select("tbody").first(), structuralInformation, rHeaders, sortable);
        } else
            product = treatRows(table, structuralInformation, rHeaders, sortable);
        catalogs.add(product);

        // 

    }

    // set the "ID" / names
    // clean up
    for (Catalog catalog : catalogs) {
        for (Product p : catalog) {
            Header primaryHeader = p.getHeaders().get(0);
            p.setName(p.getValue(primaryHeader.getName()));
        }
    }

}

From source file:de.geeksfactory.opacclient.apis.SISIS.java

protected DetailledItem parse_result(String html) throws IOException {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url);//from   ww  w. j a va 2  s .c  o m

    String html2 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showTitleActive", ENCODING);

    Document doc2 = Jsoup.parse(html2);
    doc2.setBaseUri(opac_url);

    String html3 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showAvailabilityActive",
            ENCODING);

    Document doc3 = Jsoup.parse(html3);
    doc3.setBaseUri(opac_url);

    DetailledItem result = new DetailledItem();

    try {
        result.setId(doc.select("#bibtip_id").text().trim());
    } catch (Exception ex) {
        ex.printStackTrace();
    }
    List<String> reservationlinks = new ArrayList<>();
    for (Element link : doc3.select("#vormerkung a, #tab-content a")) {
        String href = link.absUrl("href");
        Map<String, String> hrefq = getQueryParamsFirst(href);
        if (result.getId() == null) {
            // ID retrieval
            String key = hrefq.get("katkey");
            if (key != null) {
                result.setId(key);
                break;
            }
        }

        // Vormerken
        if (hrefq.get("methodToCall") != null) {
            if (hrefq.get("methodToCall").equals("doVormerkung")
                    || hrefq.get("methodToCall").equals("doBestellung")) {
                reservationlinks.add(href.split("\\?")[1]);
            }
        }
    }
    if (reservationlinks.size() == 1) {
        result.setReservable(true);
        result.setReservation_info(reservationlinks.get(0));
    } else if (reservationlinks.size() == 0) {
        result.setReservable(false);
    } else {
        // TODO: Multiple options - handle this case!
    }

    if (doc.select(".data td img").size() == 1) {
        result.setCover(doc.select(".data td img").first().attr("abs:src"));
        try {
            downloadCover(result);
        } catch (Exception e) {

        }
    }

    if (doc.select(".aw_teaser_title").size() == 1) {
        result.setTitle(doc.select(".aw_teaser_title").first().text().trim());
    } else if (doc.select(".data td strong").size() > 0) {
        result.setTitle(doc.select(".data td strong").first().text().trim());
    } else {
        result.setTitle("");
    }
    if (doc.select(".aw_teaser_title_zusatz").size() > 0) {
        result.addDetail(new Detail("Titelzusatz", doc.select(".aw_teaser_title_zusatz").text().trim()));
    }

    String title = "";
    String text = "";
    boolean takeover = false;
    Element detailtrs = doc2.select(".box-container .data td").first();
    for (Node node : detailtrs.childNodes()) {
        if (node instanceof Element) {
            if (((Element) node).tagName().equals("strong")) {
                title = ((Element) node).text().trim();
                text = "";
            } else {
                if (((Element) node).tagName().equals("a")
                        && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) {
                    text = text + node.attr("href");
                    takeover = true;
                    break;
                }
            }
        } else if (node instanceof TextNode) {
            text = text + ((TextNode) node).text();
        }
    }
    if (!takeover) {
        text = "";
        title = "";
    }

    detailtrs = doc2.select("#tab-content .data td").first();
    if (detailtrs != null) {
        for (Node node : detailtrs.childNodes()) {
            if (node instanceof Element) {
                if (((Element) node).tagName().equals("strong")) {
                    if (!text.equals("") && !title.equals("")) {
                        result.addDetail(new Detail(title.trim(), text.trim()));
                        if (title.equals("Titel:")) {
                            result.setTitle(text.trim());
                        }
                        text = "";
                    }

                    title = ((Element) node).text().trim();
                } else {
                    if (((Element) node).tagName().equals("a")
                            && (((Element) node).text().trim().contains("hier klicken")
                                    || title.equals("Link:"))) {
                        text = text + node.attr("href");
                    } else {
                        text = text + ((Element) node).text();
                    }
                }
            } else if (node instanceof TextNode) {
                text = text + ((TextNode) node).text();
            }
        }
    } else {
        if (doc2.select("#tab-content .fulltitle tr").size() > 0) {
            Elements rows = doc2.select("#tab-content .fulltitle tr");
            for (Element tr : rows) {
                if (tr.children().size() == 2) {
                    Element valcell = tr.child(1);
                    String value = valcell.text().trim();
                    if (valcell.select("a").size() == 1) {
                        value = valcell.select("a").first().absUrl("href");
                    }
                    result.addDetail(new Detail(tr.child(0).text().trim(), value));
                }
            }
        } else {
            result.addDetail(new Detail(stringProvider.getString(StringProvider.ERROR),
                    stringProvider.getString(StringProvider.COULD_NOT_LOAD_DETAIL)));
        }
    }
    if (!text.equals("") && !title.equals("")) {
        result.addDetail(new Detail(title.trim(), text.trim()));
        if (title.equals("Titel:")) {
            result.setTitle(text.trim());
        }
    }
    for (Element link : doc3.select("#tab-content a")) {
        Map<String, String> hrefq = getQueryParamsFirst(link.absUrl("href"));
        if (result.getId() == null) {
            // ID retrieval
            String key = hrefq.get("katkey");
            if (key != null) {
                result.setId(key);
                break;
            }
        }
    }
    for (Element link : doc3.select(".box-container a")) {
        if (link.text().trim().equals("Download")) {
            result.addDetail(
                    new Detail(stringProvider.getString(StringProvider.DOWNLOAD), link.absUrl("href")));
        }
    }

    Map<String, Integer> copy_columnmap = new HashMap<>();
    // Default values
    copy_columnmap.put("barcode", 1);
    copy_columnmap.put("branch", 3);
    copy_columnmap.put("status", 4);
    Elements copy_columns = doc.select("#tab-content .data tr#bg2 th");
    for (int i = 0; i < copy_columns.size(); i++) {
        Element th = copy_columns.get(i);
        String head = th.text().trim();
        if (head.contains("Status")) {
            copy_columnmap.put("status", i);
        }
        if (head.contains("Zweigstelle")) {
            copy_columnmap.put("branch", i);
        }
        if (head.contains("Mediennummer")) {
            copy_columnmap.put("barcode", i);
        }
        if (head.contains("Standort")) {
            copy_columnmap.put("location", i);
        }
        if (head.contains("Signatur")) {
            copy_columnmap.put("signature", i);
        }
    }

    Pattern status_lent = Pattern.compile(
            "^(entliehen) bis ([0-9]{1,2}.[0-9]{1,2}.[0-9]{2," + "4}) \\(gesamte Vormerkungen: ([0-9]+)\\)$");
    Pattern status_and_barcode = Pattern.compile("^(.*) ([0-9A-Za-z]+)$");

    Elements exemplartrs = doc.select("#tab-content .data tr").not("#bg2");
    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);
    for (Element tr : exemplartrs) {
        try {
            Copy copy = new Copy();
            Element status = tr.child(copy_columnmap.get("status"));
            Element barcode = tr.child(copy_columnmap.get("barcode"));
            String barcodetext = barcode.text().trim().replace(" Wegweiser", "");

            // STATUS
            String statustext;
            if (status.getElementsByTag("b").size() > 0) {
                statustext = status.getElementsByTag("b").text().trim();
            } else {
                statustext = status.text().trim();
            }
            if (copy_columnmap.get("status").equals(copy_columnmap.get("barcode"))) {
                Matcher matcher1 = status_and_barcode.matcher(statustext);
                if (matcher1.matches()) {
                    statustext = matcher1.group(1);
                    barcodetext = matcher1.group(2);
                }
            }

            Matcher matcher = status_lent.matcher(statustext);
            if (matcher.matches()) {
                copy.setStatus(matcher.group(1));
                copy.setReservations(matcher.group(3));
                copy.setReturnDate(fmt.parseLocalDate(matcher.group(2)));
            } else {
                copy.setStatus(statustext);
            }
            copy.setBarcode(barcodetext);
            if (status.select("a[href*=doVormerkung]").size() == 1) {
                copy.setResInfo(status.select("a[href*=doVormerkung]").attr("href").split("\\?")[1]);
            }

            String branchtext = tr.child(copy_columnmap.get("branch")).text().trim().replace(" Wegweiser", "");
            copy.setBranch(branchtext);

            if (copy_columnmap.containsKey("location")) {
                copy.setLocation(
                        tr.child(copy_columnmap.get("location")).text().trim().replace(" Wegweiser", ""));
            }

            if (copy_columnmap.containsKey("signature")) {
                copy.setShelfmark(
                        tr.child(copy_columnmap.get("signature")).text().trim().replace(" Wegweiser", ""));
            }

            result.addCopy(copy);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    try {
        Element isvolume = null;
        Map<String, String> volume = new HashMap<>();
        Elements links = doc.select(".data td a");
        int elcount = links.size();
        for (int eli = 0; eli < elcount; eli++) {
            List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(links.get(eli).attr("href")), "UTF-8");
            for (NameValuePair nv : anyurl) {
                if (nv.getName().equals("methodToCall") && nv.getValue().equals("volumeSearch")) {
                    isvolume = links.get(eli);
                } else if (nv.getName().equals("catKey")) {
                    volume.put("catKey", nv.getValue());
                } else if (nv.getName().equals("dbIdentifier")) {
                    volume.put("dbIdentifier", nv.getValue());
                }
            }
            if (isvolume != null) {
                volume.put("volume", "true");
                result.setVolumesearch(volume);
                break;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    return result;
}

From source file:com.salsaberries.narchiver.Trawler.java

/**
 * Logs into the site./*from  ww  w  .  j  a  v a2s.  c  om*/
 *
 * @return
 * @throws TrawlException
 */
private boolean login() throws TrawlException {
    --loginAttempts;

    if (loginAttempts < 0) {
        logger.error("Warning! Exceeded maximum number of login attempts! Program is now exiting.");
        throw new TrawlException("Maximum login attempts exceeded.");
    }

    logger.info("Attempting to log in at " + baseURL + site.getString("LOGIN_URL"));

    try {

        // follow redirects until you get it right
        HttpRequest httpRequest;
        HttpMessage httpGet;
        String url = baseURL + site.getString("LOGIN_URL");

        while (true) {
            httpGet = new HttpMessage(HttpType.GET);
            httpGet.setUrl(url);
            httpGet.initializeDefaultHeaders(site);
            httpGet.addCookieHeaders(cookies);

            httpRequest = new HttpRequest(httpGet);

            if (httpRequest.getStatusCode() != 200) {
                getTempCookies(httpRequest.getHeaders());

                // Find the header I want
                boolean found = false;
                for (Header h : httpRequest.getHeaders()) {
                    if (h.getName().equals("Location")) {
                        url = h.getValue();
                        found = true;
                    }
                }

                if (!found) {
                    throw new TrawlException("Redirect loop.");
                }

            } else {
                break;
            }

        }

        // Get headers
        ArrayList<Header> headers = httpRequest.getHeaders();
        // Parse the cookies
        getTempCookies(headers);

        String body = httpRequest.getHtml();
        Document doc = Jsoup.parse(body);
        Elements logins = doc.getElementsByAttributeValue("action", site.getString("LOGIN_SUBMIT"));

        if (logins.isEmpty()) {
            logins = doc.getElementsByAttributeValue("action",
                    site.getString("BASE_URL") + site.getString("LOGIN_SUBMIT"));
        }
        if (logins.isEmpty()) {
            logins = doc.getElementsByAttributeValue("method", "POST");
        }

        if (logins.isEmpty()) {
            throw new TrawlException("Failed to find login form!");
        }
        if (logins.size() > 1) {
            logger.warn("Found multiple login forms. Picking the first one...");
        }

        Element login = logins.get(0);

        // Extract the captcha image if appropriate
        String captchaResult = "";
        if (!site.getString("CAPTCHA").equals("")) {
            // Download the captcha image
            HttpMessage getCaptcha = new HttpMessage(HttpType.GET);
            getCaptcha.setImage(true);
            if (!site.isNull("CAPTCHA_IMAGE")) {
                getCaptcha.setUrl(baseURL + site.getString("CAPTCHA_IMAGE"));

                getCaptcha.initializeDefaultImageHeaders(site);
                getCaptcha.addHeader(new Header("Referrer", baseURL + site.getString("LOGIN_URL")));
                getCaptcha.addCookieHeaders(cookies);

                // Send it to deathbycaptcha
                SocketClient client = new SocketClient("njanetos", "2point7182");
                HttpRequest image = new HttpRequest(getCaptcha);
                ByteArrayOutputStream os = new ByteArrayOutputStream();
                ImageIO.write(image.getImage(), "png", os);
                Captcha result = client.decode(os.toByteArray());
                captchaResult = result.toString();
            } else {
                // Just try to get the image
                Elements captchas = login.getElementsByTag("img");

                if (captchas.size() != 1) {
                    throw new TrawlException(
                            "Failed to find captcha, but the initialization file says there should be one.");
                }

                Element captchaImage = captchas.get(0);

                // Does it contain base64?
                if (captchaImage.attr("src").contains("base64")) {
                    String src = captchaImage.attr("src").split(",")[1];

                    byte image[] = Base64.decodeBase64(src);
                    ByteArrayOutputStream os = new ByteArrayOutputStream();
                    os.write(image);

                    SocketClient client = new SocketClient("njanetos", "2point7182");

                    Captcha result = client.decode(os.toByteArray());
                    captchaResult = result.toString();

                } else {
                    if (captchaImage.attr("src").contains(baseURL)) {
                        getCaptcha.setUrl(captchaImage.attr("src"));
                    } else {
                        getCaptcha.setUrl(baseURL + captchaImage.attr("src"));
                    }

                    getCaptcha.initializeDefaultImageHeaders(site);
                    getCaptcha.addHeader(new Header("Referrer", baseURL + site.getString("LOGIN_URL")));
                    getCaptcha.addCookieHeaders(cookies);

                    // Send it to deathbycaptcha
                    SocketClient client = new SocketClient("njanetos", "2point7182");
                    HttpRequest image = new HttpRequest(getCaptcha);
                    ByteArrayOutputStream os = new ByteArrayOutputStream();
                    ImageIO.write(image.getImage(), "png", os);
                    Captcha result = client.decode(os.toByteArray());
                    captchaResult = result.toString();
                }
            }

            logger.info("Decoded captcha: " + captchaResult);
        }

        // Grab any hidden fields
        Elements hidden = login.getElementsByAttributeValue("type", "hidden");

        // Build the post response
        HttpMessage httpPost = new HttpMessage(HttpType.POST);
        httpPost.initializeDefaultHeaders(site);
        httpPost.addCookieHeaders(cookies);
        // TODO: Read this from the html!
        httpPost.setUrl(baseURL + site.getString("LOGIN_SUBMIT"));

        httpPost.appendContent(site.getString("USERNAME_FIELD"), site.getString("USERNAME"));
        httpPost.appendContent(site.getString("PASSWORD_FIELD"), site.getString("PASSWORD"));
        if (!captchaResult.equals("")) {
            httpPost.appendContent(site.getString("CAPTCHA_FIELD"), captchaResult);
        }

        for (int i = 0; i < hidden.size(); ++i) {
            httpPost.appendContent(hidden.get(i).attr("name"), hidden.get(i).attr("value"));
        }

        // Add the submit info
        Element submit = login.getElementsByAttributeValue("type", "submit").get(0);
        httpPost.appendContent(submit.attr("name"), submit.attr("value"));

        // Add the referrer
        httpPost.addHeader(new Header("Referer", baseURL + site.getString("LOGIN_URL")));

        // Log in
        HttpRequest response = new HttpRequest(httpPost);
        headers = response.getHeaders();
        // Add any relevant cookies
        getTempCookies(headers);
        logger.info("Successfully logged in, response code: " + response.getStatusCode());

        // Were we redirected? If so, visit the redirection URL before continuing. 
        if (response.getStatusCode() == 302) {
            // Send a GET request to the redirection URL before continuing. 
            httpGet = new HttpMessage(HttpType.GET);
            httpGet.initializeDefaultHeaders(site);
            httpGet.addHeader(new Header("Referer", baseURL + site.getString("LOGIN_URL")));
            String redirectionURL = getRedirectionURL(headers);
            httpGet.setUrl(redirectionURL);
            httpGet.addCookieHeaders(cookies);

            httpRequest = new HttpRequest(httpGet);
            logger.debug("Visited redirected page. Status code " + httpRequest.getStatusCode());
        }

    } catch (ConnectionException | MalformedURLException | ProtocolException ex) {
        // Did not successfully log in
        logger.error(ex.getMessage());
        return false;
    } catch (IOException ex) {
        // Did not successfully log in
        logger.error(ex.getMessage());
        return false;
    } catch (Exception | InterruptedException ex) {
        // Did not successfully log in
        logger.error(ex.getMessage());
        return false;
    }

    // Did we successfully log in? Then return true.
    return true;

}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Retrieve the text located in //ww  w  .  j a v a 2  s  . co  m
 * a list (UL or OL) HTML element.
 * 
 * @param element
 *       Element to be processed.
 * @param rawStr
 *       Current raw text string.
 * @param linkedStr
 *       Current text with hyperlinks.
 * @param ordered
 *       Whether the list is numbered or not.
 */
private void processListElement(Element element, StringBuilder rawStr, StringBuilder linkedStr,
        boolean ordered) { // possibly remove the last new line character
    char c = rawStr.charAt(rawStr.length() - 1);
    if (c == '\n') {
        rawStr.deleteCharAt(rawStr.length() - 1);
        linkedStr.deleteCharAt(linkedStr.length() - 1);
    }

    // possibly remove preceeding space
    c = rawStr.charAt(rawStr.length() - 1);
    if (c == ' ') {
        rawStr.deleteCharAt(rawStr.length() - 1);
        linkedStr.deleteCharAt(linkedStr.length() - 1);
    }

    // possibly add a column
    c = rawStr.charAt(rawStr.length() - 1);
    if (c != '.' && c != ':' && c != ';') {
        rawStr.append(":");
        linkedStr.append(":");
    }

    // process each list element
    int count = 1;
    for (Element listElt : element.getElementsByTag(XmlNames.ELT_LI)) { // add leading space
        rawStr.append(" ");
        linkedStr.append(" ");

        // possibly add number
        if (ordered) {
            rawStr.append(count + ") ");
            linkedStr.append(count + ") ");
        }
        count++;

        // get text and links
        processTextElement(listElt, rawStr, linkedStr);

        // possibly remove the last new line character
        c = rawStr.charAt(rawStr.length() - 1);
        if (c == '\n') {
            rawStr.deleteCharAt(rawStr.length() - 1);
            linkedStr.deleteCharAt(linkedStr.length() - 1);
        }

        // add final separator
        rawStr.append(";");
        linkedStr.append(";");
    }

    // possibly remove last separator
    c = rawStr.charAt(rawStr.length() - 1);
    if (c == ';') {
        rawStr.deleteCharAt(rawStr.length() - 1);
        linkedStr.deleteCharAt(linkedStr.length() - 1);
        c = rawStr.charAt(rawStr.length() - 1);
        if (c != '.') {
            rawStr.append(".");
            linkedStr.append(".");
        }
        rawStr.append("\n");
        linkedStr.append("\n");
    }
}

From source file:Leitura.Ecobertura.java

public void escreveTxt() throws IOException { //mtodo para pegar os nomes dos mtodos declarados
    String auxLinha = null;//from  w  ww .  j  a  v a 2 s  .  co m
    char aux[] = null;
    StringBuffer sbClasse = new StringBuffer();
    StringBuffer sbLinha = new StringBuffer();
    StringBuffer sbMetodo = new StringBuffer();
    String metodoTemp;
    boolean controleClasse = false;

    // Pega somente os elementos com tag "tr"
    Elements elements = document.getElementsByTag("tr");
    for (Element children : elements) {
        if (StringUtils.isBlank(children.text())) {
            continue;
        }
        children.getElementsByClass("comment").remove();
        // System.out.println(children.text());
        //----------------- Dispensa Comentrios -----------------
        //auxLinha = children.getElementsByTag("span").eq(0).text();
        /*if (auxLinha.contains("/*")) {
         comentario = true;
         } else if(auxLinha.contains("//")){
         comentario = true;
         controle = true;            // controla comentrio com //
         }
                
         if (auxLinha.contains("*//*")) {
                                   comentario = false;
                                   }else if(auxLinha.contains("\n") && controle == true){
                                   comentario = false;
                                   controle = false;
                                   }*/

        //------------------ Fim dispensa comentrios --------------

        // if (comentario == false) {
        //--------------------- verifica as linhas do cdigo -------------------
        if (StringUtils.isNotBlank(children.getElementsByClass("numLine").text())) {
            aux = children.getElementsByClass("numLine").text().toCharArray();

            for (int i = 0; i < aux.length; i++) {
                //System.out.println("["+aux[i]+"]");
                if (aux[i] >= 48 && aux[i] <= 57) { // pega o nmero da linha
                    sbLinha.append(aux[i]);
                }
            }
            auxLinha = sbLinha.toString();
            if (StringUtils.isNotBlank(auxLinha)) { // transforma a linha para inteiro
                qtdeLinhas = Integer.parseInt(auxLinha);
            }

            sbLinha.delete(0, sbLinha.length());
        }

        // ------------------- Fim linhas  ---------------------------------
        Elements pre = children.getElementsByTag("pre");
        for (Element element : pre) {
            String tagMetodo = element.getElementsByTag("span").eq(0).text();

            //------------------------- Verifica classe -------------------------
            if (element.getElementsByTag("span").text().contains("class")) {
                element.select("span.keyword").remove();
                if (controleClasse == false) {
                    classe = element.text().trim();
                    aux = classe.toCharArray();

                    for (int j = 0; j < aux.length; j++) {
                        if ((65 <= aux[j]) && (aux[j] <= 90) || (aux[j] >= 97) && (aux[j] <= 122)
                                || (aux[j] == 95)) {
                            sbClasse.append(aux[j]);
                            //System.out.println(j + ", " + sbClasse);
                            if (j < aux.length - 1) {
                                // System.out.println("size: "+aux.length+" j: "+j);
                                if ((aux[j + 1] == ' ') || (aux[j + 1] == '{') || (aux[j + 1] == '<')) {
                                    // System.out.println("entrei");
                                    if ((j + 1) < aux.length - 1) {
                                        for (int k = j++; k < aux.length; k++) {
                                            aux[k] = ' ';
                                        }
                                    }
                                }
                            }
                        }
                    }

                    excluiLinhas.add(qtdeLinhas);
                    classe = sbClasse.toString().replaceAll("\r", "").replaceAll("\t", "").replaceAll("\n", "");

                    controleClasse = true;
                }
                //  System.out.println("Classe: " + classe);
            } //------------------------------- Fim verifica classe------------------------------
              //------------------------------ Verifica mtodo ----------------------------------
              //else if (tagMetodo.equals("privtate") || tagMetodo.equals("public") || tagMetodo.equals("protected")) {
            else if (element.getElementsByTag("span").text().contains("privtate")
                    || element.getElementsByTag("span").text().contains("public")
                    || element.getElementsByTag("span").text().contains("protected")
                    || element.getElementsByTag("span").text().contains("static")
                    || element.getElementsByTag("span").text().contains("final")
                    || element.getElementsByTag("span").text().contains("native")
                    || element.getElementsByTag("span").text().contains("synchronized")
                    || element.getElementsByTag("span").text().contains("abstract")
                    || element.getElementsByTag("span").text().contains("threadsafe")
                    || element.getElementsByTag("span").text().contains("transient")) {
                element.select("span.keyword").remove();
                if (!element.text().contains("=") && !element.text().contains(".")
                        && !element.text().contains("@")) {
                    String[] s = element.text().split(" ");
                    for (int i = 0; i < s.length; i++) {
                        if (s[i].contains("(")) {
                            aux = s[i].toCharArray();
                            for (int j = 0; j < aux.length; j++) {
                                if (aux[j] == '(') {
                                    for (int k = j; k < aux.length; k++) {
                                        aux[k] = ' ';
                                    }
                                    break;
                                }
                                sbMetodo.append(aux[j]);
                            }
                            metodoTemp = sbMetodo.toString();
                            if (!metodoTemp.isEmpty()) {
                                metodo = metodoTemp.replaceAll("\r", "").replaceAll("\t", "").replaceAll("\n",
                                        "");
                                sbMetodo.delete(0, aux.length);
                                informacoes = new Informacoes(classe, metodo, Integer.parseInt(auxLinha));
                                inf.add(informacoes);
                            }
                        }
                    }
                }
            }

            // --------------------------- Fim Verifica Mtodo ------------------------------------
        }

        // }
    }
    /* for(int i=0; i<inf.size(); i++){
     System.out.println("Classe:"+inf.get(i).getClasse()+" Metodo:"+inf.get(i).getMetodo()+" Linha: "+inf.get(i).getLinha());
     }
     //
            
     /* for(Map.Entry<String,Informacoes> entry : inf.entrySet()) {
      String key = entry.getKey();
      int value = entry.getValue().getLinha();
      String metodov = entry.getValue().getMetodo();
      String classev = entry.getValue().getClasse();
            
      System.out.println(key + " => " + classev+ " => " +metodov+ " => " +value);
      }*/
}